Skip to article frontmatterSkip to article content

Feature Engineering

Feature Engineering

In this notebook, we take our original dataset and condense its 600,000+ rows into around 2,700 rows that represent the workout programs. From there, we one-hot encode our categorical columns to be understood cleanly by our model, and create the first version of our the description column’s vector embeddings. These embeddings will allow the categorization to be very specific to a user’s program query.

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sentence_transformers import SentenceTransformer
/Users/atherv/miniforge3/envs/lifting/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
huge_data = pd.read_csv('../data/cleaned_600k.csv')
huge_data.drop(columns=['Unnamed: 0'], inplace=True)
goal_map = {
    'Olympic Weightlifting': 'goal_olympic_weightlifting',
    'Muscle & Sculpting': 'goal_muscle_&_sculpting',
    'Bodyweight Fitness': 'goal_bodyweight_fitness',
    'Powerbuilding': 'goal_powerbuilding',
    'Bodybuilding': 'goal_bodybuilding',
    'Powerlifting': 'goal_powerlifting',
    'Athletics': 'goal_athletics'
}

level_map = {
    'Beginner': 'level_beginner',
    'Novice': 'level_novice',
    'Intermediate': 'level_intermediate',
    'Advanced': 'level_advanced'
}

def add_multilabel_onehot(df, col, value_map, prefix):
    exploded = df[[col]].explode(col)
    exploded[col] = exploded[col].apply(lambda x: f"{prefix}{x}")

    one_hot = pd.get_dummies(exploded[col])
    one_hot = one_hot.groupby(exploded.index).sum()

    expected_cols = list(value_map.values())
    for colname in expected_cols:
        if colname not in one_hot.columns:
            one_hot[colname] = 0
    one_hot = one_hot[expected_cols]

    for c in one_hot.columns:
        df[c] = one_hot[c]
# Clean up sets and reps columns, create new columns for the model to learn on
# Sets is average per week, reps is average per exercise

huge_data['is_rep_based'] = huge_data['reps'] > 0
huge_data['reps_count'] = huge_data['reps'].apply(lambda x: x if x > 0 else 0)
huge_data['reps_time'] = huge_data['reps'].apply(lambda x: -x if x < 0 else 0)


# Precompute program_length for each (title, description) pair to avoid repeated lookups
program_length_map = huge_data.drop_duplicates(['title', 'description']) \
    .set_index(['title', 'description'])['program_length'].to_dict()

def per_week(series, title, description):
    program_length = program_length_map.get((title, description), 0)
    return series.sum() / program_length if program_length else 0

# Group by program, aggregate features, and compute sets & reps per week

grouped = huge_data.groupby(['title', 'description'])
program_features = grouped.agg({
    'reps_count': 'mean',   # mean reps per exercise
    'reps_time': 'mean',
    'is_rep_based': 'mean'
}).reset_index()

# Compute sets per week and reps per week
program_features['sets'] = [
    per_week(group['sets'], title, description)
    for (title, description), group in grouped
]
program_features['reps_per_week'] = [
    per_week(group['reps_count'], title, description)
    for (title, description), group in grouped
]
# Extract categorical and numerical features from original dataset
# Aggregate to program-level to ensure one row per program
program_metadata = huge_data.groupby(['title', 'description']).agg({
    'level': 'first',
    'goal': 'first',
    'equipment': 'first',
    'program_length': 'mean',
    'time_per_workout': 'mean',
    'intensity': 'mean'
}).reset_index().reset_index().drop(columns=['index'])

# One-hot encode nested categorical features
categorical_cols = ['level', 'goal', 'equipment']
add_multilabel_onehot(program_metadata, 'level', level_map, 'level_')
add_multilabel_onehot(program_metadata, 'goal', goal_map, 'goal_')

# One hot encode normal categorical feature
ohe = OneHotEncoder(sparse_output=False)
equip_ohe = ohe.fit_transform(program_metadata[['equipment']])

feature_names = ohe.get_feature_names_out(['equipment'])
equip_df = pd.DataFrame(equip_ohe, columns=feature_names, index=program_metadata.index)

program_metadata = program_metadata.join(equip_df)

# Merge back sets and reps columns to rest of the dataset features
program_features = program_features.merge(
    program_metadata,
    on=['title', 'description'],
    how='left'
)

program_features = program_features.drop(columns=['level', 'goal', 'equipment'])
# Combining textual columns for model training
program_features['text'] = program_features['title'] + program_features['description']
program_features
Loading...
program_features.describe()
Loading...
texts = program_features['text'].to_list()
BATCH_SIZE = 64
embeddings = []

# The device can be changed to 'cuda' or 'cpu' for windows computers or Colab notebooks
model = SentenceTransformer('all-MiniLM-L6-V2', device='mps')

for i in range(0, len(texts), BATCH_SIZE):
    batch = texts[i: i + BATCH_SIZE]
    embedding = model.encode(batch)
    embeddings.append(embedding)

embeddings = np.vstack(embeddings)

# Add embeddings back to features dataframe
embd_cols = [f'embd_{i}' for i in range(embeddings.shape[1])]
embd_df = pd.DataFrame(embeddings, columns=embd_cols, index=program_features.index)
program_features = pd.concat([program_features, embd_df], axis=1)
md_cols = [
    'reps_count', 'reps_time', 'is_rep_based',
    'sets', 'reps_per_week', 'program_length', 'time_per_workout',
    'intensity', 'level_beginner', 'level_novice', 'level_intermediate',
    'level_advanced', 'goal_olympic_weightlifting',
    'goal_muscle_&_sculpting', 'goal_bodyweight_fitness',
    'goal_powerbuilding', 'goal_bodybuilding', 'goal_powerlifting',
    'goal_athletics', 'equipment_at home', 'equipment_dumbbell only',
    'equipment_full gym', 'equipment_garage gym'
]

final_features = program_features[md_cols + embd_cols]
final_features
Loading...

We have now converted our dataset into purely numerical features that we can feed into a clustering model like KMeans to quickly and efficiently cluster the programs. We will also be able to cosine similarity to find the closest programs to the one we search for.