Data Preparation
Data source
There are 02 popular options as below:
- Download MovieLens Directly - https://files.grouplens.org/datasets/movielens/ml-100k.zip
- AWS Retail Demo Store - https://github.com/aws-samples/retail-demo-store.git
Data for this project
Created below for practice purpose only.
/dataset_prepared.py
import pandas as pd
import random
import time
# ===================
# Requirements
# ===================
# Minimum 1,000 interactions
# Minimum 25 unique users
# Minimum 2 interactions per user
def generate_interactions_with_events(
num_users=100,
num_items=200,
num_interactions=2000
):
"""Generate interactions data with event types"""
# Define event types with weights (some events more common than others)
event_types = {
'view': 0.50, # 50% of events
'click': 0.25, # 25% of events
'add_to_cart': 0.15, # 15% of events
'purchase': 0.10 # 10% of events
}
users = [f'user_{i}' for i in range(1, num_users + 1)]
items = [f'item_{i}' for i in range(1, num_items + 1)]
events = list(event_types.keys())
weights = list(event_types.values())
# Generate interactions
data = []
base_timestamp = int(time.time()) - (60 * 24 * 60 * 60) # 60 days ago
for _ in range(num_interactions):
user = random.choice(users)
item = random.choice(items)
event_type = random.choices(events, weights=weights)[0]
timestamp = base_timestamp + random.randint(0, 60 * 24 * 60 * 60)
data.append({
'USER_ID': user,
'ITEM_ID': item,
'TIMESTAMP': timestamp,
'EVENT_TYPE': event_type
})
df = pd.DataFrame(data)
df = df.sort_values('TIMESTAMP')
return df
# Generate and save
interactions_df = generate_interactions_with_events()
interactions_df.to_csv('interactions.csv', index=False)
print(interactions_df.head(10))
print(f"\nEvent Type Distribution:")
print(interactions_df['EVENT_TYPE'].value_counts())