Data Preparation

Data source

There are 02 popular options as below:

Download MovieLens Directly - https://files.grouplens.org/datasets/movielens/ml-100k.zip
AWS Retail Demo Store - https://github.com/aws-samples/retail-demo-store.git

Data for this project

Created below for practice purpose only.

/dataset_prepared.py

import pandas as pd
import random
import time
# ===================
# Requirements
# ===================
# Minimum 1,000 interactions
# Minimum 25 unique users
# Minimum 2 interactions per user

def generate_interactions_with_events(
    num_users=100,
    num_items=200,
    num_interactions=2000
):
    """Generate interactions data with event types"""
    
    # Define event types with weights (some events more common than others)
    event_types = {
        'view': 0.50,        # 50% of events
        'click': 0.25,       # 25% of events
        'add_to_cart': 0.15, # 15% of events
        'purchase': 0.10     # 10% of events
    }
    
    users = [f'user_{i}' for i in range(1, num_users + 1)]
    items = [f'item_{i}' for i in range(1, num_items + 1)]
    events = list(event_types.keys())
    weights = list(event_types.values())
    
    # Generate interactions
    data = []
    base_timestamp = int(time.time()) - (60 * 24 * 60 * 60)  # 60 days ago
    
    for _ in range(num_interactions):
        user = random.choice(users)
        item = random.choice(items)
        event_type = random.choices(events, weights=weights)[0]
        timestamp = base_timestamp + random.randint(0, 60 * 24 * 60 * 60)
        
        data.append({
            'USER_ID': user,
            'ITEM_ID': item,
            'TIMESTAMP': timestamp,
            'EVENT_TYPE': event_type
        })
    
    df = pd.DataFrame(data)
    df = df.sort_values('TIMESTAMP')
    
    return df

# Generate and save
interactions_df = generate_interactions_with_events()
interactions_df.to_csv('interactions.csv', index=False)

print(interactions_df.head(10))
print(f"\nEvent Type Distribution:")
print(interactions_df['EVENT_TYPE'].value_counts())

Data source​

Data for this project​

Data source

Data for this project