Nhảy tới nội dung

Data Preparation

Data source

There are 02 popular options as below:

Data for this project

Created below for practice purpose only.

/dataset_prepared.py

import pandas as pd
import random
import time
# ===================
# Requirements
# ===================
# Minimum 1,000 interactions
# Minimum 25 unique users
# Minimum 2 interactions per user

def generate_interactions_with_events(
num_users=100,
num_items=200,
num_interactions=2000
):
"""Generate interactions data with event types"""

# Define event types with weights (some events more common than others)
event_types = {
'view': 0.50, # 50% of events
'click': 0.25, # 25% of events
'add_to_cart': 0.15, # 15% of events
'purchase': 0.10 # 10% of events
}

users = [f'user_{i}' for i in range(1, num_users + 1)]
items = [f'item_{i}' for i in range(1, num_items + 1)]
events = list(event_types.keys())
weights = list(event_types.values())

# Generate interactions
data = []
base_timestamp = int(time.time()) - (60 * 24 * 60 * 60) # 60 days ago

for _ in range(num_interactions):
user = random.choice(users)
item = random.choice(items)
event_type = random.choices(events, weights=weights)[0]
timestamp = base_timestamp + random.randint(0, 60 * 24 * 60 * 60)

data.append({
'USER_ID': user,
'ITEM_ID': item,
'TIMESTAMP': timestamp,
'EVENT_TYPE': event_type
})

df = pd.DataFrame(data)
df = df.sort_values('TIMESTAMP')

return df

# Generate and save
interactions_df = generate_interactions_with_events()
interactions_df.to_csv('interactions.csv', index=False)

print(interactions_df.head(10))
print(f"\nEvent Type Distribution:")
print(interactions_df['EVENT_TYPE'].value_counts())