def generate_advanced_dataset():
np.random.seed(42)
start_date = datetime(2022, 1, 1)
dates = (start_date + timedelta(days=x) for x in range(730))
categories = ('Electronics', 'Clothing', 'Home & Garden', 'Sports', 'Books')
products = {
'Electronics': ('Laptop', 'Smartphone', 'Headphones', 'Tablet', 'Smartwatch'),
'Clothing': ('T-Shirt', 'Jeans', 'Dress', 'Jacket', 'Sneakers'),
'Home & Garden': ('Furniture', 'Lamp', 'Rug', 'Plant', 'Cookware'),
'Sports': ('Yoga Mat', 'Dumbbell', 'Running Shoes', 'Bicycle', 'Tennis Racket'),
'Books': ('Fiction', 'Non-Fiction', 'Biography', 'Science', 'History')
}
n_transactions = 5000
data = ()
for _ in range(n_transactions):
date = np.random.choice(dates)
category = np.random.choice(categories)
product = np.random.choice(products(category))
base_prices = {
'Electronics': (200, 1500),
'Clothing': (20, 150),
'Home & Garden': (30, 500),
'Sports': (25, 300),
'Books': (10, 50)
}
price = np.random.uniform(*base_prices(category))
quantity = np.random.choice((1, 1, 1, 2, 2, 3), p=(0.5, 0.2, 0.15, 0.1, 0.03, 0.02))
customer_segment = np.random.choice(('Premium', 'Standard', 'Budget'), p=(0.2, 0.5, 0.3))
age_group = np.random.choice(('18-25', '26-35', '36-45', '46-55', '56+'))
region = np.random.choice(('North', 'South', 'East', 'West', 'Central'))
month = date.month
seasonal_factor = 1.0
if month in (11, 12):
seasonal_factor = 1.5
elif month in (6, 7):
seasonal_factor = 1.2
revenue = price * quantity * seasonal_factor
discount = np.random.choice((0, 5, 10, 15, 20, 25), p=(0.4, 0.2, 0.15, 0.15, 0.07, 0.03))
marketing_channel = np.random.choice(('Organic', 'Social Media', 'Email', 'Paid Ads'))
base_satisfaction = 4.0
if customer_segment == 'Premium':
base_satisfaction += 0.5
if discount > 15:
base_satisfaction += 0.3
satisfaction = np.clip(base_satisfaction + np.random.ordinario(0, 0.5), 1, 5)
data.append({
'Date': date, 'Category': category, 'Product': product, 'Price': round(price, 2),
'Quantity': quantity, 'Revenue': round(revenue, 2), 'Customer_Segment': customer_segment,
'Age_Group': age_group, 'Region': region, 'Discount_%': discount,
'Marketing_Channel': marketing_channel, 'Customer_Satisfaction': round(satisfaction, 2),
'Month': date.strftime('%B'), 'Year': date.year, 'Quarter': f'Q{(date.month-1)//3 + 1}'
})
df = pd.DataFrame(data)
df('Profit_Margin') = round(df('Revenue') * (1 - df('Discount_%')/100) * 0.3, 2)
df('Days_Since_Start') = (df('Date') - df('Date').min()).dt.days
return df