Creating New Dimensions#
We’ll create new dimensions for future analysis.
Table payments#
Create a new dimension indicating installment payments.
df_payments['has_installments'] = (df_payments.payment_installments > 1)
df_payments['has_installments'] = df_payments.has_installments.map({True: 'Has Installments', False: 'No Installments'}).astype('category')
Table orders#
Delivery Failure Reason
Categorize as follows (note: “approved” could fit both categories - we’ll classify it as service-side):
Service-side issues: shipped, processing, unavailable, approved
Customer-side issues: created, invoiced, canceled
rules = {
"No Issues": lambda x: x == 'Delivered',
"Service Issue": lambda x: x.isin(['Approved', 'Shipped', 'Processing', 'Unavailable']),
'Customer Issue': lambda x: x.isin(['Created', 'Invoiced', 'Canceled']),
"Missing Status": "default"
}
df_orders['delivery_issue_reason'] = df_orders.order_status.preproc.to_categorical(rules=rules)
Count | |
---|---|
No Issues | 96211 |
Service Issue | 2000 |
Customer Issue | 881 |
Delivery Delay Status
rules = {
"Missing Delivery Dt": lambda x: df_orders['order_delivered_customer_dt'].isna() | df_orders['order_estimated_delivery_dt'].isna(),
"Delayed": lambda x: df_orders['order_delivered_customer_dt'] > df_orders['order_estimated_delivery_dt'],
'Not Delayed': lambda x: df_orders['order_delivered_customer_dt'] <= df_orders['order_estimated_delivery_dt'],
"Missing Delivery Dt": "default"
}
df_orders['is_delayed'] = df_orders.order_delivered_customer_dt.preproc.to_categorical(rules=rules)
Count | |
---|---|
Not Delayed | 88389 |
Delayed | 7823 |
Missing Delivery Dt | 2880 |
Delivery Status
rules = {
"Delivered": lambda x: x == 'Delivered',
"Not Delivered": lambda x: x != 'Delivered',
"Missing Status": "default"
}
df_orders['is_delivered'] = df_orders.order_status.preproc.to_categorical(rules=rules)
Count | |
---|---|
Delivered | 96211 |
Not Delivered | 2881 |
Season
Create a season variable.
Note: South American seasons differ from Europe due to hemispheric positioning.
rules = {
"Summer": lambda x: x.dt.month.isin([12, 1, 2]),
"Autumn": lambda x: x.dt.month.isin([3, 4, 5]),
"Winter": lambda x: x.dt.month.isin([6, 7, 8]),
"Spring": lambda x: x.dt.month.isin([9, 10, 11]),
"Missing Purchase Dt": "default"
}
df_orders['purchase_season'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)
Count | |
---|---|
Winter | 30573 |
Autumn | 29809 |
Summer | 22250 |
Spring | 16460 |
Year
df_orders['purchase_year'] = df_orders['order_purchase_dt'].dt.year.fillna('Missing Purchase Dt')
df_orders['purchase_year'] = df_orders['purchase_year'].astype('category')
df_orders['purchase_year'].value_counts()
purchase_year
2018 53991
2017 45101
Name: count, dtype: int64
Month Name
Create a month name variable for the orders table.
df_orders['purchase_month'] = df_orders['order_purchase_dt'].dt.month_name().fillna('Missing Purchase Dt')
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
df_orders['purchase_month'] = df_orders['purchase_month'].astype('category').cat.reorder_categories(month_order, ordered=True)
df_orders['purchase_month'].value_counts()
purchase_month
August 10843
May 10573
July 10318
March 9893
June 9412
April 9343
February 8508
January 8069
November 7544
December 5673
October 4631
September 4285
Name: count, dtype: int64
Day Type
rules = {
"Weekend": lambda x: x.dt.day_of_week.isin([5, 6]),
"Weekday": lambda x: x.dt.day_of_week.isin(range(5)),
"Missing Purchase Dt": "default"
}
df_orders['purchase_day_type'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)
Count | |
---|---|
Weekday | 76317 |
Weekend | 22775 |
Day of Week
df_orders['purchase_weekday'] = df_orders['order_purchase_dt'].dt.day_name().fillna('Missing Purchase Dt')
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_orders['purchase_weekday'] = df_orders['purchase_weekday'].astype('category').cat.reorder_categories(weekday_order, ordered=True)
df_orders['purchase_weekday'].value_counts(dropna=False)
purchase_weekday
Monday 16141
Tuesday 15896
Wednesday 15501
Thursday 14704
Friday 14075
Sunday 11932
Saturday 10843
Name: count, dtype: int64
Time of Day
rules = {
"Morning": lambda x: x.dt.hour.between(5,11),
"Afternoon": lambda x: x.dt.hour.between(12,16),
"Evening": lambda x: x.dt.hour.between(17,22),
"Night": lambda x: x.dt.hour.isin([23, 0, 1, 2, 3, 4]),
"Missing Purchase Dt": "default"
}
df_orders['purchase_time_of_day'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)
Count | |
---|---|
Evening | 36015 |
Afternoon | 32095 |
Morning | 22338 |
Night | 8644 |
Hour
df_orders['purchase_hour'] = df_orders['order_purchase_dt'].dt.hour.fillna('Missing Purchase Dt')
df_orders['purchase_hour'] = df_orders['purchase_hour'].astype('category')
df_orders['purchase_hour'].value_counts()
purchase_hour
16 6655
11 6554
14 6547
13 6494
15 6426
21 6201
20 6178
10 6151
17 6130
12 5973
19 5971
22 5782
18 5753
9 4765
23 4108
8 2953
0 2387
7 1225
1 1168
2 506
6 502
3 272
4 203
5 188
Name: count, dtype: int64
Delivery Time Category
Let’s look at quantiles
df_orders.delivery_time_days.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
0.00 0.53
0.05 3.01
0.25 6.76
0.50 10.21
0.75 15.68
0.95 29.22
1.00 209.63
Name: delivery_time_days, dtype: float64
Based on quantile analysis:
Fast: ≤5 days
Medium: 5-15 days
Long: >15 days
labels = ['Fast', 'Medium', 'Long']
bins = [-np.inf, 5, 15, np.inf]
df_orders['delivery_time_days_cat'] = df_orders.delivery_time_days.preproc.to_categorical(method='custom_bins', labels=labels, bins=bins)
Count | |
---|---|
Medium | 56558 |
Long | 26226 |
Fast | 13428 |
NaN | 2880 |
Table reviews#
Season
rules = {
"Summer": lambda x: x.dt.month.isin([12, 1, 2]),
"Autumn": lambda x: x.dt.month.isin([3, 4, 5]),
"Winter": lambda x: x.dt.month.isin([6, 7, 8]),
"Spring": lambda x: x.dt.month.isin([9, 10, 11]),
"Missing Review Dt": "default"
}
df_reviews['review_season'] = df_reviews.review_creation_dt.preproc.to_categorical(rules=rules)
Count | |
---|---|
Winter | 32975 |
Autumn | 31031 |
Summer | 22124 |
Spring | 13518 |
Day Type
rules = {
"Weekend": lambda x: x.dt.day_of_week.isin([5, 6]),
"Weekday": lambda x: x.dt.day_of_week.isin(range(5)),
"Missing Review Dt": "default"
}
df_reviews['review_day_type'] = df_reviews.review_creation_dt.preproc.to_categorical(rules=rules)
Count | |
---|---|
Weekday | 75263 |
Weekend | 24385 |
Day of Week
df_reviews['review_creation_weekday'] = df_reviews['review_creation_dt'].dt.day_name().fillna('Missing Review Dt')
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_reviews['review_creation_weekday'] = df_reviews['review_creation_weekday'].astype('category').cat.reorder_categories(weekday_order, ordered=True)
df_reviews['review_creation_weekday'].value_counts()
review_creation_weekday
Wednesday 19232
Thursday 18907
Tuesday 18163
Friday 17858
Saturday 17846
Sunday 6539
Monday 1103
Name: count, dtype: int64
Table products#
Translate category names to English
Verify key consistency.
fron.analyze_join_keys(df_products, df_categories, 'product_category_name', how='left')
Type | Left-only keys | Right-only keys | Left size | Right size | Left join size |
---|---|---|---|---|---|
N:1 | 3 | 0 | 32,951 | 71 | 32,951 |
Examine category discrepancies.
np.setdiff1d(df_products.product_category_name.unique(), df_categories.product_category_name.unique())
array(['Missing in Products', 'pc_gamer',
'portateis_cozinha_e_preparadores_de_alimentos'], dtype=object)
Add English translations.
df_categories.loc[df_categories.shape[0]] = ('pc_gamer', 'Gaming Pc')
df_categories.loc[df_categories.shape[0]] = ('portateis_cozinha_e_preparadores_de_alimentos', 'Kitchen Appliances And Food Preparers')
df_categories.loc[df_categories.shape[0]] = ('Missing in Products', 'Missing in Products')
Replace categories with English versions.
df_products = (df_products.merge(df_categories, on='product_category_name', how='left')
.drop(columns=['product_category_name'])
.rename(columns={'product_category_name_english':'product_category'})
)
Check for missing values.
df_products.explore.detect_anomalies()
Count | Percent | |
---|---|---|
product_name_lenght | 610 | 1.85% |
product_description_lenght | 610 | 1.85% |
Generalized Product Categories
Create broader product categories for visualization.
category_mapping = {
'Furniture': [
'Office Furniture', 'Furniture Decor', 'Furniture Living Room',
'Kitchen Dining Laundry Garden Furniture', 'Bed Bath Table',
'Home Comfort', 'Home Comfort 2', 'Home Construction',
'Garden Tools', 'Furniture Bedroom', 'Furniture Mattress And Upholstery'
],
'Electronics': [
'Auto', 'Computers Accessories', 'Gaming Pc', 'Musical Instruments',
'Consoles Games', 'Watches Gifts', 'Air Conditioning', 'Telephony',
'Electronics', 'Fixed Telephony', 'Tablets Printing Image',
'Computers', 'Small Appliances Home Oven And Coffee',
'Small Appliances', 'Audio', 'Signaling And Security',
'Security And Services'
],
'Fashion': [
'Fashio Female Clothing', 'Fashion Male Clothing',
'Fashion Bags Accessories', 'Fashion Shoes', 'Fashion Sport',
'Fashion Underwear Beach', 'Fashion Childrens Clothes', 'Baby',
'Cool Stuff'
],
'Home & Garden': [
'Housewares', 'Kitchen Appliances And Food Preparers',
'Home Confort', 'Home Appliances', 'Home Appliances 2',
'Flowers', 'Costruction Tools Garden', 'Garden Tools',
'Construction Tools Lights', 'Costruction Tools Tools',
'Luggage Accessories', 'La Cuisine', 'Pet Shop', 'Market Place'
],
'Entertainment': [
'Sports Leisure', 'Toys', 'Cds Dvds Musicals', 'Music',
'Dvds Blu Ray', 'Cine Photo', 'Party Supplies',
'Christmas Supplies', 'Arts And Craftmanship', 'Art'
],
'Beauty & Health': [
'Health Beauty', 'Perfumery', 'Diapers And Hygiene'
],
'Food & Drinks': [
'Food Drink', 'Drinks', 'Food'
],
'Books & Stationery': [
'Books General Interest', 'Books Technical',
'Books Imported', 'Stationery'
],
'Industry & Construction': [
'Construction Tools Construction', 'Construction Tools Safety',
'Industry Commerce And Business', 'Agro Industry And Commerce'
]
}
category_dict = {item: category for category, items in category_mapping.items() for item in items}
df_products['general_product_category'] = df_products['product_category'].map(category_dict).fillna('Missing in Products').astype('category')
df_products['general_product_category'].value_counts(dropna=False)
general_product_category
Electronics 7822
Furniture 6530
Home & Garden 5070
Entertainment 4547
Beauty & Health 3324
Fashion 2929
Books & Stationery 1219
Industry & Construction 633
Missing in Products 610
Food & Drinks 267
Name: count, dtype: int64
Table df_geolocations#
Add a field indicating South American coordinates.
df_geolocations['in_south_america'] = (
(df_geolocations.geolocation_lat >= -53.90) & # Southern border
(df_geolocations.geolocation_lat <= 12.45) & # Northern border
(df_geolocations.geolocation_lng >= -81.32) & # Western border
(df_geolocations.geolocation_lng <= -34.79) # Eastern border
)