Creating New Dimensions#

We’ll create new dimensions for future analysis.

Table payments#

Create a new dimension indicating installment payments.

df_payments['has_installments'] = (df_payments.payment_installments > 1)
df_payments['has_installments'] = df_payments.has_installments.map({True: 'Has Installments', False: 'No Installments'}).astype('category')

Table orders#

Delivery Failure Reason

Categorize as follows (note: “approved” could fit both categories - we’ll classify it as service-side):

  • Service-side issues: shipped, processing, unavailable, approved

  • Customer-side issues: created, invoiced, canceled

rules = {
    "No Issues": lambda x: x == 'Delivered',
    "Service Issue": lambda x: x.isin(['Approved', 'Shipped', 'Processing', 'Unavailable']),
    'Customer Issue': lambda x: x.isin(['Created', 'Invoiced', 'Canceled']),
    "Missing Status": "default"
}
df_orders['delivery_issue_reason'] = df_orders.order_status.preproc.to_categorical(rules=rules)
Count
No Issues 96211
Service Issue 2000
Customer Issue 881

Delivery Delay Status

rules = {
    "Missing Delivery Dt": lambda x: df_orders['order_delivered_customer_dt'].isna() | df_orders['order_estimated_delivery_dt'].isna(),
    "Delayed": lambda x: df_orders['order_delivered_customer_dt'] > df_orders['order_estimated_delivery_dt'], 
    'Not Delayed': lambda x: df_orders['order_delivered_customer_dt'] <= df_orders['order_estimated_delivery_dt'],    
    "Missing Delivery Dt": "default"
}
df_orders['is_delayed'] = df_orders.order_delivered_customer_dt.preproc.to_categorical(rules=rules)
Count
Not Delayed 88389
Delayed 7823
Missing Delivery Dt 2880

Delivery Status

rules = {
    "Delivered": lambda x: x == 'Delivered',
    "Not Delivered": lambda x: x != 'Delivered', 
    "Missing Status": "default"
}
df_orders['is_delivered'] = df_orders.order_status.preproc.to_categorical(rules=rules)
Count
Delivered 96211
Not Delivered 2881

Season

Create a season variable.

Note: South American seasons differ from Europe due to hemispheric positioning.

rules = {
    "Summer": lambda x: x.dt.month.isin([12, 1, 2]),
    "Autumn": lambda x: x.dt.month.isin([3, 4, 5]),
    "Winter": lambda x: x.dt.month.isin([6, 7, 8]),
    "Spring": lambda x: x.dt.month.isin([9, 10, 11]),
    "Missing Purchase Dt": "default"
}
df_orders['purchase_season'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)
Count
Winter 30573
Autumn 29809
Summer 22250
Spring 16460

Year

df_orders['purchase_year'] = df_orders['order_purchase_dt'].dt.year.fillna('Missing Purchase Dt')
df_orders['purchase_year'] = df_orders['purchase_year'].astype('category')
df_orders['purchase_year'].value_counts()
purchase_year
2018    53991
2017    45101
Name: count, dtype: int64

Month Name

Create a month name variable for the orders table.

df_orders['purchase_month'] = df_orders['order_purchase_dt'].dt.month_name().fillna('Missing Purchase Dt')
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
df_orders['purchase_month'] = df_orders['purchase_month'].astype('category').cat.reorder_categories(month_order, ordered=True)
df_orders['purchase_month'].value_counts()
purchase_month
August       10843
May          10573
July         10318
March         9893
June          9412
April         9343
February      8508
January       8069
November      7544
December      5673
October       4631
September     4285
Name: count, dtype: int64

Day Type

rules = {
    "Weekend": lambda x: x.dt.day_of_week.isin([5, 6]),
    "Weekday": lambda x: x.dt.day_of_week.isin(range(5)),
    "Missing Purchase Dt": "default"
}
df_orders['purchase_day_type'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)
Count
Weekday 76317
Weekend 22775

Day of Week

df_orders['purchase_weekday'] = df_orders['order_purchase_dt'].dt.day_name().fillna('Missing Purchase Dt')
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_orders['purchase_weekday'] = df_orders['purchase_weekday'].astype('category').cat.reorder_categories(weekday_order, ordered=True)
df_orders['purchase_weekday'].value_counts(dropna=False)
purchase_weekday
Monday       16141
Tuesday      15896
Wednesday    15501
Thursday     14704
Friday       14075
Sunday       11932
Saturday     10843
Name: count, dtype: int64

Time of Day

rules = {
    "Morning": lambda x: x.dt.hour.between(5,11),
    "Afternoon": lambda x: x.dt.hour.between(12,16),
    "Evening": lambda x: x.dt.hour.between(17,22),
    "Night": lambda x: x.dt.hour.isin([23, 0, 1, 2, 3, 4]),
    "Missing Purchase Dt": "default"
}
df_orders['purchase_time_of_day'] = df_orders.order_purchase_dt.preproc.to_categorical(rules=rules)
Count
Evening 36015
Afternoon 32095
Morning 22338
Night 8644

Hour

df_orders['purchase_hour'] = df_orders['order_purchase_dt'].dt.hour.fillna('Missing Purchase Dt')
df_orders['purchase_hour'] = df_orders['purchase_hour'].astype('category')
df_orders['purchase_hour'].value_counts()
purchase_hour
16    6655
11    6554
14    6547
13    6494
15    6426
21    6201
20    6178
10    6151
17    6130
12    5973
19    5971
22    5782
18    5753
9     4765
23    4108
8     2953
0     2387
7     1225
1     1168
2      506
6      502
3      272
4      203
5      188
Name: count, dtype: int64

Delivery Time Category

Let’s look at quantiles

df_orders.delivery_time_days.quantile([0, 0.05, 0.25, 0.5, 0.75, 0.95, 1])
0.00     0.53
0.05     3.01
0.25     6.76
0.50    10.21
0.75    15.68
0.95    29.22
1.00   209.63
Name: delivery_time_days, dtype: float64

Based on quantile analysis:

  • Fast: ≤5 days

  • Medium: 5-15 days

  • Long: >15 days

labels = ['Fast', 'Medium', 'Long']
bins = [-np.inf, 5, 15, np.inf]
df_orders['delivery_time_days_cat'] = df_orders.delivery_time_days.preproc.to_categorical(method='custom_bins', labels=labels, bins=bins)
Count
Medium 56558
Long 26226
Fast 13428
NaN 2880

Table reviews#

Season

rules = {
    "Summer": lambda x: x.dt.month.isin([12, 1, 2]),
    "Autumn": lambda x: x.dt.month.isin([3, 4, 5]),
    "Winter": lambda x: x.dt.month.isin([6, 7, 8]),
    "Spring": lambda x: x.dt.month.isin([9, 10, 11]),
    "Missing Review Dt": "default"
}
df_reviews['review_season'] = df_reviews.review_creation_dt.preproc.to_categorical(rules=rules)
Count
Winter 32975
Autumn 31031
Summer 22124
Spring 13518

Day Type

rules = {
    "Weekend": lambda x: x.dt.day_of_week.isin([5, 6]),
    "Weekday": lambda x: x.dt.day_of_week.isin(range(5)),
    "Missing Review Dt": "default"
}
df_reviews['review_day_type'] = df_reviews.review_creation_dt.preproc.to_categorical(rules=rules)
Count
Weekday 75263
Weekend 24385

Day of Week

df_reviews['review_creation_weekday'] = df_reviews['review_creation_dt'].dt.day_name().fillna('Missing Review Dt')
weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df_reviews['review_creation_weekday'] = df_reviews['review_creation_weekday'].astype('category').cat.reorder_categories(weekday_order, ordered=True)
df_reviews['review_creation_weekday'].value_counts()
review_creation_weekday
Wednesday    19232
Thursday     18907
Tuesday      18163
Friday       17858
Saturday     17846
Sunday        6539
Monday        1103
Name: count, dtype: int64

Table products#

Translate category names to English

Verify key consistency.

fron.analyze_join_keys(df_products, df_categories, 'product_category_name', how='left')
Join info
Type Left-only keys Right-only keys Left size Right size Left join size
N:1 3 0 32,951 71 32,951

Examine category discrepancies.

np.setdiff1d(df_products.product_category_name.unique(), df_categories.product_category_name.unique())
array(['Missing in Products', 'pc_gamer',
       'portateis_cozinha_e_preparadores_de_alimentos'], dtype=object)

Add English translations.

df_categories.loc[df_categories.shape[0]] = ('pc_gamer', 'Gaming Pc')
df_categories.loc[df_categories.shape[0]] = ('portateis_cozinha_e_preparadores_de_alimentos', 'Kitchen Appliances And Food Preparers')
df_categories.loc[df_categories.shape[0]] = ('Missing in Products', 'Missing in Products')

Replace categories with English versions.

df_products = (df_products.merge(df_categories, on='product_category_name', how='left')
               .drop(columns=['product_category_name'])
               .rename(columns={'product_category_name_english':'product_category'})
)

Check for missing values.

df_products.explore.detect_anomalies()
Missings by Column
  Count Percent
product_name_lenght 610 1.85%
product_description_lenght 610 1.85%

Generalized Product Categories

Create broader product categories for visualization.

category_mapping = {
    'Furniture': [
        'Office Furniture', 'Furniture Decor', 'Furniture Living Room', 
        'Kitchen Dining Laundry Garden Furniture', 'Bed Bath Table', 
        'Home Comfort', 'Home Comfort 2', 'Home Construction', 
        'Garden Tools', 'Furniture Bedroom', 'Furniture Mattress And Upholstery'
    ],
    'Electronics': [
        'Auto', 'Computers Accessories', 'Gaming Pc', 'Musical Instruments', 
        'Consoles Games', 'Watches Gifts', 'Air Conditioning', 'Telephony', 
        'Electronics', 'Fixed Telephony', 'Tablets Printing Image', 
        'Computers', 'Small Appliances Home Oven And Coffee', 
        'Small Appliances', 'Audio', 'Signaling And Security', 
        'Security And Services'
    ],
    'Fashion': [
        'Fashio Female Clothing', 'Fashion Male Clothing', 
        'Fashion Bags Accessories', 'Fashion Shoes', 'Fashion Sport', 
        'Fashion Underwear Beach', 'Fashion Childrens Clothes', 'Baby', 
        'Cool Stuff'
    ],
    'Home & Garden': [
        'Housewares', 'Kitchen Appliances And Food Preparers', 
        'Home Confort', 'Home Appliances', 'Home Appliances 2', 
        'Flowers', 'Costruction Tools Garden', 'Garden Tools', 
        'Construction Tools Lights', 'Costruction Tools Tools', 
        'Luggage Accessories', 'La Cuisine', 'Pet Shop', 'Market Place'
    ],
    'Entertainment': [
        'Sports Leisure', 'Toys', 'Cds Dvds Musicals', 'Music', 
        'Dvds Blu Ray', 'Cine Photo', 'Party Supplies', 
        'Christmas Supplies', 'Arts And Craftmanship', 'Art'
    ],
    'Beauty & Health': [
        'Health Beauty', 'Perfumery', 'Diapers And Hygiene'
    ],
    'Food & Drinks': [
        'Food Drink', 'Drinks', 'Food'
    ],
    'Books & Stationery': [
        'Books General Interest', 'Books Technical', 
        'Books Imported', 'Stationery'
    ],
    'Industry & Construction': [
        'Construction Tools Construction', 'Construction Tools Safety', 
        'Industry Commerce And Business', 'Agro Industry And Commerce'
    ]
}

category_dict = {item: category for category, items in category_mapping.items() for item in items}
df_products['general_product_category'] = df_products['product_category'].map(category_dict).fillna('Missing in Products').astype('category')
df_products['general_product_category'].value_counts(dropna=False)
general_product_category
Electronics                7822
Furniture                  6530
Home & Garden              5070
Entertainment              4547
Beauty & Health            3324
Fashion                    2929
Books & Stationery         1219
Industry & Construction     633
Missing in Products         610
Food & Drinks               267
Name: count, dtype: int64

Table df_geolocations#

Add a field indicating South American coordinates.

df_geolocations['in_south_america'] = (
    (df_geolocations.geolocation_lat >= -53.90) &  # Southern border
    (df_geolocations.geolocation_lat <= 12.45) &   # Northern border
    (df_geolocations.geolocation_lng >= -81.32) &  # Western border
    (df_geolocations.geolocation_lng <= -34.79)    # Eastern border
)