Creating New Metrics#
We’ll create new variables for future analysis.
Using 5-digit zip codes for geo-analysis may result in overly small sample sizes for some regions, so we’ll work with 3-digit prefixes.
df_geolocations['geolocation_zip_code_prefix_3_digits'] = df_geolocations['geolocation_zip_code_prefix'].astype(str).str[0:3].astype(int)
df_customers['customer_zip_code_prefix_3_digits'] = df_customers['customer_zip_code_prefix'].astype(str).str[0:3].astype(int)
df_sellers['seller_zip_code_prefix_3_digits'] = df_sellers['seller_zip_code_prefix'].astype(str).str[0:3].astype(int)
Table customers#
We’ll add population data (2018) for each customer state.
population = {
'AC': 869265,
'AL': 3322820,
'AP': 829494,
'AM': 4080611,
'BA': 14812617,
'CE': 9075649,
'DF': 3974703,
'ES': 3972388,
'GO': 6921161,
'MA': 7035055,
'MT': 3441998,
'MS': 2748023,
'MG': 21040662,
'PA': 8513497,
'PB': 3996496,
'PR': 11348937,
'PE': 9496294,
'PI': 3264531,
'RJ': 17159960,
'RN': 3479010,
'RS': 11329605,
'RO': 1757589,
'RR': 576568,
'SC': 7075494,
'SP': 45538936,
'SE': 2278308,
'TO': 1555229
}
df_customers['population'] = df_customers['customer_state'].str.upper().map(population)
Table orders#
Purchase-to-Payment Approval Time
df_orders['from_purchase_to_approved_hours'] = df_orders.order_approved_dt - df_orders.order_purchase_dt
(df_orders['from_purchase_to_approved_hours'] < pd.Timedelta(0)).sum()
np.int64(0)
Convert to hours.
df_orders['from_purchase_to_approved_hours'] = df_orders.from_purchase_to_approved_hours.dt.total_seconds() / 3600
Purchase-to-Carrier Handoff Time
df_orders['from_purchase_to_carrier_days'] = df_orders.order_delivered_carrier_dt - df_orders.order_purchase_dt
(df_orders['from_purchase_to_carrier_days'] < pd.Timedelta(0)).sum()
np.int64(166)
Replace negative values with the median…
median_ = df_orders['from_purchase_to_carrier_days'][lambda x: x >= pd.Timedelta(0)].median()
df_orders.loc[lambda x: x['from_purchase_to_carrier_days'] < pd.Timedelta(0), 'from_purchase_to_carrier_days'] = median_
(df_orders['from_purchase_to_carrier_days'] < pd.Timedelta(0)).sum()
np.int64(0)
Convert to days…
df_orders['from_purchase_to_carrier_days'] = df_orders.from_purchase_to_carrier_days.dt.total_seconds() / (24 * 3600)
Purchase-to-Customer Delivery Time
df_orders['delivery_time_days'] = df_orders.order_delivered_customer_dt - df_orders.order_purchase_dt
(df_orders['delivery_time_days'] < pd.Timedelta(0)).sum()
np.int64(0)
Convert to days…
df_orders['delivery_time_days'] = df_orders['delivery_time_days'].dt.total_seconds() / (24 * 3600)
Estimated Delivery Time
df_orders['delivery_time_estimated_days'] = df_orders.order_estimated_delivery_dt - df_orders.order_purchase_dt
(df_orders['delivery_time_estimated_days'] < pd.Timedelta(0)).sum()
np.int64(0)
Convert to days.
df_orders['delivery_time_estimated_days'] = df_orders['delivery_time_estimated_days'].dt.total_seconds() / (24 * 3600)
Actual vs. Estimated Delivery Time Difference
df_orders['delivery_delay_days'] = df_orders.order_delivered_customer_dt - df_orders.order_estimated_delivery_dt
Convert to days…
df_orders['delivery_delay_days'] = df_orders['delivery_delay_days'].dt.total_seconds() / (24 * 3600)
Payment Approval-to-Carrier Handoff Time
df_orders['from_approved_to_carrier_days'] = df_orders.order_delivered_carrier_dt - df_orders.order_approved_dt
(df_orders['from_approved_to_carrier_days'] < pd.Timedelta(0)).sum()
np.int64(1359)
Replace negative values with the median…
median_ = df_orders['from_approved_to_carrier_days'][lambda x: x >= pd.Timedelta(0)].median()
df_orders.loc[lambda x: x['from_approved_to_carrier_days'] < pd.Timedelta(0), 'from_approved_to_carrier_days'] = median_
(df_orders['from_approved_to_carrier_days'] < pd.Timedelta(0)).sum()
np.int64(0)
Convert to days…
df_orders['from_approved_to_carrier_days'] = df_orders.from_approved_to_carrier_days.dt.total_seconds() / (24 * 3600)
Carrier Delivery Time
df_orders['from_carrier_to_customer_days'] = df_orders.order_delivered_customer_dt - df_orders.order_delivered_carrier_dt
(df_orders['from_carrier_to_customer_days'] < pd.Timedelta(0)).sum()
np.int64(19)
Replace with median value.
median_ = df_orders['from_carrier_to_customer_days'][lambda x: x >= pd.Timedelta(0)].median()
df_orders.loc[lambda x: x['from_carrier_to_customer_days'] < pd.Timedelta(0), 'from_carrier_to_customer_days'] = median_
(df_orders['from_carrier_to_customer_days'] < pd.Timedelta(0)).sum()
np.int64(0)
Convert to days…
df_orders['from_carrier_to_customer_days'] = df_orders['from_carrier_to_customer_days'].dt.total_seconds() / (24 * 3600)
Table reviews#
Review Response Time
df_reviews['review_answer_time_days'] = df_reviews.review_answer_dt - df_reviews.review_creation_dt
(df_reviews['review_answer_time_days'] < pd.Timedelta(0)).sum()
np.int64(0)
Convert to days…
df_reviews['review_answer_time_days'] = df_reviews['review_answer_time_days'].dt.total_seconds() / (24 * 3600)
Review Character Length
df_reviews['review_comment_message_len'] = df_reviews['review_comment_message'].str.len()
Table items and product#
Create a new variable for total product cost including shipping.
df_items['total_price'] = df_items['price'] + df_items['freight_value']
Create a new variable for product volume.
df_products['product_volume_cm3'] = df_products['product_length_cm'] * df_products['product_height_cm'] * df_products['product_width_cm']
Create a new weight-to-volume ratio variable.
df_products['weight_to_volume_ratio'] = (df_products['product_weight_g'] / df_products['product_volume_cm3']).round(2)