Data Loading#
Load the data.
Assign data types to columns where possible. Columns with missing values cannot be immediately converted to integer type. Leave such columns as is for now.
Display the first few rows of each dataframe.
Examine column types.
For categorical variables, perform normalization and convert to Title Case format for consistency and better visual presentation.
For text variables, normalize and convert to lowercase to eliminate implicit duplicates.
base_url = "https://raw.githubusercontent.com/PavelGrigoryevDS/olist-deep-dive/main/data/"
Table df_orders
dtype = {'order_status': 'category'}
df_orders = pd.read_csv(f'{base_url}olist_orders_dataset.csv.gz', dtype=dtype
, parse_dates=['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date'], date_format='%Y-%m-%d %H:%M:%S')
df_orders.rename(columns={'order_purchase_timestamp': 'order_purchase_dt'
, 'order_approved_at': 'order_approved_dt'
, 'order_delivered_carrier_date': 'order_delivered_carrier_dt'
, 'order_delivered_customer_date': 'order_delivered_customer_dt'
, 'order_estimated_delivery_date': 'order_estimated_delivery_dt'}, inplace=True)
df_orders = fo(df_orders)
df_orders.sample(5, random_state=7)
order_id | customer_id | order_status | order_purchase_dt | order_approved_dt | order_delivered_carrier_dt | order_delivered_customer_dt | order_estimated_delivery_dt | |
---|---|---|---|---|---|---|---|---|
77007 | f68550a60fc010eda8ef9a1e5cd4d651 | 4314121fb5cfec01588f42ef6bbb3db3 | delivered | 2018-04-13 16:19:20 | 2018-04-13 16:35:28 | 2018-04-18 19:02:40 | 2018-04-23 17:14:33 | 2018-05-08 |
34012 | bebbbe90253dadf5c7d8dbacfaf446f4 | 9709ffa24c12b2fb47c3ec2470b16952 | delivered | 2018-05-14 15:29:32 | 2018-05-14 15:56:40 | 2018-05-15 14:42:00 | 2018-06-04 17:28:51 | 2018-06-08 |
48017 | 417e44a3df3ce085aeafc1976354af8e | 39b0a374b8bbb7f162a55a042a3c2f50 | delivered | 2018-08-04 10:45:34 | 2018-08-04 11:55:17 | 2018-08-10 14:13:00 | 2018-08-21 16:16:42 | 2018-09-03 |
32469 | ec8cbb2f067bf3af1cdf3c82ce77e1c9 | 9dadc4bfa2ec469a923c2ac8ffdc3a0f | delivered | 2018-01-22 22:03:53 | 2018-01-22 22:16:24 | 2018-01-24 19:22:11 | 2018-01-26 21:33:24 | 2018-02-14 |
53647 | f31cf521d461f5b8b5b1456a0448495e | 7d9fd47eeb21776ed93f070d71499e0a | delivered | 2017-05-28 15:50:05 | 2017-05-29 21:45:10 | 2017-05-31 09:07:45 | 2017-06-07 11:51:24 | 2017-06-21 |
df_orders.dtypes
order_id object
customer_id object
order_status category
order_purchase_dt datetime64[ns]
order_approved_dt datetime64[ns]
order_delivered_carrier_dt datetime64[ns]
order_delivered_customer_dt datetime64[ns]
order_estimated_delivery_dt datetime64[ns]
dtype: object
df_orders.order_status.preproc.normalize_string_series(inplace=True)
Table df_payments
dtype = {'payment_type': 'category'}
df_payments = pd.read_csv(f'{base_url}olist_order_payments_dataset.csv.gz', dtype=dtype)
df_payments = fo(df_payments)
df_payments.sample(5, random_state=7)
order_id | payment_sequential | payment_type | payment_installments | payment_value | |
---|---|---|---|---|---|
39797 | deb8ce115f74e504ae38e7db9bdd42e6 | 1 | credit_card | 3 | 65.94 |
76821 | f6089c47bab18faae3d90c64cfb08481 | 1 | credit_card | 8 | 127.50 |
97070 | 9da5902b9f9210f370a81ff8edeafe62 | 1 | boleto | 1 | 35.38 |
24825 | ccd7d6d23c5d29de6367dd5332aab16d | 1 | credit_card | 1 | 22.77 |
79754 | 99ca8b25a7b584ec1b48b0a857a9211b | 1 | voucher | 1 | 110.26 |
df_payments.dtypes
order_id object
payment_sequential int64
payment_type category
payment_installments int64
payment_value float64
dtype: object
df_payments.payment_type.preproc.normalize_string_series(inplace=True)
Table df_items
df_items = pd.read_csv(f'{base_url}olist_order_items_dataset.csv.gz'
, parse_dates=['shipping_limit_date'], date_format='%Y-%m-%d %H:%M:%S')
df_items.rename(columns={'shipping_limit_date': 'shipping_limit_dt'}, inplace=True)
df_items = fo(df_items)
df_items.sample(5, random_state=7)
order_id | order_item_id | product_id | seller_id | shipping_limit_dt | price | freight_value | |
---|---|---|---|---|---|---|---|
64487 | 935879cc26cb79cd663525d2425c57dc | 1 | 94ad63831117fac82c6054d2056ed118 | 54a1852d1b8f10312c55e906355666ee | 2017-09-21 18:25:39 | 65.99 | 16.22 |
13892 | 1f9acfb03ed5bcc65242f54f6fec83a6 | 1 | f0e1372ab5cfef5ea974b45e419851cb | 270297ead4c65a6cd2593960d2af6b21 | 2017-09-01 20:35:10 | 329.99 | 17.07 |
17302 | 27a530fa8ac219ee1e453ccc7c575a07 | 1 | 389d119b48cf3043d311335e499d9c6b | 1f50f920176fa81dab994f9023523100 | 2017-12-20 22:36:31 | 59.00 | 17.66 |
95956 | d9b4c8cac94d984efe16ca1492bf3457 | 1 | 760bc344cfd2b318b8bf388fa995d3e4 | d91fb3b7d041e83b64a00a3edfb37e4f | 2017-06-27 02:26:05 | 159.10 | 15.86 |
8138 | 127b06830315a6224e76085943d1f565 | 1 | 746191562977a60292922c316e21720b | d65f31d2413268e671989503f6cf9993 | 2017-02-11 17:57:14 | 159.90 | 23.89 |
df_items.dtypes
order_id object
order_item_id int64
product_id object
seller_id object
shipping_limit_dt datetime64[ns]
price float64
freight_value float64
dtype: object
Table df_customers
dtype = {'customer_city': 'category', 'customer_state': 'category'}
df_customers = pd.read_csv(f'{base_url}olist_customers_dataset.csv.gz', dtype=dtype)
df_customers = fo(df_customers)
df_customers.sample(5, random_state=7)
customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
---|---|---|---|---|---|
77007 | c70ebd6d93654220aed313d7c3130d13 | 5df8710e0e69a1b21ec96bca50cc6c64 | 4548 | sao paulo | SP |
34012 | bcad4bc4532c5833aa4ab073b2284d12 | 9b74a120a375eb707c4f17c801d3e9b1 | 41680 | salvador | BA |
48017 | 5094c0bd47f5fe28ddd902de34ebdd4d | b0dbb4f6439fc8a9bf1ddea6553272e0 | 15056 | sao jose do rio preto | SP |
32469 | d401ce7f8ea08b0b3f591933e07e3778 | 6638dd079cc44519923ac6c0e0059463 | 22210 | rio de janeiro | RJ |
53647 | b34983585f48f0c976e7694292618592 | 5f767d103332ebefbce36dd1581acafb | 8771 | mogi das cruzes | SP |
df_customers.dtypes
customer_id object
customer_unique_id object
customer_zip_code_prefix int64
customer_city category
customer_state category
dtype: object
df_customers.customer_city.preproc.normalize_string_series(inplace=True)
df_customers.customer_state.preproc.normalize_string_series(case_format='upper', inplace=True)
Table df_reviews
df_reviews = pd.read_csv(f'{base_url}olist_order_reviews_dataset_translated.csv.gz'
, parse_dates=['review_creation_date', 'review_answer_timestamp']
, date_format={'review_creation_date': '%Y-%m-%d', 'review_answer_timestamp': '%Y-%m-%d %H:%M:%S'})
df_reviews.rename(columns={'review_creation_date': 'review_creation_dt'
, 'review_answer_timestamp': 'review_answer_dt'}, inplace=True)
df_reviews = fo(df_reviews)
df_reviews.sample(5, random_state=7)
review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_dt | review_answer_dt | |
---|---|---|---|---|---|---|---|
33909 | 7f7bf3668353655e6db2e427e2215340 | 8ef63805460994636446a1e1012ee8b6 | 5 | NaN | NaN | 2018-07-25 | 2018-07-27 13:45:02 |
92307 | 798203ad7e72182cecac941da9ef0df1 | 4dac0c9aec5f460f203ce439f8af0f8c | 5 | NaN | NaN | 2018-05-24 | 2018-05-24 21:42:59 |
89107 | d04ee3fbd4d019d9a116c111de74e524 | f1ffa84db833c490ced2c3bc52b1b833 | 5 | Good buy, I recommend! | NaN | 2018-08-10 | 2018-08-13 12:29:28 |
9538 | 9b9ce7ab5c8c8c1f162e69c34c31339c | 4cc00a9ac85dee3a197d788b827da7e8 | 1 | NaN | I didn't receive the product | 2017-12-30 | 2018-01-01 06:23:37 |
31008 | 7e5553ec57494e7cd98c3c95c1a56bca | 0644bbc829f241622c4c83b08436c465 | 5 | NaN | NaN | 2018-06-29 | 2018-07-02 01:52:38 |
df_reviews.dtypes
review_id object
order_id object
review_score int64
review_comment_title object
review_comment_message object
review_creation_dt datetime64[ns]
review_answer_dt datetime64[ns]
dtype: object
df_reviews.review_comment_title.preproc.normalize_string_series(case_format='lower', inplace=True)
df_reviews.review_comment_message.preproc.normalize_string_series(case_format='lower', inplace=True)
Table df_products
dtype = {'product_category_name': 'category'}
df_products = pd.read_csv(f'{base_url}olist_products_dataset.csv.gz', dtype=dtype)
df_products = fo(df_products)
df_products.sample(5, random_state=7)
product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
---|---|---|---|---|---|---|---|---|---|
21978 | 6139fa69b55f0d2940a87602dd20892d | automotivo | 49.00 | 329.00 | 2.00 | 100.00 | 24.00 | 2.00 | 17.00 |
24632 | 150c699262a053229e89213dc9c358d2 | sinalizacao_e_seguranca | 60.00 | 480.00 | 1.00 | 300.00 | 16.00 | 14.00 | 15.00 |
24489 | c773bee67e093ae2c47a883109b577b1 | beleza_saude | 59.00 | 1,566.00 | 1.00 | 1,200.00 | 16.00 | 16.00 | 16.00 |
18870 | d69f56b9e4b6d98bc1521db5ba96846a | esporte_lazer | 47.00 | 1,198.00 | 4.00 | 400.00 | 20.00 | 34.00 | 44.00 |
9265 | 09e9d1c1a370af902b1b7175400021ee | automotivo | 56.00 | 1,040.00 | 1.00 | 150.00 | 16.00 | 11.00 | 11.00 |
df_products.dtypes
product_id object
product_category_name category
product_name_lenght float64
product_description_lenght float64
product_photos_qty float64
product_weight_g float64
product_length_cm float64
product_height_cm float64
product_width_cm float64
dtype: object
We will not normalize the product_category_name column because we will replace it with an English version.
Table df_categories
dtype = {'product_category_name': 'category', 'product_category_name_english': 'category'}
df_categories = pd.read_csv(f'{base_url}product_category_name_translation.csv.gz', dtype=dtype)
df_categories = fo(df_categories)
df_categories.sample(5, random_state=7)
product_category_name | product_category_name_english | |
---|---|---|
18 | eletroportateis | small_appliances |
2 | automotivo | auto |
53 | pcs | computers |
27 | construcao_ferramentas_jardim | costruction_tools_garden |
45 | alimentos | food |
df_categories.dtypes
product_category_name category
product_category_name_english category
dtype: object
We will perform normalization only for the English version of the column because we will be working with it exclusively.
df_categories.product_category_name_english.preproc.normalize_string_series(inplace=True)
Table df_sellers
dtype = {'seller_city': 'category', 'seller_state': 'category'}
df_sellers = pd.read_csv(f'{base_url}olist_sellers_dataset.csv.gz', dtype=dtype)
df_sellers = fo(df_sellers)
df_sellers.sample(5, random_state=7)
seller_id | seller_zip_code_prefix | seller_city | seller_state | |
---|---|---|---|---|
1251 | cfd7ddab722b902f7ac5b5f3ba6d723d | 32372 | contagem | MG |
2098 | 1fa2d3def6adfa70e58c276bb64fe5bb | 4106 | sao paulo | SP |
1805 | 0936e1837d0c79253456bbb2ffaaef10 | 2050 | sao paulo | SP |
2593 | 5a4236931428800e9d9cc04af0987118 | 2916 | sao paulo | SP |
419 | 528ce32f560edd11add4169f6a5b65be | 18130 | sao roque | SP |
df_sellers.dtypes
seller_id object
seller_zip_code_prefix int64
seller_city category
seller_state category
dtype: object
df_sellers.seller_city.preproc.normalize_string_series(inplace=True)
df_sellers.seller_state.preproc.normalize_string_series(case_format='upper', inplace=True)
Table df_geolocations
dtype = {'geolocation_city': 'category', 'geolocation_state': 'category'}
df_geolocations = pd.read_csv(f'{base_url}olist_geolocation_dataset.csv.gz', dtype=dtype)
df_geolocations = fo(df_geolocations)
df_geolocations.sample(5, random_state=7)
geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
---|---|---|---|---|---|
39817 | 2976 | -23.47 | -46.72 | sao paulo | SP |
198962 | 8131 | -23.50 | -46.38 | sao paulo | SP |
582845 | 34600 | -19.86 | -43.86 | sabara | MG |
193875 | 7901 | -23.28 | -46.74 | francisco morato | SP |
238522 | 9751 | -23.69 | -46.54 | são bernardo do campo | SP |
df_geolocations.dtypes
geolocation_zip_code_prefix int64
geolocation_lat float64
geolocation_lng float64
geolocation_city category
geolocation_state category
dtype: object
df_geolocations.geolocation_city.preproc.normalize_string_series(inplace=True)
df_geolocations.geolocation_state.preproc.normalize_string_series(case_format='upper', inplace=True)
We will combine all dataframes into a class for easier further work.
class Dfs:
def __init__(self):
self.orders = df_orders
self.items = df_items
self.reviews = df_reviews
self.products = df_products
self.geolocations = df_geolocations
self.sellers = df_sellers
self.payments = df_payments
self.customers = df_customers
self.categories = df_categories
def __iter__(self):
return iter([
('orders', self.orders),
('items', self.items),
('reviews', self.reviews),
('products', self.products),
('geolocations', self.geolocations),
('sellers', self.sellers),
('payments', self.payments),
('customers', self.customers),
('categories', self.categories),
])
dfs = Dfs()