Data Loading

Data Loading#

  • Load the data.

  • Assign data types to columns where possible. Columns with missing values cannot be immediately converted to integer type. Leave such columns as is for now.

  • Display the first few rows of each dataframe.

  • Examine column types.

  • For categorical variables, perform normalization and convert to Title Case format for consistency and better visual presentation.

  • For text variables, normalize and convert to lowercase to eliminate implicit duplicates.

base_url = "https://raw.githubusercontent.com/PavelGrigoryevDS/olist-deep-dive/main/data/"

Table df_orders

dtype = {'order_status': 'category'}
df_orders = pd.read_csv(f'{base_url}olist_orders_dataset.csv.gz', dtype=dtype
                , parse_dates=['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date'], date_format='%Y-%m-%d %H:%M:%S')
df_orders.rename(columns={'order_purchase_timestamp': 'order_purchase_dt'
                          , 'order_approved_at': 'order_approved_dt'
                          , 'order_delivered_carrier_date': 'order_delivered_carrier_dt'
                          , 'order_delivered_customer_date': 'order_delivered_customer_dt'
                          , 'order_estimated_delivery_date': 'order_estimated_delivery_dt'}, inplace=True)
df_orders = fo(df_orders)
df_orders.sample(5, random_state=7)
order_id customer_id order_status order_purchase_dt order_approved_dt order_delivered_carrier_dt order_delivered_customer_dt order_estimated_delivery_dt
77007 f68550a60fc010eda8ef9a1e5cd4d651 4314121fb5cfec01588f42ef6bbb3db3 delivered 2018-04-13 16:19:20 2018-04-13 16:35:28 2018-04-18 19:02:40 2018-04-23 17:14:33 2018-05-08
34012 bebbbe90253dadf5c7d8dbacfaf446f4 9709ffa24c12b2fb47c3ec2470b16952 delivered 2018-05-14 15:29:32 2018-05-14 15:56:40 2018-05-15 14:42:00 2018-06-04 17:28:51 2018-06-08
48017 417e44a3df3ce085aeafc1976354af8e 39b0a374b8bbb7f162a55a042a3c2f50 delivered 2018-08-04 10:45:34 2018-08-04 11:55:17 2018-08-10 14:13:00 2018-08-21 16:16:42 2018-09-03
32469 ec8cbb2f067bf3af1cdf3c82ce77e1c9 9dadc4bfa2ec469a923c2ac8ffdc3a0f delivered 2018-01-22 22:03:53 2018-01-22 22:16:24 2018-01-24 19:22:11 2018-01-26 21:33:24 2018-02-14
53647 f31cf521d461f5b8b5b1456a0448495e 7d9fd47eeb21776ed93f070d71499e0a delivered 2017-05-28 15:50:05 2017-05-29 21:45:10 2017-05-31 09:07:45 2017-06-07 11:51:24 2017-06-21
df_orders.dtypes
order_id                               object
customer_id                            object
order_status                         category
order_purchase_dt              datetime64[ns]
order_approved_dt              datetime64[ns]
order_delivered_carrier_dt     datetime64[ns]
order_delivered_customer_dt    datetime64[ns]
order_estimated_delivery_dt    datetime64[ns]
dtype: object
df_orders.order_status.preproc.normalize_string_series(inplace=True)

Table df_payments

dtype = {'payment_type': 'category'}
df_payments = pd.read_csv(f'{base_url}olist_order_payments_dataset.csv.gz', dtype=dtype)
df_payments = fo(df_payments)
df_payments.sample(5, random_state=7)
order_id payment_sequential payment_type payment_installments payment_value
39797 deb8ce115f74e504ae38e7db9bdd42e6 1 credit_card 3 65.94
76821 f6089c47bab18faae3d90c64cfb08481 1 credit_card 8 127.50
97070 9da5902b9f9210f370a81ff8edeafe62 1 boleto 1 35.38
24825 ccd7d6d23c5d29de6367dd5332aab16d 1 credit_card 1 22.77
79754 99ca8b25a7b584ec1b48b0a857a9211b 1 voucher 1 110.26
df_payments.dtypes
order_id                  object
payment_sequential         int64
payment_type            category
payment_installments       int64
payment_value            float64
dtype: object
df_payments.payment_type.preproc.normalize_string_series(inplace=True)

Table df_items

df_items = pd.read_csv(f'{base_url}olist_order_items_dataset.csv.gz'
                , parse_dates=['shipping_limit_date'], date_format='%Y-%m-%d %H:%M:%S')
df_items.rename(columns={'shipping_limit_date': 'shipping_limit_dt'}, inplace=True)
df_items = fo(df_items)
df_items.sample(5, random_state=7)
order_id order_item_id product_id seller_id shipping_limit_dt price freight_value
64487 935879cc26cb79cd663525d2425c57dc 1 94ad63831117fac82c6054d2056ed118 54a1852d1b8f10312c55e906355666ee 2017-09-21 18:25:39 65.99 16.22
13892 1f9acfb03ed5bcc65242f54f6fec83a6 1 f0e1372ab5cfef5ea974b45e419851cb 270297ead4c65a6cd2593960d2af6b21 2017-09-01 20:35:10 329.99 17.07
17302 27a530fa8ac219ee1e453ccc7c575a07 1 389d119b48cf3043d311335e499d9c6b 1f50f920176fa81dab994f9023523100 2017-12-20 22:36:31 59.00 17.66
95956 d9b4c8cac94d984efe16ca1492bf3457 1 760bc344cfd2b318b8bf388fa995d3e4 d91fb3b7d041e83b64a00a3edfb37e4f 2017-06-27 02:26:05 159.10 15.86
8138 127b06830315a6224e76085943d1f565 1 746191562977a60292922c316e21720b d65f31d2413268e671989503f6cf9993 2017-02-11 17:57:14 159.90 23.89
df_items.dtypes
order_id                     object
order_item_id                 int64
product_id                   object
seller_id                    object
shipping_limit_dt    datetime64[ns]
price                       float64
freight_value               float64
dtype: object

Table df_customers

dtype = {'customer_city': 'category', 'customer_state': 'category'}
df_customers = pd.read_csv(f'{base_url}olist_customers_dataset.csv.gz', dtype=dtype)
df_customers = fo(df_customers)
df_customers.sample(5, random_state=7)
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state
77007 c70ebd6d93654220aed313d7c3130d13 5df8710e0e69a1b21ec96bca50cc6c64 4548 sao paulo SP
34012 bcad4bc4532c5833aa4ab073b2284d12 9b74a120a375eb707c4f17c801d3e9b1 41680 salvador BA
48017 5094c0bd47f5fe28ddd902de34ebdd4d b0dbb4f6439fc8a9bf1ddea6553272e0 15056 sao jose do rio preto SP
32469 d401ce7f8ea08b0b3f591933e07e3778 6638dd079cc44519923ac6c0e0059463 22210 rio de janeiro RJ
53647 b34983585f48f0c976e7694292618592 5f767d103332ebefbce36dd1581acafb 8771 mogi das cruzes SP
df_customers.dtypes
customer_id                   object
customer_unique_id            object
customer_zip_code_prefix       int64
customer_city               category
customer_state              category
dtype: object
df_customers.customer_city.preproc.normalize_string_series(inplace=True)
df_customers.customer_state.preproc.normalize_string_series(case_format='upper', inplace=True)

Table df_reviews

df_reviews = pd.read_csv(f'{base_url}olist_order_reviews_dataset_translated.csv.gz'
                , parse_dates=['review_creation_date', 'review_answer_timestamp']
                , date_format={'review_creation_date': '%Y-%m-%d', 'review_answer_timestamp': '%Y-%m-%d %H:%M:%S'})
df_reviews.rename(columns={'review_creation_date': 'review_creation_dt'
                           , 'review_answer_timestamp': 'review_answer_dt'}, inplace=True)
df_reviews = fo(df_reviews)
df_reviews.sample(5, random_state=7)
review_id order_id review_score review_comment_title review_comment_message review_creation_dt review_answer_dt
33909 7f7bf3668353655e6db2e427e2215340 8ef63805460994636446a1e1012ee8b6 5 NaN NaN 2018-07-25 2018-07-27 13:45:02
92307 798203ad7e72182cecac941da9ef0df1 4dac0c9aec5f460f203ce439f8af0f8c 5 NaN NaN 2018-05-24 2018-05-24 21:42:59
89107 d04ee3fbd4d019d9a116c111de74e524 f1ffa84db833c490ced2c3bc52b1b833 5 Good buy, I recommend! NaN 2018-08-10 2018-08-13 12:29:28
9538 9b9ce7ab5c8c8c1f162e69c34c31339c 4cc00a9ac85dee3a197d788b827da7e8 1 NaN I didn't receive the product 2017-12-30 2018-01-01 06:23:37
31008 7e5553ec57494e7cd98c3c95c1a56bca 0644bbc829f241622c4c83b08436c465 5 NaN NaN 2018-06-29 2018-07-02 01:52:38
df_reviews.dtypes
review_id                         object
order_id                          object
review_score                       int64
review_comment_title              object
review_comment_message            object
review_creation_dt        datetime64[ns]
review_answer_dt          datetime64[ns]
dtype: object
df_reviews.review_comment_title.preproc.normalize_string_series(case_format='lower', inplace=True)
df_reviews.review_comment_message.preproc.normalize_string_series(case_format='lower', inplace=True)

Table df_products

dtype = {'product_category_name': 'category'}
df_products = pd.read_csv(f'{base_url}olist_products_dataset.csv.gz', dtype=dtype)
df_products = fo(df_products)
df_products.sample(5, random_state=7)
product_id product_category_name product_name_lenght product_description_lenght product_photos_qty product_weight_g product_length_cm product_height_cm product_width_cm
21978 6139fa69b55f0d2940a87602dd20892d automotivo 49.00 329.00 2.00 100.00 24.00 2.00 17.00
24632 150c699262a053229e89213dc9c358d2 sinalizacao_e_seguranca 60.00 480.00 1.00 300.00 16.00 14.00 15.00
24489 c773bee67e093ae2c47a883109b577b1 beleza_saude 59.00 1,566.00 1.00 1,200.00 16.00 16.00 16.00
18870 d69f56b9e4b6d98bc1521db5ba96846a esporte_lazer 47.00 1,198.00 4.00 400.00 20.00 34.00 44.00
9265 09e9d1c1a370af902b1b7175400021ee automotivo 56.00 1,040.00 1.00 150.00 16.00 11.00 11.00
df_products.dtypes
product_id                      object
product_category_name         category
product_name_lenght            float64
product_description_lenght     float64
product_photos_qty             float64
product_weight_g               float64
product_length_cm              float64
product_height_cm              float64
product_width_cm               float64
dtype: object

We will not normalize the product_category_name column because we will replace it with an English version.

Table df_categories

dtype = {'product_category_name': 'category', 'product_category_name_english': 'category'}
df_categories = pd.read_csv(f'{base_url}product_category_name_translation.csv.gz', dtype=dtype)
df_categories = fo(df_categories)
df_categories.sample(5, random_state=7)
product_category_name product_category_name_english
18 eletroportateis small_appliances
2 automotivo auto
53 pcs computers
27 construcao_ferramentas_jardim costruction_tools_garden
45 alimentos food
df_categories.dtypes
product_category_name            category
product_category_name_english    category
dtype: object

We will perform normalization only for the English version of the column because we will be working with it exclusively.

df_categories.product_category_name_english.preproc.normalize_string_series(inplace=True)

Table df_sellers

dtype = {'seller_city': 'category', 'seller_state': 'category'}
df_sellers = pd.read_csv(f'{base_url}olist_sellers_dataset.csv.gz', dtype=dtype)
df_sellers = fo(df_sellers)
df_sellers.sample(5, random_state=7)
seller_id seller_zip_code_prefix seller_city seller_state
1251 cfd7ddab722b902f7ac5b5f3ba6d723d 32372 contagem MG
2098 1fa2d3def6adfa70e58c276bb64fe5bb 4106 sao paulo SP
1805 0936e1837d0c79253456bbb2ffaaef10 2050 sao paulo SP
2593 5a4236931428800e9d9cc04af0987118 2916 sao paulo SP
419 528ce32f560edd11add4169f6a5b65be 18130 sao roque SP
df_sellers.dtypes
seller_id                   object
seller_zip_code_prefix       int64
seller_city               category
seller_state              category
dtype: object
df_sellers.seller_city.preproc.normalize_string_series(inplace=True)
df_sellers.seller_state.preproc.normalize_string_series(case_format='upper', inplace=True)

Table df_geolocations

dtype = {'geolocation_city': 'category', 'geolocation_state': 'category'}
df_geolocations = pd.read_csv(f'{base_url}olist_geolocation_dataset.csv.gz', dtype=dtype)
df_geolocations = fo(df_geolocations)
df_geolocations.sample(5, random_state=7)
geolocation_zip_code_prefix geolocation_lat geolocation_lng geolocation_city geolocation_state
39817 2976 -23.47 -46.72 sao paulo SP
198962 8131 -23.50 -46.38 sao paulo SP
582845 34600 -19.86 -43.86 sabara MG
193875 7901 -23.28 -46.74 francisco morato SP
238522 9751 -23.69 -46.54 são bernardo do campo SP
df_geolocations.dtypes
geolocation_zip_code_prefix       int64
geolocation_lat                 float64
geolocation_lng                 float64
geolocation_city               category
geolocation_state              category
dtype: object
df_geolocations.geolocation_city.preproc.normalize_string_series(inplace=True)
df_geolocations.geolocation_state.preproc.normalize_string_series(case_format='upper', inplace=True)

We will combine all dataframes into a class for easier further work.

class Dfs:
    def __init__(self):
        self.orders = df_orders
        self.items = df_items
        self.reviews = df_reviews
        self.products = df_products
        self.geolocations = df_geolocations
        self.sellers = df_sellers
        self.payments = df_payments
        self.customers = df_customers
        self.categories = df_categories

    def __iter__(self):
        return iter([
            ('orders', self.orders),
            ('items', self.items),
            ('reviews', self.reviews),
            ('products', self.products),
            ('geolocations', self.geolocations),
            ('sellers', self.sellers),
            ('payments', self.payments),
            ('customers', self.customers),
            ('categories', self.categories),
        ])
dfs = Dfs()