Creating Analysis Class#
We’ll establish labels and category ordering for our visualizations, then integrate these specifications into our dataframes.
base_labels = dict(
# df_orders
year = 'Year'
, purchase_year = 'Year'
, purchase_month = 'Month'
, purchase_season = 'Season'
, purchase_weekday = 'Day of Week'
, purchase_day_type = 'Day Type'
, purchase_time_of_day = 'Time of Day'
, purchase_hour = 'Hour'
, order_status = 'Order Status'
, is_delayed = 'Delivery Delay Status'
, is_canceled = 'Order Cancellation Status'
, is_delivered = 'Delivery Status'
, delivery_time_days_cat = 'Delivery Time Category'
, delivery_issue_reason = 'Delivery Issue Reason'
, is_purchase = 'Purchase Status'
# from df_paymetns
, order_has_installment = 'Installment Status'
, order_total_payment_cat = 'Order Payment Category'
, order_payment_types = 'Order Payment Types'
# from df_items and df_products
, order_is_free_shipping = 'Free Shipping Status'
, order_general_product_categories = 'General Product Categories'
, order_product_categories = 'Product Categories'
, order_total_weight_cat = 'Order Weight Category'
, order_total_volume_cat = 'Order Volume Category'
# from df_reviews
, order_avg_reviews_score = 'Order Review Score'
, order_review_sentiment = 'Order Review Sentiment'
# df_sales
, sale_is_customer_first_purchase = 'First-Time Purchase'
, sale_is_customer_first_purchase_month = 'First Purchase Month'
# df_customers
, customer_state = 'Customer State'
, customer_city = 'Customer City'
, customer_top_purchase_weekdays = 'Top Purchase Weekdays'
, customer_payment_types = 'Payment Methods'
, customer_top_product_categories = 'Top Product Categories'
, customer_top_general_product_categories = 'Top General Product Categories'
, activity_segment = 'Activity Segment'
, value_segment = 'Value Segment'
, purchase_freq_segment = 'Purchase Frequency Segment'
, repeat_segment = 'Repeat Segment'
, loyalty_segment = 'Loyalty Segment'
, risk_segment = 'Risk Segment'
, weekday_segment = 'Weekday Segment'
, installment_segment = 'Installment Segment'
, products_cnt_segment = 'Products Count Segment'
, weight_segment = 'Weight Segment'
# df_payments
, has_installments = 'Installment Status'
, payment_type = 'Payment Type'
# df_products
, general_product_category = 'General Product Category'
, product_category = 'Product Category'
# df_review
, review_score = 'Review Score'
, season_review = 'Season'
, review_day_type = 'Day Type'
, review_creation_weekday = 'Day of Week'
# df_sellers
, seller_state = 'Seller State'
, seller_city = 'Seller City'
# for all tables
, day_of_month = 'Day of Month'
)
base_category_orders = dict(
is_purchase = ['Purchase', 'Not Purchase']
, purchase_year = ['2017', '2018']
, purchase_season = ['Spring', 'Summer', 'Autumn', 'Winter']
, purchase_month = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
, purchase_weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
, purchase_day_type = ['Weekday', 'Weekend']
, purchase_time_of_day = ['Morning', 'Afternoon', 'Evening', 'Night']
, purchase_hour = list(map(str, range(24)))
, season_review = ['Spring', 'Summer', 'Autumn', 'Winter']
, review_day_type = ['Weekday', 'Weekend']
, review_creation_weekday = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
, order_avg_reviews_score=list(map(str, range(1, 6)))
, review_score=list(map(str, range(1, 6)))
, order_review_sentiment = ['Positive', 'Neutral', 'Negative']
, is_delayed = ['Delayed', 'Not Delayed']
, is_delivered = ['Delivered', 'Not Delivered']
, order_status = ['Delivered', 'Shipped', 'Processing', 'Unavailable', 'Canceled', 'Invoiced', 'Approved', 'Created']
, delivery_issue_reason = ['Service Issue', 'Customer Issue', 'No Issues']
, delivery_time_days_cat = ['Fast', 'Medium', 'Long']
, order_total_payment_cat = ['Cheap', 'Medium', 'Expensive']
, order_total_weight_cat = ['Light', 'Medium', 'Heavy']
, order_total_volume_cat = ['Small', 'Medium', 'Large']
, order_has_installment = ['Has Installments', 'No Installments']
, has_installments = ['Has Installments', 'No Installments']
, order_is_free_shipping = ['Free Shipping', 'Paid Shipping']
, day_of_month = list(map(str, range(31)))
# segments
, value_segment = ['Low', 'Medium', 'High']
, activity_segment = ['Core', 'Potential Core', 'Short-Lived Repeat', 'One Time', 'Never Converted']
, purchase_freq_segment = ['Weekly', 'Monthly', 'Quarterly', 'Semiannual', 'Annual']
, repeat_segment = ['Fast Repeat', 'Medium Repeat', 'Slow Repeat']
, loyalty_segment = ['Critic', 'Neutral', 'Promoter']
, risk_segment = ['Reliable', 'Risky']
, weekday_segment = ['Weekday', 'Weekend']
, installment_segment = ['Full Pay', 'Installment']
, products_cnt_segment = ['Single Product', 'Multi Product', 'Bulk Buyer']
, weight_segment = ['Light', 'Medium', 'Heavy']
)
for df in [df_orders, df_sales, df_items, df_customers, df_products, df_sellers, df_reviews, df_payments]:
df.viz.update_plotly_settings(
labels=base_labels
, category_orders=base_category_orders
)
We’ll develop a dedicated class for generating standardized visualizations.
class PlotBuilder:
df = None
time_column = None
time_column_label = None
metric = None
metric_label = None # for axis label
metric_label_for_distribution = None # if metric_lable is aggregated
agg_func = None
freq = None
title_base = None
norm_by = None
cur_dim = None
dimensions = base_labels
axis_sort_order = None
text_auto = None
plotly_kwargs = {}
update_fig = {}
block_save_fig_for_slides = True
slide_path = 'for_slides/svg/'
slide_img_fmt = 'svg'
@classmethod
def reset_configure(cls) -> None:
"""Reset global settings"""
cls.df = None
cls.time_column = None
cls.time_column_label = None
cls.metric = None
cls.metric_label = None
cls.metric_label_for_distribution = None
cls.agg_func = None
cls.freq = None
cls.title_base = None
cls.norm_by = None
cls.cur_dim = None
cls.dimensions = base_labels
cls.axis_sort_order = None
cls.text_auto = None
cls.plotly_kwargs = {}
cls.update_fig = {}
@classmethod
def metric_info(cls, freq=None, agg_func=None, **kwargs):
"""
Display distribution information and statistics for the metric column.
Parameters:
-----------
freq : str, optional
The time frequency for aggregation (e.g., 'ME' for month, 'W' for week, 'D' for day).
If not provided, the analysis will be performed on raw, non-aggregated data.
agg_func : str or function, optional
The aggregation function to apply when freq is specified (e.g., 'mean', 'sum', 'count').
If not provided, uses the class's default agg_func.
**kwargs : dict
Additional arguments to pass to the underlying visualization function.
Commonly used to customize labels, titles, or plot parameters.
Notes:
------
- When freq is specified, each data point represents an aggregated value
for the given time period, providing insights into temporal patterns.
- Without freq, the analysis shows the raw distribution of individual values.
- The method automatically handles axis labels and titles, but these can be
overridden through kwargs if needed.
"""
metric_label_for_title = cls.get_metric_label_for_title_for_metric_info()
if freq:
period_map = {'ME': 'Month', 'W': 'Week', 'D': 'Day'}
if not agg_func:
agg_func = cls.agg_func
agg_metric = f'{agg_func}_{cls.metric}_per_{period_map[freq].lower()}'
# Aggregate data by specified frequency
df_metric_per_period = (
cls.df.groupby(pd.Grouper(key=cls.time_column, freq=freq), observed=False)[cls.metric]
.agg(agg_func)
.to_frame(agg_metric)
)
# Enhance title with aggregation info
metric_label_for_title += f' per {period_map.get(freq, freq)}'
# Set default labels if none provided
if 'labels' not in kwargs:
kwargs['labels'] = {
agg_metric: cls.metric_label_for_distribution if cls.metric_label_for_distribution else cls.metric_label
}
# Set default title if none provided
if 'title' not in kwargs:
kwargs['title'] = f'Distribution of {metric_label_for_title}'
return df_metric_per_period[agg_metric].explore.info(**kwargs)
else:
# Handle non-aggregated case
if 'labels' not in kwargs:
kwargs['labels'] = {
cls.metric: cls.metric_label_for_distribution if cls.metric_label_for_distribution else cls.metric_label
}
if 'title' not in kwargs:
kwargs['title'] = f'Distribution of {metric_label_for_title}'
return cls.df[cls.metric].explore.info(**kwargs)
@classmethod
def get_metric_label_for_title_for_metric_info(cls):
"""
Helper method to get the appropriate metric label for titles.
"""
if cls.metric_label_for_distribution:
label_source = cls.metric_label_for_distribution
else:
label_source = cls.metric_label
# Use only the first part if multiple labels are comma-separated
return label_source.split(',')[0] if ',' in label_source else label_source
@classmethod
def metric_top(cls, id_column='order_id', n=10, freq=None, agg_func=None) -> pd.DataFrame:
"""
Display top n entries based on the metric, with optional temporal aggregation.
Parameters:
-----------
id_column : str, optional (default='order_id')
The column name to display as identifier along with the metric.
n : int, optional (default=10)
Number of top entries to return.
freq : str, optional
The time frequency for aggregation (e.g., 'ME' for month, 'W' for week, 'D' for day).
If not provided, the analysis will be performed on raw, non-aggregated data.
agg_func : str or function, optional
The aggregation function to apply when freq is specified (e.g., 'mean', 'sum', 'count').
If not provided, uses the class's default agg_func.
Returns:
--------
pd.DataFrame
DataFrame containing the top n entries with the specified identifier column and metric values.
When aggregation is applied, returns top time periods with aggregated metric values.
Notes:
------
- Without aggregation: shows top individual records sorted by raw metric values.
- With aggregation: shows top time periods sorted by aggregated metric values.
- The method preserves the original class metric labels for proper display.
- For aggregated results, the identifier column will be the time period.
"""
if freq:
# Handle aggregated case
period_map = {'ME': 'Month', 'W': 'Week', 'D': 'Day'}
if not agg_func:
agg_func = cls.agg_func
# Aggregate data by specified frequency
df_agg = (
cls.df.groupby(pd.Grouper(key=cls.time_column, freq=freq), observed=False)[cls.metric]
.agg(agg_func)
)
return df_agg.sort_values(ascending=False).head(n).to_frame()
else:
# Handle non-aggregated case
return cls.df.set_index(id_column)[cls.metric].sort_values(ascending=False).head(n).to_frame()
@classmethod
def get_dim(cls, print_by_chunk=True) -> Union[None, list]:
"""
Returns a list of specified measurements
You can get a line in the form of a line to conveniently copy and immediately create a variable.
And you can just in the form of a list
"""
if print_by_chunk:
dims = list(cls.dimensions.keys())
result = "["
for i, item in enumerate(dims):
result += f"'{item}'"
if i < len(dims) - 1:
result += ", "
if (i + 1) % 5 == 0:
result += "\n"
result += "]"
print(result)
else:
return list(cls.dimensions.keys())
@classmethod
def check_cur_dim(cls, kwargs) -> None:
"""
Helps not to indicate the name of the measurements when you need to sort out all the excessions
Also displays the line with the name of the section for the report and the name of the measurement itself
"""
if 'x' in kwargs and isinstance(kwargs['x'], int):
kwargs['x'] = cls.cur_dim[kwargs['x']]
res = '**By ' + cls.dimensions[kwargs['x']] + '**'
print(res)
print('x: ', kwargs['x'])
if 'y' in kwargs and isinstance(kwargs['y'], int):
kwargs['y'] = cls.cur_dim[kwargs['y']]
res = '**By ' + cls.dimensions[kwargs['y']] + '**'
print(res)
print('y: ', kwargs['y'])
if 'color' in kwargs and isinstance(kwargs['color'], int):
kwargs['color'] = cls.cur_dim[kwargs['color']]
res = '**By ' + cls.dimensions[kwargs['color']] + '**'
print(res)
print('color: ', kwargs['color'])
if 'cat1' in kwargs and isinstance(kwargs['cat1'], int):
kwargs['cat1'] = cls.cur_dim[kwargs['cat1']]
res = '**By ' + cls.dimensions[kwargs['cat1']] + '**'
print(res)
print('cat1: ', kwargs['cat1'])
if 'cat2' in kwargs and isinstance(kwargs['cat2'], int):
kwargs['cat2'] = cls.cur_dim[kwargs['cat2']]
res = '**By ' + cls.dimensions[kwargs['cat2']] + '**'
print(res)
print('cat2: ', kwargs['cat2'])
@classmethod
def configure(cls, **kwargs) -> None:
"""Set global settings"""
cls.reset_configure()
for key, value in kwargs.items():
if hasattr(cls, key):
setattr(cls, key, value)
else:
raise AttributeError(f"Invalid config parameter: {key}")
@classmethod
def to_slide(cls, fig: go.Figure, title_postfix: str = None):
if not cls.block_save_fig_for_slides:
title = fig.layout.title.text
if isinstance(title_postfix, str):
title += title_postfix
fig.write_image(f"{cls.slide_path}{title}.{cls.slide_img_fmt}")
@classmethod
def line(cls, **kwargs) -> go.Figure:
"""Create line plot"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'line')
kwargs['labels'] = cls._prepare_labels(kwargs, 'line')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'line')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.line(**kwargs)
else:
fig = cls.df.viz.line(**kwargs)
fig = cls._update_fig(fig, kwargs, 'line')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def bar_groupby(cls, **kwargs) -> go.Figure:
"""Create bar plot with groupby"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'bar_groupby')
kwargs['labels'] = cls._prepare_labels(kwargs, 'bar_groupby')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'bar_groupby')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.bar(**kwargs)
else:
fig = cls.df.viz.bar(**kwargs)
fig = cls._update_fig(fig, kwargs, 'bar_groupby')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def line_resample(cls, **kwargs) -> go.Figure:
"""Create line plot with resample"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'line_resample')
kwargs['labels'] = cls._prepare_labels(kwargs, 'line_resample')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'line_resample')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.line(**kwargs)
else:
fig = cls.df.viz.line(**kwargs)
fig = cls._update_fig(fig, kwargs, 'line')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def area_resample(cls, **kwargs) -> go.Figure:
"""Create line plot with resample"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'line_resample')
kwargs['labels'] = cls._prepare_labels(kwargs, 'line_resample')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'line_resample')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.area(**kwargs)
else:
fig = cls.df.viz.area(**kwargs)
fig = cls._update_fig(fig, kwargs, 'area')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def heatmap(cls, **kwargs) -> go.Figure:
"""Create heatmap plot"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'heatmap')
kwargs['labels'] = cls._prepare_labels(kwargs, 'heatmap')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'heatmap')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.heatmap(**kwargs)
else:
fig = cls.df.viz.heatmap(**kwargs)
fig = cls._update_fig(fig, kwargs, 'heatmap')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def pie_bar(cls, **kwargs) -> go.Figure:
"""Create pie_bar plot"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'pie_bar')
kwargs['labels'] = cls._prepare_labels(kwargs, 'pie_bar')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'pie_bar')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.pie_bar(**kwargs)
else:
fig = cls.df.viz.pie_bar(**kwargs)
fig = cls._update_fig(fig, kwargs, 'pie_bar')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def box(cls, **kwargs) -> go.Figure:
"""Create boxplot"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'box')
kwargs['labels'] = cls._prepare_labels(kwargs, 'box')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'box')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.box(**kwargs)
else:
fig = cls.df.viz.box(**kwargs)
fig = cls._update_fig(fig, kwargs, 'box')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def histogram(cls, **kwargs) -> go.Figure:
"""Create histogram plot"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'histogram')
kwargs['labels'] = cls._prepare_labels(kwargs, 'histogram')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'histogram')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.histogram(**kwargs)
else:
# Since order_avg_reviews_score is numerical, it is better to convert to string for plotly
if kwargs.get('color') == 'order_avg_reviews_score':
df = cls.df.copy()
df['order_avg_reviews_score'] = cls.df['order_avg_reviews_score'].astype(str).astype('category')
else:
df = cls.df
fig = df.viz.histogram(**kwargs)
fig = cls._update_fig(fig, kwargs, 'histogram')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def cat_compare(cls, **kwargs) -> go.Figure:
"""Create plots for compare categorical columns"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'cat_compare')
if 'category_orders' not in kwargs:
kwargs['category_orders']={kwargs.get('cat1'): 'descending', kwargs.get('cat2'): 'descending'}
if cls.plotly_kwargs and 'category_orders' in cls.plotly_kwargs:
kwargs['category_orders'].update(cls.plotly_kwargs['category_orders'])
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.cat_compare(**kwargs)
else:
fig = cls.df.viz.cat_compare(**kwargs)
fig = fig
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def period_change(cls, **kwargs) -> go.Figure:
"""Plot period-over-period changes"""
to_slide = False
if 'to_slide' in kwargs:
to_slide = kwargs['to_slide']
kwargs.pop('to_slide')
kwargs = cls._prepare_common_settings(kwargs, 'period_change')
kwargs['labels'] = cls._prepare_labels(kwargs, 'period_change')
if not kwargs.get('title'):
kwargs['title'] = cls._prepare_title(kwargs, 'period_change')
kwargs = cls._prepare_common_settings(kwargs, 'period_change')
if kwargs.get('data_frame') is not None:
fig = kwargs.pop('data_frame').viz.period_change(**kwargs)
else:
fig = cls.df.viz.period_change(**kwargs)
fig = cls._update_fig(fig, kwargs, 'period_change')
if to_slide:
cls.to_slide(fig, to_slide)
return fig
@classmethod
def _prepare_labels(cls, kwargs: dict, graph_type: str) -> dict:
"""Labels preparation for graphs"""
labels = dict()
# Label for base metric
if graph_type in ['line', 'bar_groupby', 'line_resample', 'pie_bar']:
if cls.metric and cls.metric_label:
labels[cls.metric] = cls.metric_label
elif graph_type in ['box', 'histogram']:
if cls.metric and cls.metric_label_for_distribution:
labels[cls.metric] = cls.metric_label_for_distribution
# labels for time_column
if graph_type in ['line', 'line_resample', 'box', 'period_change']:
if cls.time_column and cls.time_column_label:
labels[cls.time_column] = cls.time_column_label
# labels for specific graph_type
if graph_type in ['heatmap']:
if isinstance(kwargs['x'], pd.core.resample.TimeGrouper):
labels['x'] = cls.time_column_label
else:
if kwargs['x'] not in cls.dimensions:
raise ValueError(f"{kwargs['x']} not in cls.dimensions")
labels['x'] = cls.dimensions[kwargs['x']]
if kwargs['y'] not in cls.dimensions:
raise ValueError(f"{kwargs['y']} not in cls.dimensions")
labels['y'] = cls.dimensions[kwargs['y']]
labels['color'] = cls.metric_label
if 'labels' in kwargs:
labels.update(kwargs['labels'])
return labels
@classmethod
def _prepare_title(cls, kwargs: dict, graph_type: str) -> str:
"""Title preparation for graphs"""
# Determin axis_dimension
if graph_type in ['bar_groupby', 'pie_bar'] or (graph_type == 'box' and kwargs.get('mode') != 'time_series'):
if kwargs.get('y') == cls.metric:
axis_dimension = kwargs['x']
elif kwargs.get('x') == cls.metric:
axis_dimension = kwargs['y']
else:
raise ValueError('Can not define axis_dimension')
if graph_type != 'box' and cls.axis_sort_order and axis_dimension != 'purchase_season' and axis_dimension not in kwargs.get('category_orders', {}):
axis_sort_order = cls.axis_sort_order
# For bottom direction change sorting direction
if kwargs.get('trim_top_n_direction') == 'bottom':
axis_sort_order = 'ascending'
kwargs.setdefault('category_orders', {}).update({axis_dimension: axis_sort_order})
if graph_type == 'box' and cls.agg_func and cls.axis_sort_order and axis_dimension != 'purchase_season' and axis_dimension not in kwargs.get('category_orders', {}):
axis_sort_order = cls.axis_sort_order
# For bottom direction change sorting direction
if kwargs.get('trim_top_n_direction') == 'bottom':
axis_sort_order = 'ascending'
kwargs.setdefault('category_orders', {}).update({axis_dimension: f'{cls.agg_func} {axis_sort_order}'})
if axis_dimension not in cls.dimensions:
raise ValueError(f'{axis_dimension} not in cls.dimensions')
# Base part of titke
# if metric label contain ',' cut that part of metric label
if graph_type in ['box', 'histogram']:
if cls.metric_label_for_distribution:
metric_label_for_distribution = cls.metric_label_for_distribution if ',' not in cls.metric_label_for_distribution else cls.metric_label_for_distribution.split(',')[0]
else:
metric_label_for_distribution = cls.metric_label if ',' not in cls.metric_label else cls.metric_label.split(',')[0]
if graph_type == 'box':
title = f'Boxplots of {metric_label_for_distribution}'
elif graph_type == 'histogram':
title = f'Distribution of {metric_label_for_distribution}'
elif graph_type not in ['period_change']:
if cls.title_base:
title = cls.title_base
elif cls.metric_label:
title = cls.metric_label if ',' not in cls.metric_label else cls.metric_label.split(',')[0]
else:
raise ValueError('For auto create title, title_base or metric_label must be define')
# Color part of title
if graph_type in ['line', 'bar_groupby', 'line_resample', 'pie_bar', 'box', 'histogram']:
if not (graph_type in ['line', 'line_resample', 'histogram'] or (graph_type == 'box' and kwargs.get('mode') == 'time_series')):
title += f' by {cls.dimensions[axis_dimension]}'
if 'color' in kwargs:
if kwargs['color'] not in cls.dimensions:
raise ValueError(f"{kwargs['color']} not in cls.dimensions")
if graph_type in ['line', 'line_resample', 'histogram'] or (graph_type == 'box' and kwargs.get('mode') == 'time_series'):
title += f" by {cls.dimensions[kwargs['color']]}"
else:
title += f" and {cls.dimensions[kwargs['color']]}"
# Datetime part of title
if graph_type in ['line', 'line_resample'] or (graph_type == 'box' and kwargs.get('mode') == 'time_series'):
if 'freq' in kwargs:
resample_freq_for_title = kwargs['freq']
else:
resample_freq_for_title = cls.freq
if not resample_freq_for_title:
raise ValueError('freq must be define')
freq_map = {'h': 'Hour', 'D': 'Day', 'W': 'Week', 'ME': 'Month', 'M': 'Month'}
if 'color' in kwargs:
title += f' and {freq_map[resample_freq_for_title]}'
else:
title += f' by {freq_map[resample_freq_for_title]}'
# Part for specific graph_type
if graph_type in ['heatmap']:
title += f" by {cls.dimensions[kwargs['x']]}"
title += f" and {cls.dimensions[kwargs['y']]}"
elif graph_type in ['histogram', 'box']:
if 'lower_quantile' in kwargs or 'upper_quantile' in kwargs:
quantile_for_title = ' ('
if kwargs.get('mode') == 'dual_box_trim':
quantile_for_title += 'Right: '
if 'lower_quantile' in kwargs:
quantile_for_title += f"from {kwargs['lower_quantile']} "
if 'upper_quantile' in kwargs:
quantile_for_title += f"to {kwargs['upper_quantile']} "
quantile_for_title += 'Quantile)'
title += quantile_for_title
if graph_type in ['period_change']:
title_map = {
'mom': 'Monthly Change in {metric}',
'wow': 'Weekly Change in {metric}',
'dod': 'Daily Change in {metric}',
'yoy': 'Yearly Change in {metric}'
}
period = kwargs.get('period', 'mom')
if not cls.metric_label:
raise ValueError('metric_label must be define')
metric_label = cls.metric_label if ',' not in cls.metric_label else cls.metric_label.split(',')[0]
title = title_map[period].format(metric=metric_label)
return title
@classmethod
def _prepare_common_settings(cls, kwargs: dict, graph_type: str) -> dict:
"""Preparation of general settings for graphs"""
cls.check_cur_dim(kwargs)
if graph_type not in ['heatmap', 'cat_compare']:
kwargs.setdefault('hover_data', {}).update({cls.metric: ':.3f'})
if graph_type in ['cat_compare']:
if 'cat1' in kwargs and 'cat2' not in kwargs:
kwargs['cat2'] = cls.metric
elif 'cat2' in kwargs and 'cat1' not in kwargs:
kwargs['cat1'] = cls.metric
# top_n settings
if graph_type in ['line', 'line_resample']:
if (kwargs.get('color') in ['customer_state', 'customer_city', 'order_product_categories', 'order_general_product_categories', 'seller_state', 'seller_city']
and 'trim_top_n_color' not in kwargs):
kwargs['trim_top_n_color'] = 5
if graph_type in ['bar_groupby', 'box']:
if kwargs.get('y') in ['customer_state', 'customer_city', 'order_product_categories', 'order_general_product_categories',
'product_category', 'general_product_category', 'seller_state', 'seller_city']:
if 'trim_top_n_y' not in kwargs:
kwargs['trim_top_n_y'] = 15
if 'height' not in kwargs:
kwargs['height'] = 500
# aggregation
if graph_type in ['bar_groupby', 'line_resample', 'pie_bar', 'heatmap']:
kwargs.setdefault('agg_func', cls.agg_func)
if graph_type != 'heatmap':
kwargs.setdefault('agg_column', cls.metric)
# normalization
if graph_type in ['bar_groupby', 'pie_bar']:
if cls.text_auto:
kwargs.setdefault('text_auto', cls.text_auto)
kwargs.setdefault('norm_by', cls.norm_by)
# time freq
if graph_type in ['line_resample']:
kwargs.setdefault('freq', cls.freq)
# settings for specific graph_type
if graph_type in ['line', 'line_resample']:
kwargs.setdefault('x', cls.time_column)
kwargs.setdefault('y', cls.metric)
elif graph_type in ['bar_groupby']:
if 'x' not in kwargs and 'y' not in kwargs:
raise ValueError('x or y must be define')
if 'y' not in kwargs:
kwargs.setdefault('y', cls.metric)
elif 'x' not in kwargs:
kwargs.setdefault('x', cls.metric)
elif graph_type in ['heatmap']:
kwargs.setdefault('do_pivot', True)
kwargs.setdefault('z', cls.metric)
kwargs['width'] = 1100
elif graph_type in ['pie_bar']:
kwargs.setdefault('hole', 0.5)
if 'x' not in kwargs and 'y' not in kwargs:
raise ValueError('x or y must be define')
if 'y' not in kwargs:
kwargs.setdefault('y', cls.metric)
elif 'x' not in kwargs:
kwargs.setdefault('x', cls.metric)
elif graph_type in ['box']:
if kwargs.get('mode') == 'time_series':
kwargs.setdefault('x', cls.time_column)
kwargs.setdefault('y', cls.metric)
else:
kwargs['show_dual'] = True
kwargs['upper_quantile'] = 0.95
if 'x' not in kwargs and 'y' not in kwargs:
raise ValueError('x or y must be define')
if 'y' not in kwargs:
kwargs.setdefault('y', cls.metric)
elif 'x' not in kwargs:
kwargs.setdefault('x', cls.metric)
elif graph_type in ['histogram']:
if 'x' in kwargs and isinstance(kwargs['x'], str) or 'color' in kwargs:
kwargs.setdefault('show_hist', False)
kwargs.setdefault('show_kde', True)
kwargs.setdefault('mode', 'dual_box_trim')
kwargs.setdefault('show_legend_title', True)
else:
kwargs.setdefault('mode', 'dual_hist_trim')
kwargs.setdefault('x', cls.metric)
kwargs.setdefault('upper_quantile', 0.95)
elif graph_type in ['period_change']:
kwargs.setdefault('metric_col', cls.metric)
kwargs.setdefault('date_col', cls.time_column)
kwargs.setdefault('agg_func', cls.agg_func)
if cls.plotly_kwargs:
kwargs.update(cls.plotly_kwargs)
return kwargs
@classmethod
def _update_fig(cls, fig: go.Figure, kwargs: dict, graph_type: str) -> dict:
update_fig = {}
for param in cls.update_fig:
if hasattr(fig.layout, param):
update_fig[param] = cls.update_fig[param]
fig.update_layout(**update_fig)
return fig
pb = PlotBuilder