Source code for olliepy.InteractiveDashboard

import copy
import json
import time
from json import JSONDecodeError
from typing import List

import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_numeric_dtype
from typeguard import typechecked

from .Report import Report


def validate_attributes(dataframes: List[pd.DataFrame],
                        dataframes_names: List[str],
                        numerical_columns: List[str],
                        categorical_columns: List[str],
                        date_columns: List[str]):
    if len(dataframes) == 0:
        raise AttributeError('You need to pass at least one pandas dataframe to create a dashboard.')

    if len(dataframes) != len(dataframes_names):
        raise AttributeError('You need to have a dataframe name for each dataframe you have in dataframes')

    if len(categorical_columns + numerical_columns) == 0:
        raise AttributeError(
            'You need to pass categorical_columns and/or numerical_columns in order to create a dashboard.')

    for df, df_name in zip(dataframes, dataframes_names):
        df_columns = df.columns.tolist()
        for col in numerical_columns:
            if col not in df_columns:
                raise AttributeError(f'Numerical column: {col} is not found in {df_name} dataframe.')
        for col in categorical_columns:
            if col not in df_columns:
                raise AttributeError(f'Categorical column: {col} is not found in {df_name} dataframe.')

        if date_columns is not None:
            for col in date_columns:
                if col not in df_columns:
                    raise AttributeError(f'Date column: {col} is not found in {df_name} dataframe.')
                if not is_datetime(pd.to_datetime(df[col], infer_datetime_format=True, errors='ignore')):
                    raise TypeError(
                        f'''Date column: {col} has one or more rows which are not a valid date format in {df_name} dataframe.
                            You can replace invalid values with None''')


def validate_bin_numerical_feature_attributes(dataframes: List[pd.DataFrame],
                                              dataframes_names: List[str],
                                              numerical_feature_name: str,
                                              new_feature_name: str):
    for df, df_name in zip(dataframes, dataframes_names):
        df_columns = df.columns.tolist()
        if numerical_feature_name not in df_columns:
            raise AttributeError(f'Numerical column: {numerical_feature_name} is not found in {df_name} dataframe.')
        if not is_numeric_dtype(df[numerical_feature_name]):
            raise TypeError('''the provided numerical_feature_name is not valid.
            Please make sure that you are passing a numerical feature name''')

    if len(new_feature_name) == 0:
        raise AttributeError('''the provided new_feature_name is not valid.
            Please make sure that you are passing new_feature_name as a string with at least one character''')


def load_interactive_dashboard(dashboard_path: str) -> Report:
    """
    Load existing dashboard given the dashboard path
    :param dashboard_path: file system path
    :return: Interactive dashboard
    """
    import os
    import json
    if os.path.exists(path=dashboard_path):
        if dashboard_path[-1] in ('/', '\\'):
            dashboard_path = dashboard_path[:-1]

        report_file_path = f'{dashboard_path}/report_data.json'

        if os.path.exists(path=report_file_path):
            output_directory, dashboard_folder_name = os.path.split(dashboard_path)
            with open(report_file_path) as report_data_file:
                try:
                    report_data = json.load(report_data_file)
                    title = report_data['title']
                    dataframes_names = report_data['datasets']
                    numerical_columns = report_data['numericalColumns'].copy()
                    if 'generated_id' in numerical_columns:
                        numerical_columns.remove('generated_id')

                    categorical_columns = report_data['categoricalColumns']
                    date_columns = report_data['dateColumns']
                    number_displays = report_data['numberDisplays']
                    charts = report_data['charts']

                    dataframes = []
                    for dataframe_name in dataframes_names:
                        dataframes.append(pd.read_json(json.dumps(report_data[dataframe_name])))

                    dashboard = InteractiveDashboard(title=title,
                                                     output_directory=output_directory,
                                                     dataframes=dataframes,
                                                     dataframes_names=dataframes_names,
                                                     numerical_columns=numerical_columns,
                                                     categorical_columns=categorical_columns,
                                                     date_columns=date_columns,
                                                     dashboard_folder_name=dashboard_folder_name)

                    dashboard.number_displays = number_displays
                    dashboard.charts = charts
                    dashboard.report_data = report_data

                    return dashboard
                except JSONDecodeError:
                    raise ValueError('The provided dashboard JSON file has been encrypted and can not be parsed.')
        else:
            raise FileNotFoundError(f'report_data.json was not found in {dashboard_path}')

    else:
        raise NotADirectoryError(f'provided dashboard_path is not valid. dashboard_path does not exist')


[docs]@typechecked class InteractiveDashboard(Report): """ InteractiveDashboard creates an interactive dashboard that can be used for EDA or error analysis. Attributes ---------- title : str the title of the report output_directory : str the directory where the dashboard folder will be created dataframes : List[pd.DataFrame] a list dataframes to be used in the dashboard dataframes_names : List[str] a list of the dataframes names numerical_columns : List[str] default=None a list of the numerical columns to be included in the dashboard categorical_columns : List[str] default=None a list of the categorical columns to be included in the dashboard date_columns : List[str] default=None a list of the date columns to be included in the dashboard dashboard_folder_name : str default=None the name of the folder that will contain all the generated report files. If not set, the title of the report will be used. encryption_secret : str default=None the 16 characters secret that will be used to encrypt the generated report data. If it is not set, the generated data won't be encrypted. generate_encryption_secret : bool default=False the encryption_secret will be generated and its value returned as output. you can also view encryption_secret to get the generated secret. Methods ------- create_dashboard() creates the dashboard serve_dashboard_from_local_server() serves the dashboard using a flask server save_dashboard() saves the dashboard to be used without a flask server. """ def __init__(self, title: str, output_directory: str, dataframes: List[pd.DataFrame], dataframes_names: List[str], numerical_columns: List[str] = [], categorical_columns: List[str] = [], date_columns: List[str] = None, dashboard_folder_name: str = None, encryption_secret: str = None, generate_encryption_secret: bool = False): super().__init__(title, output_directory, '', dashboard_folder_name, encryption_secret, generate_encryption_secret) validate_attributes(dataframes, dataframes_names, numerical_columns, categorical_columns, date_columns) self.dataframes = [df.copy() for df in dataframes] self.dataframes_names = dataframes_names[:] self.numerical_columns = numerical_columns[:] self.categorical_columns = categorical_columns[:] self.date_columns = date_columns[:] if date_columns is not None else [] self.number_displays: List[dict] = [] self.charts: List[dict] = [] self._template_name = 'interactive-dashboard' self._generated_id_column = 'generated_id'
[docs] def create_dashboard(self, auto_generate_distribution_plots: bool = False) -> None: """ Creates a dashboard using the user defined data. :param auto_generate_distribution_plots: generate distribution plots and add them to the dashboard. default: False """ # delete default report location created by parent class if 'report' in self.report_data: del self.report_data['report'] tic = time.perf_counter() for df in self.dataframes: df[self._generated_id_column] = df.index for date_column in self.date_columns: df[date_column] = pd.to_datetime(df[date_column], infer_datetime_format=True) for col in self.categorical_columns: df[col] = df[col].astype(str) self.numerical_columns = [self._generated_id_column] + self.numerical_columns self.report_data['datasets'] = self.dataframes_names self.report_data['numericalColumns'] = self.numerical_columns if self.numerical_columns else [] self.report_data['categoricalColumns'] = self.categorical_columns if self.categorical_columns else [] self.report_data['dateColumns'] = self.date_columns if self.date_columns else [] for df, df_name in zip(self.dataframes, self.dataframes_names): self.report_data[df_name] = json.loads(df.to_json(orient='records')) if auto_generate_distribution_plots: self.number_displays = self.number_displays + self._generate_number_displays() self.charts = self.charts + self._generate_charts() self.report_data['numberDisplays'] = self.number_displays self.report_data['charts'] = self.charts toc = time.perf_counter() print(f"The dashboard was created in {toc - tic:0.4f} seconds") if self.encryption_secret: print(f'Your encryption secret is {self.encryption_secret}')
[docs] def get_charts(self) -> List[dict]: """ Get a copy of the dashboard's charts :return: List[dict] the charts """ return copy.deepcopy(self.charts)
[docs] def get_number_displays(self) -> List[dict]: """ Get a copy of the dashboard's number displays :return: List[dict] the number displays """ return copy.deepcopy(self.number_displays)
[docs] def update_charts(self, new_charts: List[dict], keep_existing=True) -> None: """ Update the dashboard charts. If keep_existing is True, the dashboard's charts will be extended otherwise it will be replaced. :param new_charts: List of dict representing the new charts :param keep_existing: boolean to flag whether existing charts should be extended. :return: None """ if keep_existing: self.charts.extend(copy.deepcopy(new_charts)) else: self.charts = copy.deepcopy(new_charts)
[docs] def update_number_displays(self, new_number_displays: List[dict], keep_existing=True) -> None: """ Update the dashboard number displays. If keep_existing is True, the dashboard's number displays will be extended otherwise it will be replaced. :param new_number_displays: List of dict representing the new number displays :param keep_existing: boolean to flag whether existing charts should be extended. :return: None """ if keep_existing: self.number_displays.extend(copy.deepcopy(new_number_displays)) else: self.number_displays = copy.deepcopy(new_number_displays)
[docs] @typechecked def bin_numerical_feature(self, numerical_feature_name: str, new_feature_name: str, number_of_bins: int, suffix: str = None) -> None: """ This will be a selected numerical feature. OlliePy will get the bins from the first data frame and apply these bins on the rest of the dataframes. :param numerical_feature_name: the numerical feature to bin :param new_feature_name: the name of the new binned feature :param number_of_bins: the number of bins to apply :param suffix: suffix to add the bins value :return: None """ validate_bin_numerical_feature_attributes(self.dataframes, self.dataframes_names, numerical_feature_name, new_feature_name) first_df = self.dataframes[0] first_df.loc[:, new_feature_name], bins = pd.cut(first_df.loc[:, numerical_feature_name], retbins=True, include_lowest=True, bins=number_of_bins) self.categorical_columns.append(new_feature_name) if len(self.dataframes) > 1: for df in self.dataframes[1:]: df.loc[:, new_feature_name] = pd.cut(df.loc[:, numerical_feature_name], bins=bins) if suffix is not None: for df in self.dataframes: df.loc[:, new_feature_name] = df.loc[:, new_feature_name].astype(str) + '_' + suffix
def _generate_number_displays(self) -> List[dict]: """ generate number displays for the auto generate functionality :return: List[dict] the generated number displays """ return [ { 'agg': 'count', 'column': self._generated_id_column, 'title': 'Number of observations', 'type': 'number-display', 'w': 12, 'h': 1, 'maxH': 2, 'i': 0, 'id': 'number_of_observations_number_display', 'x': 0, 'y': 0, 'static': False } ] def _generate_charts(self) -> List[dict]: """ generate histograms for the auto generate functionality :return: List[dict] the generated charts """ charts = [] x = 0 y = 0 w = 4 h = 2 i = 0 max_width = 12 df = self.dataframes[0] for col in self.categorical_columns: n_unique = df[col].nunique() if n_unique > 2: charts.append({ 'dimension': col, 'agg': 'count', 'column': self._generated_id_column, 'title': f'{col} count', 'type': 'row-chart', 'cap': 10, 'w': w, 'h': h, 'minW': 2, 'minH': 2, 'i': i, 'id': f'chart_{i}', 'x': x, 'y': y, 'static': False }) else: charts.append({ 'id': f'chart_{i}', 'title': f'{col}', 'type': 'pie-chart', 'dimension': col, 'x': x, 'y': y, 'w': w, 'h': h, 'i': i, 'minW': 2, 'minH': 2, 'static': False }) i += 1 x += w if (x + w) > max_width: x = 0 y += h for col in self.numerical_columns: if col != self._generated_id_column: bin_width = (df[col].max() - df[col].min()) / 100 bin_width = 1.0 if np.isnan(bin_width) else bin_width charts.append({ 'dimension': col, 'title': f'{col} histogram', 'xAxisLabel': col, 'yAxisLabel': 'Frequency', 'binWidth': bin_width, 'type': 'histogram-chart', 'w': w, 'h': h, 'minW': 2, 'minH': 2, 'i': i, 'id': f'chart_{i}', 'x': x, 'y': y, 'static': False }) i += 1 x += w if (x + w) > max_width: x = 0 y += h return charts
[docs] def serve_dashboard_from_local_server(self, mode: str = 'server', port: int = None, load_existing_dashboard: bool = False) -> None: """ Serve the dashboard to the user using a web server. Available modes: - 'server': will open a new tab in the default browser using webbrowser package - 'js': will open a new tab in the default browser using IPython - 'jupyter': will open the dashboard in a jupyter notebook :param mode: the selected web server mode. default: 'server' :param port: the server port. default: None. a random port will be generated between (1024-49151) :param load_existing_dashboard: Load existing dashboard data. :return: None """ if not port: import random port = random.randint(1024, 49151) super()._serve_report_using_flask(self._template_name, mode, port, load_existing_dashboard)
[docs] def save_dashboard(self, zip_dashboard: bool = False) -> None: """ Creates the dashboard directory, copies the web application based on the template name, saves the dashboard data. :param zip_dashboard: enable it in order to zip the directory for downloading. default: False :return: None """ super()._save_the_report(self._template_name, zip_dashboard)