Source code for pm4py.statistics.traces.generic.pandas.case_statistics

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
from enum import Enum
from typing import Optional, Dict, Any, Union, Tuple, List

import pandas as pd

from pm4py.statistics.traces.generic.common import case_duration as case_duration_commons
from pm4py.util import exec_utils, constants, pandas_utils
from pm4py.util import variants_util
from pm4py.util import xes_constants as xes
from pm4py.util.business_hours import soj_time_business_hours_diff
from pm4py.util.constants import CASE_CONCEPT_NAME
from pm4py.util.xes_constants import DEFAULT_TIMESTAMP_KEY


[docs]class Parameters(Enum): ATTRIBUTE_KEY = constants.PARAMETER_CONSTANT_ATTRIBUTE_KEY ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY MAX_VARIANTS_TO_RETURN = "max_variants_to_return" VARIANTS_DF = "variants_df" ENABLE_SORT = "enable_sort" SORT_BY_COLUMN = "sort_by_column" SORT_ASCENDING = "sort_ascending" MAX_RET_CASES = "max_ret_cases" BUSINESS_HOURS = "business_hours" WORKTIMING = "worktiming" WEEKENDS = "weekends" WORKCALENDAR = "workcalendar"
[docs]def get_variant_statistics(df: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Union[ List[Dict[str, int]], List[Dict[List[str], int]]]: """ Get variants from a Pandas dataframe Parameters ----------- df Dataframe parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column that contains the Case ID Parameters.ACTIVITY_KEY -> Column that contains the activity Parameters.MAX_VARIANTS_TO_RETURN -> Maximum number of variants to return variants_df -> If provided, avoid recalculation of the variants dataframe Returns ----------- variants_list List of variants inside the Pandas dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) max_variants_to_return = exec_utils.get_param_value(Parameters.MAX_VARIANTS_TO_RETURN, parameters, None) variants_df = exec_utils.get_param_value(Parameters.VARIANTS_DF, parameters, get_variants_df(df, parameters=parameters)) variants_df = variants_df.reset_index() variants_list = pandas_utils.to_dict_records(variants_df.groupby("variant").agg("count").reset_index()) variants_list = sorted(variants_list, key=lambda x: (x[case_id_glue], x["variant"]), reverse=True) if max_variants_to_return: variants_list = variants_list[:min(len(variants_list), max_variants_to_return)] return variants_list
[docs]def get_variants_df_and_list(df: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Tuple[ pd.DataFrame, Union[List[Dict[str, int]], List[Dict[List[str], int]]]]: """ (Technical method) Provides variants_df and variants_list out of the box Parameters ------------ df Dataframe parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column that contains the Case ID Parameters.ACTIVITY_KEY -> Column that contains the activity Returns ------------ variants_df Variants dataframe variants_list List of variants sorted by their count """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) variants_df = get_variants_df(df, parameters=parameters) variants_stats = get_variant_statistics(df, parameters=parameters) variants_list = [] for vd in variants_stats: variant = vd["variant"] count = vd[case_id_glue] variants_list.append([variant, count]) variants_list = sorted(variants_list, key=lambda x: (x[1], x[0]), reverse=True) return variants_df, variants_list
[docs]def get_cases_description(df: pd.DataFrame, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Dict[ str, Dict[str, Any]]: """ Get a description of traces present in the Pandas dataframe Parameters ----------- df Pandas dataframe parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column that identifies the case ID Parameters.TIMESTAMP_KEY -> Column that identifies the timestamp enable_sort -> Enable sorting of traces Parameters.SORT_BY_COLUMN -> Sort traces inside the dataframe using the specified column. Admitted values: startTime, endTime, caseDuration Parameters.SORT_ASCENDING -> Set sort direction (boolean; it true then the sort direction is ascending, otherwise descending) Parameters.MAX_RET_CASES -> Set the maximum number of returned traces Returns ----------- ret Dictionary of traces associated to their start timestamp, their end timestamp and their duration """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) enable_sort = exec_utils.get_param_value(Parameters.ENABLE_SORT, parameters, True) sort_by_column = exec_utils.get_param_value(Parameters.SORT_BY_COLUMN, parameters, "startTime") sort_ascending = exec_utils.get_param_value(Parameters.SORT_ASCENDING, parameters, True) max_ret_cases = exec_utils.get_param_value(Parameters.MAX_RET_CASES, parameters, None) business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False) worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17]) weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7]) workcalendar = exec_utils.get_param_value(Parameters.WORKCALENDAR, parameters, constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR) grouped_df = df[[case_id_glue, timestamp_key]].groupby(df[case_id_glue]) # grouped_df = df[[case_id_glue, timestamp_key]].groupby(df[case_id_glue]) first_eve_df = grouped_df.first() last_eve_df = grouped_df.last() del grouped_df last_eve_df.columns = [str(col) + '_2' for col in first_eve_df.columns] stacked_df = pd.concat([first_eve_df, last_eve_df], axis=1) del first_eve_df del last_eve_df del stacked_df[case_id_glue] del stacked_df[case_id_glue + "_2"] stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key] stacked_df['caseDuration'] = stacked_df['caseDuration'].astype('timedelta64[s]') if business_hours: stacked_df['caseDuration'] = stacked_df.apply( lambda x: soj_time_business_hours_diff(x[timestamp_key], x[timestamp_key + "_2"], worktiming, weekends, workcalendar), axis=1) else: stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key] stacked_df['caseDuration'] = stacked_df['caseDuration'].astype('timedelta64[s]') stacked_df[timestamp_key + "_2"] = stacked_df[timestamp_key + "_2"].astype('int64') // 10 ** 9 stacked_df[timestamp_key] = stacked_df[timestamp_key].astype('int64') // 10 ** 9 stacked_df = stacked_df.rename(columns={timestamp_key: 'startTime', timestamp_key + "_2": 'endTime'}) if enable_sort: stacked_df = stacked_df.sort_values(sort_by_column, ascending=sort_ascending) if max_ret_cases is not None: stacked_df = stacked_df.head(n=min(max_ret_cases, len(stacked_df))) ret = pandas_utils.to_dict_index(stacked_df) return ret
[docs]def get_variants_df(df, parameters=None): """ Get variants dataframe from a Pandas dataframe Parameters ----------- df Dataframe parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column that contains the Case ID Parameters.ACTIVITY_KEY -> Column that contains the activity Returns ----------- variants_df Variants dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) if variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.STRING: new_df = df.groupby(case_id_glue)[activity_key].agg(lambda col: constants.DEFAULT_VARIANT_SEP.join(pd.Series.to_list(col))).to_frame() elif variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.LIST: new_df = df.groupby(case_id_glue)[activity_key].agg(lambda col: tuple(pd.Series.to_list(col))).to_frame() new_cols = list(new_df.columns) new_df = new_df.rename(columns={new_cols[0]: "variant"}) return new_df
[docs]def get_variants_df_with_case_duration(df, parameters=None): """ Get variants dataframe from a Pandas dataframe, with case duration that is included Parameters ----------- df Dataframe parameters Parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column that contains the Case ID Parameters.ACTIVITY_KEY -> Column that contains the activity Parameters.TIMESTAMP_KEY -> Column that contains the timestamp Returns ----------- variants_df Variants dataframe """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY) business_hours = exec_utils.get_param_value(Parameters.BUSINESS_HOURS, parameters, False) worktiming = exec_utils.get_param_value(Parameters.WORKTIMING, parameters, [7, 17]) weekends = exec_utils.get_param_value(Parameters.WEEKENDS, parameters, [6, 7]) workcalendar = exec_utils.get_param_value(Parameters.WORKCALENDAR, parameters, constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR) grouped_df = df[[case_id_glue, timestamp_key, activity_key]].groupby(df[case_id_glue]) df1 = None if variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.STRING: df1 = grouped_df[activity_key].agg(lambda col: constants.DEFAULT_VARIANT_SEP.join(pd.Series.to_list(col))).to_frame() elif variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.LIST: df1 = grouped_df[activity_key].agg(lambda col: tuple(pd.Series.to_list(col))).to_frame() new_cols = list(df1.columns) df1 = df1.rename(columns={new_cols[0]: "variant"}) first_eve_df = grouped_df.first() last_eve_df = grouped_df.last() del grouped_df last_eve_df.columns = [str(col) + '_2' for col in first_eve_df.columns] stacked_df = pd.concat([first_eve_df, last_eve_df], axis=1) del first_eve_df del last_eve_df del stacked_df[case_id_glue] del stacked_df[case_id_glue + "_2"] stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key] stacked_df['caseDuration'] = stacked_df['caseDuration'].astype('timedelta64[s]') if business_hours: stacked_df['caseDuration'] = stacked_df.apply( lambda x: soj_time_business_hours_diff(x[timestamp_key], x[timestamp_key + "_2"], worktiming, weekends, workcalendar), axis=1) else: stacked_df['caseDuration'] = stacked_df[timestamp_key + "_2"] - stacked_df[timestamp_key] stacked_df['caseDuration'] = stacked_df['caseDuration'].astype('timedelta64[s]') new_df = pd.concat([df1, stacked_df], axis=1) del df1 del stacked_df return new_df
[docs]def get_events(df: pd.DataFrame, case_id: str, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> List[ Dict[str, Any]]: """ Get events belonging to the specified case Parameters ----------- df Pandas dataframe case_id Required case ID parameters Possible parameters of the algorithm, including: Parameters.CASE_ID_KEY -> Column in which the case ID is contained Returns ---------- list_eve List of events belonging to the case """ if parameters is None: parameters = {} case_id_glue = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME) return pandas_utils.to_dict_records(df[df[case_id_glue] == case_id])
[docs]def get_kde_caseduration(df, parameters=None): """ Gets the estimation of KDE density for the case durations calculated on the dataframe Parameters -------------- df Pandas dataframe parameters Possible parameters of the algorithm, including: Parameters.GRAPH_POINTS -> number of points to include in the graph Parameters.CASE_ID_KEY -> Column hosting the Case ID Returns -------------- x X-axis values to represent y Y-axis values to represent """ cases = get_cases_description(df, parameters=parameters) duration_values = [x["caseDuration"] for x in cases.values()] return case_duration_commons.get_kde_caseduration(duration_values, parameters=parameters)
[docs]def get_kde_caseduration_json(df, parameters=None): """ Gets the estimation of KDE density for the case durations calculated on the log/dataframe (expressed as JSON) Parameters -------------- df Pandas dataframe parameters Possible parameters of the algorithm, including: Parameters.GRAPH_POINTS -> number of points to include in the graph Parameters.CASE_ID_KEY -> Column hosting the Case ID Returns -------------- json JSON representing the graph points """ cases = get_cases_description(df, parameters=parameters) duration_values = [x["caseDuration"] for x in cases.values()] return case_duration_commons.get_kde_caseduration_json(duration_values, parameters=parameters)
[docs]def get_all_case_durations(df, parameters=None): """ Gets all the case durations out of the log Parameters ------------ df Pandas dataframe parameters Possible parameters of the algorithm Returns ------------ duration_values List of all duration values """ cd = get_cases_description(df, parameters=parameters) durations = [y["caseDuration"] for y in cd.values()] return sorted(durations)
[docs]def get_first_quartile_case_duration(df, parameters=None): """ Gets the first quartile out of the log Parameters ------------- df Pandas dataframe parameters Possible parameters of the algorithm Returns ------------- value First quartile value """ if parameters is None: parameters = {} duration_values = get_all_case_durations(df, parameters=parameters) if duration_values: return duration_values[int((len(duration_values) * 3) / 4)] return 0
[docs]def get_median_case_duration(df, parameters=None): """ Gets the median case duration out of the log Parameters ------------- df Pandas dataframe parameters Possible parameters of the algorithm Returns ------------- value Median duration value """ if parameters is None: parameters = {} duration_values = get_all_case_durations(df, parameters=parameters) if duration_values: return duration_values[int(len(duration_values) / 2)] return 0