Source code for pm4py.stats

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
from typing import Dict, Union, List, Tuple, Collection
from typing import Set
from collections import Counter

import pandas as pd

from pm4py.objects.ocel.obj import OCEL
from pm4py.objects.log.obj import EventLog, Trace, EventStream
from pm4py.util.pandas_utils import check_is_pandas_dataframe, check_pandas_dataframe_columns, insert_ev_in_tr_index
from pm4py.utils import get_properties
from pm4py.util import xes_constants, constants
from copy import copy
import deprecation


[docs]def get_start_activities(log: Union[EventLog, pd.DataFrame]) -> Dict[str, int]: """ Returns the start activities from a log object Parameters --------------- log Log object Returns --------------- start_activities Dictionary of start activities along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.start_activities.pandas import get return get.get_start_activities(log, parameters=get_properties(log)) else: from pm4py.statistics.start_activities.log import get return get.get_start_activities(log, parameters=get_properties(log))
[docs]def get_end_activities(log: Union[EventLog, pd.DataFrame]) -> Dict[str, int]: """ Returns the end activities of a log Parameters --------------- log Lob object Returns --------------- end_activities Dictionary of end activities along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.end_activities.pandas import get return get.get_end_activities(log, parameters=get_properties(log)) else: from pm4py.statistics.end_activities.log import get return get.get_end_activities(log, parameters=get_properties(log))
[docs]@deprecation.deprecated('2.2.10', '3.0.0', details="please use get_event_attributes instead") def get_attributes(log: Union[EventLog, pd.DataFrame]) -> List[str]: return get_event_attributes(log)
[docs]def get_event_attributes(log: Union[EventLog, pd.DataFrame]) -> List[str]: """ Returns the attributes at the event level of the log Parameters --------------- log Log object Returns --------------- attributes_list List of attributes contained in the log """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) return list(log.columns) else: from pm4py.statistics.attributes.log import get return list(get.get_all_event_attributes_from_log(log))
[docs]def get_trace_attributes(log: Union[EventLog, pd.DataFrame]) -> List[str]: """ Gets the attributes at the trace level of a log object Parameters ---------------- log Log object Returns --------------- trace_attributes_list List of attributes at the trace level """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") from pm4py.util import constants if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) return [x for x in list(log.columns) if x.startswith(constants.CASE_ATTRIBUTE_PREFIX)] else: from pm4py.statistics.attributes.log import get return list(get.get_all_trace_attributes_from_log(log))
[docs]@deprecation.deprecated('2.2.10', '3.0.0', details="please use get_event_attribute_values instead") def get_attribute_values(log: Union[EventLog, pd.DataFrame], attribute: str, count_once_per_case=False) -> Dict[str, int]: return get_event_attribute_values(log, attribute, count_once_per_case=count_once_per_case)
[docs]def get_event_attribute_values(log: Union[EventLog, pd.DataFrame], attribute: str, count_once_per_case=False) -> Dict[str, int]: """ Returns the values for a specified attribute Parameters --------------- log Log object attribute Attribute count_once_per_case If True, consider only an occurrence of the given attribute value inside a case (if there are multiple events sharing the same attribute value, count only 1 occurrence) Returns --------------- attribute_values Dictionary of values along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") parameters = get_properties(log) parameters["keep_once_per_case"] = count_once_per_case if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.attributes.pandas import get return get.get_attribute_values(log, attribute, parameters=parameters) else: from pm4py.statistics.attributes.log import get return get.get_attribute_values(log, attribute, parameters=parameters)
[docs]def get_trace_attribute_values(log: Union[EventLog, pd.DataFrame], attribute: str) -> Dict[str, int]: """ Returns the values for a specified trace attribute Parameters --------------- log Log object attribute Attribute Returns --------------- attribute_values Dictionary of values along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.attributes.pandas import get return get.get_attribute_values(log, attribute) else: from pm4py.statistics.attributes.log import get return get.get_trace_attribute_values(log, attribute)
[docs]def get_variants(log: Union[EventLog, pd.DataFrame]) -> Dict[str, List[Trace]]: """ Gets the variants from the log Parameters -------------- log Event log Returns -------------- variants Dictionary of variants along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") import pm4py if pm4py.util.variants_util.VARIANT_SPECIFICATION == pm4py.util.variants_util.VariantsSpecifications.STRING: import warnings warnings.warn('pm4py.get_variants is deprecated. Please use pm4py.get_variants_as_tuples instead.') if pm4py.util.variants_util.VARIANT_SPECIFICATION == pm4py.util.variants_util.VariantsSpecifications.LIST: raise Exception('Please use pm4py.get_variants_as_tuples') if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.variants.pandas import get return get.get_variants_count(log, parameters=get_properties(log)) else: from pm4py.statistics.variants.log import get return get.get_variants(log, parameters=get_properties(log))
[docs]def get_variants_as_tuples(log: Union[EventLog, pd.DataFrame]) -> Dict[Tuple[str], List[Trace]]: """ Gets the variants from the log (where the keys are tuples and not strings) Parameters -------------- log Event log Returns -------------- variants Dictionary of variants along with their count """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") import pm4py # the behavior of PM4Py is changed to allow this to work pm4py.util.variants_util.VARIANT_SPECIFICATION = pm4py.util.variants_util.VariantsSpecifications.LIST if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.variants.pandas import get return get.get_variants_count(log, parameters=get_properties(log)) else: from pm4py.statistics.variants.log import get return get.get_variants(log, parameters=get_properties(log))
[docs]def get_minimum_self_distances(log: EventLog) -> Dict[str, int]: ''' This algorithm computes the minimum self-distance for each activity observed in an event log. The self distance of a in <a> is infinity, of a in <a,a> is 0, in <a,b,a> is 1, etc. The minimum self distance is the minimal observed self distance value in the event log. Parameters ---------- log event log (either pandas.DataFrame, EventLog or EventStream) Returns ------- dict mapping an activity to its self-distance, if it exists, otherwise it is not part of the dict. ''' if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") from pm4py.algo.discovery.minimum_self_distance import algorithm as msd_algo return msd_algo.apply(log, parameters=get_properties(log))
[docs]def get_minimum_self_distance_witnesses(log: EventLog) -> Dict[str, Set[str]]: ''' This function derives the minimum self distance witnesses. The self distance of a in <a> is infinity, of a in <a,a> is 0, in <a,b,a> is 1, etc. The minimum self distance is the minimal observed self distance value in the event log. A 'witness' is an activity that witnesses the minimum self distance. For example, if the minimum self distance of activity a in some log L is 2, then, if trace <a,b,c,a> is in log L, b and c are a witness of a. Parameters ---------- log Event Log to use Returns ------- Dictionary mapping each activity to a set of witnesses. ''' if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") from pm4py.algo.discovery.minimum_self_distance import algorithm as msd_algo from pm4py.algo.discovery.minimum_self_distance import utils as msdw_algo return msdw_algo.derive_msd_witnesses(log, msd_algo.apply(log, parameters=get_properties(log)), parameters=get_properties(log))
[docs]def get_case_arrival_average(log: Union[EventLog, pd.DataFrame]) -> float: """ Gets the average difference between the start times of two consecutive cases Parameters --------------- log Log object Returns --------------- case_arrival_average Average difference between the start times of two consecutive cases """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.traces.generic.pandas import case_arrival return case_arrival.get_case_arrival_avg(log, parameters=get_properties(log)) else: from pm4py.statistics.traces.generic.log import case_arrival return case_arrival.get_case_arrival_avg(log, parameters=get_properties(log))
[docs]def get_rework_cases_per_activity(log: Union[EventLog, pd.DataFrame]) -> Dict[str, int]: """ Find out for which activities of the log the rework (more than one occurrence in the trace for the activity) occurs. The output is a dictionary associating to each of the aforementioned activities the number of cases for which the rework occurred. Parameters ------------------ log Log object Returns ------------------ rework_dictionary Dictionary associating to each of the aforementioned activities the number of cases for which the rework occurred. """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.rework.pandas import get as rework_get return rework_get.apply(log, parameters=get_properties(log)) else: from pm4py.statistics.rework.log import get as rework_get return rework_get.apply(log, parameters=get_properties(log))
[docs]def get_case_overlap(log: Union[EventLog, pd.DataFrame]) -> List[int]: """ Associates to each case in the log the number of cases concurrently open Parameters ------------------ log Log object Returns ------------------ overlap_list List that for each case (identified by its index in the log) tells how many other cases are concurrently open. """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.overlap.cases.pandas import get as cases_overlap return cases_overlap.apply(log, parameters=get_properties(log)) else: from pm4py.statistics.overlap.cases.log import get as cases_overlap return cases_overlap.apply(log, parameters=get_properties(log))
[docs]def get_cycle_time(log: Union[EventLog, pd.DataFrame]) -> float: """ Calculates the cycle time of the event log. The definition that has been followed is the one proposed in: https://www.presentationeze.com/presentations/lean-manufacturing-just-in-time/lean-manufacturing-just-in-time-full-details/process-cycle-time-analysis/calculate-cycle-time/#:~:text=Cycle%20time%20%3D%20Average%20time%20between,is%2024%20minutes%20on%20average. So: Cycle time = Average time between completion of units. Example taken from the website: Consider a manufacturing facility, which is producing 100 units of product per 40 hour week. The average throughput rate is 1 unit per 0.4 hours, which is one unit every 24 minutes. Therefore the cycle time is 24 minutes on average. Parameters ----------------- log Log object Returns ----------------- cycle_time Cycle time (calculated with the aforementioned formula). """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.traces.cycle_time.pandas import get as cycle_time return cycle_time.apply(log, parameters=get_properties(log)) else: from pm4py.statistics.traces.cycle_time.log import get as cycle_time return cycle_time.apply(log, parameters=get_properties(log))
[docs]def get_all_case_durations(log: Union[EventLog, pd.DataFrame], business_hours: bool = False, worktiming: List[int] = [7, 17], weekends: List[int] = [6, 7]) -> List[float]: """ Gets the durations of the cases in the event log Parameters --------------- log Event log business_hours Enables/disables the computation based on the business hours (default: False) worktiming (If the business hours are enabled) The hour range in which the resources of the log are working (default: 7 to 17) weekends (If the business hours are enabled) The weekends days (default: Saturday (6), Sunday (7)) Returns --------------- durations Case durations (as list) """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") properties = copy(get_properties(log)) properties["business_hours"] = business_hours properties["worktiming"] = worktiming properties["weekends"] = weekends if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.traces.generic.pandas import case_statistics cd = case_statistics.get_cases_description(log, parameters=properties) return sorted([x["caseDuration"] for x in cd.values()]) else: from pm4py.statistics.traces.generic.log import case_statistics return case_statistics.get_all_case_durations(log, parameters=properties)
[docs]def get_case_duration(log: Union[EventLog, pd.DataFrame], case_id: str, business_hours: bool = False, worktiming: List[int] = [7, 17], weekends: List[int] = [6, 7]) -> float: """ Gets the duration of a specific case Parameters ------------------- log Event log case_id Case identifier business_hours Enables/disables the computation based on the business hours (default: False) worktiming (If the business hours are enabled) The hour range in which the resources of the log are working (default: 7 to 17) weekends (If the business hours are enabled) The weekends days (default: Saturday (6), Sunday (7)) Returns ------------------ duration Duration of the given case """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") properties = copy(get_properties(log)) properties["business_hours"] = business_hours properties["worktiming"] = worktiming properties["weekends"] = weekends if check_is_pandas_dataframe(log): check_pandas_dataframe_columns(log) from pm4py.statistics.traces.generic.pandas import case_statistics cd = case_statistics.get_cases_description(log, parameters=properties) return cd[case_id]["caseDuration"] else: from pm4py.statistics.traces.generic.log import case_statistics cd = case_statistics.get_cases_description(log, parameters=properties) return cd[case_id]["caseDuration"]
[docs]def get_activity_position_summary(log: Union[EventLog, pd.DataFrame], activity: str) -> Dict[int, int]: """ Given an event log, returns a dictionary which summarize the positions of the activities in the different cases of the event log. E.g., if an activity happens 1000 times in the position 1 (the second event of a case), and 500 times in the position 2 (the third event of a case), then the returned dictionary would be: {1: 1000, 2: 500} Parameters ----------------- log Event log object / Pandas dataframe activity Activity to consider Returns ----------------- pos_dict_summary Summary of the positions of the activity in the trace (e.g. {1: 1000, 2: 500}) """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") properties = get_properties(log) activity_key = properties[ constants.PARAMETER_CONSTANT_ACTIVITY_KEY] if constants.PARAMETER_CONSTANT_ACTIVITY_KEY in properties else xes_constants.DEFAULT_NAME_KEY case_id_key = properties[ constants.PARAMETER_CONSTANT_CASEID_KEY] if constants.PARAMETER_CONSTANT_CASEID_KEY in properties else constants.CASE_CONCEPT_NAME if check_is_pandas_dataframe(log): log = insert_ev_in_tr_index(log, case_id_key, "@@index_in_trace") ret = log[log[activity_key] == activity]["@@index_in_trace"].value_counts().to_dict() return ret else: ret = Counter() for trace in log: for i in range(len(trace)): this_act = trace[i][activity_key] if this_act == activity: ret[i] += 1 return dict(ret)