Source code for pm4py.objects.log.util.pandas_numpy_variants

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
import pandas as pd
from enum import Enum
from pm4py.util import constants, xes_constants, pandas_utils, exec_utils
import numpy as np
from collections import Counter
from pm4py.util import variants_util


[docs]class Parameters(Enum): CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY INDEX_KEY = "index_key"
[docs]def apply(dataframe: pd.DataFrame, parameters=None): """ Returns the variants from a Pandas dataframe (through Numpy) Parameters ------------------ dataframe Dataframe parameters Parameters of the algorithm, including: - Parameters.CASE_ID_KEY => the case identifier - Parameters.ACTIVITY_KEY => the activity - Parameters.TIMESTAMP_KEY => the timestamp - Parameters.INDEX_KEY => the index Returns ------------------ variants_dict Dictionary associating to each variant the number of occurrences in the dataframe """ if parameters is None: parameters = {} case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, constants.CASE_CONCEPT_NAME) activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes_constants.DEFAULT_NAME_KEY) timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, xes_constants.DEFAULT_TIMESTAMP_KEY) index_key = exec_utils.get_param_value(Parameters.INDEX_KEY, parameters, constants.DEFAULT_INDEX_KEY) if not (hasattr(dataframe, "attrs") and dataframe.attrs): # dataframe has not been initialized through format_dataframe dataframe = pandas_utils.insert_index(dataframe, index_key) dataframe.sort_values([case_id_key, timestamp_key, index_key]) cases = dataframe[case_id_key].to_numpy() activities = dataframe[activity_key].to_numpy() c_unq, c_ind, c_counts = np.unique(cases, return_index=True, return_counts=True) variants = Counter() for i in range(len(c_ind)): si = c_ind[i] ei = si + c_counts[i] acts = tuple(activities[si:ei]) variants[acts] += 1 if variants_util.VARIANT_SPECIFICATION == variants_util.VariantsSpecifications.STRING: variants = {constants.DEFAULT_VARIANT_SEP.join(x): y for x, y in variants.items()} else: variants = {x: y for x, y in variants.items()} return variants