Source code for pm4py.util.pandas_utils

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
import pkgutil

import pandas as pd

from pm4py.util import constants, xes_constants
import deprecation
import numpy as np


[docs]def to_dict_records(df): """ Pandas dataframe to dictionary (records method) Parameters --------------- df Dataframe Returns -------------- list_dictio List containing a dictionary for each row """ return df.to_dict('records')
[docs]def to_dict_index(df): """ Pandas dataframe to dictionary (index method) Parameters --------------- df Dataframe Returns -------------- dict dict like {index -> {column -> value}} """ return df.to_dict('index')
[docs]def insert_index(df, column_name=constants.DEFAULT_INDEX_KEY, copy_dataframe=True): """ Inserts the dataframe index in the specified column Parameters -------------- df Dataframe column_name Name of the column that should host the index copy_dataframe Establishes if the original dataframe should be copied before inserting the column Returns -------------- df Dataframe with index """ if copy_dataframe: df = df.copy() df = df.reset_index(drop=True) df[column_name] = df.index return df
[docs]def insert_case_index(df, column_name=constants.DEFAULT_CASE_INDEX_KEY, case_id=constants.CASE_CONCEPT_NAME, copy_dataframe=True): """ Inserts the case number in the dataframe Parameters --------------- df Dataframe column_name Name of the column that should host the case index case_id Case identifier copy_dataframe Establishes if the original dataframe should be copied before inserting the column Returns --------------- df Dataframe with case index """ if copy_dataframe: df = df.copy() df[column_name] = df.groupby(case_id).ngroup() return df
[docs]def insert_ev_in_tr_index(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAME, column_name: str = constants.DEFAULT_INDEX_IN_TRACE_KEY) -> pd.DataFrame: """ Inserts a column that specify the index of the event inside the case Parameters --------------- df Dataframe case_id Column that hosts the case identifier column_name Name of the column that should host the index Returns -------------- df Dataframe with index """ df = df.copy() df_trace_idx = df.groupby(case_id).cumcount() df[column_name] = df_trace_idx return df
[docs]def insert_feature_activity_position_in_trace(df: pd.DataFrame, case_id: str = constants.CASE_CONCEPT_NAME, activity_key: str = xes_constants.DEFAULT_NAME_KEY, prefix="@@position_"): """ Inserts additional columns @@position_ACT1, @@position_ACT2 ... which are populated for every event having activity ACT1, ACT2 respectively, with the index of the event inside its case. Parameters ------------------ df Pandas dataframe case_id Case idntifier activity_key Activity prefix Prefix of the "activity position in trace" feature (default: @@position_) Returns ------------------ df Pandas dataframe """ df = insert_ev_in_tr_index(df, case_id=case_id) activities = set(df[activity_key].unique()) for act in activities: df[prefix + act] = df[activity_key].apply(lambda x: np.nan if x == act else -1) df[prefix + act] = df[prefix + act].fillna(df[constants.DEFAULT_INDEX_IN_TRACE_KEY]) df[prefix + act] = df[prefix + act].replace(-1, np.nan) return df
[docs]@deprecation.deprecated('2.2.8', '3.0.0', details="use check_is_pandas_dataframe instead") def check_is_dataframe(log): """ Checks if a log object is a dataframe Parameters ------------- log Log object Returns ------------- boolean Is dataframe? """ if pkgutil.find_loader("pandas"): import pandas as pd return type(log) is pd.DataFrame return False
[docs]@deprecation.deprecated('2.2.8', '3.0.0', details="use check_pandas_dataframe_columns instead") def check_dataframe_columns(df): """ Checks if the dataframe contains all the required columns. If not, raise an exception Parameters -------------- df Pandas dataframe """ if len(set(df.columns).intersection( set([constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_NAME_KEY, xes_constants.DEFAULT_TIMESTAMP_KEY]))) < 3: raise Exception( "please format your dataframe accordingly! df = pm4py.format_dataframe(df, case_id='<name of the case ID column>', activity_key='<name of the activity column>', timestamp_key='<name of the timestamp column>')")
[docs]def check_is_pandas_dataframe(log): """ Checks if a log object is a dataframe Parameters ------------- log Log object Returns ------------- boolean Is dataframe? """ if pkgutil.find_loader("pandas"): import pandas as pd return type(log) is pd.DataFrame return False
[docs]def check_pandas_dataframe_columns(df): """ Checks if the dataframe contains all the required columns. If not, raise an exception Parameters -------------- df Pandas dataframe """ if len(set(df.columns).intersection( set([constants.CASE_CONCEPT_NAME, xes_constants.DEFAULT_NAME_KEY, xes_constants.DEFAULT_TIMESTAMP_KEY]))) < 3: raise Exception( "please format your dataframe accordingly! df = pm4py.format_dataframe(df, case_id='<name of the case ID column>', activity_key='<name of the activity column>', timestamp_key='<name of the timestamp column>')")