Source code for pm4py.ml

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
from typing import Union, Tuple
import pandas as pd
from pm4py.objects.log.obj import EventLog, EventStream
from pm4py.util import constants
import random


[docs]def split_train_test(log: Union[EventLog, pd.DataFrame], train_percentage: float = 0.8) -> Union[ Tuple[EventLog, EventLog], Tuple[pd.DataFrame, pd.DataFrame]]: """ Split an event log in a training log and a test log (for machine learning purposes) Parameters -------------- log Event log / Pandas dataframe train_percentage Fraction of traces to be included in the training log (from 0.0 to 1.0) Returns -------------- training_log Training event log test_log Test event log """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if type(log) is pd.DataFrame: cases = set(log[constants.CASE_CONCEPT_NAME].unique()) train_cases = set() test_cases = set() for c in cases: r = random.random() if r <= train_percentage: train_cases.add(c) else: test_cases.add(c) train_df = log[log[constants.CASE_CONCEPT_NAME].isin(train_cases)] test_df = log[log[constants.CASE_CONCEPT_NAME].isin(test_cases)] return train_df, test_df else: from pm4py.objects.log.util import split_train_test return split_train_test.split(log, train_percentage=train_percentage)
[docs]def get_prefixes_from_log(log: Union[EventLog, pd.DataFrame], length: int) -> Union[EventLog, pd.DataFrame]: """ Gets the prefixes of a log of a given length Parameters ---------------- log Event log / Pandas dataframe length Length Returns ---------------- prefix_log Log contain the prefixes: - if a trace has lower or identical length, it is included as-is - if a trace has greater length, it is cut """ if type(log) not in [pd.DataFrame, EventLog, EventStream]: raise Exception("the method can be applied only to a traditional event log!") if type(log) is pd.DataFrame: from pm4py.util import pandas_utils log = pandas_utils.insert_ev_in_tr_index(log) return log[log[constants.DEFAULT_INDEX_IN_TRACE_KEY] <= (length-1)] else: from pm4py.objects.log.util import get_prefixes return get_prefixes.get_prefixes_from_log(log, length)