Source code for pm4py.algo.conformance.alignments.edit_distance.variants.edit_distance

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
import difflib
from enum import Enum
from typing import Optional, Dict, Any, List, Set, Union

from pm4py.objects.log.obj import EventLog, Trace
from pm4py.objects.log.util import log_regex
from pm4py.objects.petri_net.utils import align_utils
from pm4py.util import exec_utils
from pm4py.util import string_distance
from pm4py.util import typing
from pm4py.objects.conversion.log import converter as log_converter


[docs]class Parameters(Enum): PERFORM_ANTI_ALIGNMENT = "perform_anti_alignment"
[docs]def apply(log1: EventLog, log2: EventLog, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> typing.ListAlignments: """ Aligns each trace of the first log against the second log, minimizing the edit distance Parameters -------------- log1 First log log2 Second log parameters Parameters of the algorithm Returns --------------- aligned_traces List that contains, for each trace of the first log, the corresponding alignment """ if parameters is None: parameters = {} log1 = log_converter.apply(log1, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters) log2 = log_converter.apply(log2, variant=log_converter.Variants.TO_EVENT_LOG, parameters=parameters) anti_alignment = exec_utils.get_param_value(Parameters.PERFORM_ANTI_ALIGNMENT, parameters, False) aligned_traces = [] # form a mapping dictionary associating each activity of the two logs to an ASCII character mapping = log_regex.form_encoding_dictio_from_two_logs(log1, log2, parameters=parameters) # encode the second log (against which we want to align each trace of the first log) list_encodings = log_regex.get_encoded_log(log2, mapping, parameters=parameters) # optimization: keep one item per variant set_encodings = set(list_encodings) list_encodings = list(set_encodings) # this initial sort helps in reducing the execution time in the following phases, # since the expense of all the successive sorts is reduced if anti_alignment: list_encodings = sorted(list_encodings, key=lambda x: -len(x)) else: list_encodings = sorted(list_encodings, key=lambda x: len(x)) # keeps an alignment cache (to avoid re-calculating the same edit distances :) ) cache_align = {} best_worst_cost = min(len(x) for x in list_encodings) for trace in log1: # gets the alignment align_result = align_trace(trace, list_encodings, set_encodings, mapping, cache_align=cache_align, parameters=parameters) aligned_traces.append(align_result) # assign fitness to traces for index, align in enumerate(aligned_traces): if align is not None: unfitness_upper_part = align['cost'] // align_utils.STD_MODEL_LOG_MOVE_COST if unfitness_upper_part == 0: align['fitness'] = 1 elif (len(log1[index]) + best_worst_cost) > 0: align['fitness'] = 1 - ( (align['cost'] // align_utils.STD_MODEL_LOG_MOVE_COST) / (len(log1[index]) + best_worst_cost)) else: align['fitness'] = 0 align["bwc"] = (len(log1[index]) + best_worst_cost) * align_utils.STD_MODEL_LOG_MOVE_COST return aligned_traces
[docs]def align_trace(trace: Trace, list_encodings: List[str], set_encodings: Set[str], mapping: Dict[str, str], cache_align: Optional[Dict[Any, Any]] = None, parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> typing.AlignmentResult: """ Aligns a trace against a list of traces, minimizing the edit distance Parameters -------------- trace Trace list_encodings List of encoded traces (the same as set_encodings, but as a list) set_encodings Set of encoded traces (the same as list_encodings, but as a set), useful to quickly check if the provided trace is contained in the traces of the other log mapping Mapping (of activities to characters) cache_align Cache of the alignments parameters Parameters of the algorithm Returns -------------- aligned_trace Aligned trace """ if parameters is None: parameters = {} # keeps an alignment cache (to avoid re-calculating the same edit distances :) ) if cache_align is None: cache_align = {} anti_alignment = exec_utils.get_param_value(Parameters.PERFORM_ANTI_ALIGNMENT, parameters, False) comparison_function = string_distance.argmax_levenshtein if anti_alignment else string_distance.argmin_levenshtein # encode the current trace using the mapping dictionary encoded_trace = log_regex.get_encoded_trace(trace, mapping, parameters=parameters) inv_mapping = {y: x for x, y in mapping.items()} if encoded_trace not in cache_align: if not anti_alignment and encoded_trace in set_encodings: # the trace is already in the encodings. we don't need to calculate any edit distance argmin_dist = encoded_trace else: # finds the encoded trace of the other log that is at minimal distance argmin_dist = comparison_function(encoded_trace, list_encodings) seq_match = difflib.SequenceMatcher(None, encoded_trace, argmin_dist).get_matching_blocks() i = 0 j = 0 align_trace = [] total_cost = 0 for el in seq_match: while i < el.a: align_trace.append((inv_mapping[encoded_trace[i]], ">>")) total_cost += align_utils.STD_MODEL_LOG_MOVE_COST i = i + 1 while j < el.b: align_trace.append((">>", inv_mapping[argmin_dist[j]])) total_cost += align_utils.STD_MODEL_LOG_MOVE_COST j = j + 1 for z in range(el.size): align_trace.append((inv_mapping[encoded_trace[i]], inv_mapping[argmin_dist[j]])) i = i + 1 j = j + 1 align = {"alignment": align_trace, "cost": total_cost} # saves the alignment in the cache cache_align[encoded_trace] = align return align else: return cache_align[encoded_trace]