Source code for pm4py.util.string_distance

'''
    This file is part of PM4Py (More Info: https://pm4py.fit.fraunhofer.de).

    PM4Py is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    PM4Py is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
'''
import sys
from typing import List, Union

import stringdist

levenshtein = lambda stru1, stru2: stringdist.levenshtein(stru1, stru2)


[docs]def argmin_levenshtein(stru: str, list_stri: List[str]) -> Union[str, None]: """ Given a string (stru), finds a string in a list of strings (list_stri) that minimizes the Levenshtein distance. Parameters -------------- stru String (that is compared) list_stri List of comparison strings Returns -------------- argmin_dist String (belonging to list_stri) that minimizes the Levenshtein distance with the 'stru' argument """ if list_stri: len_stru = len(stru) argmin_dist = None min_edit_dist = sys.maxsize # sort the strings in the list based on the actual length in comparison # to the provided string list_stri = sorted(list_stri, key=lambda x: abs(len(x) - len_stru)) for comp_stri in list_stri: this_length_diff = abs(len(comp_stri) - len_stru) # to make things faster, breaks whether we reach a string # which distance is greater or equal than the edit distance # with a previous string, because the edit distance would then # be trivially greater or equal in length if this_length_diff >= min_edit_dist: break dist_this_comp = levenshtein(stru, comp_stri) if dist_this_comp < min_edit_dist: argmin_dist = comp_stri min_edit_dist = dist_this_comp return argmin_dist return None
[docs]def argmax_levenshtein(stru: str, list_stri: List[str]) -> Union[str, None]: """ Given a string (stru), finds a string in a list of strings (list_stri) that maximizes the Levenshtein distance. Parameters -------------- stru String (that is compared) list_stri List of comparison strings Returns -------------- argmax_dist String (belonging to list_stri) that maximizes the Levenshtein distance with the 'stru' argument """ if list_stri: len_stru = len(stru) argmax_dist = None max_edit_dist = 0 # sort the strings in the list based on the actual length in comparison # to the provided string list_stri = sorted(list_stri, key=lambda x: -len(x)) for comp_stri in list_stri: # to make things faster, breaks whether the # following condition is reached if len(comp_stri) + len_stru <= max_edit_dist: break dist_this_comp = levenshtein(stru, comp_stri) if dist_this_comp > max_edit_dist: argmax_dist = comp_stri max_edit_dist = dist_this_comp return argmax_dist return None