Skip to content
Snippets Groups Projects
Commit b5de7ce3 authored by Sivert Kronen Hatteberg's avatar Sivert Kronen Hatteberg
Browse files

Add utils functions for calculating how similar two names are.

Copied from cerebrum, uses restricted damarau levenshtein.

Issue: GREG-68
parent 0fe9d23b
No related branches found
No related tags found
1 merge request!135ID-porten support and better user matching
Pipeline #99378 failed
def restricted_damarau_levenshtein(first: str, second:str) -> int:
"""
Calculate the edit distance between two string using restricted damarau
levenshtein. Allowing deletion, insertion, substitution and transposition
of characters.
"""
first = ' ' + first
second = ' ' + second
dist_matrix = [[0 for i in range(len(first))] for b in range(len(second))]
for i in range(len(second)):
dist_matrix[i][0] = i
for j in range(len(first)):
dist_matrix[0][j] = j
for i in range(1, len(dist_matrix)):
for j in range(1, len(dist_matrix[0])):
possible_choices = [dist_matrix[i-1][j] + 1, #del
dist_matrix[i][j-1] + 1] #ins
if first[j] == second[i]:
possible_choices.append(dist_matrix[i-1][j-1]) #equal
elif first[j] == second[i-1] and first[j-1] == second[i]:
possible_choices.append(dist_matrix[i-2][j-2] + 1) #trans
else:
possible_choices.append(dist_matrix[i-1][j-1] + 1) #sub
dist_matrix[i][j] = min(possible_choices)
return dist_matrix[-1][-1]
def name_diff(full_name1:str, full_name2:str, threshold:int=2) -> int:
"""
Calculate the difference between two names, allowing one name to be
longer than the other by only checking whether all names in the shortest
seem to be included in the longest.
e.g. name_diff('foo bar', 'foo test bar') = 0
If the total difference is larger than a given threshold after any part of
the name, the function will return early. Useful when only close matches
are interesting.
"""
total_difference = 0
names1 = full_name1.split()
names2 = full_name2.split()
if len(names1) > len(names2):
names1, names2 = names2, names1
for n1 in names1:
best_match = (0, 5)
for i, n2 in enumerate(names2, 1):
diff = restricted_damarau_levenshtein(n1, n2)
if diff < best_match[1]:
best_match = (i, diff)
total_difference += best_match[1]
names2 = names2[best_match[0]:]
if total_difference > threshold:
break
return total_difference
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment