Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 36 additions & 3 deletions py_stringmatching/simfunctions.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,10 @@ def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_id
>>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2 : (int(1 if s1 == s2 else 0)))
4.4
"""
# input validations
utils.sim_check_for_none(string1, string2)
utils.tok_check_for_string_input(string1, string2)
# if one of the strings is empty return 0
if utils.sim_check_for_empty(string1, string2):
return 0

Expand All @@ -59,21 +61,24 @@ def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_id
M = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
X = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
Y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)

# DP initialization
for i in _range(1, len(string1) + 1):
M[i][0] = -float("inf")
X[i][0] = gap_start + (i - 1) * gap_continuation
Y[i][0] = -float("inf")

# DP initialization
for j in _range(1, len(string2) + 1):
M[0][j] = -float("inf")
X[0][j] = -float("inf")
Y[0][j] = gap_start + (j - 1) * gap_continuation

# affine gap calculation using DP
for i in _range(1, len(string1) + 1):
for j in _range(1, len(string2) + 1):
# best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[flake8]

  • [E501] line too long (91 > 79 characters)

M[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(M[i - 1][j - 1], X[i - 1][j - 1], Y[i - 1][j - 1])
# the best score given that x_i is aligned to a gap
X[i][j] = max(gap_start + M[i - 1][j], gap_continuation + X[i - 1][j])
# the best score given that y_j is aligned to a gap
Y[i][j] = max(gap_start + M[i][j - 1], gap_continuation + Y[i][j - 1])
return max(M[len(string1)][len(string2)], X[len(string1)][len(string2)], Y[len(string1)][len(string2)])

Expand Down Expand Up @@ -311,14 +316,18 @@ def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident):
>>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
2.5
"""
# input validations
utils.sim_check_for_none(string1, string2)
utils.sim_check_for_string_inputs(string1, string2)

dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
# DP initialization
for i in _range(len(string1) + 1):
dist_mat[i, 0] = -(i * gap_cost)
# DP initialization
for j in _range(len(string2) + 1):
dist_mat[0, j] = -(j * gap_cost)
# Needleman-Wunsch DP calculation
for i in _range(1, len(string1) + 1):
for j in _range(1, len(string2) + 1):
match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1])
Expand Down Expand Up @@ -361,11 +370,13 @@ def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident):
>>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
6.5
"""
# input validations
utils.sim_check_for_none(string1, string2)
utils.sim_check_for_string_inputs(string1, string2)

dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
max_value = 0
# Smith Waterman DP calculations
for i in _range(1, len(string1) + 1):
for j in _range(1, len(string2) + 1):
match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1])
Expand Down Expand Up @@ -551,27 +562,37 @@ def tfidf(bag1, bag2, corpus_list = None, dampen=False):
>>> tfidf(['a', 'b', 'a'], ['a'])
0.7071067811865475
"""
# input validations
utils.sim_check_for_none(bag1, bag2)
utils.sim_check_for_list_or_set_inputs(bag1, bag2)
# if the strings match exactly return 1.0
if utils.sim_check_for_exact_match(bag1, bag2):
return 1.0
# if one of the strings is empty return 0
if utils.sim_check_for_empty(bag1, bag2):
return 0
# if corpus is not provided treat input string as corpus
if corpus_list is None:
corpus_list = [bag1, bag2]
corpus_size = len(corpus_list)
# term frequency for input strings
tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
# number of documents an element appeared
element_freq = {}
# set of unique element
total_unique_elements = set()
for document in corpus_list:
temp_set = set()
for element in document:
# adding element only if it is present in one of two input string
if element in bag1 or element in bag2:
temp_set.add(element)
total_unique_elements.add(element)
# update element document frequency for this document
for element in temp_set:
element_freq[element] = element_freq[element]+1 if element in element_freq else 1
idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
# tfidf calculation
for element in total_unique_elements:
idf_element = corpus_size * 1.0 / element_freq[element]
v_x = 0 if element not in tf_x else (math.log(idf_element) * math.log(tf_x[element] + 1)) if dampen else (idf_element * tf_x[element])
Expand Down Expand Up @@ -674,36 +695,48 @@ def soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5):
References:
* Principles of Data Integration book
"""
# input validations
utils.sim_check_for_none(bag1, bag2)
utils.sim_check_for_list_or_set_inputs(bag1, bag2)
# if the strings match exactly return 1.0
if utils.sim_check_for_exact_match(bag1, bag2):
return 1.0
# if one of the strings is empty return 0
if utils.sim_check_for_empty(bag1, bag2):
return 0
# if corpus is not provided treat input string as corpus
if corpus_list is None:
corpus_list = [bag1, bag2]
corpus_size = len(corpus_list) * 1.0
# term frequency for input strings
tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
# number of documents an element appeared
element_freq = {}
# set of unique element
total_unique_elements = set()
for document in corpus_list:
temp_set = set()
for element in document:
# adding element only if it is present in one of two input string
if element in bag1 or element in bag2:
temp_set.add(element)
total_unique_elements.add(element)
# update element document frequency for this document
for element in temp_set:
element_freq[element] = element_freq[element]+1 if element in element_freq else 1
similarity_map = {}
# calculating the term sim score against the input string 2, construct similarity map
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[flake8]

  • [E501] line too long (89 > 79 characters)

for x in bag1:
if x not in similarity_map:
max_score = 0.0
for y in bag2:
score = sim_func(x,y)
# adding sim only if it is above threshold and highest for this element
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[flake8]

  • [E501] line too long (87 > 79 characters)

if score > threshold and score > max_score:
similarity_map[x] = utils.Similarity(x, y, score)
max_score = score
result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
# soft-tfidf calculation
for element in total_unique_elements:
# numerator
if element in similarity_map:
Expand Down