From 69bb857e36ce01ca3e5e47b5cb8d83952e3955e9 Mon Sep 17 00:00:00 2001 From: aliHitawala Date: Mon, 14 Mar 2016 15:26:18 -0500 Subject: [PATCH] Adding comments in sim function implementation --- py_stringmatching/simfunctions.py | 39 ++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/py_stringmatching/simfunctions.py b/py_stringmatching/simfunctions.py index cab351b..061fb4a 100644 --- a/py_stringmatching/simfunctions.py +++ b/py_stringmatching/simfunctions.py @@ -49,8 +49,10 @@ def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_id >>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2 : (int(1 if s1 == s2 else 0))) 4.4 """ + # input validations utils.sim_check_for_none(string1, string2) utils.tok_check_for_string_input(string1, string2) + # if one of the strings is empty return 0 if utils.sim_check_for_empty(string1, string2): return 0 @@ -59,21 +61,24 @@ def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_id M = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) X = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) Y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) - + # DP initialization for i in _range(1, len(string1) + 1): M[i][0] = -float("inf") X[i][0] = gap_start + (i - 1) * gap_continuation Y[i][0] = -float("inf") - + # DP initialization for j in _range(1, len(string2) + 1): M[0][j] = -float("inf") X[0][j] = -float("inf") Y[0][j] = gap_start + (j - 1) * gap_continuation - + # affine gap calculation using DP for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): + # best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j M[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(M[i - 1][j - 1], X[i - 1][j - 1], Y[i - 1][j - 1]) + # the best score given that x_i is aligned to a gap X[i][j] = max(gap_start + M[i - 1][j], gap_continuation + X[i - 1][j]) + # the best score given that y_j is aligned to a gap Y[i][j] = max(gap_start + M[i][j - 1], gap_continuation + Y[i][j - 1]) return max(M[len(string1)][len(string2)], X[len(string1)][len(string2)], Y[len(string1)][len(string2)]) @@ -311,14 +316,18 @@ def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident): >>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0)) 2.5 """ + # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) + # DP initialization for i in _range(len(string1) + 1): dist_mat[i, 0] = -(i * gap_cost) + # DP initialization for j in _range(len(string2) + 1): dist_mat[0, j] = -(j * gap_cost) + # Needleman-Wunsch DP calculation for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1]) @@ -361,11 +370,13 @@ def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident): >>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) 6.5 """ + # input validations utils.sim_check_for_none(string1, string2) utils.sim_check_for_string_inputs(string1, string2) dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) max_value = 0 + # Smith Waterman DP calculations for i in _range(1, len(string1) + 1): for j in _range(1, len(string2) + 1): match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1]) @@ -551,27 +562,37 @@ def tfidf(bag1, bag2, corpus_list = None, dampen=False): >>> tfidf(['a', 'b', 'a'], ['a']) 0.7071067811865475 """ + # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) + # if the strings match exactly return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 + # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 + # if corpus is not provided treat input string as corpus if corpus_list is None: corpus_list = [bag1, bag2] corpus_size = len(corpus_list) + # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) + # number of documents an element appeared element_freq = {} + # set of unique element total_unique_elements = set() for document in corpus_list: temp_set = set() for element in document: + # adding element only if it is present in one of two input string if element in bag1 or element in bag2: temp_set.add(element) total_unique_elements.add(element) + # update element document frequency for this document for element in temp_set: element_freq[element] = element_freq[element]+1 if element in element_freq else 1 idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 + # tfidf calculation for element in total_unique_elements: idf_element = corpus_size * 1.0 / element_freq[element] v_x = 0 if element not in tf_x else (math.log(idf_element) * math.log(tf_x[element] + 1)) if dampen else (idf_element * tf_x[element]) @@ -674,36 +695,48 @@ def soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5): References: * Principles of Data Integration book """ + # input validations utils.sim_check_for_none(bag1, bag2) utils.sim_check_for_list_or_set_inputs(bag1, bag2) + # if the strings match exactly return 1.0 if utils.sim_check_for_exact_match(bag1, bag2): return 1.0 + # if one of the strings is empty return 0 if utils.sim_check_for_empty(bag1, bag2): return 0 + # if corpus is not provided treat input string as corpus if corpus_list is None: corpus_list = [bag1, bag2] corpus_size = len(corpus_list) * 1.0 + # term frequency for input strings tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) + # number of documents an element appeared element_freq = {} + # set of unique element total_unique_elements = set() for document in corpus_list: temp_set = set() for element in document: + # adding element only if it is present in one of two input string if element in bag1 or element in bag2: temp_set.add(element) total_unique_elements.add(element) + # update element document frequency for this document for element in temp_set: element_freq[element] = element_freq[element]+1 if element in element_freq else 1 similarity_map = {} + # calculating the term sim score against the input string 2, construct similarity map for x in bag1: if x not in similarity_map: max_score = 0.0 for y in bag2: score = sim_func(x,y) + # adding sim only if it is above threshold and highest for this element if score > threshold and score > max_score: similarity_map[x] = utils.Similarity(x, y, score) max_score = score result, v_x_2, v_y_2 = 0.0, 0.0, 0.0 + # soft-tfidf calculation for element in total_unique_elements: # numerator if element in similarity_map: