kvpradap · alihitawala · Mar 14, 2016 · kvpradap · Mar 14, 2016 · kvpradap
diff --git a/py_stringmatching/simfunctions.py b/py_stringmatching/simfunctions.py
@@ -49,8 +49,10 @@ def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_id
         >>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2 : (int(1 if s1 == s2 else 0)))
         4.4
     """
+    # input validations
     utils.sim_check_for_none(string1, string2)
     utils.tok_check_for_string_input(string1, string2)
+    # if one of the strings is empty return 0
     if utils.sim_check_for_empty(string1, string2):
         return 0
 
@@ -59,21 +61,24 @@ def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_id
     M = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
     X = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
     Y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
-
+    # DP initialization
     for i in _range(1, len(string1) + 1):
         M[i][0] = -float("inf")
         X[i][0] = gap_start + (i - 1) * gap_continuation
         Y[i][0] = -float("inf")
-
+    # DP initialization
     for j in _range(1, len(string2) + 1):
         M[0][j] = -float("inf")
         X[0][j] = -float("inf")
         Y[0][j] = gap_start + (j - 1) * gap_continuation
-
+    # affine gap calculation using DP
     for i in _range(1, len(string1) + 1):
         for j in _range(1, len(string2) + 1):
+            # best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j
             M[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(M[i - 1][j - 1], X[i - 1][j - 1], Y[i - 1][j - 1])
+            # the best score given that x_i is aligned to a gap
             X[i][j] = max(gap_start + M[i - 1][j], gap_continuation + X[i - 1][j])
+            # the best score given that y_j is aligned to a gap
             Y[i][j] = max(gap_start + M[i][j - 1], gap_continuation + Y[i][j - 1])
     return max(M[len(string1)][len(string2)], X[len(string1)][len(string2)], Y[len(string1)][len(string2)])
 
@@ -311,14 +316,18 @@ def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident):
         >>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
         2.5
     """
+    # input validations
     utils.sim_check_for_none(string1, string2)
     utils.sim_check_for_string_inputs(string1, string2)
 
     dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
+    # DP initialization
     for i in _range(len(string1) + 1):
         dist_mat[i, 0] = -(i * gap_cost)
+    # DP initialization
     for j in _range(len(string2) + 1):
         dist_mat[0, j] = -(j * gap_cost)
+    # Needleman-Wunsch DP calculation
     for i in _range(1, len(string1) + 1):
         for j in _range(1, len(string2) + 1):
             match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1])
@@ -361,11 +370,13 @@ def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident):
         >>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
         6.5
     """
+    # input validations
     utils.sim_check_for_none(string1, string2)
     utils.sim_check_for_string_inputs(string1, string2)
 
     dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
     max_value = 0
+    # Smith Waterman DP calculations
     for i in _range(1, len(string1) + 1):
         for j in _range(1, len(string2) + 1):
             match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1])
@@ -551,27 +562,37 @@ def tfidf(bag1, bag2, corpus_list = None, dampen=False):
         >>> tfidf(['a', 'b', 'a'], ['a'])
         0.7071067811865475
     """
+    # input validations
     utils.sim_check_for_none(bag1, bag2)
     utils.sim_check_for_list_or_set_inputs(bag1, bag2)
+    # if the strings match exactly return 1.0
     if utils.sim_check_for_exact_match(bag1, bag2):
         return 1.0
+    # if one of the strings is empty return 0
     if utils.sim_check_for_empty(bag1, bag2):
         return 0
+    # if corpus is not provided treat input string as corpus
     if corpus_list is None:
         corpus_list = [bag1, bag2]
     corpus_size = len(corpus_list)
+    # term frequency for input strings
     tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
+    # number of documents an element appeared
     element_freq = {}
+    # set of unique element
     total_unique_elements = set()
     for document in corpus_list:
         temp_set = set()
         for element in document:
+            # adding element only if it is present in one of two input string
             if element in bag1 or element in bag2:
                 temp_set.add(element)
                 total_unique_elements.add(element)
+        # update element document frequency for this document
         for element in temp_set:
             element_freq[element] = element_freq[element]+1 if element in element_freq else 1
     idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+    # tfidf calculation
     for element in total_unique_elements:
         idf_element = corpus_size * 1.0 / element_freq[element]
         v_x = 0 if element not in tf_x else (math.log(idf_element) * math.log(tf_x[element] + 1)) if dampen else (idf_element * tf_x[element])
@@ -674,36 +695,48 @@ def soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5):
     References:
         * Principles of Data Integration book
     """
+    # input validations
     utils.sim_check_for_none(bag1, bag2)
     utils.sim_check_for_list_or_set_inputs(bag1, bag2)
+    # if the strings match exactly return 1.0
     if utils.sim_check_for_exact_match(bag1, bag2):
         return 1.0
+    # if one of the strings is empty return 0
     if utils.sim_check_for_empty(bag1, bag2):
         return 0
+    # if corpus is not provided treat input string as corpus
     if corpus_list is None:
         corpus_list = [bag1, bag2]
     corpus_size = len(corpus_list) * 1.0
+    # term frequency for input strings
     tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
+    # number of documents an element appeared
     element_freq = {}
+    # set of unique element
     total_unique_elements = set()
     for document in corpus_list:
         temp_set = set()
         for element in document:
+            # adding element only if it is present in one of two input string
             if element in bag1 or element in bag2:
                 temp_set.add(element)
                 total_unique_elements.add(element)
+        # update element document frequency for this document
         for element in temp_set:
             element_freq[element] = element_freq[element]+1 if element in element_freq else 1
     similarity_map = {}
+    # calculating the term sim score against the input string 2, construct similarity map
     for x in bag1:
         if x not in similarity_map:
             max_score = 0.0
             for y in bag2:
                 score = sim_func(x,y)
+                # adding sim only if it is above threshold and highest for this element
                 if score > threshold and score > max_score:
                     similarity_map[x] = utils.Similarity(x, y, score)
                     max_score = score
     result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
+    # soft-tfidf calculation
     for element in total_unique_elements:
         # numerator
         if element in similarity_map: