text-normalization/num_base.py at master · soshial/text-normalization · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# coding: utf-8
__author__ = 'soshial'

import re

class NumBase(object):
    language = None

    def __init__(self,language,logger):
        self.language = language
        # setting right numword classes
        if language == 'de': from numword import numword_de; self.numword = numword_de.NumWordDE()
        elif language == 'en': from numword import numword_en; self.numword = numword_en.NumWordEN()
        elif language == 'es': from numword import numword_es; self.numword = numword_es.NumWordES()
        elif language == 'fr': from numword import numword_fr; self.numword = numword_fr.NumWordFR()
        elif language == 'ru': from numword import numword_ru; self.numword = numword_ru.NumWordRU()
        else: from numword import numword_en as numword

        self.logger = logger
        self.decades = {}
        self.from_to = u"{from}/{to}"
        self.endings = {}
        self.roman_num_map =  [('M', 1000, 3), ('CM', 900, 1), ('D', 500, 1), ('CD', 400, 1),
                               ('C', 100, 3), ('XC', 90, 1), ('L', 50, 1), ('XL', 40, 1),
                               ('X', 10, 3), ('IX', 9, 1), ('V', 5, 1), ('IV', 4, 1), ('I', 1, 3)]
        self.plus = None
        self.degree= None
        self.number= None
        self.months = None

    def get_canonical_number_from_string(self,clean_number_string):
        """Converting @str into float or long number"""
        import math,re
        if re.search("[.,]",clean_number_string) and not math.isnan(float(clean_number_string)) and not math.isinf(float(clean_number_string)):
            return float(clean_number_string)
        elif not math.isnan(long(clean_number_string)) and not math.isinf(long(clean_number_string)):
            return long(clean_number_string)
        else:
            raise ValueError

    def check_and_convert_into_number(self,str,details):
        """This function checks the string for being number and returns spelled numeral in a string"""
        if not re.search("\d",str): # if numbers are not present then we just return the string back
            return str
        elif re.search("^-?\d+(\.|,)?\d*$",str): # if it is a canonic number string: '-30.3879'
            clean_number_string = re.sub(',','',str)
        else: # if it is some garbage
            if re.search("^.?-?\d+(\.|,)?\d*.?$",str): clean_number_string = re.sub("[^\d.-]","",str) # just cleaned number: # -30 ,.3879 -> 303879
            else: clean_number_string = re.sub("\D","",str) # just cleaned number: # -30 ,.3879 -> 303879
        try:
            canonical_number = self.get_canonical_number_from_string(clean_number_string) # cleaned number converting into long/float
        except StandardError:
            self.logger.info('Problem with processing type of variable (long, float): ' + str + ' -> ' + clean_number_string)
            raise StandardError
        # main processing of the word
        # usual/currency/year/temperature/time
        # todo 40-40 - счёт?
        # todo 1700s
        # todo 40x200
        # todo 6’2″
        type = "-"
#        if self.language == 'ru':
#            gr_case,gr_num,gr_gend,type = self.detect_inflection(details)
#            #print "@___слово:",str,repr(details).decode("unicode-escape"),"  // ",repr(gr_case).decode("unicode-escape"),repr(gr_gend).decode("unicode-escape"),repr(gr_num).decode("unicode-escape"),type
#            if len(gr_case):
#                if type == 'ord':
#                    self.numword.inflection_case = gr_case.pop() + u"," + gr_num.pop() + u"," + gr_gend.pop()
#                else:
#                    self.numword.inflection_case = gr_case.pop()
#            else: self.numword.inflection_case = u"им"
            #print self.numword.inflection_case
        if re.search("^\d+$",str): # simplest natural number
            if 1800 < canonical_number < 2000: return self.numword.year(canonical_number) # a year
            elif type == "ord" or self.is_date_near(details):
                return self.numword.ordinal(canonical_number) # the 1 of March -> the first of March
            else: return self.numword.cardinal(canonical_number) # usual number
        elif re.search("^-?\d+(\.|,)?\d*$",str):
            return self.numword.cardinal(canonical_number) # usual number
        elif re.search("^\d{4}-\d{2,4}$",str): # "1982-(19)95" -> "from 1982 to 1995"
            def daterepl(matchobj):
                try:
                    self.from_to.split("/")
                    return self.from_to.split("/")[0] + " " + self.numword.cardinal(self.get_canonical_number_from_string(matchobj.group(1)))\
                           + " " + self.from_to.split("/")[1] + " " + self.numword.cardinal(self.get_canonical_number_from_string(matchobj.group(2)))
                except StandardError:
                    self.logger.info('Problem with processing type of 2 variables (e.g. 1763-98): ' + str)
                    raise StandardError
            return re.sub("(\d{4})-(\d{4})",daterepl,str,0)
        elif re.search("(\d\d\d\d-\d\d-\d\d)|(\d\d-\d\d-\d\d\d\d)|(\d\d\d\d/\d\d/\d\d)|(\d\d/\d\d/\d\d\d\d)",str):
            #print "#1"
            self.logger.info('Omitting dates: ' + str)
            raise StandardError # 2011-10-17, 12/02/1997 are omitted, but the sentences are not
        elif re.search("^(\+?\d?[\(]\d{3}[\)][\.| |\-]?|^\d{3}[\.|\-| ]?)?\d{3}(\.|\-| )?\d\d(\.|\-| )?\d\d$",str):
            #print "#2"
            self.logger.info('Omitting phone number: ' + str)
            raise StandardError # the phone numbers like (607)-432-1000 (916) 934-45-54
        elif str in self.decades:
            #print "#3"
            return self.decades[str]
        elif re.search(u"^[#№]\d+$",str):
            return self.number+" "+self.numword.cardinal(canonical_number)
        elif re.search("^[$¢€£]-?\d+(\.|,)?\d*$",str): # currencies
            return self.numword.currency(canonical_number)
        elif re.search("^[IVXLCM]{2,}$",str): # roman numerals
            #print "#4"
            return self.numword.cardinal(self.roman_to_int(str))
        elif re.search(u"^-?\d+(°|°?C|°?F)$",str): # temperature
            return self.temperature(long(re.sub("[^\d-]","",str)))
        elif re.search("^-?\d+(\.|,)?\d*.$",str):
            #print "#5"
            if str.endswith(u"k"): return self.numword.cardinal(canonical_number*1000)
            elif str.endswith(u"m"): return self.numword.cardinal(canonical_number*1000000)
            # todo if "60s".endswith(u"s"): 60 seconds or sixties
            elif str.endswith(u"%"): return self.percentage(canonical_number)
            elif str.endswith(u"‰"): return self.percentage(canonical_number,1)
            elif str.endswith(u"‱"): return self.percentage(canonical_number,2)
            elif str.endswith(u"+"): return self.plus.split("/")[0] + " " + self.numword.cardinal(canonical_number) + " " + self.plus.split("/")[1]
            elif str.endswith(u"x"): return self.numword.cardinal(canonical_number) + u" times"
            else: return self.numword.cardinal(canonical_number) # should return self.numword.cardinal(self.short_endings(str))
        elif not self.ordinals(str) is False: # 21st, 9th, 1092nd
            #print "#6"
            return self.ordinals(str)
        # todo localization
        elif re.search("^\d+-?("+"|".join(self.endings)+")$",str):
            #print "#7",str
            return self.complex_endings(str,canonical_number)
        elif re.search("^-?[\d. ]+$",str): # if we have just clean numbers with math symbols or spaces
            #print "#8"
            print u"WARNING!!!_with", str
            self.logger.info("WARNING!!!_with", str + clean_number_string)
            return self.numword.cardinal(canonical_number)
        else:
            #print "#9"
            return re.sub("[\d.-]+",self.numword.cardinal(canonical_number),clean_number_string)

    def int_to_roman(self,i):
        result = []
        for numeral, integer, max_count in self.roman_num_map:
            count = int(i / integer)
            result.append(numeral * count)
            i -= integer * count
        return ''.join(result)

    def roman_to_int(self,str):
        i = result = 0
        repeating = []
        for numeral, integer, max_count in self.roman_num_map:
            repeating[numeral] = 0
            while str[i:i + len(numeral)] == numeral:
                result += integer
                repeating[numeral] += 1
                i += len(numeral)
            if repeating[numeral] > max_count: raise StandardError
        return result

    def date(self,str):
        pass

    def phone(self,str):
        pass

    def ordinals(self,str):
        pass

    def percentage(self,str,power=0):
        pass

    def temperature(self,str):
        pass

    def short_endings(self,str):
        return str

    def complex_endings(self,str,number):
        pass

    def detect_inflection(self,details):
        pass

    def is_date_near(self,details):
        pass