From dbddf0bdc4cd3e88d4ab6d664671dc51626979f6 Mon Sep 17 00:00:00 2001 From: Magdalena Date: Sun, 18 Aug 2019 07:53:27 +0200 Subject: [PATCH] add docstrings (numPy style) to methods in utils --- multi_rake/utils.py | 54 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 52 insertions(+), 2 deletions(-) diff --git a/multi_rake/utils.py b/multi_rake/utils.py index 1ad8079..24cb94e 100644 --- a/multi_rake/utils.py +++ b/multi_rake/utils.py @@ -11,6 +11,20 @@ def detect_language(text, proba_threshold): + """Detect language code and probability of input text based on 'cld2'. + + Parameters + ---------- + text : utf8Bytes + Text to detect language as unicode. + proba_threshold : float + Minimum probability cld2 language detection has to output in order to accept proposed language code. + + Returns + ------- + str + Language code detected by cld2. + """ _, _, details = cld2.detect(text) language_code = details[0].language_code @@ -20,11 +34,35 @@ def detect_language(text, proba_threshold): return language_code -def keep_only_letters(string): - return ' '.join(token.group() for token in LETTERS_RE.finditer(string)) +def keep_only_letters(text): + """Apply regex to only keep letters. + + Parameters + ---------- + text : str + Text to search for letters in. + + Returns + ------- + str + Input text cleaned by regex to only contain letters. + """ + return ' '.join(token.group() for token in LETTERS_RE.finditer(text)) def separate_words(text): + """Seperate text to tokens by whitespace and dimiss numeric tokens. + + Parameters + ---------- + text : str + Text to tokenize. + + Returns + ------- + list of str + Tokenized text. + """ words = [] for word in text.split(): @@ -35,5 +73,17 @@ def separate_words(text): def split_sentences(text): + """Split text into sentences with custom regex boundaries. + + Parameters + ---------- + text : str + Text to split on sentence delimiters. + + Returns + ------- + list of str + Text split into sentences. + """ sentences = SENTENCE_DELIMITERS_RE.split(text) return sentences