From 8fd90cfc1a2b0ae22aaf95dfed5d5dedb11c68ca Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Thu, 21 Jul 2022 10:37:16 +0300 Subject: [PATCH 1/9] added a new endpoint that get similarity of phases --- .idea/.gitignore | 8 ++++ .idea/modules.xml | 8 ++++ .idea/nlpserver.iml | 8 ++++ .idea/vcs.xml | 6 +++ nlpserver.py | 99 +++++++++++++++++++++++++++++++++------------ 5 files changed, 103 insertions(+), 26 deletions(-) create mode 100644 .idea/.gitignore create mode 100644 .idea/modules.xml create mode 100644 .idea/nlpserver.iml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..9408255 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/nlpserver.iml b/.idea/nlpserver.iml new file mode 100644 index 0000000..c956989 --- /dev/null +++ b/.idea/nlpserver.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/nlpserver.py b/nlpserver.py index 06191bf..13b8484 100644 --- a/nlpserver.py +++ b/nlpserver.py @@ -10,7 +10,7 @@ app = Flask(__name__) # configurations -#app.config['var1'] = 'test' +# app.config['var1'] = 'test' default_data = {} default_data['web64'] = { @@ -19,22 +19,24 @@ 'last_modified': '2019-01-15', 'documentation': 'http://nlpserver.web64.com/', 'github': 'https://github.com/web64/nlp-server', - 'endpoints': ['/status','/gensim/summarize', '/polyglot/neighbours', '/langid', '/polyglot/entities', '/polyglot/sentiment', '/newspaper', '/readability', '/spacy/entities', '/afinn'], + 'endpoints': ['/status', '/gensim/summarize', '/gensim/similarity', '/polyglot/neighbours', '/langid', '/polyglot/entities', '/polyglot/sentiment', '/newspaper', '/readability', '/spacy/entities', '/afinn'], } default_data['message'] = 'NLP Server by web64.com' data = default_data + @app.route("/") def main(): return render_template('form.html') - #return jsonify(data) + # return jsonify(data) + @app.route('/status') def status(): data = dict(default_data) data['missing_libraries'] = [] - + try: import textblob except ImportError: @@ -48,7 +50,10 @@ def status(): import gensim except ImportError: data['missing_libraries'].append('gensim') - + try: + import jieba + except ImportError: + data['missing_libraries'].append('jieba') try: import newspaper except ImportError: @@ -63,12 +68,12 @@ def status(): import readability except ImportError: data['missing_libraries'].append('readability') - + try: import bs4 except ImportError: data['missing_libraries'].append('bs4') - + try: import afinn except ImportError: @@ -106,7 +111,7 @@ def spacy_entities(): if request.method == 'GET': return jsonify(data) - params = request.form # postdata + params = request.form # postdata if not params: data['error'] = 'Missing parameters' @@ -121,20 +126,20 @@ def spacy_entities(): else: lang = params['lang'] - nlp = spacy.load( lang ) - doc = nlp( params['text'] ) - data['entities'] = {} - - counters = {} + nlp = spacy.load(lang) + doc = nlp(params['text']) + data['entities'] = {} + + counters = {} for ent in doc.ents: if not ent.label_ in data['entities']: data['entities'][ent.label_] = dict() counters[ent.label_] = 0 else: counters[ent.label_] += 1 - - data['entities'][ ent.label_ ][ counters[ent.label_] ] = ent.text - #data['entities'][ent.label_].add( ent.text ) + + data['entities'][ent.label_][counters[ent.label_]] = ent.text + # data['entities'][ent.label_].add( ent.text ) return jsonify(data) @@ -143,13 +148,11 @@ def spacy_entities(): def gensim_summarize(): from gensim.summarization.summarizer import summarize data = dict(default_data) - data['message'] = "Summarize long text - Usage: 'text' POST parameter" + data['message'] = "Summarize long text - Usage: 'text' POST parameter"+request.form['text'] + params = {} - if request.method == 'GET': - return jsonify(data) - - params = request.form # postdata + params = request.form # postdata if not params: data['error'] = 'Missing parameters' @@ -163,17 +166,61 @@ def gensim_summarize(): word_count = None else: word_count = int(params['word_count']) - - data['summarize'] = summarize( text=params['text'], word_count=word_count ) + + data['summarize'] = summarize(text=params['text'], word_count=word_count) return jsonify(data) +@app.route("/gensim/similarity", methods=['GET', 'POST']) +def gensim_similarity(): + + import jieba + + from gensim import corpora, models, similarities + data = dict(default_data) + data['message'] = "get similarity percentage of phases" + + params = {} + + params = request.form # postdata + + phases = [params['as'],'abcd efgh'] + + keyword = params['event'] + + texts = [] + + for phase in phases: + texts.append(list(jieba.cut(phase))) + + dictionary = corpora.Dictionary(texts) + + feature_cnt = len(dictionary.token2id) + + corpus = [dictionary.doc2bow(text) for text in texts] + + tfidf = models.TfidfModel(corpus) + + kw_vector = dictionary.doc2bow( list(jieba.cut(keyword)) ) + + index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt) + + sim = index[tfidf[kw_vector]] + + data['sim']= sim.tolist() + #for i in range(len(sim)): + #data['simlarity'+str((i+1))] = str(sim[i]) + + return jsonify(data) + + @app.route("/polyglot/neighbours", methods=['GET']) def embeddings(): from polyglot.text import Word data = dict(default_data) data['message'] = "Neighbours (Embeddings) - Find neighbors of word API - Parameters: 'word', 'lang' language (default: en)" + params = {} params['word']= request.args.get('word') @@ -360,7 +407,7 @@ def readability(): data['readability']['title'] = doc.title() data['readability']['short_title'] = doc.short_title() - #data['readability']['content'] = doc.content() + # data['readability']['content'] = doc.content() data['readability']['article_html'] = doc.summary( html_partial=True ) soup = BeautifulSoup( data['readability']['article_html'] ) @@ -378,7 +425,7 @@ def afinn_sentiment(): data['afinn'] = 0 - #data['afinn'] = afinn.score('This is utterly excellent!') + # data['afinn'] = afinn.score('This is utterly excellent!') params = request.form # postdata @@ -457,7 +504,7 @@ def newspaper(): data['newspaper']['source_url'] = article.source_url data['newspaper']['meta_lang'] = article.meta_lang - #Detect language + # Detect language if len(article.text) > 100: lang_data = langid.classify( article.title + ' ' + article.text ) data['langid']['language'] = lang_data[0] From c66f8086b58a15b476984bf231ec5d6c87b46126 Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Thu, 21 Jul 2022 10:40:40 +0300 Subject: [PATCH 2/9] update readme file --- README.md | 1 + nlpserver.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d4d79a5..8945e74 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,7 @@ Endpoint|Method|Parameters|Info|Library /polyglot/neighbours|GET|word,lang|Embeddings: neighbouring words|polyglot /langid|GET,POST|text|Language detection for provided text|langid /gensim/summarize|POST|text,word_count|Summarization of long text|gensim +/gensim/similarity|POST|text1,text2|Similarity percentage of texts|gensim /spacy/entities|POST|text,lang|Entity extraction for provided text in given language|SpaCy ## Usage diff --git a/nlpserver.py b/nlpserver.py index 13b8484..3834baa 100644 --- a/nlpserver.py +++ b/nlpserver.py @@ -185,9 +185,9 @@ def gensim_similarity(): params = request.form # postdata - phases = [params['as'],'abcd efgh'] + phases = [params['text1'],'abcd efgh'] - keyword = params['event'] + keyword = params['text2'] texts = [] From ff7728bc96749c1827bbf8c9e2aceb16d42c11d9 Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Fri, 22 Jul 2022 10:45:47 +0300 Subject: [PATCH 3/9] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8945e74..1055317 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # NLP Server -

+ NLP Server is a Python 3 Flask web service for easy access to multilingual Natural Language Processing tasks such as language detection, article extraction, entity extraction, sentiment analysis, summarization and more. From c5da9135d6efcafbd2f93beafac1f5986445fc1b Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Sun, 5 Feb 2023 00:13:11 +0300 Subject: [PATCH 4/9] Update nlpserver.py --- nlpserver.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nlpserver.py b/nlpserver.py index 3834baa..a0413c9 100644 --- a/nlpserver.py +++ b/nlpserver.py @@ -313,9 +313,11 @@ def polyglot_sentiment(): else: language = params['lang'] - - polyglot_text = Text(params['text'], hint_language_code=language) - data['sentiment'] = polyglot_text.polarity + try: + polyglot_text = Text(params['text'], hint_language_code=language) + data['sentiment'] = polyglot_text.polarity + except ZeroDivisionError: + data['sentiment'] = 0 return jsonify(data) From 7e95f6dc2621557f683644ea9ccdbf5efa1a9793 Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Sun, 5 Feb 2023 00:37:01 +0300 Subject: [PATCH 5/9] Update nlpserver.py --- nlpserver.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nlpserver.py b/nlpserver.py index a0413c9..21544d7 100644 --- a/nlpserver.py +++ b/nlpserver.py @@ -16,9 +16,8 @@ default_data['web64'] = { 'app': 'nlpserver', 'version': '1.0.1', - 'last_modified': '2019-01-15', - 'documentation': 'http://nlpserver.web64.com/', - 'github': 'https://github.com/web64/nlp-server', + 'last_modified': '2022-01-15', + 'github': 'https://github.com/abdelrahmankhedr/nlpserver', 'endpoints': ['/status', '/gensim/summarize', '/gensim/similarity', '/polyglot/neighbours', '/langid', '/polyglot/entities', '/polyglot/sentiment', '/newspaper', '/readability', '/spacy/entities', '/afinn'], } From 8c6bd3a5c48ef8b6276b69d921f736b91fa6a234 Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Sun, 5 Feb 2023 07:08:51 +0300 Subject: [PATCH 6/9] Update nlpserver.py --- nlpserver.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/nlpserver.py b/nlpserver.py index 21544d7..aef64bd 100644 --- a/nlpserver.py +++ b/nlpserver.py @@ -320,6 +320,27 @@ def polyglot_sentiment(): return jsonify(data) +@app.route("/trans/sentiment", methods=['GET','POST']) +def trans_sentiment(): + from transformers import pipeline + data = dict(default_data) + data['message'] = "Sentiment Analysis API - POST only" + data['sentiment'] = {} + + params = request.form # postdata + + if not params: + data['error'] = 'Missing parameters' + return jsonify(data) + + if not params['text']: + data['error'] = 'Text parameter not found' + return jsonify(data) + + classifier = pipeline('sentiment-analysis', model="distilbert-base-uncased-finetuned-sst-2-english") + data['sentiment'] = classifier([params['text']]) + return jsonify(data) + @app.route("/polyglot/entities", methods=['GET','POST']) def polyglot_entities(): from polyglot.text import Text From 93e35a30f62376405810303975bd46b9d2fba6c1 Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Sun, 5 Feb 2023 07:09:42 +0300 Subject: [PATCH 7/9] Update requirements.txt --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c375b42..d4ca16b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,5 +12,6 @@ readability-lxml BeautifulSoup4 afinn textblob +transformer #summa -#pattern \ No newline at end of file +#pattern From e40511c98636abef7dca41b6d1e16d3ef81deb3c Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Sun, 5 Feb 2023 07:10:58 +0300 Subject: [PATCH 8/9] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 1055317..4d3e5f3 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ Endpoint|Method|Parameters|Info|Library /readability|POST|html|Article extraction for provided HTML|readability-lxml /polyglot/entities|POST|text,lang|Entity extraction and sentiment analysis for provided text|polyglot /polyglot/sentiment|POST|text,lang|Sentiment analysis for provided text|polyglot +/trans/sentiment|POST|text,lang|Sentiment analysis for provided text using transformer|polyglot /polyglot/neighbours|GET|word,lang|Embeddings: neighbouring words|polyglot /langid|GET,POST|text|Language detection for provided text|langid /gensim/summarize|POST|text,word_count|Summarization of long text|gensim From 5b1ebb5946ca513b6cb2403b1d04d1e9c74d51d4 Mon Sep 17 00:00:00 2001 From: abdelrahmankhedr Date: Sun, 5 Feb 2023 07:11:39 +0300 Subject: [PATCH 9/9] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4d3e5f3..ff2e12e 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ sudo pip3 install readability-lxml sudo pip3 install BeautifulSoup4 sudo pip3 install afinn sudo pip3 install textblob +sudo pip3 install transformer ``` The /status api endpoint will list missing python modules: http://localhost:6400/status