diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..9408255 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/nlpserver.iml b/.idea/nlpserver.iml new file mode 100644 index 0000000..c956989 --- /dev/null +++ b/.idea/nlpserver.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index d4d79a5..ff2e12e 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # NLP Server -

+ NLP Server is a Python 3 Flask web service for easy access to multilingual Natural Language Processing tasks such as language detection, article extraction, entity extraction, sentiment analysis, summarization and more. @@ -73,6 +73,7 @@ sudo pip3 install readability-lxml sudo pip3 install BeautifulSoup4 sudo pip3 install afinn sudo pip3 install textblob +sudo pip3 install transformer ``` The /status api endpoint will list missing python modules: http://localhost:6400/status @@ -125,9 +126,11 @@ Endpoint|Method|Parameters|Info|Library /readability|POST|html|Article extraction for provided HTML|readability-lxml /polyglot/entities|POST|text,lang|Entity extraction and sentiment analysis for provided text|polyglot /polyglot/sentiment|POST|text,lang|Sentiment analysis for provided text|polyglot +/trans/sentiment|POST|text,lang|Sentiment analysis for provided text using transformer|polyglot /polyglot/neighbours|GET|word,lang|Embeddings: neighbouring words|polyglot /langid|GET,POST|text|Language detection for provided text|langid /gensim/summarize|POST|text,word_count|Summarization of long text|gensim +/gensim/similarity|POST|text1,text2|Similarity percentage of texts|gensim /spacy/entities|POST|text,lang|Entity extraction for provided text in given language|SpaCy ## Usage diff --git a/nlpserver.py b/nlpserver.py index 06191bf..aef64bd 100644 --- a/nlpserver.py +++ b/nlpserver.py @@ -10,31 +10,32 @@ app = Flask(__name__) # configurations -#app.config['var1'] = 'test' +# app.config['var1'] = 'test' default_data = {} default_data['web64'] = { 'app': 'nlpserver', 'version': '1.0.1', - 'last_modified': '2019-01-15', - 'documentation': 'http://nlpserver.web64.com/', - 'github': 'https://github.com/web64/nlp-server', - 'endpoints': ['/status','/gensim/summarize', '/polyglot/neighbours', '/langid', '/polyglot/entities', '/polyglot/sentiment', '/newspaper', '/readability', '/spacy/entities', '/afinn'], + 'last_modified': '2022-01-15', + 'github': 'https://github.com/abdelrahmankhedr/nlpserver', + 'endpoints': ['/status', '/gensim/summarize', '/gensim/similarity', '/polyglot/neighbours', '/langid', '/polyglot/entities', '/polyglot/sentiment', '/newspaper', '/readability', '/spacy/entities', '/afinn'], } default_data['message'] = 'NLP Server by web64.com' data = default_data + @app.route("/") def main(): return render_template('form.html') - #return jsonify(data) + # return jsonify(data) + @app.route('/status') def status(): data = dict(default_data) data['missing_libraries'] = [] - + try: import textblob except ImportError: @@ -48,7 +49,10 @@ def status(): import gensim except ImportError: data['missing_libraries'].append('gensim') - + try: + import jieba + except ImportError: + data['missing_libraries'].append('jieba') try: import newspaper except ImportError: @@ -63,12 +67,12 @@ def status(): import readability except ImportError: data['missing_libraries'].append('readability') - + try: import bs4 except ImportError: data['missing_libraries'].append('bs4') - + try: import afinn except ImportError: @@ -106,7 +110,7 @@ def spacy_entities(): if request.method == 'GET': return jsonify(data) - params = request.form # postdata + params = request.form # postdata if not params: data['error'] = 'Missing parameters' @@ -121,20 +125,20 @@ def spacy_entities(): else: lang = params['lang'] - nlp = spacy.load( lang ) - doc = nlp( params['text'] ) - data['entities'] = {} - - counters = {} + nlp = spacy.load(lang) + doc = nlp(params['text']) + data['entities'] = {} + + counters = {} for ent in doc.ents: if not ent.label_ in data['entities']: data['entities'][ent.label_] = dict() counters[ent.label_] = 0 else: counters[ent.label_] += 1 - - data['entities'][ ent.label_ ][ counters[ent.label_] ] = ent.text - #data['entities'][ent.label_].add( ent.text ) + + data['entities'][ent.label_][counters[ent.label_]] = ent.text + # data['entities'][ent.label_].add( ent.text ) return jsonify(data) @@ -143,13 +147,11 @@ def spacy_entities(): def gensim_summarize(): from gensim.summarization.summarizer import summarize data = dict(default_data) - data['message'] = "Summarize long text - Usage: 'text' POST parameter" + data['message'] = "Summarize long text - Usage: 'text' POST parameter"+request.form['text'] + params = {} - if request.method == 'GET': - return jsonify(data) - - params = request.form # postdata + params = request.form # postdata if not params: data['error'] = 'Missing parameters' @@ -163,17 +165,61 @@ def gensim_summarize(): word_count = None else: word_count = int(params['word_count']) - - data['summarize'] = summarize( text=params['text'], word_count=word_count ) + + data['summarize'] = summarize(text=params['text'], word_count=word_count) return jsonify(data) +@app.route("/gensim/similarity", methods=['GET', 'POST']) +def gensim_similarity(): + + import jieba + + from gensim import corpora, models, similarities + data = dict(default_data) + data['message'] = "get similarity percentage of phases" + + params = {} + + params = request.form # postdata + + phases = [params['text1'],'abcd efgh'] + + keyword = params['text2'] + + texts = [] + + for phase in phases: + texts.append(list(jieba.cut(phase))) + + dictionary = corpora.Dictionary(texts) + + feature_cnt = len(dictionary.token2id) + + corpus = [dictionary.doc2bow(text) for text in texts] + + tfidf = models.TfidfModel(corpus) + + kw_vector = dictionary.doc2bow( list(jieba.cut(keyword)) ) + + index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features = feature_cnt) + + sim = index[tfidf[kw_vector]] + + data['sim']= sim.tolist() + #for i in range(len(sim)): + #data['simlarity'+str((i+1))] = str(sim[i]) + + return jsonify(data) + + @app.route("/polyglot/neighbours", methods=['GET']) def embeddings(): from polyglot.text import Word data = dict(default_data) data['message'] = "Neighbours (Embeddings) - Find neighbors of word API - Parameters: 'word', 'lang' language (default: en)" + params = {} params['word']= request.args.get('word') @@ -266,12 +312,35 @@ def polyglot_sentiment(): else: language = params['lang'] - - polyglot_text = Text(params['text'], hint_language_code=language) - data['sentiment'] = polyglot_text.polarity + try: + polyglot_text = Text(params['text'], hint_language_code=language) + data['sentiment'] = polyglot_text.polarity + except ZeroDivisionError: + data['sentiment'] = 0 return jsonify(data) +@app.route("/trans/sentiment", methods=['GET','POST']) +def trans_sentiment(): + from transformers import pipeline + data = dict(default_data) + data['message'] = "Sentiment Analysis API - POST only" + data['sentiment'] = {} + + params = request.form # postdata + + if not params: + data['error'] = 'Missing parameters' + return jsonify(data) + + if not params['text']: + data['error'] = 'Text parameter not found' + return jsonify(data) + + classifier = pipeline('sentiment-analysis', model="distilbert-base-uncased-finetuned-sst-2-english") + data['sentiment'] = classifier([params['text']]) + return jsonify(data) + @app.route("/polyglot/entities", methods=['GET','POST']) def polyglot_entities(): from polyglot.text import Text @@ -360,7 +429,7 @@ def readability(): data['readability']['title'] = doc.title() data['readability']['short_title'] = doc.short_title() - #data['readability']['content'] = doc.content() + # data['readability']['content'] = doc.content() data['readability']['article_html'] = doc.summary( html_partial=True ) soup = BeautifulSoup( data['readability']['article_html'] ) @@ -378,7 +447,7 @@ def afinn_sentiment(): data['afinn'] = 0 - #data['afinn'] = afinn.score('This is utterly excellent!') + # data['afinn'] = afinn.score('This is utterly excellent!') params = request.form # postdata @@ -457,7 +526,7 @@ def newspaper(): data['newspaper']['source_url'] = article.source_url data['newspaper']['meta_lang'] = article.meta_lang - #Detect language + # Detect language if len(article.text) > 100: lang_data = langid.classify( article.title + ' ' + article.text ) data['langid']['language'] = lang_data[0] diff --git a/requirements.txt b/requirements.txt index c375b42..d4ca16b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,5 +12,6 @@ readability-lxml BeautifulSoup4 afinn textblob +transformer #summa -#pattern \ No newline at end of file +#pattern