From e24ea66cb0548c6e1126694efbf0f6bd6d489ea5 Mon Sep 17 00:00:00 2001 From: sachet-mittal Date: Tue, 3 Oct 2017 21:22:00 -0700 Subject: [PATCH] make code flake8 complient $ flake8 *.py ExtractMainContent.py:34:80: E501 line too long (144 > 79 characters) ExtractMainContent.py:48:80: E501 line too long (90 > 79 characters) ExtractMainContent.py:50:80: E501 line too long (90 > 79 characters) ExtractMainContent.py:52:80: E501 line too long (97 > 79 characters) ExtractMainContent.py:55:80: E501 line too long (131 > 79 characters) ExtractMainContent.py:57:80: E501 line too long (101 > 79 characters) Extractor.py:87:80: E501 line too long (84 > 79 characters) Main.py:40:80: E501 line too long (120 > 79 characters) Main.py:59:80: E501 line too long (89 > 79 characters) NewsPulling.py:15:80: E501 line too long (94 > 79 characters) NewsPulling.py:31:80: E501 line too long (113 > 79 characters) --- ExtractMainContent.py | 63 +++++++++-------- Extractor.py | 161 +++++++++++++++++++++++------------------- Main.py | 64 +++++++++-------- NewsPulling.py | 101 +++++++++++++------------- configReader.py | 30 ++++---- 5 files changed, 221 insertions(+), 198 deletions(-) diff --git a/ExtractMainContent.py b/ExtractMainContent.py index 2677a1e..0eead5e 100644 --- a/ExtractMainContent.py +++ b/ExtractMainContent.py @@ -6,40 +6,49 @@ """ import requests from configReader import ConfigurationReader -from Extractor import * +import Extractor import textwrap + class ExtractMainContent(object): - def __init__(self,source,articleurl): - self.extractorlist=[HuffingtonPost(),NYT(),BBC(),BloomBerg(),Guardian(),TheHindu(),TimesOfIndia()] - websites=ConfigurationReader().GetWebsiteSupported() - self.Mapping={} - for index,website in enumerate(websites): - self.Mapping[website]=self.extractorlist[index] - self.Source=source - self.url=articleurl - self.textWrap=textwrap.TextWrapper(initial_indent='\t',subsequent_indent='\t',width=100) - - + def __init__(self, source, articleurl): + self.extractorlist = [ + Extractor.HuffingtonPost(), + Extractor.NYT(), + Extractor.BBC(), + Extractor.BloomBerg(), + Extractor.Guardian(), + Extractor.TheHindu(), + Extractor.TimesOfIndia()] + websites = ConfigurationReader().GetWebsiteSupported() + self.Mapping = {} + for index, website in enumerate(websites): + self.Mapping[website] = self.extractorlist[index] + self.Source = source + self.url = articleurl + self.textWrap = textwrap.TextWrapper( + initial_indent='\t', subsequent_indent='\t', width=100) + def DownloadContent(self): - headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'} - req=requests.get(self.url,headers=headers) + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'} + req = requests.get(self.url, headers=headers) return req.text - - def AddExtractorList(self,extractor): + + def AddExtractorList(self, extractor): self.extractorlist.append(extractor) - + def Extract(self): - self.ExtractStrategy=self.Mapping[self.Source] - text=self.DownloadContent() + self.ExtractStrategy = self.Mapping[self.Source] + text = self.DownloadContent() return self.ExtractStrategy.ExtractionAlgo(text) - + def Beautify(self): - title,output=self.Extract() + title, output = self.Extract() print "==========================================================================" - print "\t"+title + print "\t" + title print "==========================================================================" - print (self.textWrap.fill(output)) #wrap of the line + print (self.textWrap.fill(output)) # wrap of the line print "*********************************************************************************" print "\n\n" if len(output) == 0: @@ -49,11 +58,7 @@ def Beautify(self): print "\n\n" def FileSave(self): - title,output=self.Extract() - article_file = open(title+".txt","w+") + title, output = self.Extract() + article_file = open(title + ".txt", "w+") article_file.write(output.encode('utf-8')) article_file.close() - - - - diff --git a/Extractor.py b/Extractor.py index 2e95af7..7036c48 100644 --- a/Extractor.py +++ b/Extractor.py @@ -6,122 +6,137 @@ """ from bs4 import BeautifulSoup + class Extractor(object): - - def ExtractionAlgo(self,text): + + def ExtractionAlgo(self, text): pass - - def TextExtractionAlgo(self,text,htmlelement,classname): - soup=BeautifulSoup(text,'html.parser') - title=soup.title.string - Result=[] - #print soup - maincontent=soup.find_all(htmlelement, class_=classname) - #print maincontent + + def TextExtractionAlgo(self, text, htmlelement, classname): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all(htmlelement, class_=classname) + # print maincontent for content in maincontent: - scripttags=content.find_all(["script","br","figure","image"]) + scripttags = content.find_all(["script", "br", "figure", "image"]) for scripttag in scripttags: scripttag.extract() - #print content.text + # print content.text Result.append(content.text) - Result=''.join(Result) - return (title,Result) - - - + Result = ''.join(Result) + return (title, Result) + + class HuffingtonPost(Extractor): """class for Huffington Post parsing""" + def __init__(self): Extractor.__init__(self) - def ExtractionAlgo(self,text): - return Extractor.TextExtractionAlgo(self,text,"div","content-list-component text") - - + + def ExtractionAlgo(self, text): + return Extractor.TextExtractionAlgo( + self, text, "div", "content-list-component text") + + class NYT(Extractor): """class for New York Times parsing""" + def __init__(self): Extractor.__init__(self) - def ExtractionAlgo(self,text): - return Extractor.TextExtractionAlgo(self,text,"p","story-body-text story-content") - - + + def ExtractionAlgo(self, text): + return Extractor.TextExtractionAlgo( + self, text, "p", "story-body-text story-content") + + class BBC(Extractor): """class for BBC News parsing""" + def __init__(self): Extractor.__init__(self) - - def ExtractionAlgo(self,text): - return Extractor.TextExtractionAlgo(self,text,"div","story-body__inner") - - + + def ExtractionAlgo(self, text): + return Extractor.TextExtractionAlgo( + self, text, "div", "story-body__inner") + + class BloomBerg(Extractor): """class for BloomBerg parsing""" + def __init__(self): Extractor.__init__(self) - - def ExtractionAlgo(self,text): - return Extractor.TextExtractionAlgo(self,text,"div","body-copy") - + + def ExtractionAlgo(self, text): + return Extractor.TextExtractionAlgo(self, text, "div", "body-copy") + + class Guardian(Extractor): """class for Guardian parsing""" + def __init__(self): Extractor.__init__(self) - - def ExtractionAlgo(self,text): - soup=BeautifulSoup(text,'html.parser') - title=soup.title.string - Result=[] - #print soup - maincontent=soup.find_all("div", class_="content__article-body from-content-api js-article__body") - #print maincontent + + def ExtractionAlgo(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all( + "div", class_="content__article-body from-content-api js-article__body") + # print maincontent for content in maincontent: - scripttags=content.find_all(["script","br","figure","image"]) + scripttags = content.find_all(["script", "br", "figure", "image"]) for scripttag in scripttags: scripttag.extract() - #print content.text + # print content.text for foundcontent in content.find_all("p"): Result.append(foundcontent.text) - Result=''.join(Result) - return (title,Result) - + Result = ''.join(Result) + return (title, Result) + + class TheHindu(Extractor): """class for BloomBerg parsing""" + def __init__(self): Extractor.__init__(self) - - def ExtractionAlgo(self,text): - soup=BeautifulSoup(text,'html.parser') - title=soup.title.string - Result=[] - #print soup - maincontent=soup.find_all("div", class_="article") - #print maincontent + + def ExtractionAlgo(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="article") + # print maincontent for content in maincontent: - scripttags=content.find_all(["script","br","figure","image","span"]) + scripttags = content.find_all( + ["script", "br", "figure", "image", "span"]) for scripttag in scripttags: scripttag.extract() - #print content.text + # print content.text for foundcontent in content.find_all("p"): Result.append(foundcontent.text) - Result=''.join(Result) - return (title,Result) - + Result = ''.join(Result) + return (title, Result) + + class TimesOfIndia(Extractor): """class for BloomBerg parsing""" + def __init__(self): Extractor.__init__(self) - - def ExtractionAlgo(self,text): - soup=BeautifulSoup(text,'html.parser') - title=soup.title.string - Result=[] - #print soup - maincontent=soup.find_all("div", class_="Normal") - #print maincontent + + def ExtractionAlgo(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="Normal") + # print maincontent for content in maincontent: - #print content.text + # print content.text Result.append(content.text) - Result=''.join(Result) - return (title,Result) - - \ No newline at end of file + Result = ''.join(Result) + return (title, Result) diff --git a/Main.py b/Main.py index bea51da..c7a07ae 100644 --- a/Main.py +++ b/Main.py @@ -10,53 +10,57 @@ import sys import codecs +EXIT = 99 +BACK = 66 + def NewsSources(): - NewsSources=ConfigurationReader().GetWebsiteSupported() + NewsSources = ConfigurationReader().GetWebsiteSupported() return NewsSources + def App(): - newsSources=NewsSources() + newsSources = NewsSources() while True: - for i in xrange(len(newsSources)): - print ("["+str(i)+"]" +"\t" +newsSources[i]) - print ("Please enter the index of the news source or press 99 to quit") + for i, newsSource in enumerate(newsSources): + print "[%s] \t %s " % (i, newsSource) + print "Please enter the index of the news source or press 99 to quit" try: - newsSourceNumber=raw_input("News Source Number >>>> ") - except ValueError: - print ("That is not a valid News Source Number") - newsSourceNumber=int(newsSourceNumber) - if newsSourceNumber==99: + newsSourceNumber = raw_input("News Source Number >>>> ") + except ValueError: + print "That is not a valid News Source Number" + newsSourceNumber = int(newsSourceNumber) + if newsSourceNumber == EXIT: sys.exit() - if (newsSourceNumber >=len(newsSources)): - print ("Please select the index no less than "+ str(len(newsSources))) - obj=NewsPulling(newsSources[newsSourceNumber]) - Articles=obj.BeautifyArticles(); + if (newsSourceNumber >= len(newsSources)): + print "Please select the index no less than %s" % len(newsSources) + obj = NewsPulling(newsSources[newsSourceNumber]) + Articles = obj.BeautifyArticles() while True: - print ("Do you want to read any story further? If yes, please select the number corresponding to the article") - print ("Press 66 to go back to the main menu") - print ("Press 99 to quit") + print "Do you want to read any story further? If yes, please select the number corresponding to the article" + print "Press 66 to go back to the main menu" + print "Press 99 to quit" try: - articleNumber=raw_input("Article No >>>> ") + articleNumber = int(raw_input("Article No >>>> ")) except ValueError: - print ("That is not a valid Article Number") - articleNumber=int(articleNumber) - if articleNumber==99 : + print("That is not a valid Article Number") + continue + if articleNumber == EXIT: sys.exit() - elif articleNumber==66 : + elif articleNumber == BACK: break - elif (articleNumber >= len(Articles)): - print ("Please select the index no less than "+ str(len(Articles))) - #print Articles[articleNumber][2] + elif articleNumber >= len(Articles): + print "Please select the index no less than %s" % len(Articles) else: - extr=ExtractMainContent(newsSources[newsSourceNumber],Articles[articleNumber][2]) + extr = ExtractMainContent(newsSources[newsSourceNumber], + Articles[articleNumber][2]) extr.Beautify() - print ("Do you want to save this article in file") + print("Do you want to save this article in file") YesorNo = int(raw_input("Press 1 to save else press 0 to continue >>> ")) if YesorNo == 1: extr.FileSave() - -if __name__== "__main__": + +if __name__ == "__main__": sys.stdout = codecs.getwriter('utf8')(sys.stdout) - App(); + App() diff --git a/NewsPulling.py b/NewsPulling.py index 0b5e3e5..fc5c852 100644 --- a/NewsPulling.py +++ b/NewsPulling.py @@ -10,19 +10,22 @@ from requests import ConnectionError import sys + class NewsPulling(object): """This class is used to pull news from the internet depending on the source specified """ - def __init__(self,newsSource): - self.Source=newsSource - + + def __init__(self, newsSource): + self.Source = newsSource + def PullNews(self): Configuration = ConfigurationReader() - self.__APIKey=Configuration.GetAPIKEY() - self.__Limit=Configuration.GetLimit() - url='https://newsapi.org/v1/articles?source='+self.Source+'&sortBy=top&apiKey='+self.__APIKey + self.__APIKey = Configuration.GetAPIKEY() + self.__Limit = Configuration.GetLimit() + url = 'https://newsapi.org/v1/articles?source=' + \ + self.Source + '&sortBy=top&apiKey=' + self.__APIKey try: - req=requests.get(url) - if(req.status_code==200): + req = requests.get(url) + if(req.status_code == 200): return req else: print "There is some issue in connecting to the internet. Please check your firewall or internet" @@ -30,64 +33,62 @@ def PullNews(self): print "A connection Attempt failed" print e.message sys.exit() - + def JsonRead(self): - req=self.PullNews() + req = self.PullNews() # indicate if we need to convert to utf-8 needsconversion = False if req.encoding != 'utf-8': needsconversion = True - req=req.json() - articles=req['articles'] - noofarticles=len(articles) - maxarticles=min(noofarticles,self.__Limit) - - FilteredArticles=[] - + req = req.json() + articles = req['articles'] + noofarticles = len(articles) + maxarticles = min(noofarticles, self.__Limit) + + FilteredArticles = [] + for i in xrange(maxarticles): - article=articles[i] - #print article + article = articles[i] if needsconversion: - description=unicode(article['description'], 'utf-8') - #print description - title=unicode(article['title'], 'utf-8') - Article_url=unicode(article['url'], 'utf-8') - DateofPublication=unicode(article['publishedAt'], 'utf-8') - Author=unicode(article['author'], 'utf-8') - FilteredArticles.append([description,title,Article_url,DateofPublication,Author]) + description = unicode(article['description'], 'utf-8') + title = unicode(article['title'], 'utf-8') + Article_url = unicode(article['url'], 'utf-8') + DateofPublication = unicode(article['publishedAt'], 'utf-8') + Author = unicode(article['author'], 'utf-8') + FilteredArticles.append([description, + title, + Article_url, + DateofPublication, + Author]) else: - description=article['description'] - #print description - title=article['title'] - Article_url=article['url'] - DateofPublication=article['publishedAt'] - Author=article['author'] - FilteredArticles.append([description,title,Article_url,DateofPublication,Author]) + description = article['description'] + title = article['title'] + Article_url = article['url'] + DateofPublication = article['publishedAt'] + Author = article['author'] + FilteredArticles.append([description, + title, + Article_url, + DateofPublication, + Author]) return FilteredArticles - - #jsondict=json.load(req.json()) - #print jsondict - + def BeautifyArticles(self): - self.Articles=self.JsonRead() - if self.Articles is None or len(self.Articles)==0: + self.Articles = self.JsonRead() + if self.Articles is None or len(self.Articles) == 0: print "No articles found" sys.exit() print "=================STORIES==================================" for i in xrange(len(self.Articles)): - print "[" +str(i) +"]", - # print(sequence,end='') used for python 3.x + print "[" + str(i) + "]", + # print(sequence,end='') used for python 3.x if self.Articles[i][1] is not None: - print "\t"+self.Articles[i][1] + print "\t" + self.Articles[i][1] if self.Articles[i][0] is not None: - print "\t"+self.Articles[i][0] + print "\t" + self.Articles[i][0] if self.Articles[i][4] is not None: - print "\t"+self.Articles[i][4] + print "\t" + self.Articles[i][4] if self.Articles[i][3] is not None: - print "\t"+self.Articles[i][3]+"\n" + print "\t" + self.Articles[i][3] + "\n" print "***************************************************************" - return self.Articles - - - - + return self.Articles diff --git a/configReader.py b/configReader.py index a8b90c4..8bf5368 100644 --- a/configReader.py +++ b/configReader.py @@ -6,31 +6,29 @@ """ import yaml -##to do -#implement singleton pattern here + +# to do +# implement singleton pattern here class ConfigurationReader(): - __APIKEY=None - __WebsiteSupported=[] - __Limit=None + __APIKEY = None + __WebsiteSupported = [] + __Limit = None + def __init__(self): with open("config.yml", 'r') as ymlfile: cfg = yaml.load(ymlfile) - ConfigurationReader.__APIKEY=cfg['Apikey'] - #print ConfigurationReader.__APIKEY - ConfigurationReader.__Limit=cfg['Limit'] - #print ConfigurationReader.__Limit - ConfigurationReader.__WebsiteSupported=cfg['WebsiteSupported'] - #print ConfigurationReader.__WebsiteSupported - @staticmethod + ConfigurationReader.__APIKEY = cfg['Apikey'] + ConfigurationReader.__Limit = cfg['Limit'] + ConfigurationReader.__WebsiteSupported = cfg['WebsiteSupported'] + + @staticmethod def GetAPIKEY(): return ConfigurationReader.__APIKEY - + @staticmethod def GetLimit(): return ConfigurationReader.__Limit - + @staticmethod def GetWebsiteSupported(): return ConfigurationReader.__WebsiteSupported - - \ No newline at end of file