Griffintaur · sachet-mittal · Oct 4, 2017 · Griffintaur · Oct 7, 2017 · sachet-mittal
diff --git a/ExtractMainContent.py b/ExtractMainContent.py
@@ -6,40 +6,49 @@
 """
 import requests
 from configReader import ConfigurationReader
-from Extractor import *
+import Extractor
 import textwrap
 
+
 class ExtractMainContent(object):
-    def __init__(self,source,articleurl):
-        self.extractorlist=[HuffingtonPost(),NYT(),BBC(),BloomBerg(),Guardian(),TheHindu(),TimesOfIndia()]
-        websites=ConfigurationReader().GetWebsiteSupported()
-        self.Mapping={}
-        for index,website in enumerate(websites):
-            self.Mapping[website]=self.extractorlist[index]
-        self.Source=source
-        self.url=articleurl
-        self.textWrap=textwrap.TextWrapper(initial_indent='\t',subsequent_indent='\t',width=100)
-
-
+    def __init__(self, source, articleurl):
+        self.extractorlist = [
+            Extractor.HuffingtonPost(),
+            Extractor.NYT(),
+            Extractor.BBC(),
+            Extractor.BloomBerg(),
+            Extractor.Guardian(),
+            Extractor.TheHindu(),
+            Extractor.TimesOfIndia()]
+        websites = ConfigurationReader().GetWebsiteSupported()
+        self.Mapping = {}
+        for index, website in enumerate(websites):
+            self.Mapping[website] = self.extractorlist[index]
+        self.Source = source
+        self.url = articleurl
+        self.textWrap = textwrap.TextWrapper(
+            initial_indent='\t', subsequent_indent='\t', width=100)
+
     def DownloadContent(self):
-        headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
-        req=requests.get(self.url,headers=headers)
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
+        req = requests.get(self.url, headers=headers)
         return req.text
-    
-    def AddExtractorList(self,extractor):
+
+    def AddExtractorList(self, extractor):
         self.extractorlist.append(extractor)
-    
+
     def Extract(self):
-        self.ExtractStrategy=self.Mapping[self.Source]
-        text=self.DownloadContent()
+        self.ExtractStrategy = self.Mapping[self.Source]
+        text = self.DownloadContent()
         return self.ExtractStrategy.ExtractionAlgo(text)
-    
+
     def Beautify(self):
-        title,output=self.Extract()            
+        title, output = self.Extract()
         print "=========================================================================="
-        print "\t"+title
+        print "\t" + title
         print "=========================================================================="
-        print (self.textWrap.fill(output)) #wrap of the line
+        print (self.textWrap.fill(output))  # wrap of the line
         print "*********************************************************************************"
         print "\n\n"
         if len(output) == 0:
@@ -49,11 +58,7 @@ def Beautify(self):
             print "\n\n"
 
     def FileSave(self):
-        title,output=self.Extract()
-        article_file = open(title+".txt","w+")
+        title, output = self.Extract()
+        article_file = open(title + ".txt", "w+")
         article_file.write(output.encode('utf-8'))
         article_file.close()
-
-
-
-
diff --git a/Extractor.py b/Extractor.py
@@ -6,122 +6,137 @@
 """
 from bs4 import BeautifulSoup
 
+
 class Extractor(object):
-    
-    def ExtractionAlgo(self,text):
+
+    def ExtractionAlgo(self, text):
         pass
-    
-    def TextExtractionAlgo(self,text,htmlelement,classname):
-        soup=BeautifulSoup(text,'html.parser')
-        title=soup.title.string
-        Result=[]
-        #print soup
-        maincontent=soup.find_all(htmlelement, class_=classname)
-        #print maincontent
+
+    def TextExtractionAlgo(self, text, htmlelement, classname):
+        soup = BeautifulSoup(text, 'html.parser')
+        title = soup.title.string
+        Result = []
+        # print soup
+        maincontent = soup.find_all(htmlelement, class_=classname)
+        # print maincontent
         for content in maincontent:
-            scripttags=content.find_all(["script","br","figure","image"])
+            scripttags = content.find_all(["script", "br", "figure", "image"])
             for scripttag in scripttags:
                 scripttag.extract()
-            #print content.text
+            # print content.text
             Result.append(content.text)
-        Result=''.join(Result)
-        return (title,Result)
-
-
-
+        Result = ''.join(Result)
+        return (title, Result)
+
+
 class HuffingtonPost(Extractor):
     """class for Huffington Post parsing"""
+
     def __init__(self):
         Extractor.__init__(self)
-    def ExtractionAlgo(self,text):
-        return Extractor.TextExtractionAlgo(self,text,"div","content-list-component text")
-
-
+
+    def ExtractionAlgo(self, text):
+        return Extractor.TextExtractionAlgo(
+            self, text, "div", "content-list-component text")
+
+
 class NYT(Extractor):
     """class for New York Times parsing"""
+
     def __init__(self):
         Extractor.__init__(self)
-    def ExtractionAlgo(self,text):
-        return Extractor.TextExtractionAlgo(self,text,"p","story-body-text story-content")
-
-
+
+    def ExtractionAlgo(self, text):
+        return Extractor.TextExtractionAlgo(
+            self, text, "p", "story-body-text story-content")
+
+
 class BBC(Extractor):
     """class for BBC News parsing"""
+
     def __init__(self):
         Extractor.__init__(self)
-
-    def ExtractionAlgo(self,text):
-        return Extractor.TextExtractionAlgo(self,text,"div","story-body__inner")
-
-
+
+    def ExtractionAlgo(self, text):
+        return Extractor.TextExtractionAlgo(
+            self, text, "div", "story-body__inner")
+
+
 class BloomBerg(Extractor):
     """class for BloomBerg parsing"""
+
     def __init__(self):
         Extractor.__init__(self)
-
-    def ExtractionAlgo(self,text):
-        return Extractor.TextExtractionAlgo(self,text,"div","body-copy")
-
+
+    def ExtractionAlgo(self, text):
+        return Extractor.TextExtractionAlgo(self, text, "div", "body-copy")
+
+
 class Guardian(Extractor):
     """class for Guardian parsing"""
+
     def __init__(self):
         Extractor.__init__(self)
-
-    def ExtractionAlgo(self,text):
-        soup=BeautifulSoup(text,'html.parser')
-        title=soup.title.string
-        Result=[]
-        #print soup
-        maincontent=soup.find_all("div", class_="content__article-body from-content-api js-article__body")
-        #print maincontent
+
+    def ExtractionAlgo(self, text):
+        soup = BeautifulSoup(text, 'html.parser')
+        title = soup.title.string
+        Result = []
+        # print soup
+        maincontent = soup.find_all(
+            "div", class_="content__article-body from-content-api js-article__body")
+        # print maincontent
         for content in maincontent:
-            scripttags=content.find_all(["script","br","figure","image"])
+            scripttags = content.find_all(["script", "br", "figure", "image"])
             for scripttag in scripttags:
                 scripttag.extract()
-            #print content.text
+            # print content.text
             for foundcontent in content.find_all("p"):
                 Result.append(foundcontent.text)
-        Result=''.join(Result)
-        return (title,Result)
-
+        Result = ''.join(Result)
+        return (title, Result)
+
+
 class TheHindu(Extractor):
     """class for BloomBerg parsing"""
+
     def __init__(self):
         Extractor.__init__(self)
-        
-    def ExtractionAlgo(self,text):
-        soup=BeautifulSoup(text,'html.parser')
-        title=soup.title.string
-        Result=[]
-        #print soup
-        maincontent=soup.find_all("div", class_="article")
-        #print maincontent
+
+    def ExtractionAlgo(self, text):
+        soup = BeautifulSoup(text, 'html.parser')
+        title = soup.title.string
+        Result = []
+        # print soup
+        maincontent = soup.find_all("div", class_="article")
+        # print maincontent
         for content in maincontent:
-            scripttags=content.find_all(["script","br","figure","image","span"])
+            scripttags = content.find_all(
+                ["script", "br", "figure", "image", "span"])
             for scripttag in scripttags:
                 scripttag.extract()
-            #print content.text
+            # print content.text
             for foundcontent in content.find_all("p"):
                 Result.append(foundcontent.text)
-        Result=''.join(Result)
-        return (title,Result)
-
+        Result = ''.join(Result)
+        return (title, Result)
+
+
 class TimesOfIndia(Extractor):
     """class for BloomBerg parsing"""
+
     def __init__(self):
         Extractor.__init__(self)
-        
-    def ExtractionAlgo(self,text):
-        soup=BeautifulSoup(text,'html.parser')
-        title=soup.title.string
-        Result=[]
-        #print soup
-        maincontent=soup.find_all("div", class_="Normal")
-        #print maincontent
+
+    def ExtractionAlgo(self, text):
+        soup = BeautifulSoup(text, 'html.parser')
+        title = soup.title.string
+        Result = []
+        # print soup
+        maincontent = soup.find_all("div", class_="Normal")
+        # print maincontent
         for content in maincontent:
-            #print content.text
+            # print content.text
             Result.append(content.text)
-        Result=''.join(Result)
-        return (title,Result)      
-
-
+        Result = ''.join(Result)
+        return (title, Result)
diff --git a/Main.py b/Main.py
@@ -10,53 +10,57 @@
 import sys
 import codecs
 
+EXIT = 99
+BACK = 66
+
 
 def NewsSources():
-    NewsSources=ConfigurationReader().GetWebsiteSupported()
+    NewsSources = ConfigurationReader().GetWebsiteSupported()
     return NewsSources
 
+
 def App():
-    newsSources=NewsSources()
+    newsSources = NewsSources()
     while True:
-        for i in xrange(len(newsSources)):
-            print ("["+str(i)+"]" +"\t" +newsSources[i])
-        print ("Please enter the index of the news source or press 99 to quit")
+        for i, newsSource in enumerate(newsSources):
+            print "[%s] \t %s " % (i, newsSource)
+        print "Please enter the index of the news source or press 99 to quit"
         try:
-            newsSourceNumber=raw_input("News Source Number >>>> ")
-        except ValueError:  
-            print ("That is not a valid News Source Number")
-        newsSourceNumber=int(newsSourceNumber)
-        if newsSourceNumber==99:
+            newsSourceNumber = raw_input("News Source Number >>>> ")
+        except ValueError:
+            print "That is not a valid News Source Number"
+        newsSourceNumber = int(newsSourceNumber)
+        if newsSourceNumber == EXIT:
             sys.exit()
-        if (newsSourceNumber >=len(newsSources)):
-            print ("Please select the index no less than "+ str(len(newsSources)))  
-        obj=NewsPulling(newsSources[newsSourceNumber])
-        Articles=obj.BeautifyArticles();   
+        if (newsSourceNumber >= len(newsSources)):
+            print "Please select the index no less than %s" % len(newsSources)
+        obj = NewsPulling(newsSources[newsSourceNumber])
+        Articles = obj.BeautifyArticles()
         while True:
-            print ("Do you want to read any story further? If yes, please select the number corresponding to the article")
-            print ("Press 66 to go back to the main menu")
-            print ("Press 99 to quit")
+            print "Do you want to read any story further? If yes, please select the number corresponding to the article"
+            print "Press 66 to go back to the main menu"
+            print "Press 99 to quit"
             try:
-                articleNumber=raw_input("Article No >>>> ")
+                articleNumber = int(raw_input("Article No >>>> "))
             except ValueError:
-                print ("That is not a valid Article Number")
-            articleNumber=int(articleNumber)
-            if articleNumber==99 :
+                print("That is not a valid Article Number")
+                continue
+            if articleNumber == EXIT:
                 sys.exit()
-            elif articleNumber==66 :
+            elif articleNumber == BACK:
                 break
-            elif (articleNumber >= len(Articles)):
-                print ("Please select the index no less than "+ str(len(Articles)))
-            #print Articles[articleNumber][2]
+            elif articleNumber >= len(Articles):
+                print "Please select the index no less than %s" % len(Articles)
             else:
-                extr=ExtractMainContent(newsSources[newsSourceNumber],Articles[articleNumber][2])
+                extr = ExtractMainContent(newsSources[newsSourceNumber],
+                                          Articles[articleNumber][2])
                 extr.Beautify()
-                print ("Do you want to save this article in file")
+                print("Do you want to save this article in file")
                 YesorNo = int(raw_input("Press 1 to save else press 0 to continue >>> "))
                 if YesorNo == 1:
                     extr.FileSave()
-
 
-if __name__== "__main__":
+
+if __name__ == "__main__":
     sys.stdout = codecs.getwriter('utf8')(sys.stdout)
-    App();
+    App()