Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 34 additions & 29 deletions ExtractMainContent.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,40 +6,49 @@
"""
import requests
from configReader import ConfigurationReader
from Extractor import *
import Extractor
import textwrap


class ExtractMainContent(object):
def __init__(self,source,articleurl):
self.extractorlist=[HuffingtonPost(),NYT(),BBC(),BloomBerg(),Guardian(),TheHindu(),TimesOfIndia()]
websites=ConfigurationReader().GetWebsiteSupported()
self.Mapping={}
for index,website in enumerate(websites):
self.Mapping[website]=self.extractorlist[index]
self.Source=source
self.url=articleurl
self.textWrap=textwrap.TextWrapper(initial_indent='\t',subsequent_indent='\t',width=100)


def __init__(self, source, articleurl):
self.extractorlist = [
Extractor.HuffingtonPost(),
Extractor.NYT(),
Extractor.BBC(),
Extractor.BloomBerg(),
Extractor.Guardian(),
Extractor.TheHindu(),
Extractor.TimesOfIndia()]
websites = ConfigurationReader().GetWebsiteSupported()
self.Mapping = {}
for index, website in enumerate(websites):
self.Mapping[website] = self.extractorlist[index]
self.Source = source
self.url = articleurl
self.textWrap = textwrap.TextWrapper(
initial_indent='\t', subsequent_indent='\t', width=100)

def DownloadContent(self):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
req=requests.get(self.url,headers=headers)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}
req = requests.get(self.url, headers=headers)
return req.text
def AddExtractorList(self,extractor):

def AddExtractorList(self, extractor):
self.extractorlist.append(extractor)

def Extract(self):
self.ExtractStrategy=self.Mapping[self.Source]
text=self.DownloadContent()
self.ExtractStrategy = self.Mapping[self.Source]
text = self.DownloadContent()
return self.ExtractStrategy.ExtractionAlgo(text)

def Beautify(self):
title,output=self.Extract()
title, output = self.Extract()
print "=========================================================================="
print "\t"+title
print "\t" + title
print "=========================================================================="
print (self.textWrap.fill(output)) #wrap of the line
print (self.textWrap.fill(output)) # wrap of the line
print "*********************************************************************************"
print "\n\n"
if len(output) == 0:
Expand All @@ -49,11 +58,7 @@ def Beautify(self):
print "\n\n"

def FileSave(self):
title,output=self.Extract()
article_file = open(title+".txt","w+")
title, output = self.Extract()
article_file = open(title + ".txt", "w+")
article_file.write(output.encode('utf-8'))
article_file.close()




161 changes: 88 additions & 73 deletions Extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,122 +6,137 @@
"""
from bs4 import BeautifulSoup


class Extractor(object):
def ExtractionAlgo(self,text):

def ExtractionAlgo(self, text):
pass
def TextExtractionAlgo(self,text,htmlelement,classname):
soup=BeautifulSoup(text,'html.parser')
title=soup.title.string
Result=[]
#print soup
maincontent=soup.find_all(htmlelement, class_=classname)
#print maincontent

def TextExtractionAlgo(self, text, htmlelement, classname):
soup = BeautifulSoup(text, 'html.parser')
title = soup.title.string
Result = []
# print soup
maincontent = soup.find_all(htmlelement, class_=classname)
# print maincontent
for content in maincontent:
scripttags=content.find_all(["script","br","figure","image"])
scripttags = content.find_all(["script", "br", "figure", "image"])
for scripttag in scripttags:
scripttag.extract()
#print content.text
# print content.text
Result.append(content.text)
Result=''.join(Result)
return (title,Result)



Result = ''.join(Result)
return (title, Result)


class HuffingtonPost(Extractor):
"""class for Huffington Post parsing"""

def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self,text):
return Extractor.TextExtractionAlgo(self,text,"div","content-list-component text")



def ExtractionAlgo(self, text):
return Extractor.TextExtractionAlgo(
self, text, "div", "content-list-component text")


class NYT(Extractor):
"""class for New York Times parsing"""

def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self,text):
return Extractor.TextExtractionAlgo(self,text,"p","story-body-text story-content")



def ExtractionAlgo(self, text):
return Extractor.TextExtractionAlgo(
self, text, "p", "story-body-text story-content")


class BBC(Extractor):
"""class for BBC News parsing"""

def __init__(self):
Extractor.__init__(self)

def ExtractionAlgo(self,text):
return Extractor.TextExtractionAlgo(self,text,"div","story-body__inner")



def ExtractionAlgo(self, text):
return Extractor.TextExtractionAlgo(
self, text, "div", "story-body__inner")


class BloomBerg(Extractor):
"""class for BloomBerg parsing"""

def __init__(self):
Extractor.__init__(self)

def ExtractionAlgo(self,text):
return Extractor.TextExtractionAlgo(self,text,"div","body-copy")


def ExtractionAlgo(self, text):
return Extractor.TextExtractionAlgo(self, text, "div", "body-copy")


class Guardian(Extractor):
"""class for Guardian parsing"""

def __init__(self):
Extractor.__init__(self)

def ExtractionAlgo(self,text):
soup=BeautifulSoup(text,'html.parser')
title=soup.title.string
Result=[]
#print soup
maincontent=soup.find_all("div", class_="content__article-body from-content-api js-article__body")
#print maincontent

def ExtractionAlgo(self, text):
soup = BeautifulSoup(text, 'html.parser')
title = soup.title.string
Result = []
# print soup
maincontent = soup.find_all(
"div", class_="content__article-body from-content-api js-article__body")
# print maincontent
for content in maincontent:
scripttags=content.find_all(["script","br","figure","image"])
scripttags = content.find_all(["script", "br", "figure", "image"])
for scripttag in scripttags:
scripttag.extract()
#print content.text
# print content.text
for foundcontent in content.find_all("p"):
Result.append(foundcontent.text)
Result=''.join(Result)
return (title,Result)

Result = ''.join(Result)
return (title, Result)


class TheHindu(Extractor):
"""class for BloomBerg parsing"""

def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self,text):
soup=BeautifulSoup(text,'html.parser')
title=soup.title.string
Result=[]
#print soup
maincontent=soup.find_all("div", class_="article")
#print maincontent

def ExtractionAlgo(self, text):
soup = BeautifulSoup(text, 'html.parser')
title = soup.title.string
Result = []
# print soup
maincontent = soup.find_all("div", class_="article")
# print maincontent
for content in maincontent:
scripttags=content.find_all(["script","br","figure","image","span"])
scripttags = content.find_all(
["script", "br", "figure", "image", "span"])
for scripttag in scripttags:
scripttag.extract()
#print content.text
# print content.text
for foundcontent in content.find_all("p"):
Result.append(foundcontent.text)
Result=''.join(Result)
return (title,Result)

Result = ''.join(Result)
return (title, Result)


class TimesOfIndia(Extractor):
"""class for BloomBerg parsing"""

def __init__(self):
Extractor.__init__(self)
def ExtractionAlgo(self,text):
soup=BeautifulSoup(text,'html.parser')
title=soup.title.string
Result=[]
#print soup
maincontent=soup.find_all("div", class_="Normal")
#print maincontent

def ExtractionAlgo(self, text):
soup = BeautifulSoup(text, 'html.parser')
title = soup.title.string
Result = []
# print soup
maincontent = soup.find_all("div", class_="Normal")
# print maincontent
for content in maincontent:
#print content.text
# print content.text
Result.append(content.text)
Result=''.join(Result)
return (title,Result)


Result = ''.join(Result)
return (title, Result)
64 changes: 34 additions & 30 deletions Main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,53 +10,57 @@
import sys
import codecs

EXIT = 99
BACK = 66


def NewsSources():
NewsSources=ConfigurationReader().GetWebsiteSupported()
NewsSources = ConfigurationReader().GetWebsiteSupported()
return NewsSources


def App():
newsSources=NewsSources()
newsSources = NewsSources()
while True:
for i in xrange(len(newsSources)):
print ("["+str(i)+"]" +"\t" +newsSources[i])
print ("Please enter the index of the news source or press 99 to quit")
for i, newsSource in enumerate(newsSources):
print "[%s] \t %s " % (i, newsSource)
print "Please enter the index of the news source or press 99 to quit"
try:
newsSourceNumber=raw_input("News Source Number >>>> ")
except ValueError:
print ("That is not a valid News Source Number")
newsSourceNumber=int(newsSourceNumber)
if newsSourceNumber==99:
newsSourceNumber = raw_input("News Source Number >>>> ")
except ValueError:
print "That is not a valid News Source Number"
newsSourceNumber = int(newsSourceNumber)
if newsSourceNumber == EXIT:
sys.exit()
if (newsSourceNumber >=len(newsSources)):
print ("Please select the index no less than "+ str(len(newsSources)))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing parenthesis from the print doesn't make it compliant only for the python 2.7 while having it makes it work for both python 2.7 and python 3.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the aim is to make it compatible between python2 and python3, generally either

  1. 'from future import print_function' is used
    Or
  2. six is made a part of the library. Refer http://six.readthedocs.io/ .

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is is better to go with 'from future import print_function' at this point in time.

obj=NewsPulling(newsSources[newsSourceNumber])
Articles=obj.BeautifyArticles();
if (newsSourceNumber >= len(newsSources)):
print "Please select the index no less than %s" % len(newsSources)
obj = NewsPulling(newsSources[newsSourceNumber])
Articles = obj.BeautifyArticles()
while True:
print ("Do you want to read any story further? If yes, please select the number corresponding to the article")
print ("Press 66 to go back to the main menu")
print ("Press 99 to quit")
print "Do you want to read any story further? If yes, please select the number corresponding to the article"
print "Press 66 to go back to the main menu"
print "Press 99 to quit"
try:
articleNumber=raw_input("Article No >>>> ")
articleNumber = int(raw_input("Article No >>>> "))
except ValueError:
print ("That is not a valid Article Number")
articleNumber=int(articleNumber)
if articleNumber==99 :
print("That is not a valid Article Number")
continue
if articleNumber == EXIT:
sys.exit()
elif articleNumber==66 :
elif articleNumber == BACK:
break
elif (articleNumber >= len(Articles)):
print ("Please select the index no less than "+ str(len(Articles)))
#print Articles[articleNumber][2]
elif articleNumber >= len(Articles):
print "Please select the index no less than %s" % len(Articles)
else:
extr=ExtractMainContent(newsSources[newsSourceNumber],Articles[articleNumber][2])
extr = ExtractMainContent(newsSources[newsSourceNumber],
Articles[articleNumber][2])
extr.Beautify()
print ("Do you want to save this article in file")
print("Do you want to save this article in file")
YesorNo = int(raw_input("Press 1 to save else press 0 to continue >>> "))
if YesorNo == 1:
extr.FileSave()


if __name__== "__main__":

if __name__ == "__main__":
sys.stdout = codecs.getwriter('utf8')(sys.stdout)
App();
App()
Loading