diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a121595 --- /dev/null +++ b/.gitignore @@ -0,0 +1,108 @@ +# Saved News Files +saved_articles/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +.static_storage/ +.media/ +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/ExtractMainContent.py b/ExtractMainContent.py deleted file mode 100644 index dcbcb8c..0000000 --- a/ExtractMainContent.py +++ /dev/null @@ -1,60 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Jul 24 21:42:05 2016-2017 - -@author: Ankit Singh -""" -import requests -from configReader import ConfigurationReader -from Extractor import * -import textwrap -import os - -class ExtractMainContent(object): - def __init__(self,source,articleurl): - self.extractorlist=[HuffingtonPost(),NYT(),BBC(),BloomBerg(),Guardian(),TheHindu(),TimesOfIndia()] - websites=ConfigurationReader().GetWebsiteSupported() - self.Mapping={} - for index,website in enumerate(websites): - self.Mapping[website]=self.extractorlist[index] - self.Source=source - self.url=articleurl - self.textWrap=textwrap.TextWrapper(initial_indent='\t',subsequent_indent='\t',width=100) - - - def DownloadContent(self): - headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'} - req=requests.get(self.url,headers=headers) - return req.text - - def AddExtractorList(self,extractor): - self.extractorlist.append(extractor) - - def Extract(self): - self.ExtractStrategy=self.Mapping[self.Source] - text=self.DownloadContent() - return self.ExtractStrategy.ExtractionAlgo(text) - - def Beautify(self): - title,output=self.Extract() - print "==========================================================================" - print "\t"+title - print "==========================================================================" - print (self.textWrap.fill(output)) #wrap of the line - print "*********************************************************************************" - print "\n\n" - if len(output) == 0: - print "There isn't much on the site .It is media(video/image) post.To further view the media post Go to the below link" - print self.url - print "*********************************************************************************" - print "\n\n" - - def FileSave(self): - title,output=self.Extract() - current_directory = os.getcwd() - save_directory = os.path.join(current_directory, r'Saved_articles') - if not os.path.exists(save_directory): - os.makedirs(save_directory) - article_file = open("Saved_articles/"+title+".txt","w+") - article_file.write(output.encode('utf-8')) - article_file.close() diff --git a/Extractor.py b/Extractor.py deleted file mode 100644 index 2e95af7..0000000 --- a/Extractor.py +++ /dev/null @@ -1,127 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Jul 24 23:05:13 2016-2017 - -@author: Ankit Singh -""" -from bs4 import BeautifulSoup - -class Extractor(object): - - def ExtractionAlgo(self,text): - pass - - def TextExtractionAlgo(self,text,htmlelement,classname): - soup=BeautifulSoup(text,'html.parser') - title=soup.title.string - Result=[] - #print soup - maincontent=soup.find_all(htmlelement, class_=classname) - #print maincontent - for content in maincontent: - scripttags=content.find_all(["script","br","figure","image"]) - for scripttag in scripttags: - scripttag.extract() - #print content.text - Result.append(content.text) - Result=''.join(Result) - return (title,Result) - - - -class HuffingtonPost(Extractor): - """class for Huffington Post parsing""" - def __init__(self): - Extractor.__init__(self) - def ExtractionAlgo(self,text): - return Extractor.TextExtractionAlgo(self,text,"div","content-list-component text") - - -class NYT(Extractor): - """class for New York Times parsing""" - def __init__(self): - Extractor.__init__(self) - def ExtractionAlgo(self,text): - return Extractor.TextExtractionAlgo(self,text,"p","story-body-text story-content") - - -class BBC(Extractor): - """class for BBC News parsing""" - def __init__(self): - Extractor.__init__(self) - - def ExtractionAlgo(self,text): - return Extractor.TextExtractionAlgo(self,text,"div","story-body__inner") - - -class BloomBerg(Extractor): - """class for BloomBerg parsing""" - def __init__(self): - Extractor.__init__(self) - - def ExtractionAlgo(self,text): - return Extractor.TextExtractionAlgo(self,text,"div","body-copy") - -class Guardian(Extractor): - """class for Guardian parsing""" - def __init__(self): - Extractor.__init__(self) - - def ExtractionAlgo(self,text): - soup=BeautifulSoup(text,'html.parser') - title=soup.title.string - Result=[] - #print soup - maincontent=soup.find_all("div", class_="content__article-body from-content-api js-article__body") - #print maincontent - for content in maincontent: - scripttags=content.find_all(["script","br","figure","image"]) - for scripttag in scripttags: - scripttag.extract() - #print content.text - for foundcontent in content.find_all("p"): - Result.append(foundcontent.text) - Result=''.join(Result) - return (title,Result) - -class TheHindu(Extractor): - """class for BloomBerg parsing""" - def __init__(self): - Extractor.__init__(self) - - def ExtractionAlgo(self,text): - soup=BeautifulSoup(text,'html.parser') - title=soup.title.string - Result=[] - #print soup - maincontent=soup.find_all("div", class_="article") - #print maincontent - for content in maincontent: - scripttags=content.find_all(["script","br","figure","image","span"]) - for scripttag in scripttags: - scripttag.extract() - #print content.text - for foundcontent in content.find_all("p"): - Result.append(foundcontent.text) - Result=''.join(Result) - return (title,Result) - -class TimesOfIndia(Extractor): - """class for BloomBerg parsing""" - def __init__(self): - Extractor.__init__(self) - - def ExtractionAlgo(self,text): - soup=BeautifulSoup(text,'html.parser') - title=soup.title.string - Result=[] - #print soup - maincontent=soup.find_all("div", class_="Normal") - #print maincontent - for content in maincontent: - #print content.text - Result.append(content.text) - Result=''.join(Result) - return (title,Result) - - \ No newline at end of file diff --git a/Images/Readme.md b/Images/Readme.md index b421bbf..6161bc4 100644 --- a/Images/Readme.md +++ b/Images/Readme.md @@ -1 +1 @@ -# This folder contain the sample ouput or screen shots of the program while running. +# This folder contains the sample output or screen shots of the program while running. diff --git a/Images/screenshot1.JPG b/Images/screenshot1.JPG index 86cfcfc..26f4330 100644 Binary files a/Images/screenshot1.JPG and b/Images/screenshot1.JPG differ diff --git a/Images/snapshot2.JPG b/Images/snapshot2.JPG index c22eee1..66613e1 100644 Binary files a/Images/snapshot2.JPG and b/Images/snapshot2.JPG differ diff --git a/Main.py b/Main.py deleted file mode 100644 index bea51da..0000000 --- a/Main.py +++ /dev/null @@ -1,62 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Jul 24 16:15:11 2016-2017 - -@author: Ankit Singh -""" -from NewsPulling import NewsPulling -from configReader import ConfigurationReader -from ExtractMainContent import ExtractMainContent -import sys -import codecs - - -def NewsSources(): - NewsSources=ConfigurationReader().GetWebsiteSupported() - return NewsSources - -def App(): - newsSources=NewsSources() - while True: - for i in xrange(len(newsSources)): - print ("["+str(i)+"]" +"\t" +newsSources[i]) - print ("Please enter the index of the news source or press 99 to quit") - try: - newsSourceNumber=raw_input("News Source Number >>>> ") - except ValueError: - print ("That is not a valid News Source Number") - newsSourceNumber=int(newsSourceNumber) - if newsSourceNumber==99: - sys.exit() - if (newsSourceNumber >=len(newsSources)): - print ("Please select the index no less than "+ str(len(newsSources))) - obj=NewsPulling(newsSources[newsSourceNumber]) - Articles=obj.BeautifyArticles(); - while True: - print ("Do you want to read any story further? If yes, please select the number corresponding to the article") - print ("Press 66 to go back to the main menu") - print ("Press 99 to quit") - try: - articleNumber=raw_input("Article No >>>> ") - except ValueError: - print ("That is not a valid Article Number") - articleNumber=int(articleNumber) - if articleNumber==99 : - sys.exit() - elif articleNumber==66 : - break - elif (articleNumber >= len(Articles)): - print ("Please select the index no less than "+ str(len(Articles))) - #print Articles[articleNumber][2] - else: - extr=ExtractMainContent(newsSources[newsSourceNumber],Articles[articleNumber][2]) - extr.Beautify() - print ("Do you want to save this article in file") - YesorNo = int(raw_input("Press 1 to save else press 0 to continue >>> ")) - if YesorNo == 1: - extr.FileSave() - - -if __name__== "__main__": - sys.stdout = codecs.getwriter('utf8')(sys.stdout) - App(); diff --git a/NewsPulling.py b/NewsPulling.py deleted file mode 100644 index 0b5e3e5..0000000 --- a/NewsPulling.py +++ /dev/null @@ -1,93 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Jul 24 20:01:34 2016-2017 - -@author: Ankit Singh -""" - -import requests -from configReader import ConfigurationReader -from requests import ConnectionError -import sys - -class NewsPulling(object): - """This class is used to pull news from the internet depending on the source specified """ - def __init__(self,newsSource): - self.Source=newsSource - - def PullNews(self): - Configuration = ConfigurationReader() - self.__APIKey=Configuration.GetAPIKEY() - self.__Limit=Configuration.GetLimit() - url='https://newsapi.org/v1/articles?source='+self.Source+'&sortBy=top&apiKey='+self.__APIKey - try: - req=requests.get(url) - if(req.status_code==200): - return req - else: - print "There is some issue in connecting to the internet. Please check your firewall or internet" - except ConnectionError as e: - print "A connection Attempt failed" - print e.message - sys.exit() - - def JsonRead(self): - req=self.PullNews() - # indicate if we need to convert to utf-8 - needsconversion = False - if req.encoding != 'utf-8': - needsconversion = True - req=req.json() - articles=req['articles'] - noofarticles=len(articles) - maxarticles=min(noofarticles,self.__Limit) - - FilteredArticles=[] - - for i in xrange(maxarticles): - article=articles[i] - #print article - if needsconversion: - description=unicode(article['description'], 'utf-8') - #print description - title=unicode(article['title'], 'utf-8') - Article_url=unicode(article['url'], 'utf-8') - DateofPublication=unicode(article['publishedAt'], 'utf-8') - Author=unicode(article['author'], 'utf-8') - FilteredArticles.append([description,title,Article_url,DateofPublication,Author]) - else: - description=article['description'] - #print description - title=article['title'] - Article_url=article['url'] - DateofPublication=article['publishedAt'] - Author=article['author'] - FilteredArticles.append([description,title,Article_url,DateofPublication,Author]) - return FilteredArticles - - #jsondict=json.load(req.json()) - #print jsondict - - def BeautifyArticles(self): - self.Articles=self.JsonRead() - if self.Articles is None or len(self.Articles)==0: - print "No articles found" - sys.exit() - print "=================STORIES==================================" - for i in xrange(len(self.Articles)): - print "[" +str(i) +"]", - # print(sequence,end='') used for python 3.x - if self.Articles[i][1] is not None: - print "\t"+self.Articles[i][1] - if self.Articles[i][0] is not None: - print "\t"+self.Articles[i][0] - if self.Articles[i][4] is not None: - print "\t"+self.Articles[i][4] - if self.Articles[i][3] is not None: - print "\t"+self.Articles[i][3]+"\n" - print "***************************************************************" - return self.Articles - - - - diff --git a/README.md b/README.md index 0589ff2..e28ea0c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +## Please note that I have made changes that invalidate the information in the readme. To run create the saved_articles directory, if it isn't pulled from the repo, then use Python 3.6+ and run `python3 news.py`. I'll fix the readme soon, and provide a better way to install and run. Thanks. - Tom + # News at the Command line ### Want to be kept updated without visiting the news portals every now and then @@ -6,35 +8,27 @@ # Modules Requirements -- **Python 2.7 +** +- **Python 3.6+** - **Requests** - **Beautiful Soup** - **PyYAML** To install the module dependencies before running the application, simply navigate into the project folder and run `pip install -r requirements.txt`. -#### I recommend installing Python Anaconda so that all useful libraries are available at one go. # Working - All sample input images are placed under the **Images** folder. - You can change the maximum number of posts in **config.yml**. Look for **Limit** attribute. # How To Use - Just run the Main.py. - You can understand the rest while using it for the first time. - ### To execute - python Main.py - + Make sure you have installed required libraries, instructions above. + Just run the main.py, do this by typing `py main.py`. + The rest is quite straight forward. + # Contributing We welcome your contributions. Please feel free to fork the code, play with it, make some patches and send us pull requests. - [MAIL me](anky.nits.cse@gmail.com) for any discussion on the project or contribution. # Support -Please [open an issue on GitHub](https://github.com/Griffintaur/News-At-Command-Line/issues/new) if you'd like to report a bug or request a feature. - -# ScreenShot - ![Input1](https://raw.githubusercontent.com/Griffintaur/News-At-Command-Line/master/Images/screenshot1.JPG) - - ![Ouput1](https://raw.githubusercontent.com/Griffintaur/News-At-Command-Line/master/Images/snapshot2.JPG) +Please open an issue on GitHub if you'd like to report a bug or request a feature. ## License The code is released under MIT license and free to use. diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..3795fbf --- /dev/null +++ b/TODO.md @@ -0,0 +1,10 @@ +* Move all files to proper locations (./news) +* Test all the things +* Update README +* Read from environment variables + config + command line args +* Edit configuration in program +* Move to plugins for news sources +* Move all prompts to string constants file for easy changes and translations +* Dependency inject BeautifulSoup in extractor + +* Consider REST client / server architecture diff --git a/config.yml b/config.yml index 54199a7..b11ddb4 100644 --- a/config.yml +++ b/config.yml @@ -1,20 +1,4 @@ -# -*- coding: utf-8 -*- -#""" -#Created on Jul 24 16:13:25 2016-2017 - -#@author: Ankit Singh -#""" - -# -*- coding: utf-8 -*- - -#Created on Jul 20 11:43:52 2016 - -#@author: Ankit Singh - - WebsiteSupported: - # - IndianExpress - # - TheHindu - the-huffington-post - the-new-york-times - bbc-news @@ -22,10 +6,8 @@ WebsiteSupported: - the-guardian-uk - the-hindu - the-times-of-india - # - The WashingtonPost - # - TheNewyorkTimes -#No of posts to be displayed in the one go +# Posts shown Limit: 10 Apikey: bda5818cc2af461e98330ccdf6fb9cbe diff --git a/configReader.py b/configReader.py deleted file mode 100644 index a8b90c4..0000000 --- a/configReader.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -""" -Created on Jul 24 20:14:29 2016-2017 - -@author: Ankit Singh -""" -import yaml - -##to do -#implement singleton pattern here -class ConfigurationReader(): - __APIKEY=None - __WebsiteSupported=[] - __Limit=None - def __init__(self): - with open("config.yml", 'r') as ymlfile: - cfg = yaml.load(ymlfile) - ConfigurationReader.__APIKEY=cfg['Apikey'] - #print ConfigurationReader.__APIKEY - ConfigurationReader.__Limit=cfg['Limit'] - #print ConfigurationReader.__Limit - ConfigurationReader.__WebsiteSupported=cfg['WebsiteSupported'] - #print ConfigurationReader.__WebsiteSupported - @staticmethod - def GetAPIKEY(): - return ConfigurationReader.__APIKEY - - @staticmethod - def GetLimit(): - return ConfigurationReader.__Limit - - @staticmethod - def GetWebsiteSupported(): - return ConfigurationReader.__WebsiteSupported - - \ No newline at end of file diff --git a/config_reader.py b/config_reader.py new file mode 100644 index 0000000..0f36639 --- /dev/null +++ b/config_reader.py @@ -0,0 +1,16 @@ +import yaml + + +class ConfigurationReader: + def __init__(self): + with open('config.yml') as ymlfile: + cfg = yaml.load(ymlfile) + self.APIKEY = cfg['Apikey'] + self.limit = cfg['Limit'] + self.websites_supported = cfg['WebsiteSupported'] + + # TODO: Move to using this, and reading it from env, config, defaults + self.user_agent = cfg.get('User-Agent', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' + ' AppleWebKit/537.36 (KHTML, like Gecko ' + 'Chrome/59.0.3071.115 Safari/537.36') diff --git a/dev-requirements.txt b/dev-requirements.txt new file mode 100644 index 0000000..f4413b9 --- /dev/null +++ b/dev-requirements.txt @@ -0,0 +1,14 @@ +autopep8==1.3.4 +beautifulsoup4==4.6.0 +bs4==0.0.1 +certifi==2017.7.27.1 +chardet==3.0.4 +flake8==3.5.0 +idna==2.6 +mccabe==0.6.1 +pkg-resources==0.0.0 +pycodestyle==2.3.1 +pyflakes==1.6.0 +PyYAML==3.12 +requests==2.18.4 +urllib3==1.22 diff --git a/extract_main_content.py b/extract_main_content.py new file mode 100644 index 0000000..d91ea78 --- /dev/null +++ b/extract_main_content.py @@ -0,0 +1,61 @@ +import requests +from config_reader import ConfigurationReader +from extractor import * +import textwrap + + +class ExtractMainContent: + def __init__(self, source, articleurl): + self.extractorlist = [HuffingtonPost(), NYT(), BBC( + ), BloomBerg(), Guardian(), TheHindu(), TimesOfIndia()] + websites = ConfigurationReader().websites_supported + self.Mapping = {} + for index, website in enumerate(websites): + self.Mapping[website] = self.extractorlist[index] + self.Source = source + self.url = articleurl + self.textWrap = textwrap.TextWrapper( + initial_indent='\t', subsequent_indent='\t', width=100) + + def download(self): + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/59.0.3071.115 Safari/537.36'} + req = requests.get(self.url, headers=headers) + return req.text + + # unused, but may be useful in the future + # def AddExtractorList(self, extractor): + # self.extractorlist.append(extractor) + + def _extract(self): + self.ExtractStrategy = self.Mapping[self.Source] + text = self.download() + return self.ExtractStrategy.extractor(text) + + def beautify(self): + title, output = self._extract() + print("=" * (len(title) + 15)) + print("\t" + title) + print("=" * (len(title) + 15)) + + print((self.textWrap.fill(output))) # wrap of the line + print("*" * 80) + if len(output) == 0: + print("Sorry :(") + print("There isn't much text on the site besides video/image. To " + "further view the media post, Go to the below link") + print(self.url) + print('*' * 80) + print("\n\n") + + def save(self): + title, output = self._extract() + + # Remove Chars not allowed in filenames + for char in ['<', '>', "/", ":", '"', "\\", "|", "?", "*"]: + if char in title: + title = title.replace(char, "") + + with open(f'saved_articles/{title}.txt', "w+") as f: + f.write(output) diff --git a/extractor.py b/extractor.py new file mode 100644 index 0000000..ee985d5 --- /dev/null +++ b/extractor.py @@ -0,0 +1,133 @@ +from bs4 import BeautifulSoup + + +class Extractor: + + def extractor(self, text): + pass + + def _extraction_algo(self, text, htmlelement, classname): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + result = [] + # print soup + maincontent = soup.find_all(htmlelement, class_=classname) + # print maincontent + for content in maincontent: + scripttags = content.find_all(["script", "br", "figure", "image"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + result.append(content.text) + result = ''.join(result) + return (title, result) + + +class HuffingtonPost(Extractor): + """class for Huffington Post parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + return self._extraction_algo(text, "div", "content-list-component text") + + +class NYT(Extractor): + """class for New York Times parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + return self._extraction_algo(text, "p", "story-body-text story-content") + + +class BBC(Extractor): + """class for BBC News parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + return self._extraction_algo(text, "div", "story-body__inner") + + +class BloomBerg(Extractor): + """class for BloomBerg parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + return self._extraction_algo(text, "div", "body-copy") + + +class Guardian(Extractor): + """class for Guardian parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all( + "div", class_="content__article-body from-content-api js-article__body") + # print maincontent + for content in maincontent: + scripttags = content.find_all(["script", "br", "figure", "image"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + for foundcontent in content.find_all("p"): + Result.append(foundcontent.text) + Result = ''.join(Result) + return (title, Result) + + +class TheHindu(Extractor): + """class for BloomBerg parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="article") + # print maincontent + for content in maincontent: + scripttags = content.find_all( + ["script", "br", "figure", "image", "span"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + for foundcontent in content.find_all("p"): + Result.append(foundcontent.text) + Result = ''.join(Result) + return (title, Result) + + +class TimesOfIndia(Extractor): + """class for BloomBerg parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="Normal") + # print maincontent + for content in maincontent: + # print content.text + Result.append(content.text) + Result = ''.join(Result) + return (title, Result) diff --git a/news.py b/news.py new file mode 100644 index 0000000..64bf84f --- /dev/null +++ b/news.py @@ -0,0 +1,111 @@ +import sys +from enum import Enum + +from news_pulling import NewsPulling +from config_reader import ConfigurationReader +from extract_main_content import ExtractMainContent + + +class SelectionStatus(Enum): + BACK = 1 + EXIT = 2 + READ = 3 + + +def news_sources(): + news_sources = ConfigurationReader().websites_supported + return news_sources + + +def display_sources(sources): + for index, source in enumerate(sources): + print(f'[{index + 1}]\t{source}') + print("\nPlease enter the index of the news source or type 'quit' to exit") + + +def display_title_banner(): + # Cool Title/Banner + print("=" * 40) + print("\tNews at the Command Line") + print("=" * 40) + print() + + +def prompt_for_source(sources): + while True: + display_sources(sources) + source_choice = input("News Source Number >>>> ") + # Quit + if(source_choice.lower() == "quit"): + sys.exit() + try: + source_choice = int(source_choice) - 1 + if(source_choice >= len(sources) or source_choice < 0): + print("Please select an index between 1-" + + str(len(sources))) + else: + return source_choice + except ValueError: + print("That is not a valid News Source Number") + + +def prompt_for_article(max=0): + print("Do you want to read a story further? If yes, please select the" + "number corresponding to the article") + print("Enter 'back' to go back to the main menu") + print("Press 'quit' to quit") + while True: + article_selection = input("Article No >>>> ") + + # Back + if(article_selection.lower()[0] == 'b'): + return SelectionStatus.BACK, None + # Exit + elif(article_selection.lower()[0] == 'q'): + return SelectionStatus.EXIT, None + + article_selection = int(article_selection) + if 0 > article_selection - 1 or article_selection > max: + print(f'Please select an index between 1-{max}.') + else: + return SelectionStatus.READ, article_selection - 1 + + +def prompt_for_save(): + while True: + print("Do you want to save this article in file") + selection = str(input("Want to save? y/n >>> ")) + if selection[0].lower() == 'y': + return True + elif selection[0].lower() == 'n': + return False + + +def main(): + display_title_banner() + + while True: + sources = news_sources() + source_choice = prompt_for_source(sources) + + while True: + puller = NewsPulling(sources[source_choice]) + articles = puller.beautify_articles() + status, article_selection = prompt_for_article(max=len(articles)) + if status == SelectionStatus.EXIT: + sys.exit() + elif status == SelectionStatus.BACK: + break + else: + print("\n" * 5) + extr = ExtractMainContent( + sources[source_choice], articles[article_selection][2]) + extr.beautify() + + if prompt_for_save(): + extr.save() + print("File saved!\n") + + +if __name__ == "__main__": + main() diff --git a/news_pulling.py b/news_pulling.py new file mode 100644 index 0000000..65c8501 --- /dev/null +++ b/news_pulling.py @@ -0,0 +1,92 @@ +import sys + +import requests +from requests import ConnectionError + +from config_reader import ConfigurationReader + + +class NewsPulling: + """This class is used to pull news from the internet depending on the source specified """ + + def __init__(self, newsSource): + self.Source = newsSource + + def pull_news(self): + config = ConfigurationReader() + self.__APIKey = config.APIKEY + self.__Limit = config.limit + url = 'https://newsapi.org/v1/articles?source=' + \ + self.Source + '&sortBy=top&apiKey=' + self.__APIKey + try: + req = requests.get(url) + if(req.status_code == 200): + return req + else: + print( + "There is some issue in connecting to the internet. Please check your firewall or internet") + except ConnectionError as e: + print("A connection Attempt failed") + print(e.message) + sys.exit() + + def json_read(self): + req = self.pull_news() + # indicate if we need to convert to utf-8 + needsconversion = False + if req.encoding != 'utf-8': + needsconversion = True + req = req.json() + articles = req['articles'] + noofarticles = len(articles) + maxarticles = min(noofarticles, self.__Limit) + + FilteredArticles = [] + + for i in range(maxarticles): + article = articles[i] + if needsconversion: + description = str(article['description'], 'utf-8') + # print description + title = str(article['title'], 'utf-8') + Article_url = str(article['url'], 'utf-8') + DateofPublication = str(article['publishedAt'], 'utf-8') + Author = str(article['author'], 'utf-8') + FilteredArticles.append( + [description, title, Article_url, DateofPublication, Author]) + else: + description = article['description'] + # print description + title = article['title'] + Article_url = article['url'] + DateofPublication = article['publishedAt'] + Author = article['author'] + FilteredArticles.append( + [description, title, Article_url, DateofPublication, Author]) + return FilteredArticles + + def beautify_articles(self): + self.Articles = self.json_read() + if self.Articles is None or len(self.Articles) == 0: + print("No articles found") + sys.exit() + print("\n" + ("=" * 16) + " STORIES " + ("=" * 16)) + for i in range(len(self.Articles)): + print("[" + str(i + 1) + "]", end=' ') + # Title + if self.Articles[i][1] is not None: + print("\t" + self.Articles[i][1]) + # Summary + if self.Articles[i][0] is not None: + # Limit Summary Size + summary = self.Articles[i][0][:85] + \ + (self.Articles[i][0][85:] and '...') + print("\t" + summary) + # Author + if self.Articles[i][4] is not None: + print("\t" + self.Articles[i][4]) + # Date + if self.Articles[i][3] is not None: + print("\t" + self.Articles[i][3] + "\n") + print("=" * 40) + return self.Articles