From 47aa4dc45accaaf1f0f4b7f96c99e01f899d9d35 Mon Sep 17 00:00:00 2001 From: Tom Faulkner Date: Tue, 20 Feb 2018 20:26:33 -0600 Subject: [PATCH 1/7] dos2unix... move files --- news/__init__.py | 0 news/config_reader.py | 16 +++++ news/extract_main_content.py | 61 ++++++++++++++++ news/extractor.py | 133 +++++++++++++++++++++++++++++++++++ news/news.py | 113 +++++++++++++++++++++++++++++ news/news_pulling.py | 92 ++++++++++++++++++++++++ 6 files changed, 415 insertions(+) create mode 100644 news/__init__.py create mode 100644 news/config_reader.py create mode 100644 news/extract_main_content.py create mode 100644 news/extractor.py create mode 100755 news/news.py create mode 100644 news/news_pulling.py diff --git a/news/__init__.py b/news/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/news/config_reader.py b/news/config_reader.py new file mode 100644 index 0000000..26831b3 --- /dev/null +++ b/news/config_reader.py @@ -0,0 +1,16 @@ +import yaml + + +class ConfigurationReader: + def __init__(self): + with open('config.yml') as ymlfile: + cfg = yaml.load(ymlfile) + self.APIKEY = cfg['Apikey'] + self.limit = cfg['Limit'] + self.websites_supported = cfg['WebsiteSupported'] + + # TODO: Move to using this, and reading it from env, config, defaults + self.user_agent = cfg.get('User-Agent', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' + ' AppleWebKit/537.36 (KHTML, like Gecko ' + 'Chrome/59.0.3071.115 Safari/537.36') diff --git a/news/extract_main_content.py b/news/extract_main_content.py new file mode 100644 index 0000000..2f00438 --- /dev/null +++ b/news/extract_main_content.py @@ -0,0 +1,61 @@ +import requests +from config_reader import ConfigurationReader +from extractor import * +import textwrap + + +class ExtractMainContent: + def __init__(self, source, articleurl): + self.extractorlist = [HuffingtonPost(), NYT(), BBC( + ), BloomBerg(), Guardian(), TheHindu(), TimesOfIndia()] + websites = ConfigurationReader().websites_supported + self.Mapping = {} + for index, website in enumerate(websites): + self.Mapping[website] = self.extractorlist[index] + self.Source = source + self.url = articleurl + self.textWrap = textwrap.TextWrapper( + initial_indent='\t', subsequent_indent='\t', width=100) + + def download(self): + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/59.0.3071.115 Safari/537.36'} + req = requests.get(self.url, headers=headers) + return req.text + + # unused, but may be useful in the future + # def AddExtractorList(self, extractor): + # self.extractorlist.append(extractor) + + def _extract(self): + self.ExtractStrategy = self.Mapping[self.Source] + text = self.download() + return self.ExtractStrategy.extractor(text) + + def beautify(self): + title, output = self._extract() + print("=" * (len(title) + 15)) + print("\t" + title) + print("=" * (len(title) + 15)) + + print((self.textWrap.fill(output))) # wrap of the line + print("*" * 80) + if len(output) == 0: + print("Sorry :(") + print("There isn't much text on the site besides video/image. To " + "further view the media post, Go to the below link") + print(self.url) + print('*' * 80) + print("\n\n") + + def save(self): + title, output = self._extract() + + # Remove Chars not allowed in filenames + for char in ['<', '>', "/", ":", '"', "\\", "|", "?", "*"]: + if char in title: + title = title.replace(char, "") + + with open(f'saved_articles/{title}.txt', "w+") as f: + f.write(output) diff --git a/news/extractor.py b/news/extractor.py new file mode 100644 index 0000000..8e1ac53 --- /dev/null +++ b/news/extractor.py @@ -0,0 +1,133 @@ +from bs4 import BeautifulSoup + + +class Extractor: + + def extractor(self, text): + pass + + def _extraction_algo(self, text, htmlelement, classname): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + result = [] + # print soup + maincontent = soup.find_all(htmlelement, class_=classname) + # print maincontent + for content in maincontent: + scripttags = content.find_all(["script", "br", "figure", "image"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + result.append(content.text) + result = ''.join(result) + return (title, result) + + +class HuffingtonPost(Extractor): + """class for Huffington Post parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + return self._extraction_algo(text, "div", "content-list-component text") + + +class NYT(Extractor): + """class for New York Times parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + return self._extraction_algo(text, "p", "story-body-text story-content") + + +class BBC(Extractor): + """class for BBC News parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + return self._extraction_algo(text, "div", "story-body__inner") + + +class BloomBerg(Extractor): + """class for BloomBerg parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + return self._extraction_algo(text, "div", "body-copy") + + +class Guardian(Extractor): + """class for Guardian parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all( + "div", class_="content__article-body from-content-api js-article__body") + # print maincontent + for content in maincontent: + scripttags = content.find_all(["script", "br", "figure", "image"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + for foundcontent in content.find_all("p"): + Result.append(foundcontent.text) + Result = ''.join(Result) + return (title, Result) + + +class TheHindu(Extractor): + """class for BloomBerg parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="article") + # print maincontent + for content in maincontent: + scripttags = content.find_all( + ["script", "br", "figure", "image", "span"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + for foundcontent in content.find_all("p"): + Result.append(foundcontent.text) + Result = ''.join(Result) + return (title, Result) + + +class TimesOfIndia(Extractor): + """class for BloomBerg parsing""" + + def __init__(self): + Extractor.__init__(self) + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="Normal") + # print maincontent + for content in maincontent: + # print content.text + Result.append(content.text) + Result = ''.join(Result) + return (title, Result) diff --git a/news/news.py b/news/news.py new file mode 100755 index 0000000..5448741 --- /dev/null +++ b/news/news.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +import sys +from enum import Enum + +from news_pulling import NewsPulling +from config_reader import ConfigurationReader +from extract_main_content import ExtractMainContent + + +class SelectionStatus(Enum): + BACK = 1 + EXIT = 2 + READ = 3 + + +def news_sources(): + news_sources = ConfigurationReader().websites_supported + return news_sources + + +def display_sources(sources): + for index, source in enumerate(sources): + print(f'[{index + 1}]\t{source}') + print("\nPlease enter the index of the news source or type 'quit' to exit") + + +def display_title_banner(): + # Cool Title/Banner + print("=" * 40) + print("\tNews at the Command Line") + print("=" * 40) + print() + + +def prompt_for_source(sources): + while True: + display_sources(sources) + source_choice = input("News Source Number >>>> ") + # Quit + if(source_choice.lower() == "quit"): + sys.exit() + try: + source_choice = int(source_choice) - 1 + if(source_choice >= len(sources) or source_choice < 0): + print("Please select an index between 1-" + + str(len(sources))) + else: + return source_choice + except ValueError: + print("That is not a valid News Source Number") + + +def prompt_for_article(max=0): + print("Do you want to read a story further? If yes, please select the" + "number corresponding to the article") + print("Enter 'back' to go back to the main menu") + print("Press 'quit' to quit") + while True: + article_selection = input("Article No >>>> ") + + # Back + if(article_selection.lower()[0] == 'b'): + return SelectionStatus.BACK, None + # Exit + elif(article_selection.lower()[0] == 'q'): + return SelectionStatus.EXIT, None + + article_selection = int(article_selection) + if 0 > article_selection - 1 or article_selection > max: + print(f'Please select an index between 1-{max}.') + else: + return SelectionStatus.READ, article_selection - 1 + + +def prompt_for_save(): + while True: + print("Do you want to save this article in file") + selection = str(input("Want to save? y/n >>> ")) + if selection[0].lower() == 'y': + return True + elif selection[0].lower() == 'n': + return False + + +def main(): + display_title_banner() + + while True: + sources = news_sources() + source_choice = prompt_for_source(sources) + + while True: + puller = NewsPulling(sources[source_choice]) + articles = puller.beautify_articles() + status, article_selection = prompt_for_article(max=len(articles)) + if status == SelectionStatus.EXIT: + sys.exit() + elif status == SelectionStatus.BACK: + break + else: + print("\n" * 5) + extr = ExtractMainContent( + sources[source_choice], articles[article_selection][2]) + extr.beautify() + + if prompt_for_save(): + extr.save() + print("File saved!\n") + + +if __name__ == "__main__": + main() diff --git a/news/news_pulling.py b/news/news_pulling.py new file mode 100644 index 0000000..613829d --- /dev/null +++ b/news/news_pulling.py @@ -0,0 +1,92 @@ +import sys + +import requests +from requests import ConnectionError + +from config_reader import ConfigurationReader + + +class NewsPulling: + """This class is used to pull news from the internet depending on the source specified """ + + def __init__(self, newsSource): + self.Source = newsSource + + def pull_news(self): + config = ConfigurationReader() + self.__APIKey = config.APIKEY + self.__Limit = config.limit + url = 'https://newsapi.org/v1/articles?source=' + \ + self.Source + '&sortBy=top&apiKey=' + self.__APIKey + try: + req = requests.get(url) + if(req.status_code == 200): + return req + else: + print( + "There is some issue in connecting to the internet. Please check your firewall or internet") + except ConnectionError as e: + print("A connection Attempt failed") + print(e.message) + sys.exit() + + def json_read(self): + req = self.pull_news() + # indicate if we need to convert to utf-8 + needsconversion = False + if req.encoding != 'utf-8': + needsconversion = True + req = req.json() + articles = req['articles'] + noofarticles = len(articles) + maxarticles = min(noofarticles, self.__Limit) + + FilteredArticles = [] + + for i in range(maxarticles): + article = articles[i] + if needsconversion: + description = str(article['description'], 'utf-8') + # print description + title = str(article['title'], 'utf-8') + Article_url = str(article['url'], 'utf-8') + DateofPublication = str(article['publishedAt'], 'utf-8') + Author = str(article['author'], 'utf-8') + FilteredArticles.append( + [description, title, Article_url, DateofPublication, Author]) + else: + description = article['description'] + # print description + title = article['title'] + Article_url = article['url'] + DateofPublication = article['publishedAt'] + Author = article['author'] + FilteredArticles.append( + [description, title, Article_url, DateofPublication, Author]) + return FilteredArticles + + def beautify_articles(self): + self.Articles = self.json_read() + if self.Articles is None or len(self.Articles) == 0: + print("No articles found") + sys.exit() + print("\n" + ("=" * 16) + " STORIES " + ("=" * 16)) + for i in range(len(self.Articles)): + print("[" + str(i + 1) + "]", end=' ') + # Title + if self.Articles[i][1] is not None: + print("\t" + self.Articles[i][1]) + # Summary + if self.Articles[i][0] is not None: + # Limit Summary Size + summary = self.Articles[i][0][:85] + \ + (self.Articles[i][0][85:] and '...') + print("\t" + summary) + # Author + if self.Articles[i][4] is not None: + print("\t" + self.Articles[i][4]) + # Date + if self.Articles[i][3] is not None: + print("\t" + self.Articles[i][3] + "\n") + print("=" * 40) + return self.Articles From d53f551dc4efe71ab96f22a1727c2f76893a711f Mon Sep 17 00:00:00 2001 From: Tom Faulkner Date: Tue, 20 Feb 2018 20:59:57 -0600 Subject: [PATCH 2/7] dos2unix everything else --- CONTRIBUTORS.md | 2 + __version__.py | 1 + config.yml | 28 ++++---- config_reader.py | 16 ----- extract_main_content.py | 61 ---------------- extractor.py | 133 ----------------------------------- news.py | 111 ----------------------------- news/extract_main_content.py | 5 +- news/news.py | 6 +- news/news_pulling.py | 2 +- news_pulling.py | 92 ------------------------ requirements.txt | 1 + setup.py | 36 ++++++++++ 13 files changed, 60 insertions(+), 434 deletions(-) create mode 100644 CONTRIBUTORS.md create mode 100644 __version__.py delete mode 100644 config_reader.py delete mode 100644 extract_main_content.py delete mode 100644 extractor.py delete mode 100644 news.py delete mode 100644 news_pulling.py create mode 100644 setup.py diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 0000000..fee71c8 --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,2 @@ +Ankit Singh https://github.com/Griffintaur +Tom Faulkner https://github.com/TomFaulkner diff --git a/__version__.py b/__version__.py new file mode 100644 index 0000000..b8023d8 --- /dev/null +++ b/__version__.py @@ -0,0 +1 @@ +__version__ = '0.0.1' diff --git a/config.yml b/config.yml index b11ddb4..7421ceb 100644 --- a/config.yml +++ b/config.yml @@ -1,15 +1,13 @@ -WebsiteSupported: - - the-huffington-post - - the-new-york-times - - bbc-news - - bloomberg - - the-guardian-uk - - the-hindu - - the-times-of-india - -# Posts shown -Limit: 10 - -Apikey: bda5818cc2af461e98330ccdf6fb9cbe - - \ No newline at end of file +WebsiteSupported: + - the-huffington-post + - the-new-york-times + - bbc-news + - bloomberg + - the-guardian-uk + - the-hindu + - the-times-of-india + +# Posts shown +Limit: 10 + +Apikey: bda5818cc2af461e98330ccdf6fb9cbe diff --git a/config_reader.py b/config_reader.py deleted file mode 100644 index 0f36639..0000000 --- a/config_reader.py +++ /dev/null @@ -1,16 +0,0 @@ -import yaml - - -class ConfigurationReader: - def __init__(self): - with open('config.yml') as ymlfile: - cfg = yaml.load(ymlfile) - self.APIKEY = cfg['Apikey'] - self.limit = cfg['Limit'] - self.websites_supported = cfg['WebsiteSupported'] - - # TODO: Move to using this, and reading it from env, config, defaults - self.user_agent = cfg.get('User-Agent', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' - ' AppleWebKit/537.36 (KHTML, like Gecko ' - 'Chrome/59.0.3071.115 Safari/537.36') diff --git a/extract_main_content.py b/extract_main_content.py deleted file mode 100644 index d91ea78..0000000 --- a/extract_main_content.py +++ /dev/null @@ -1,61 +0,0 @@ -import requests -from config_reader import ConfigurationReader -from extractor import * -import textwrap - - -class ExtractMainContent: - def __init__(self, source, articleurl): - self.extractorlist = [HuffingtonPost(), NYT(), BBC( - ), BloomBerg(), Guardian(), TheHindu(), TimesOfIndia()] - websites = ConfigurationReader().websites_supported - self.Mapping = {} - for index, website in enumerate(websites): - self.Mapping[website] = self.extractorlist[index] - self.Source = source - self.url = articleurl - self.textWrap = textwrap.TextWrapper( - initial_indent='\t', subsequent_indent='\t', width=100) - - def download(self): - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/59.0.3071.115 Safari/537.36'} - req = requests.get(self.url, headers=headers) - return req.text - - # unused, but may be useful in the future - # def AddExtractorList(self, extractor): - # self.extractorlist.append(extractor) - - def _extract(self): - self.ExtractStrategy = self.Mapping[self.Source] - text = self.download() - return self.ExtractStrategy.extractor(text) - - def beautify(self): - title, output = self._extract() - print("=" * (len(title) + 15)) - print("\t" + title) - print("=" * (len(title) + 15)) - - print((self.textWrap.fill(output))) # wrap of the line - print("*" * 80) - if len(output) == 0: - print("Sorry :(") - print("There isn't much text on the site besides video/image. To " - "further view the media post, Go to the below link") - print(self.url) - print('*' * 80) - print("\n\n") - - def save(self): - title, output = self._extract() - - # Remove Chars not allowed in filenames - for char in ['<', '>', "/", ":", '"', "\\", "|", "?", "*"]: - if char in title: - title = title.replace(char, "") - - with open(f'saved_articles/{title}.txt', "w+") as f: - f.write(output) diff --git a/extractor.py b/extractor.py deleted file mode 100644 index ee985d5..0000000 --- a/extractor.py +++ /dev/null @@ -1,133 +0,0 @@ -from bs4 import BeautifulSoup - - -class Extractor: - - def extractor(self, text): - pass - - def _extraction_algo(self, text, htmlelement, classname): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - result = [] - # print soup - maincontent = soup.find_all(htmlelement, class_=classname) - # print maincontent - for content in maincontent: - scripttags = content.find_all(["script", "br", "figure", "image"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - result.append(content.text) - result = ''.join(result) - return (title, result) - - -class HuffingtonPost(Extractor): - """class for Huffington Post parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "content-list-component text") - - -class NYT(Extractor): - """class for New York Times parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "p", "story-body-text story-content") - - -class BBC(Extractor): - """class for BBC News parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "story-body__inner") - - -class BloomBerg(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "body-copy") - - -class Guardian(Extractor): - """class for Guardian parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all( - "div", class_="content__article-body from-content-api js-article__body") - # print maincontent - for content in maincontent: - scripttags = content.find_all(["script", "br", "figure", "image"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - for foundcontent in content.find_all("p"): - Result.append(foundcontent.text) - Result = ''.join(Result) - return (title, Result) - - -class TheHindu(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all("div", class_="article") - # print maincontent - for content in maincontent: - scripttags = content.find_all( - ["script", "br", "figure", "image", "span"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - for foundcontent in content.find_all("p"): - Result.append(foundcontent.text) - Result = ''.join(Result) - return (title, Result) - - -class TimesOfIndia(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all("div", class_="Normal") - # print maincontent - for content in maincontent: - # print content.text - Result.append(content.text) - Result = ''.join(Result) - return (title, Result) diff --git a/news.py b/news.py deleted file mode 100644 index 64bf84f..0000000 --- a/news.py +++ /dev/null @@ -1,111 +0,0 @@ -import sys -from enum import Enum - -from news_pulling import NewsPulling -from config_reader import ConfigurationReader -from extract_main_content import ExtractMainContent - - -class SelectionStatus(Enum): - BACK = 1 - EXIT = 2 - READ = 3 - - -def news_sources(): - news_sources = ConfigurationReader().websites_supported - return news_sources - - -def display_sources(sources): - for index, source in enumerate(sources): - print(f'[{index + 1}]\t{source}') - print("\nPlease enter the index of the news source or type 'quit' to exit") - - -def display_title_banner(): - # Cool Title/Banner - print("=" * 40) - print("\tNews at the Command Line") - print("=" * 40) - print() - - -def prompt_for_source(sources): - while True: - display_sources(sources) - source_choice = input("News Source Number >>>> ") - # Quit - if(source_choice.lower() == "quit"): - sys.exit() - try: - source_choice = int(source_choice) - 1 - if(source_choice >= len(sources) or source_choice < 0): - print("Please select an index between 1-" + - str(len(sources))) - else: - return source_choice - except ValueError: - print("That is not a valid News Source Number") - - -def prompt_for_article(max=0): - print("Do you want to read a story further? If yes, please select the" - "number corresponding to the article") - print("Enter 'back' to go back to the main menu") - print("Press 'quit' to quit") - while True: - article_selection = input("Article No >>>> ") - - # Back - if(article_selection.lower()[0] == 'b'): - return SelectionStatus.BACK, None - # Exit - elif(article_selection.lower()[0] == 'q'): - return SelectionStatus.EXIT, None - - article_selection = int(article_selection) - if 0 > article_selection - 1 or article_selection > max: - print(f'Please select an index between 1-{max}.') - else: - return SelectionStatus.READ, article_selection - 1 - - -def prompt_for_save(): - while True: - print("Do you want to save this article in file") - selection = str(input("Want to save? y/n >>> ")) - if selection[0].lower() == 'y': - return True - elif selection[0].lower() == 'n': - return False - - -def main(): - display_title_banner() - - while True: - sources = news_sources() - source_choice = prompt_for_source(sources) - - while True: - puller = NewsPulling(sources[source_choice]) - articles = puller.beautify_articles() - status, article_selection = prompt_for_article(max=len(articles)) - if status == SelectionStatus.EXIT: - sys.exit() - elif status == SelectionStatus.BACK: - break - else: - print("\n" * 5) - extr = ExtractMainContent( - sources[source_choice], articles[article_selection][2]) - extr.beautify() - - if prompt_for_save(): - extr.save() - print("File saved!\n") - - -if __name__ == "__main__": - main() diff --git a/news/extract_main_content.py b/news/extract_main_content.py index 2f00438..791ff93 100644 --- a/news/extract_main_content.py +++ b/news/extract_main_content.py @@ -1,8 +1,9 @@ import requests -from config_reader import ConfigurationReader -from extractor import * import textwrap +from .config_reader import ConfigurationReader +from .extractor import * + class ExtractMainContent: def __init__(self, source, articleurl): diff --git a/news/news.py b/news/news.py index 5448741..d74ec56 100755 --- a/news/news.py +++ b/news/news.py @@ -3,9 +3,9 @@ import sys from enum import Enum -from news_pulling import NewsPulling -from config_reader import ConfigurationReader -from extract_main_content import ExtractMainContent +from .news_pulling import NewsPulling +from .config_reader import ConfigurationReader +from .extract_main_content import ExtractMainContent class SelectionStatus(Enum): diff --git a/news/news_pulling.py b/news/news_pulling.py index 613829d..c0c272b 100644 --- a/news/news_pulling.py +++ b/news/news_pulling.py @@ -3,7 +3,7 @@ import requests from requests import ConnectionError -from config_reader import ConfigurationReader +from .config_reader import ConfigurationReader class NewsPulling: diff --git a/news_pulling.py b/news_pulling.py deleted file mode 100644 index 65c8501..0000000 --- a/news_pulling.py +++ /dev/null @@ -1,92 +0,0 @@ -import sys - -import requests -from requests import ConnectionError - -from config_reader import ConfigurationReader - - -class NewsPulling: - """This class is used to pull news from the internet depending on the source specified """ - - def __init__(self, newsSource): - self.Source = newsSource - - def pull_news(self): - config = ConfigurationReader() - self.__APIKey = config.APIKEY - self.__Limit = config.limit - url = 'https://newsapi.org/v1/articles?source=' + \ - self.Source + '&sortBy=top&apiKey=' + self.__APIKey - try: - req = requests.get(url) - if(req.status_code == 200): - return req - else: - print( - "There is some issue in connecting to the internet. Please check your firewall or internet") - except ConnectionError as e: - print("A connection Attempt failed") - print(e.message) - sys.exit() - - def json_read(self): - req = self.pull_news() - # indicate if we need to convert to utf-8 - needsconversion = False - if req.encoding != 'utf-8': - needsconversion = True - req = req.json() - articles = req['articles'] - noofarticles = len(articles) - maxarticles = min(noofarticles, self.__Limit) - - FilteredArticles = [] - - for i in range(maxarticles): - article = articles[i] - if needsconversion: - description = str(article['description'], 'utf-8') - # print description - title = str(article['title'], 'utf-8') - Article_url = str(article['url'], 'utf-8') - DateofPublication = str(article['publishedAt'], 'utf-8') - Author = str(article['author'], 'utf-8') - FilteredArticles.append( - [description, title, Article_url, DateofPublication, Author]) - else: - description = article['description'] - # print description - title = article['title'] - Article_url = article['url'] - DateofPublication = article['publishedAt'] - Author = article['author'] - FilteredArticles.append( - [description, title, Article_url, DateofPublication, Author]) - return FilteredArticles - - def beautify_articles(self): - self.Articles = self.json_read() - if self.Articles is None or len(self.Articles) == 0: - print("No articles found") - sys.exit() - print("\n" + ("=" * 16) + " STORIES " + ("=" * 16)) - for i in range(len(self.Articles)): - print("[" + str(i + 1) + "]", end=' ') - # Title - if self.Articles[i][1] is not None: - print("\t" + self.Articles[i][1]) - # Summary - if self.Articles[i][0] is not None: - # Limit Summary Size - summary = self.Articles[i][0][:85] + \ - (self.Articles[i][0][85:] and '...') - print("\t" + summary) - # Author - if self.Articles[i][4] is not None: - print("\t" + self.Articles[i][4]) - # Date - if self.Articles[i][3] is not None: - print("\t" + self.Articles[i][3] + "\n") - print("=" * 40) - return self.Articles diff --git a/requirements.txt b/requirements.txt index 345cd29..f3917db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ idna==2.6 PyYAML==3.12 requests==2.18.4 urllib3==1.22 +wheel diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..562f227 --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +from setuptools import setup, find_packages +from os import path + +from __version__ import __version__ +here = path.abspath(path.dirname(__file__)) + +with open('README.md') as f: + long_description = f.read() + +setup( + name="News At The Command Line", + version=__version__, + description="Read your news on your favourite terminal", + author="Ankit Singh", + packages=['news'], + package_dir={'news': 'news'}, + long_description=long_description, + + install_requires=[ + 'bs4>=0.0.1', + 'beautifulsoup4>=4.6.0', + 'PyYAML>=3.12', + 'requests>=2.18.4', + ], + + license='MIT', + entry_points={ + 'console_scripts': [ + 'newsctl=news.news:main' + ] + }, + classifiers=[ + 'Environment :: Console', + 'Intended Audience :: End Users/Desktop', + ] +) From caaab64978a340649e09c7c14ac36adb2b267d3f Mon Sep 17 00:00:00 2001 From: Tom Faulkner Date: Tue, 20 Feb 2018 21:08:17 -0600 Subject: [PATCH 3/7] readme and todo --- README.md | 16 +++++++++------- TODO.md | 5 +++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index e28ea0c..45c6ef9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -## Please note that I have made changes that invalidate the information in the readme. To run create the saved_articles directory, if it isn't pulled from the repo, then use Python 3.6+ and run `python3 news.py`. I'll fix the readme soon, and provide a better way to install and run. Thanks. - Tom - # News at the Command line ### Want to be kept updated without visiting the news portals every now and then @@ -9,11 +7,16 @@ # Modules Requirements - **Python 3.6+** -- **Requests** -- **Beautiful Soup** +- **Requests** +- **Beautiful Soup** - **PyYAML** -To install the module dependencies before running the application, simply navigate into the project folder and run `pip install -r requirements.txt`. +# Installation +1. `git clone` the repository, preferably into a virtual environment. +2. Copy `config.yml` into your home directory. +3. Run with `newsctl` + +At present `config.yml` is only read from pwd when the script is run, I'll fix this soon. # Working - All sample input images are placed under the **Images** folder. @@ -21,7 +24,7 @@ To install the module dependencies before running the application, simply naviga # How To Use Make sure you have installed required libraries, instructions above. - Just run the main.py, do this by typing `py main.py`. + Just run the main.py, do this by typing `py main.py`. The rest is quite straight forward. # Contributing @@ -32,4 +35,3 @@ Please open an issue on GitHub if you'd like to report a bug or request a featur ## License The code is released under MIT license and free to use. - diff --git a/TODO.md b/TODO.md index 3795fbf..7a3ebe3 100644 --- a/TODO.md +++ b/TODO.md @@ -1,6 +1,7 @@ -* Move all files to proper locations (./news) +* ~~ Move all files to proper locations (./news) ~~ +* Read config.yml from a default location, or ENV variable location * Test all the things -* Update README +* Update README (further updates) * Read from environment variables + config + command line args * Edit configuration in program * Move to plugins for news sources From 7bfaab89927e2e8d6f780325f9bf91c1f3455e39 Mon Sep 17 00:00:00 2001 From: Tom Faulkner Date: Tue, 20 Feb 2018 21:20:02 -0600 Subject: [PATCH 4/7] Update TODO.md --- TODO.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TODO.md b/TODO.md index 7a3ebe3..e3d83e0 100644 --- a/TODO.md +++ b/TODO.md @@ -1,4 +1,4 @@ -* ~~ Move all files to proper locations (./news) ~~ +* ~~Move all files to proper locations (./news)~~ * Read config.yml from a default location, or ENV variable location * Test all the things * Update README (further updates) From 4db0e8fb9b4bc621a70f560cd085042afecbf8df Mon Sep 17 00:00:00 2001 From: Tom Faulkner Date: Tue, 20 Feb 2018 21:21:59 -0600 Subject: [PATCH 5/7] Delete TODO.md --- TODO.md | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 TODO.md diff --git a/TODO.md b/TODO.md deleted file mode 100644 index e3d83e0..0000000 --- a/TODO.md +++ /dev/null @@ -1,11 +0,0 @@ -* ~~Move all files to proper locations (./news)~~ -* Read config.yml from a default location, or ENV variable location -* Test all the things -* Update README (further updates) -* Read from environment variables + config + command line args -* Edit configuration in program -* Move to plugins for news sources -* Move all prompts to string constants file for easy changes and translations -* Dependency inject BeautifulSoup in extractor - -* Consider REST client / server architecture From 0615d64ad0002a6af05f9c854b982dfb5bc838da Mon Sep 17 00:00:00 2001 From: Tom Faulkner Date: Tue, 20 Feb 2018 23:13:06 -0600 Subject: [PATCH 6/7] plugin system in place, config moved --- __version__.py | 1 - news/__version__.py | 2 + news/config_reader.py | 26 +++- news/constants.py | 6 + news/extract_main_content.py | 17 ++- news/extractor.py | 133 --------------------- news/news.py | 5 +- news/news_pulling.py | 16 ++- news/reader.py | 19 +++ news/reader_plugins/bbc.py | 9 ++ news/reader_plugins/bloomberg.py | 7 ++ news/reader_plugins/guardian.py | 26 ++++ news/reader_plugins/hindu.py | 24 ++++ news/reader_plugins/huffington_post.py | 8 ++ news/reader_plugins/new_york_times.py | 8 ++ news/reader_plugins/plugin_registration.py | 17 +++ news/reader_plugins/times_of_india.py | 19 +++ requirements.txt | 1 + setup.py | 2 +- 19 files changed, 190 insertions(+), 156 deletions(-) delete mode 100644 __version__.py create mode 100644 news/__version__.py create mode 100644 news/constants.py delete mode 100644 news/extractor.py create mode 100644 news/reader.py create mode 100644 news/reader_plugins/bbc.py create mode 100644 news/reader_plugins/bloomberg.py create mode 100644 news/reader_plugins/guardian.py create mode 100644 news/reader_plugins/hindu.py create mode 100644 news/reader_plugins/huffington_post.py create mode 100644 news/reader_plugins/new_york_times.py create mode 100644 news/reader_plugins/plugin_registration.py create mode 100644 news/reader_plugins/times_of_india.py diff --git a/__version__.py b/__version__.py deleted file mode 100644 index b8023d8..0000000 --- a/__version__.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = '0.0.1' diff --git a/news/__version__.py b/news/__version__.py new file mode 100644 index 0000000..75c1d9c --- /dev/null +++ b/news/__version__.py @@ -0,0 +1,2 @@ +__app_name__ = 'newsctl' +__version__ = '0.0.1' diff --git a/news/config_reader.py b/news/config_reader.py index 26831b3..d68755d 100644 --- a/news/config_reader.py +++ b/news/config_reader.py @@ -1,13 +1,29 @@ +import os +from contextlib import suppress + import yaml +from appdirs import AppDirs + +from .__version__ import __app_name__ +from .constants import constants + +dirs = AppDirs(__app_name__) class ConfigurationReader: def __init__(self): - with open('config.yml') as ymlfile: - cfg = yaml.load(ymlfile) - self.APIKEY = cfg['Apikey'] - self.limit = cfg['Limit'] - self.websites_supported = cfg['WebsiteSupported'] + try: + with open(f'{dirs.user_config_dir}/config.yml') as ymlfile: + cfg = yaml.load(ymlfile) + except FileNotFoundError: + with suppress(FileExistsError): + os.makedirs(dirs.user_config_dir) + with open(f'{dirs.user_config_dir}/config.yml', 'w') as ymlfile: + ymlfile.write(yaml.dump(constants['config_defaults'])) + cfg = constants['config_defaults'] + + self.APIKEY = cfg['api_key'] + self.limit = cfg['article_limit'] # TODO: Move to using this, and reading it from env, config, defaults self.user_agent = cfg.get('User-Agent', diff --git a/news/constants.py b/news/constants.py new file mode 100644 index 0000000..cb258e2 --- /dev/null +++ b/news/constants.py @@ -0,0 +1,6 @@ +constants = { + 'config_defaults': { + 'api_key': 'bda5818cc2af461e98330ccdf6fb9cbe', + 'article_limit': 10, + } +} diff --git a/news/extract_main_content.py b/news/extract_main_content.py index 791ff93..576a3da 100644 --- a/news/extract_main_content.py +++ b/news/extract_main_content.py @@ -2,18 +2,17 @@ import textwrap from .config_reader import ConfigurationReader -from .extractor import * +from .reader_plugins.plugin_registration import sites +from .reader import Reader class ExtractMainContent: def __init__(self, source, articleurl): - self.extractorlist = [HuffingtonPost(), NYT(), BBC( - ), BloomBerg(), Guardian(), TheHindu(), TimesOfIndia()] - websites = ConfigurationReader().websites_supported - self.Mapping = {} - for index, website in enumerate(websites): - self.Mapping[website] = self.extractorlist[index] - self.Source = source + + self.mapping = {} + for index, website in enumerate(sites): + self.mapping[website] = self.extractorlist[index] + self.source = source self.url = articleurl self.textWrap = textwrap.TextWrapper( initial_indent='\t', subsequent_indent='\t', width=100) @@ -30,7 +29,7 @@ def download(self): # self.extractorlist.append(extractor) def _extract(self): - self.ExtractStrategy = self.Mapping[self.Source] + self.ExtractStrategy = self.mapping[self.source] text = self.download() return self.ExtractStrategy.extractor(text) diff --git a/news/extractor.py b/news/extractor.py deleted file mode 100644 index 8e1ac53..0000000 --- a/news/extractor.py +++ /dev/null @@ -1,133 +0,0 @@ -from bs4 import BeautifulSoup - - -class Extractor: - - def extractor(self, text): - pass - - def _extraction_algo(self, text, htmlelement, classname): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - result = [] - # print soup - maincontent = soup.find_all(htmlelement, class_=classname) - # print maincontent - for content in maincontent: - scripttags = content.find_all(["script", "br", "figure", "image"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - result.append(content.text) - result = ''.join(result) - return (title, result) - - -class HuffingtonPost(Extractor): - """class for Huffington Post parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "content-list-component text") - - -class NYT(Extractor): - """class for New York Times parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "p", "story-body-text story-content") - - -class BBC(Extractor): - """class for BBC News parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "story-body__inner") - - -class BloomBerg(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "body-copy") - - -class Guardian(Extractor): - """class for Guardian parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all( - "div", class_="content__article-body from-content-api js-article__body") - # print maincontent - for content in maincontent: - scripttags = content.find_all(["script", "br", "figure", "image"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - for foundcontent in content.find_all("p"): - Result.append(foundcontent.text) - Result = ''.join(Result) - return (title, Result) - - -class TheHindu(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all("div", class_="article") - # print maincontent - for content in maincontent: - scripttags = content.find_all( - ["script", "br", "figure", "image", "span"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - for foundcontent in content.find_all("p"): - Result.append(foundcontent.text) - Result = ''.join(Result) - return (title, Result) - - -class TimesOfIndia(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all("div", class_="Normal") - # print maincontent - for content in maincontent: - # print content.text - Result.append(content.text) - Result = ''.join(Result) - return (title, Result) diff --git a/news/news.py b/news/news.py index d74ec56..3f5421a 100755 --- a/news/news.py +++ b/news/news.py @@ -4,8 +4,8 @@ from enum import Enum from .news_pulling import NewsPulling -from .config_reader import ConfigurationReader from .extract_main_content import ExtractMainContent +from .reader_plugins.plugin_registration import sites class SelectionStatus(Enum): @@ -15,7 +15,8 @@ class SelectionStatus(Enum): def news_sources(): - news_sources = ConfigurationReader().websites_supported + news_sources = tuple(sites.keys()) + print('news sources', news_sources) return news_sources diff --git a/news/news_pulling.py b/news/news_pulling.py index c0c272b..b80015e 100644 --- a/news/news_pulling.py +++ b/news/news_pulling.py @@ -5,21 +5,27 @@ from .config_reader import ConfigurationReader +# TODO: Take reader plugin, read name from it for source in pull_news +# TODO: Rewrite this entire module +# TODO: Rewrite everything +# TODO: make newsapi.org replaceable, especially since there is a v2 api + class NewsPulling: - """This class is used to pull news from the internet depending on the source specified """ + """Pull news from the internet depending on the source specified.""" - def __init__(self, newsSource): - self.Source = newsSource + def __init__(self, source): + self.source = source def pull_news(self): config = ConfigurationReader() - self.__APIKey = config.APIKEY self.__Limit = config.limit url = 'https://newsapi.org/v1/articles?source=' + \ - self.Source + '&sortBy=top&apiKey=' + self.__APIKey + self.source + '&sortBy=top&apiKey=' + config.APIKEY + print(url) try: req = requests.get(url) + print(req) if(req.status_code == 200): return req else: diff --git a/news/reader.py b/news/reader.py new file mode 100644 index 0000000..f2fcc0a --- /dev/null +++ b/news/reader.py @@ -0,0 +1,19 @@ +from bs4 import BeautifulSoup + + +class Reader: + def _extraction_algo(self, text, htmlelement, classname): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + result = [] + # print soup + maincontent = soup.find_all(htmlelement, class_=classname) + # print maincontent + for content in maincontent: + scripttags = content.find_all(["script", "br", "figure", "image"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + result.append(content.text) + result = ''.join(result) + return (title, result) diff --git a/news/reader_plugins/bbc.py b/news/reader_plugins/bbc.py new file mode 100644 index 0000000..bafb167 --- /dev/null +++ b/news/reader_plugins/bbc.py @@ -0,0 +1,9 @@ +from news.reader import Reader + + +class BBC(Reader): + """class for BBC News parsing""" + source_name = 'bbc-news' + + def extractor(self, text): + return self._extraction_algo(text, "div", "story-body__inner") diff --git a/news/reader_plugins/bloomberg.py b/news/reader_plugins/bloomberg.py new file mode 100644 index 0000000..848060d --- /dev/null +++ b/news/reader_plugins/bloomberg.py @@ -0,0 +1,7 @@ +from news.reader import Reader + + +class Bloomberg(Reader): + """class for BloomBerg parsing""" + def extractor(self, text): + return self._extraction_algo(text, "div", "body-copy") diff --git a/news/reader_plugins/guardian.py b/news/reader_plugins/guardian.py new file mode 100644 index 0000000..a05377f --- /dev/null +++ b/news/reader_plugins/guardian.py @@ -0,0 +1,26 @@ +from bs4 import BeautifulSoup + +from news.reader import Reader + + +class Guardian(Reader): + """class for Guardian parsing""" + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all( + "div", + class_="content__article-body from-content-api js-article__body" + ) + # print maincontent + for content in maincontent: + scripttags = content.find_all(["script", "br", "figure", "image"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + for foundcontent in content.find_all("p"): + Result.append(foundcontent.text) + Result = ''.join(Result) + return (title, Result) diff --git a/news/reader_plugins/hindu.py b/news/reader_plugins/hindu.py new file mode 100644 index 0000000..6e32f65 --- /dev/null +++ b/news/reader_plugins/hindu.py @@ -0,0 +1,24 @@ +from bs4 import BeautifulSoup + +from news.reader import Reader + + +class TheHindu(Reader): + """class for The Hindu parsing""" + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="article") + # print maincontent + for content in maincontent: + scripttags = content.find_all( + ["script", "br", "figure", "image", "span"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + for foundcontent in content.find_all("p"): + Result.append(foundcontent.text) + Result = ''.join(Result) + return (title, Result) diff --git a/news/reader_plugins/huffington_post.py b/news/reader_plugins/huffington_post.py new file mode 100644 index 0000000..4d4be25 --- /dev/null +++ b/news/reader_plugins/huffington_post.py @@ -0,0 +1,8 @@ +from news.reader import Reader + + +class HuffingtonPost(Reader): + """class for Huffington Post parsing""" + def extractor(self, text): + return self._extraction_algo(text, "div", + "content-list-component text") diff --git a/news/reader_plugins/new_york_times.py b/news/reader_plugins/new_york_times.py new file mode 100644 index 0000000..b12576c --- /dev/null +++ b/news/reader_plugins/new_york_times.py @@ -0,0 +1,8 @@ +from news.reader import Reader + + +class NYT(Reader): + """class for New York Times parsing""" + def extractor(self, text): + return self._extraction_algo(text, "p", + "story-body-text story-content") diff --git a/news/reader_plugins/plugin_registration.py b/news/reader_plugins/plugin_registration.py new file mode 100644 index 0000000..d987cf2 --- /dev/null +++ b/news/reader_plugins/plugin_registration.py @@ -0,0 +1,17 @@ +from news.reader_plugins.huffington_post import HuffingtonPost +from news.reader_plugins.new_york_times import NYT +from news.reader_plugins.bbc import BBC +from news.reader_plugins.bloomberg import Bloomberg +from news.reader_plugins.guardian import Guardian +from news.reader_plugins.hindu import TheHindu +from news.reader_plugins.times_of_india import TimesOfIndia + +sites = { + 'Huffington Post': HuffingtonPost, + 'New York Times': NYT, + 'BBC': BBC, + 'Bloomberg': Bloomberg, + 'Guardian': Guardian, + 'The Hindu': TheHindu, + 'Times of India': TimesOfIndia +} diff --git a/news/reader_plugins/times_of_india.py b/news/reader_plugins/times_of_india.py new file mode 100644 index 0000000..266a4ad --- /dev/null +++ b/news/reader_plugins/times_of_india.py @@ -0,0 +1,19 @@ +from bs4 import BeautifulSoup + +from news.reader import Reader + + +class TimesOfIndia(Reader): + """class for Times of India parsing""" + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="Normal") + # print maincontent + for content in maincontent: + # print content.text + Result.append(content.text) + Result = ''.join(Result) + return (title, Result) diff --git a/requirements.txt b/requirements.txt index f3917db..4d4e19c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ PyYAML==3.12 requests==2.18.4 urllib3==1.22 wheel +appdirs==1.4.3 diff --git a/setup.py b/setup.py index 562f227..83b1adf 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages from os import path -from __version__ import __version__ +from news.__version__ import __version__ here = path.abspath(path.dirname(__file__)) with open('README.md') as f: From bc12fae1396ccbc46f5e7b6d6f2b302332108826 Mon Sep 17 00:00:00 2001 From: Tom Faulkner Date: Thu, 22 Feb 2018 21:13:13 -0600 Subject: [PATCH 7/7] updated plugins, more overhaul, a bit of pep8 --- news/config_reader.py | 6 ------ news/extract_main_content.py | 13 +------------ news/news.py | 5 +++-- news/news_pulling.py | 21 ++++++++------------- news/reader_plugins/bloomberg.py | 2 ++ news/reader_plugins/guardian.py | 2 ++ news/reader_plugins/hindu.py | 2 ++ news/reader_plugins/huffington_post.py | 2 ++ news/reader_plugins/new_york_times.py | 2 ++ news/reader_plugins/times_of_india.py | 2 ++ 10 files changed, 24 insertions(+), 33 deletions(-) diff --git a/news/config_reader.py b/news/config_reader.py index d68755d..95b6132 100644 --- a/news/config_reader.py +++ b/news/config_reader.py @@ -24,9 +24,3 @@ def __init__(self): self.APIKEY = cfg['api_key'] self.limit = cfg['article_limit'] - - # TODO: Move to using this, and reading it from env, config, defaults - self.user_agent = cfg.get('User-Agent', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' - ' AppleWebKit/537.36 (KHTML, like Gecko ' - 'Chrome/59.0.3071.115 Safari/537.36') diff --git a/news/extract_main_content.py b/news/extract_main_content.py index 576a3da..2b9f1c2 100644 --- a/news/extract_main_content.py +++ b/news/extract_main_content.py @@ -1,17 +1,11 @@ import requests import textwrap -from .config_reader import ConfigurationReader from .reader_plugins.plugin_registration import sites -from .reader import Reader class ExtractMainContent: def __init__(self, source, articleurl): - - self.mapping = {} - for index, website in enumerate(sites): - self.mapping[website] = self.extractorlist[index] self.source = source self.url = articleurl self.textWrap = textwrap.TextWrapper( @@ -24,14 +18,9 @@ def download(self): req = requests.get(self.url, headers=headers) return req.text - # unused, but may be useful in the future - # def AddExtractorList(self, extractor): - # self.extractorlist.append(extractor) - def _extract(self): - self.ExtractStrategy = self.mapping[self.source] text = self.download() - return self.ExtractStrategy.extractor(text) + return sites[self.source]().extractor(text) def beautify(self): title, output = self._extract() diff --git a/news/news.py b/news/news.py index 3f5421a..305cba0 100755 --- a/news/news.py +++ b/news/news.py @@ -16,7 +16,6 @@ class SelectionStatus(Enum): def news_sources(): news_sources = tuple(sites.keys()) - print('news sources', news_sources) return news_sources @@ -92,7 +91,9 @@ def main(): source_choice = prompt_for_source(sources) while True: - puller = NewsPulling(sources[source_choice]) + # TODO: This is ugly, but functional. + # Getting the name of thesource as used in the API from the plugin. + puller = NewsPulling(sites[sources[source_choice]]().source_name) articles = puller.beautify_articles() status, article_selection = prompt_for_article(max=len(articles)) if status == SelectionStatus.EXIT: diff --git a/news/news_pulling.py b/news/news_pulling.py index b80015e..958582d 100644 --- a/news/news_pulling.py +++ b/news/news_pulling.py @@ -5,11 +5,6 @@ from .config_reader import ConfigurationReader -# TODO: Take reader plugin, read name from it for source in pull_news -# TODO: Rewrite this entire module -# TODO: Rewrite everything -# TODO: make newsapi.org replaceable, especially since there is a v2 api - class NewsPulling: """Pull news from the internet depending on the source specified.""" @@ -22,17 +17,16 @@ def pull_news(self): self.__Limit = config.limit url = 'https://newsapi.org/v1/articles?source=' + \ self.source + '&sortBy=top&apiKey=' + config.APIKEY - print(url) try: req = requests.get(url) print(req) - if(req.status_code == 200): + if req.status_code == 200: return req else: - print( - "There is some issue in connecting to the internet. Please check your firewall or internet") + print("There is some issue in connecting to the internet." + "Please check your firewall or internet") except ConnectionError as e: - print("A connection Attempt failed") + print("A connection attempt failed") print(e.message) sys.exit() @@ -58,8 +52,8 @@ def json_read(self): Article_url = str(article['url'], 'utf-8') DateofPublication = str(article['publishedAt'], 'utf-8') Author = str(article['author'], 'utf-8') - FilteredArticles.append( - [description, title, Article_url, DateofPublication, Author]) + FilteredArticles.append([description, title, Article_url, + DateofPublication, Author]) else: description = article['description'] # print description @@ -68,7 +62,8 @@ def json_read(self): DateofPublication = article['publishedAt'] Author = article['author'] FilteredArticles.append( - [description, title, Article_url, DateofPublication, Author]) + [description, title, Article_url, + DateofPublication, Author]) return FilteredArticles def beautify_articles(self): diff --git a/news/reader_plugins/bloomberg.py b/news/reader_plugins/bloomberg.py index 848060d..893573b 100644 --- a/news/reader_plugins/bloomberg.py +++ b/news/reader_plugins/bloomberg.py @@ -3,5 +3,7 @@ class Bloomberg(Reader): """class for BloomBerg parsing""" + source_name = 'bloomberg' + def extractor(self, text): return self._extraction_algo(text, "div", "body-copy") diff --git a/news/reader_plugins/guardian.py b/news/reader_plugins/guardian.py index a05377f..82f7a1b 100644 --- a/news/reader_plugins/guardian.py +++ b/news/reader_plugins/guardian.py @@ -5,6 +5,8 @@ class Guardian(Reader): """class for Guardian parsing""" + source_name = 'the-guardian-uk' + def extractor(self, text): soup = BeautifulSoup(text, 'html.parser') title = soup.title.string diff --git a/news/reader_plugins/hindu.py b/news/reader_plugins/hindu.py index 6e32f65..e9c3bed 100644 --- a/news/reader_plugins/hindu.py +++ b/news/reader_plugins/hindu.py @@ -5,6 +5,8 @@ class TheHindu(Reader): """class for The Hindu parsing""" + source_name = 'the-hindu' + def extractor(self, text): soup = BeautifulSoup(text, 'html.parser') title = soup.title.string diff --git a/news/reader_plugins/huffington_post.py b/news/reader_plugins/huffington_post.py index 4d4be25..6ba7403 100644 --- a/news/reader_plugins/huffington_post.py +++ b/news/reader_plugins/huffington_post.py @@ -3,6 +3,8 @@ class HuffingtonPost(Reader): """class for Huffington Post parsing""" + source_name = 'the-huffington-post' + def extractor(self, text): return self._extraction_algo(text, "div", "content-list-component text") diff --git a/news/reader_plugins/new_york_times.py b/news/reader_plugins/new_york_times.py index b12576c..641c766 100644 --- a/news/reader_plugins/new_york_times.py +++ b/news/reader_plugins/new_york_times.py @@ -2,6 +2,8 @@ class NYT(Reader): + source_name = 'the-new-york-times' + """class for New York Times parsing""" def extractor(self, text): return self._extraction_algo(text, "p", diff --git a/news/reader_plugins/times_of_india.py b/news/reader_plugins/times_of_india.py index 266a4ad..37b11b9 100644 --- a/news/reader_plugins/times_of_india.py +++ b/news/reader_plugins/times_of_india.py @@ -5,6 +5,8 @@ class TimesOfIndia(Reader): """class for Times of India parsing""" + source_name = 'the-times-of-india' + def extractor(self, text): soup = BeautifulSoup(text, 'html.parser') title = soup.title.string