diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 0000000..fee71c8 --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,2 @@ +Ankit Singh https://github.com/Griffintaur +Tom Faulkner https://github.com/TomFaulkner diff --git a/README.md b/README.md index e28ea0c..45c6ef9 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,3 @@ -## Please note that I have made changes that invalidate the information in the readme. To run create the saved_articles directory, if it isn't pulled from the repo, then use Python 3.6+ and run `python3 news.py`. I'll fix the readme soon, and provide a better way to install and run. Thanks. - Tom - # News at the Command line ### Want to be kept updated without visiting the news portals every now and then @@ -9,11 +7,16 @@ # Modules Requirements - **Python 3.6+** -- **Requests** -- **Beautiful Soup** +- **Requests** +- **Beautiful Soup** - **PyYAML** -To install the module dependencies before running the application, simply navigate into the project folder and run `pip install -r requirements.txt`. +# Installation +1. `git clone` the repository, preferably into a virtual environment. +2. Copy `config.yml` into your home directory. +3. Run with `newsctl` + +At present `config.yml` is only read from pwd when the script is run, I'll fix this soon. # Working - All sample input images are placed under the **Images** folder. @@ -21,7 +24,7 @@ To install the module dependencies before running the application, simply naviga # How To Use Make sure you have installed required libraries, instructions above. - Just run the main.py, do this by typing `py main.py`. + Just run the main.py, do this by typing `py main.py`. The rest is quite straight forward. # Contributing @@ -32,4 +35,3 @@ Please open an issue on GitHub if you'd like to report a bug or request a featur ## License The code is released under MIT license and free to use. - diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 3795fbf..0000000 --- a/TODO.md +++ /dev/null @@ -1,10 +0,0 @@ -* Move all files to proper locations (./news) -* Test all the things -* Update README -* Read from environment variables + config + command line args -* Edit configuration in program -* Move to plugins for news sources -* Move all prompts to string constants file for easy changes and translations -* Dependency inject BeautifulSoup in extractor - -* Consider REST client / server architecture diff --git a/config.yml b/config.yml index b11ddb4..7421ceb 100644 --- a/config.yml +++ b/config.yml @@ -1,15 +1,13 @@ -WebsiteSupported: - - the-huffington-post - - the-new-york-times - - bbc-news - - bloomberg - - the-guardian-uk - - the-hindu - - the-times-of-india - -# Posts shown -Limit: 10 - -Apikey: bda5818cc2af461e98330ccdf6fb9cbe - - \ No newline at end of file +WebsiteSupported: + - the-huffington-post + - the-new-york-times + - bbc-news + - bloomberg + - the-guardian-uk + - the-hindu + - the-times-of-india + +# Posts shown +Limit: 10 + +Apikey: bda5818cc2af461e98330ccdf6fb9cbe diff --git a/config_reader.py b/config_reader.py deleted file mode 100644 index 0f36639..0000000 --- a/config_reader.py +++ /dev/null @@ -1,16 +0,0 @@ -import yaml - - -class ConfigurationReader: - def __init__(self): - with open('config.yml') as ymlfile: - cfg = yaml.load(ymlfile) - self.APIKEY = cfg['Apikey'] - self.limit = cfg['Limit'] - self.websites_supported = cfg['WebsiteSupported'] - - # TODO: Move to using this, and reading it from env, config, defaults - self.user_agent = cfg.get('User-Agent', - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' - ' AppleWebKit/537.36 (KHTML, like Gecko ' - 'Chrome/59.0.3071.115 Safari/537.36') diff --git a/extractor.py b/extractor.py deleted file mode 100644 index ee985d5..0000000 --- a/extractor.py +++ /dev/null @@ -1,133 +0,0 @@ -from bs4 import BeautifulSoup - - -class Extractor: - - def extractor(self, text): - pass - - def _extraction_algo(self, text, htmlelement, classname): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - result = [] - # print soup - maincontent = soup.find_all(htmlelement, class_=classname) - # print maincontent - for content in maincontent: - scripttags = content.find_all(["script", "br", "figure", "image"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - result.append(content.text) - result = ''.join(result) - return (title, result) - - -class HuffingtonPost(Extractor): - """class for Huffington Post parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "content-list-component text") - - -class NYT(Extractor): - """class for New York Times parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "p", "story-body-text story-content") - - -class BBC(Extractor): - """class for BBC News parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "story-body__inner") - - -class BloomBerg(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - return self._extraction_algo(text, "div", "body-copy") - - -class Guardian(Extractor): - """class for Guardian parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all( - "div", class_="content__article-body from-content-api js-article__body") - # print maincontent - for content in maincontent: - scripttags = content.find_all(["script", "br", "figure", "image"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - for foundcontent in content.find_all("p"): - Result.append(foundcontent.text) - Result = ''.join(Result) - return (title, Result) - - -class TheHindu(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all("div", class_="article") - # print maincontent - for content in maincontent: - scripttags = content.find_all( - ["script", "br", "figure", "image", "span"]) - for scripttag in scripttags: - scripttag.extract() - # print content.text - for foundcontent in content.find_all("p"): - Result.append(foundcontent.text) - Result = ''.join(Result) - return (title, Result) - - -class TimesOfIndia(Extractor): - """class for BloomBerg parsing""" - - def __init__(self): - Extractor.__init__(self) - - def extractor(self, text): - soup = BeautifulSoup(text, 'html.parser') - title = soup.title.string - Result = [] - # print soup - maincontent = soup.find_all("div", class_="Normal") - # print maincontent - for content in maincontent: - # print content.text - Result.append(content.text) - Result = ''.join(Result) - return (title, Result) diff --git a/news/__init__.py b/news/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/news/__version__.py b/news/__version__.py new file mode 100644 index 0000000..75c1d9c --- /dev/null +++ b/news/__version__.py @@ -0,0 +1,2 @@ +__app_name__ = 'newsctl' +__version__ = '0.0.1' diff --git a/news/config_reader.py b/news/config_reader.py new file mode 100644 index 0000000..95b6132 --- /dev/null +++ b/news/config_reader.py @@ -0,0 +1,26 @@ +import os +from contextlib import suppress + +import yaml +from appdirs import AppDirs + +from .__version__ import __app_name__ +from .constants import constants + +dirs = AppDirs(__app_name__) + + +class ConfigurationReader: + def __init__(self): + try: + with open(f'{dirs.user_config_dir}/config.yml') as ymlfile: + cfg = yaml.load(ymlfile) + except FileNotFoundError: + with suppress(FileExistsError): + os.makedirs(dirs.user_config_dir) + with open(f'{dirs.user_config_dir}/config.yml', 'w') as ymlfile: + ymlfile.write(yaml.dump(constants['config_defaults'])) + cfg = constants['config_defaults'] + + self.APIKEY = cfg['api_key'] + self.limit = cfg['article_limit'] diff --git a/news/constants.py b/news/constants.py new file mode 100644 index 0000000..cb258e2 --- /dev/null +++ b/news/constants.py @@ -0,0 +1,6 @@ +constants = { + 'config_defaults': { + 'api_key': 'bda5818cc2af461e98330ccdf6fb9cbe', + 'article_limit': 10, + } +} diff --git a/extract_main_content.py b/news/extract_main_content.py similarity index 67% rename from extract_main_content.py rename to news/extract_main_content.py index d91ea78..2b9f1c2 100644 --- a/extract_main_content.py +++ b/news/extract_main_content.py @@ -1,61 +1,50 @@ -import requests -from config_reader import ConfigurationReader -from extractor import * -import textwrap - - -class ExtractMainContent: - def __init__(self, source, articleurl): - self.extractorlist = [HuffingtonPost(), NYT(), BBC( - ), BloomBerg(), Guardian(), TheHindu(), TimesOfIndia()] - websites = ConfigurationReader().websites_supported - self.Mapping = {} - for index, website in enumerate(websites): - self.Mapping[website] = self.extractorlist[index] - self.Source = source - self.url = articleurl - self.textWrap = textwrap.TextWrapper( - initial_indent='\t', subsequent_indent='\t', width=100) - - def download(self): - headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/59.0.3071.115 Safari/537.36'} - req = requests.get(self.url, headers=headers) - return req.text - - # unused, but may be useful in the future - # def AddExtractorList(self, extractor): - # self.extractorlist.append(extractor) - - def _extract(self): - self.ExtractStrategy = self.Mapping[self.Source] - text = self.download() - return self.ExtractStrategy.extractor(text) - - def beautify(self): - title, output = self._extract() - print("=" * (len(title) + 15)) - print("\t" + title) - print("=" * (len(title) + 15)) - - print((self.textWrap.fill(output))) # wrap of the line - print("*" * 80) - if len(output) == 0: - print("Sorry :(") - print("There isn't much text on the site besides video/image. To " - "further view the media post, Go to the below link") - print(self.url) - print('*' * 80) - print("\n\n") - - def save(self): - title, output = self._extract() - - # Remove Chars not allowed in filenames - for char in ['<', '>', "/", ":", '"', "\\", "|", "?", "*"]: - if char in title: - title = title.replace(char, "") - - with open(f'saved_articles/{title}.txt', "w+") as f: - f.write(output) +import requests +import textwrap + +from .reader_plugins.plugin_registration import sites + + +class ExtractMainContent: + def __init__(self, source, articleurl): + self.source = source + self.url = articleurl + self.textWrap = textwrap.TextWrapper( + initial_indent='\t', subsequent_indent='\t', width=100) + + def download(self): + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/59.0.3071.115 Safari/537.36'} + req = requests.get(self.url, headers=headers) + return req.text + + def _extract(self): + text = self.download() + return sites[self.source]().extractor(text) + + def beautify(self): + title, output = self._extract() + print("=" * (len(title) + 15)) + print("\t" + title) + print("=" * (len(title) + 15)) + + print((self.textWrap.fill(output))) # wrap of the line + print("*" * 80) + if len(output) == 0: + print("Sorry :(") + print("There isn't much text on the site besides video/image. To " + "further view the media post, Go to the below link") + print(self.url) + print('*' * 80) + print("\n\n") + + def save(self): + title, output = self._extract() + + # Remove Chars not allowed in filenames + for char in ['<', '>', "/", ":", '"', "\\", "|", "?", "*"]: + if char in title: + title = title.replace(char, "") + + with open(f'saved_articles/{title}.txt', "w+") as f: + f.write(output) diff --git a/news.py b/news/news.py old mode 100644 new mode 100755 similarity index 87% rename from news.py rename to news/news.py index 64bf84f..305cba0 --- a/news.py +++ b/news/news.py @@ -1,111 +1,115 @@ -import sys -from enum import Enum - -from news_pulling import NewsPulling -from config_reader import ConfigurationReader -from extract_main_content import ExtractMainContent - - -class SelectionStatus(Enum): - BACK = 1 - EXIT = 2 - READ = 3 - - -def news_sources(): - news_sources = ConfigurationReader().websites_supported - return news_sources - - -def display_sources(sources): - for index, source in enumerate(sources): - print(f'[{index + 1}]\t{source}') - print("\nPlease enter the index of the news source or type 'quit' to exit") - - -def display_title_banner(): - # Cool Title/Banner - print("=" * 40) - print("\tNews at the Command Line") - print("=" * 40) - print() - - -def prompt_for_source(sources): - while True: - display_sources(sources) - source_choice = input("News Source Number >>>> ") - # Quit - if(source_choice.lower() == "quit"): - sys.exit() - try: - source_choice = int(source_choice) - 1 - if(source_choice >= len(sources) or source_choice < 0): - print("Please select an index between 1-" + - str(len(sources))) - else: - return source_choice - except ValueError: - print("That is not a valid News Source Number") - - -def prompt_for_article(max=0): - print("Do you want to read a story further? If yes, please select the" - "number corresponding to the article") - print("Enter 'back' to go back to the main menu") - print("Press 'quit' to quit") - while True: - article_selection = input("Article No >>>> ") - - # Back - if(article_selection.lower()[0] == 'b'): - return SelectionStatus.BACK, None - # Exit - elif(article_selection.lower()[0] == 'q'): - return SelectionStatus.EXIT, None - - article_selection = int(article_selection) - if 0 > article_selection - 1 or article_selection > max: - print(f'Please select an index between 1-{max}.') - else: - return SelectionStatus.READ, article_selection - 1 - - -def prompt_for_save(): - while True: - print("Do you want to save this article in file") - selection = str(input("Want to save? y/n >>> ")) - if selection[0].lower() == 'y': - return True - elif selection[0].lower() == 'n': - return False - - -def main(): - display_title_banner() - - while True: - sources = news_sources() - source_choice = prompt_for_source(sources) - - while True: - puller = NewsPulling(sources[source_choice]) - articles = puller.beautify_articles() - status, article_selection = prompt_for_article(max=len(articles)) - if status == SelectionStatus.EXIT: - sys.exit() - elif status == SelectionStatus.BACK: - break - else: - print("\n" * 5) - extr = ExtractMainContent( - sources[source_choice], articles[article_selection][2]) - extr.beautify() - - if prompt_for_save(): - extr.save() - print("File saved!\n") - - -if __name__ == "__main__": - main() +#!/usr/bin/env python3 + +import sys +from enum import Enum + +from .news_pulling import NewsPulling +from .extract_main_content import ExtractMainContent +from .reader_plugins.plugin_registration import sites + + +class SelectionStatus(Enum): + BACK = 1 + EXIT = 2 + READ = 3 + + +def news_sources(): + news_sources = tuple(sites.keys()) + return news_sources + + +def display_sources(sources): + for index, source in enumerate(sources): + print(f'[{index + 1}]\t{source}') + print("\nPlease enter the index of the news source or type 'quit' to exit") + + +def display_title_banner(): + # Cool Title/Banner + print("=" * 40) + print("\tNews at the Command Line") + print("=" * 40) + print() + + +def prompt_for_source(sources): + while True: + display_sources(sources) + source_choice = input("News Source Number >>>> ") + # Quit + if(source_choice.lower() == "quit"): + sys.exit() + try: + source_choice = int(source_choice) - 1 + if(source_choice >= len(sources) or source_choice < 0): + print("Please select an index between 1-" + + str(len(sources))) + else: + return source_choice + except ValueError: + print("That is not a valid News Source Number") + + +def prompt_for_article(max=0): + print("Do you want to read a story further? If yes, please select the" + "number corresponding to the article") + print("Enter 'back' to go back to the main menu") + print("Press 'quit' to quit") + while True: + article_selection = input("Article No >>>> ") + + # Back + if(article_selection.lower()[0] == 'b'): + return SelectionStatus.BACK, None + # Exit + elif(article_selection.lower()[0] == 'q'): + return SelectionStatus.EXIT, None + + article_selection = int(article_selection) + if 0 > article_selection - 1 or article_selection > max: + print(f'Please select an index between 1-{max}.') + else: + return SelectionStatus.READ, article_selection - 1 + + +def prompt_for_save(): + while True: + print("Do you want to save this article in file") + selection = str(input("Want to save? y/n >>> ")) + if selection[0].lower() == 'y': + return True + elif selection[0].lower() == 'n': + return False + + +def main(): + display_title_banner() + + while True: + sources = news_sources() + source_choice = prompt_for_source(sources) + + while True: + # TODO: This is ugly, but functional. + # Getting the name of thesource as used in the API from the plugin. + puller = NewsPulling(sites[sources[source_choice]]().source_name) + articles = puller.beautify_articles() + status, article_selection = prompt_for_article(max=len(articles)) + if status == SelectionStatus.EXIT: + sys.exit() + elif status == SelectionStatus.BACK: + break + else: + print("\n" * 5) + extr = ExtractMainContent( + sources[source_choice], articles[article_selection][2]) + extr.beautify() + + if prompt_for_save(): + extr.save() + print("File saved!\n") + + +if __name__ == "__main__": + main() diff --git a/news_pulling.py b/news/news_pulling.py similarity index 76% rename from news_pulling.py rename to news/news_pulling.py index 65c8501..958582d 100644 --- a/news_pulling.py +++ b/news/news_pulling.py @@ -1,92 +1,93 @@ -import sys - -import requests -from requests import ConnectionError - -from config_reader import ConfigurationReader - - -class NewsPulling: - """This class is used to pull news from the internet depending on the source specified """ - - def __init__(self, newsSource): - self.Source = newsSource - - def pull_news(self): - config = ConfigurationReader() - self.__APIKey = config.APIKEY - self.__Limit = config.limit - url = 'https://newsapi.org/v1/articles?source=' + \ - self.Source + '&sortBy=top&apiKey=' + self.__APIKey - try: - req = requests.get(url) - if(req.status_code == 200): - return req - else: - print( - "There is some issue in connecting to the internet. Please check your firewall or internet") - except ConnectionError as e: - print("A connection Attempt failed") - print(e.message) - sys.exit() - - def json_read(self): - req = self.pull_news() - # indicate if we need to convert to utf-8 - needsconversion = False - if req.encoding != 'utf-8': - needsconversion = True - req = req.json() - articles = req['articles'] - noofarticles = len(articles) - maxarticles = min(noofarticles, self.__Limit) - - FilteredArticles = [] - - for i in range(maxarticles): - article = articles[i] - if needsconversion: - description = str(article['description'], 'utf-8') - # print description - title = str(article['title'], 'utf-8') - Article_url = str(article['url'], 'utf-8') - DateofPublication = str(article['publishedAt'], 'utf-8') - Author = str(article['author'], 'utf-8') - FilteredArticles.append( - [description, title, Article_url, DateofPublication, Author]) - else: - description = article['description'] - # print description - title = article['title'] - Article_url = article['url'] - DateofPublication = article['publishedAt'] - Author = article['author'] - FilteredArticles.append( - [description, title, Article_url, DateofPublication, Author]) - return FilteredArticles - - def beautify_articles(self): - self.Articles = self.json_read() - if self.Articles is None or len(self.Articles) == 0: - print("No articles found") - sys.exit() - print("\n" + ("=" * 16) + " STORIES " + ("=" * 16)) - for i in range(len(self.Articles)): - print("[" + str(i + 1) + "]", end=' ') - # Title - if self.Articles[i][1] is not None: - print("\t" + self.Articles[i][1]) - # Summary - if self.Articles[i][0] is not None: - # Limit Summary Size - summary = self.Articles[i][0][:85] + \ - (self.Articles[i][0][85:] and '...') - print("\t" + summary) - # Author - if self.Articles[i][4] is not None: - print("\t" + self.Articles[i][4]) - # Date - if self.Articles[i][3] is not None: - print("\t" + self.Articles[i][3] + "\n") - print("=" * 40) - return self.Articles +import sys + +import requests +from requests import ConnectionError + +from .config_reader import ConfigurationReader + + +class NewsPulling: + """Pull news from the internet depending on the source specified.""" + + def __init__(self, source): + self.source = source + + def pull_news(self): + config = ConfigurationReader() + self.__Limit = config.limit + url = 'https://newsapi.org/v1/articles?source=' + \ + self.source + '&sortBy=top&apiKey=' + config.APIKEY + try: + req = requests.get(url) + print(req) + if req.status_code == 200: + return req + else: + print("There is some issue in connecting to the internet." + "Please check your firewall or internet") + except ConnectionError as e: + print("A connection attempt failed") + print(e.message) + sys.exit() + + def json_read(self): + req = self.pull_news() + # indicate if we need to convert to utf-8 + needsconversion = False + if req.encoding != 'utf-8': + needsconversion = True + req = req.json() + articles = req['articles'] + noofarticles = len(articles) + maxarticles = min(noofarticles, self.__Limit) + + FilteredArticles = [] + + for i in range(maxarticles): + article = articles[i] + if needsconversion: + description = str(article['description'], 'utf-8') + # print description + title = str(article['title'], 'utf-8') + Article_url = str(article['url'], 'utf-8') + DateofPublication = str(article['publishedAt'], 'utf-8') + Author = str(article['author'], 'utf-8') + FilteredArticles.append([description, title, Article_url, + DateofPublication, Author]) + else: + description = article['description'] + # print description + title = article['title'] + Article_url = article['url'] + DateofPublication = article['publishedAt'] + Author = article['author'] + FilteredArticles.append( + [description, title, Article_url, + DateofPublication, Author]) + return FilteredArticles + + def beautify_articles(self): + self.Articles = self.json_read() + if self.Articles is None or len(self.Articles) == 0: + print("No articles found") + sys.exit() + print("\n" + ("=" * 16) + " STORIES " + ("=" * 16)) + for i in range(len(self.Articles)): + print("[" + str(i + 1) + "]", end=' ') + # Title + if self.Articles[i][1] is not None: + print("\t" + self.Articles[i][1]) + # Summary + if self.Articles[i][0] is not None: + # Limit Summary Size + summary = self.Articles[i][0][:85] + \ + (self.Articles[i][0][85:] and '...') + print("\t" + summary) + # Author + if self.Articles[i][4] is not None: + print("\t" + self.Articles[i][4]) + # Date + if self.Articles[i][3] is not None: + print("\t" + self.Articles[i][3] + "\n") + print("=" * 40) + return self.Articles diff --git a/news/reader.py b/news/reader.py new file mode 100644 index 0000000..f2fcc0a --- /dev/null +++ b/news/reader.py @@ -0,0 +1,19 @@ +from bs4 import BeautifulSoup + + +class Reader: + def _extraction_algo(self, text, htmlelement, classname): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + result = [] + # print soup + maincontent = soup.find_all(htmlelement, class_=classname) + # print maincontent + for content in maincontent: + scripttags = content.find_all(["script", "br", "figure", "image"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + result.append(content.text) + result = ''.join(result) + return (title, result) diff --git a/news/reader_plugins/bbc.py b/news/reader_plugins/bbc.py new file mode 100644 index 0000000..bafb167 --- /dev/null +++ b/news/reader_plugins/bbc.py @@ -0,0 +1,9 @@ +from news.reader import Reader + + +class BBC(Reader): + """class for BBC News parsing""" + source_name = 'bbc-news' + + def extractor(self, text): + return self._extraction_algo(text, "div", "story-body__inner") diff --git a/news/reader_plugins/bloomberg.py b/news/reader_plugins/bloomberg.py new file mode 100644 index 0000000..893573b --- /dev/null +++ b/news/reader_plugins/bloomberg.py @@ -0,0 +1,9 @@ +from news.reader import Reader + + +class Bloomberg(Reader): + """class for BloomBerg parsing""" + source_name = 'bloomberg' + + def extractor(self, text): + return self._extraction_algo(text, "div", "body-copy") diff --git a/news/reader_plugins/guardian.py b/news/reader_plugins/guardian.py new file mode 100644 index 0000000..82f7a1b --- /dev/null +++ b/news/reader_plugins/guardian.py @@ -0,0 +1,28 @@ +from bs4 import BeautifulSoup + +from news.reader import Reader + + +class Guardian(Reader): + """class for Guardian parsing""" + source_name = 'the-guardian-uk' + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all( + "div", + class_="content__article-body from-content-api js-article__body" + ) + # print maincontent + for content in maincontent: + scripttags = content.find_all(["script", "br", "figure", "image"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + for foundcontent in content.find_all("p"): + Result.append(foundcontent.text) + Result = ''.join(Result) + return (title, Result) diff --git a/news/reader_plugins/hindu.py b/news/reader_plugins/hindu.py new file mode 100644 index 0000000..e9c3bed --- /dev/null +++ b/news/reader_plugins/hindu.py @@ -0,0 +1,26 @@ +from bs4 import BeautifulSoup + +from news.reader import Reader + + +class TheHindu(Reader): + """class for The Hindu parsing""" + source_name = 'the-hindu' + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="article") + # print maincontent + for content in maincontent: + scripttags = content.find_all( + ["script", "br", "figure", "image", "span"]) + for scripttag in scripttags: + scripttag.extract() + # print content.text + for foundcontent in content.find_all("p"): + Result.append(foundcontent.text) + Result = ''.join(Result) + return (title, Result) diff --git a/news/reader_plugins/huffington_post.py b/news/reader_plugins/huffington_post.py new file mode 100644 index 0000000..6ba7403 --- /dev/null +++ b/news/reader_plugins/huffington_post.py @@ -0,0 +1,10 @@ +from news.reader import Reader + + +class HuffingtonPost(Reader): + """class for Huffington Post parsing""" + source_name = 'the-huffington-post' + + def extractor(self, text): + return self._extraction_algo(text, "div", + "content-list-component text") diff --git a/news/reader_plugins/new_york_times.py b/news/reader_plugins/new_york_times.py new file mode 100644 index 0000000..641c766 --- /dev/null +++ b/news/reader_plugins/new_york_times.py @@ -0,0 +1,10 @@ +from news.reader import Reader + + +class NYT(Reader): + source_name = 'the-new-york-times' + + """class for New York Times parsing""" + def extractor(self, text): + return self._extraction_algo(text, "p", + "story-body-text story-content") diff --git a/news/reader_plugins/plugin_registration.py b/news/reader_plugins/plugin_registration.py new file mode 100644 index 0000000..d987cf2 --- /dev/null +++ b/news/reader_plugins/plugin_registration.py @@ -0,0 +1,17 @@ +from news.reader_plugins.huffington_post import HuffingtonPost +from news.reader_plugins.new_york_times import NYT +from news.reader_plugins.bbc import BBC +from news.reader_plugins.bloomberg import Bloomberg +from news.reader_plugins.guardian import Guardian +from news.reader_plugins.hindu import TheHindu +from news.reader_plugins.times_of_india import TimesOfIndia + +sites = { + 'Huffington Post': HuffingtonPost, + 'New York Times': NYT, + 'BBC': BBC, + 'Bloomberg': Bloomberg, + 'Guardian': Guardian, + 'The Hindu': TheHindu, + 'Times of India': TimesOfIndia +} diff --git a/news/reader_plugins/times_of_india.py b/news/reader_plugins/times_of_india.py new file mode 100644 index 0000000..37b11b9 --- /dev/null +++ b/news/reader_plugins/times_of_india.py @@ -0,0 +1,21 @@ +from bs4 import BeautifulSoup + +from news.reader import Reader + + +class TimesOfIndia(Reader): + """class for Times of India parsing""" + source_name = 'the-times-of-india' + + def extractor(self, text): + soup = BeautifulSoup(text, 'html.parser') + title = soup.title.string + Result = [] + # print soup + maincontent = soup.find_all("div", class_="Normal") + # print maincontent + for content in maincontent: + # print content.text + Result.append(content.text) + Result = ''.join(Result) + return (title, Result) diff --git a/requirements.txt b/requirements.txt index 345cd29..4d4e19c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,5 @@ idna==2.6 PyYAML==3.12 requests==2.18.4 urllib3==1.22 +wheel +appdirs==1.4.3 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..83b1adf --- /dev/null +++ b/setup.py @@ -0,0 +1,36 @@ +from setuptools import setup, find_packages +from os import path + +from news.__version__ import __version__ +here = path.abspath(path.dirname(__file__)) + +with open('README.md') as f: + long_description = f.read() + +setup( + name="News At The Command Line", + version=__version__, + description="Read your news on your favourite terminal", + author="Ankit Singh", + packages=['news'], + package_dir={'news': 'news'}, + long_description=long_description, + + install_requires=[ + 'bs4>=0.0.1', + 'beautifulsoup4>=4.6.0', + 'PyYAML>=3.12', + 'requests>=2.18.4', + ], + + license='MIT', + entry_points={ + 'console_scripts': [ + 'newsctl=news.news:main' + ] + }, + classifiers=[ + 'Environment :: Console', + 'Intended Audience :: End Users/Desktop', + ] +)