From f2e7cbf275e32fad426b4d3096cb6c5af768b4e7 Mon Sep 17 00:00:00 2001 From: KareemAbuzaid <36644028+KareemAbuzaid@users.noreply.github.com> Date: Tue, 12 Mar 2019 18:02:28 +0200 Subject: [PATCH 1/4] Add proxy settings handling --- News/NewsPulling.py | 12 +++++++++++- News/configReader.py | 14 ++++++++++++++ README.md | 2 ++ config.yml | 3 +++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/News/NewsPulling.py b/News/NewsPulling.py index a16aa5d..2ce43a1 100644 --- a/News/NewsPulling.py +++ b/News/NewsPulling.py @@ -20,9 +20,19 @@ def PullNews(self): Configuration = ConfigurationReader() self.__APIKey=Configuration.GetAPIKEY() self.__Limit=Configuration.GetLimit() + self.__ProxyIP=Configuration.GetProxyIP() + self.__ProxyPortNumber=Configuration.GetProxyPortNumber() url='https://newsapi.org/v2/top-headlines?sources='+self.Source+'&sortBy=top&apiKey='+self.__APIKey + proxies = {} + if self.__ProxyIP and self.__ProxyPortNumber: + proxies = { + 'http': "http://{}:{}".format(self.__ProxyIP, + self.__ProxyPortNumber), + 'https': "https://{}:{}".format(self.__ProxyIP, + self.__ProxyPortNumber), + } try: - req=requests.get(url) + req=requests.get(url, proxies=proxies) if(req.status_code==200): return req else: diff --git a/News/configReader.py b/News/configReader.py index a8b90c4..172367d 100644 --- a/News/configReader.py +++ b/News/configReader.py @@ -12,6 +12,8 @@ class ConfigurationReader(): __APIKEY=None __WebsiteSupported=[] __Limit=None + __ProxyIP=None + __ProxyPortNumber=None def __init__(self): with open("config.yml", 'r') as ymlfile: cfg = yaml.load(ymlfile) @@ -21,6 +23,10 @@ def __init__(self): #print ConfigurationReader.__Limit ConfigurationReader.__WebsiteSupported=cfg['WebsiteSupported'] #print ConfigurationReader.__WebsiteSupported + ConfigurationReader.__ProxyIP = cfg['ProxyIP'] + # print ConfigurationReader.__ProxyIP + ConfigurationReader.__ProxyPortNumber = cfg['ProxyPortNumber'] + # print ConfigurationReader.__ProxyPortNumber @staticmethod def GetAPIKEY(): return ConfigurationReader.__APIKEY @@ -32,5 +38,13 @@ def GetLimit(): @staticmethod def GetWebsiteSupported(): return ConfigurationReader.__WebsiteSupported + + @staticmethod + def GetProxyIP(): + return ConfigurationReader.__ProxyIP + + @staticmethod + def GetProxyPortNumber(): + return ConfigurationReader.__ProxyPortNumber \ No newline at end of file diff --git a/README.md b/README.md index caf6f22..8c825ed 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ To install the module dependencies before running the application, simply naviga ### To install python setup.py install newsctl + ### If you are accessing the web through a proxy service: + Please add the proxy IP and port number to the config.yml file # Contributing We welcome your contributions. Please feel free to fork the code, play with it, make some patches and send us pull requests. diff --git a/config.yml b/config.yml index f124022..cead5fc 100644 --- a/config.yml +++ b/config.yml @@ -31,4 +31,7 @@ Limit: 10 Apikey: bda5818cc2af461e98330ccdf6fb9cbe +ProxyIP: +ProxyPortNumber: + \ No newline at end of file From c12070248603a47a4cddd25a53e671976224bcaf Mon Sep 17 00:00:00 2001 From: KareemAbuzaid <36644028+KareemAbuzaid@users.noreply.github.com> Date: Tue, 12 Mar 2019 18:38:50 +0200 Subject: [PATCH 2/4] Add proxy handling for downloading the articles as well --- News/ExtractMainContent.py | 14 +++++++++++++- News/NewsPulling.py | 2 +- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/News/ExtractMainContent.py b/News/ExtractMainContent.py index 2677a1e..3ffb814 100644 --- a/News/ExtractMainContent.py +++ b/News/ExtractMainContent.py @@ -11,6 +11,7 @@ class ExtractMainContent(object): def __init__(self,source,articleurl): + #import pdb; pdb.set_trace(); self.extractorlist=[HuffingtonPost(),NYT(),BBC(),BloomBerg(),Guardian(),TheHindu(),TimesOfIndia()] websites=ConfigurationReader().GetWebsiteSupported() self.Mapping={} @@ -23,7 +24,18 @@ def __init__(self,source,articleurl): def DownloadContent(self): headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'} - req=requests.get(self.url,headers=headers) + Configuration = ConfigurationReader() + self.__ProxyIP=Configuration.GetProxyIP() + self.__ProxyPortNumber=Configuration.GetProxyPortNumber() + proxies = {} + if self.__ProxyIP and self.__ProxyPortNumber: + proxies = { + 'http': "http://{}:{}".format(self.__ProxyIP, + self.__ProxyPortNumber), + 'https': "https://{}:{}".format(self.__ProxyIP, + self.__ProxyPortNumber), + } + req=requests.get(self.url,headers=headers,proxies=proxies) return req.text def AddExtractorList(self,extractor): diff --git a/News/NewsPulling.py b/News/NewsPulling.py index 2ce43a1..0472d8f 100644 --- a/News/NewsPulling.py +++ b/News/NewsPulling.py @@ -32,7 +32,7 @@ def PullNews(self): self.__ProxyPortNumber), } try: - req=requests.get(url, proxies=proxies) + req=requests.get(url,proxies=proxies) if(req.status_code==200): return req else: From 7f2a9041df132010cbe70f0632c7f2c489a2a9c7 Mon Sep 17 00:00:00 2001 From: KareemAbuzaid <36644028+KareemAbuzaid@users.noreply.github.com> Date: Tue, 12 Mar 2019 18:45:27 +0200 Subject: [PATCH 3/4] Use string formatting rather than concatination --- News/NewsPulling.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/News/NewsPulling.py b/News/NewsPulling.py index 0472d8f..b1d9ae1 100644 --- a/News/NewsPulling.py +++ b/News/NewsPulling.py @@ -22,7 +22,8 @@ def PullNews(self): self.__Limit=Configuration.GetLimit() self.__ProxyIP=Configuration.GetProxyIP() self.__ProxyPortNumber=Configuration.GetProxyPortNumber() - url='https://newsapi.org/v2/top-headlines?sources='+self.Source+'&sortBy=top&apiKey='+self.__APIKey + import pdb; pdb.set_trace(); + url = 'https://newsapi.org/v2/top-headlines?sources={}&sortBy=top&apiKey={}'.format(self.Source, self.__APIKey) proxies = {} if self.__ProxyIP and self.__ProxyPortNumber: proxies = { From 6176ff71a0e8f94ca6cfb4cb2eb1acb1539b16d2 Mon Sep 17 00:00:00 2001 From: KareemAbuzaid <36644028+KareemAbuzaid@users.noreply.github.com> Date: Tue, 12 Mar 2019 19:15:08 +0200 Subject: [PATCH 4/4] Revert "Use string formatting rather than concatination" This reverts commit 7f2a9041df132010cbe70f0632c7f2c489a2a9c7. --- News/NewsPulling.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/News/NewsPulling.py b/News/NewsPulling.py index b1d9ae1..0472d8f 100644 --- a/News/NewsPulling.py +++ b/News/NewsPulling.py @@ -22,8 +22,7 @@ def PullNews(self): self.__Limit=Configuration.GetLimit() self.__ProxyIP=Configuration.GetProxyIP() self.__ProxyPortNumber=Configuration.GetProxyPortNumber() - import pdb; pdb.set_trace(); - url = 'https://newsapi.org/v2/top-headlines?sources={}&sortBy=top&apiKey={}'.format(self.Source, self.__APIKey) + url='https://newsapi.org/v2/top-headlines?sources='+self.Source+'&sortBy=top&apiKey='+self.__APIKey proxies = {} if self.__ProxyIP and self.__ProxyPortNumber: proxies = {