From 6aeb926a4baee80a2179d3e8b72925d9c42c3a94 Mon Sep 17 00:00:00 2001 From: Sadiyah Faruk Date: Wed, 3 Aug 2016 16:31:53 -0400 Subject: [PATCH 1/7] took out spot_check --- verifier.py | 202 +--------------------------------------------------- 1 file changed, 1 insertion(+), 201 deletions(-) diff --git a/verifier.py b/verifier.py index 8a7f350..00a6d0f 100644 --- a/verifier.py +++ b/verifier.py @@ -62,276 +62,76 @@ def size_comparison(self): self.pages.remove(page) return - # Check that specified elements are supposed to exist and a loading bar isn't present instead - # Check that specified elements or their alternates are present and non-empty in each page - # Alternate: different elements appear if there isn't supposed to be content, so it has to check both - # Format: Filled-in : Alternate - def spot_check(self): - for page in self.pages[:]: - soup = page.get_content() - # Existential crisis: - for element in self.loading_elements: - final_element = self.loading_elements[element] # What is supposed to be there - loading_bar_result = soup.select(element) # Is a loading bar present? - if len(loading_bar_result) > 0: # A loading bar exists (so content does not exist completely) - print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element, " present.") - self.failed_pages.append(page.url) - self.pages.remove(page) - break - else: - # Alternate checker: - for element in self.alternate_elements: - alt = self.alternate_elements[element] - result = soup.select(element) - # No results or empty results, with alternate - if (len(result) == 0 or len(result[0].contents) == 0) and alt != '': - alt_result = soup.select(alt) - - # Element's alternate has no or empty results - if len(alt_result) == 0 or len(alt_result[0].contents) == 0: - print("Failed: alternate spot_check(): ", page, alt, '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - - # Element has no alternate and no results or empty results - elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '': - print('Failed: spot_check(): ', page, element, "No alt.", '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - return - def run_verifier(self, json_filename, json_list): self.harvest_pages(json_filename, json_list) self.size_comparison() - # self.spot_check() - # Verifier subclasses + class ProjectDashboardVerifier(Verifier): def __init__(self): super().__init__(410, ProjectDashboardPage, '') - self.loading_elements = { - "#treeGrid > div > p": '#tb-tbody', # Files list - "#containment": "#render-node", # Exists if there are supposed to be components / Is it filled? - } - self.alternate_elements = { - '#nodeTitleEditable': '', # Title - '#contributors span.date.node-last-modified-date': '', # Last modified - '#contributorsList > ol': '', # Contributor list - '#tb-tbody': '', # File list - '#logScope > div > div > div.panel-body > span > dl': '#logFeed > div > p' - # Activity / "Unable to retrieve at this time" - } - - # Override: the loader for loading_elements is still supposed to exist - # Check that specified elements are supposed to exist and a loading bar isn't present instead - # Check that specified elements or their alternates are present and non-empty in each page - # Alternate: different elements appear if there isn't supposed to be content, so it has to check both - # Format: Filled-in : Alternate - def spot_check(self): - for page in self.pages[:]: - soup = page.get_content() - # Existential crisis: - for element in self.loading_elements: - final_element = self.loading_elements[element] # What is supposed to be there - loading_bar_result = soup.select(element) - if len(loading_bar_result) > 0: # Container div is present - final_result = soup.select(final_element) - if len(final_result) == 0: # Final element isn't in place - print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element, - " present.") - self.failed_pages.append(page.url) - self.pages.remove(page) - break - else: - # Alternate checker: - for element in self.alternate_elements: - alt = self.alternate_elements[element] - result = soup.select(element) - # No results or empty results, with alternate - if (len(result) == 0 or len(result[0].contents) == 0) and alt != '': - alt_result = soup.select(alt) - - # Element's alternate has no or empty results - if len(alt_result) == 0 or len(alt_result[0].contents) == 0: - print("Failed: alternate spot_check(): ", page, alt, '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - - # Element has no alternate and no results or empty results - elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '': - print('Failed: spot_check(): ', page, element, "No alt.", '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - return class ProjectFilesVerifier(Verifier): def __init__(self): super().__init__(380, ProjectFilesPage, "files/") - self.alternate_elements = { - '.fg-file-links': '', # Links to files (names them) - } class ProjectWikiVerifier(Verifier): def __init__(self): super().__init__(410, ProjectWikiPage, "wiki/") - self.alternate_elements = { - '#wikiViewRender': '#wikiViewRender > p > em', # Wiki content / `No wiki content` - '#viewVersionSelect option': '', # Current version date modified - '.fg-file-links': '' # Links to other pages (names them) - } class ProjectAnalyticsVerifier(Verifier): def __init__(self): super().__init__(380, ProjectAnalyticsPage, "analytics/") - self.alternate_elements = { - '#adBlock': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center', - # Warning about AdBlock - 'iframe': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center', - # External frame for analytics - } class ProjectRegistrationsVerifier(Verifier): def __init__(self): super().__init__(380, ProjectRegistrationsPage, "registrations/") - self.alternate_elements = { - '#renderNode': '#registrations > div > div > p' # List of nodes - } class ProjectForksVerifier(Verifier): def __init__(self): super().__init__(380, ProjectForksPage, "forks/") - self.alternate_elements = { - '#renderNode': 'div.watermarked > div > div.row > div.col-xs-9.col-sm-8 > p' # List - } class RegistrationDashboardVerifier(Verifier): def __init__(self): super().__init__(410, RegistrationDashboardPage, "") - self.loading_elements = { - "#treeGrid > div > p": '#tb-tbody', # Files list - "#containment": "#render-node", # Exists if there are supposed to be components / Is it filled? - } - self.alternate_elements = { - '#nodeTitleEditable': '', # Title - '#contributors > div > p:nth-of-type(5) > span': '', # Last modified - '#contributorsList > ol': '', # Contributor list - '#logScope > div > div > div.panel-body > span > dl': '#logFeed > div > p' - # Activity / "Unable to retrieve at this time" - } - - # Override: the loader for loading_elements is still supposed to exist - # Check that specified elements are supposed to exist and a loading bar isn't present instead - # Check that specified elements or their alternates are present and non-empty in each page - # Alternate: different elements appear if there isn't supposed to be content, so it has to check both - # Format: Filled-in : Alternate - def spot_check(self): - for page in self.pages[:]: - soup = page.get_content() - # Existential crisis: - for element in self.loading_elements: - final_element = self.loading_elements[element] # What is supposed to be there - loading_bar_result = soup.select(element) - if len(loading_bar_result) > 0: # Container div is present - final_result = soup.select(final_element) - if len(final_result) == 0: # Final element isn't in place - print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element, - " present.") - self.failed_pages.append(page.url) - self.pages.remove(page) - break - else: - # Alternate checker: - for element in self.alternate_elements: - alt = self.alternate_elements[element] - result = soup.select(element) - # No results or empty results, with alternate - if (len(result) == 0 or len(result[0].contents) == 0) and alt != '': - alt_result = soup.select(alt) - - # Element's alternate has no or empty results - if len(alt_result) == 0 or len(alt_result[0].contents) == 0: - print("Failed: alternate spot_check(): ", page, alt, '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - - # Element has no alternate and no results or empty results - elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '': - print('Failed: spot_check(): ', page, element, "No alt.", '\n') - self.failed_pages.append(page.url) - self.pages.remove(page) - break - return class RegistrationFilesVerifier(Verifier): def __init__(self): super().__init__(380, RegistrationFilesPage, "files/") - self.alternate_elements = { - '.fg-file-links': '', # Links to files (names them) - } class RegistrationWikiVerifier(Verifier): def __init__(self): super().__init__(410, RegistrationWikiPage, "wiki/") - self.alternate_elements = { - '#wikiViewRender': '#wikiViewRender > p > em', # Wiki content / `No wiki content` - '#viewVersionSelect option': '', # Current version date modified - '.fg-file-links': '' # Links to other pages (names them) - } class RegistrationAnalyticsVerifier(Verifier): def __init__(self): super().__init__(380, RegistrationAnalyticsPage, "analytics/") - self.alternate_elements = { - '#adBlock': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center', - # Warning about AdBlock - 'iframe': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center', - # External frame for analytics - } class RegistrationForksVerifier(Verifier): def __init__(self): super().__init__(380, RegistrationForksPage, "forks/") - self.alternate_elements = { - '#renderNode': 'div.watermarked > div > div.row > div.col-xs-9.col-sm-8 > p' # List - } class UserProfileVerifier(Verifier): def __init__(self): super().__init__(80, UserProfilePage, "") - self.alternate_elements = { - '#projects': 'div > div:nth-of-type(1) > div > div.panel-body > div', # Project list / "No projects" - '#components': 'div > div:nth-of-type(2) > div > div.panel-body > div', # Component list / "No components" - 'body h2': '' # Activity points, project count - } class InstitutionDashboardVerifier(Verifier): def __init__(self): super().__init__(350, InstitutionDashboardPage, "") - self.loading_elements = { - '#fileBrowser > div.db-main > div.line-loader > div.load-message': '.fg-file-links' # "loading" / Project browser - } - self.alternate_elements = { - '#fileBrowser > div.db-infobar > div > div': '#fileBrowser > div.db-infobar > div > div' # Project preview / "Select a project" - } # Called when json file had scrape_nodes = true From a878aad9ec58ad3659ddf029704bae275381e593 Mon Sep 17 00:00:00 2001 From: Sadiyah Faruk Date: Thu, 4 Aug 2016 09:32:55 -0400 Subject: [PATCH 2/7] no more page modularization? bad idea? good idea? --- verifier.py | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/verifier.py b/verifier.py index 00a6d0f..7d8b635 100644 --- a/verifier.py +++ b/verifier.py @@ -4,7 +4,6 @@ ProjectForksPage, ProjectRegistrationsPage, ProjectWikiPage, RegistrationDashboardPage, RegistrationFilesPage, \ RegistrationAnalyticsPage, RegistrationForksPage, RegistrationWikiPage, UserProfilePage, InstitutionDashboardPage from crawler import Crawler -import bs4 # Verifier superclass @@ -19,14 +18,6 @@ def __init__(self, min_size, pg_type, end): self.page_type = pg_type self.url_end = end - # Certain elements will be absent if there's no content for them to display, so we check if there is a loading - # bar in its place. This means the element should exist, but it doesn't. - self.loading_elements = {} - - # Other elements will be replaced by a message if there's no content for them (e.g. "This user has no projects") - # We check for the elements and their alternates if the original isn't found. - self.alternate_elements = {} - self.pages = [] # All the page objects self.failed_pages = [] @@ -37,19 +28,20 @@ def harvest_pages(self, json_dictionary, json_list): :param json_list: The list in the json file of found URLs :return: Null, but self.pages is populated. """ - for url in json_list[:]: - if self.url_end in url: - print('rel: ', url) - if url in json_dictionary['error_list']: - self.failed_pages.append(url) - print('error: ', url) - else: - try: - obj = self.page_type(url) - self.pages.append(obj) - except FileNotFoundError: + if json_dictionary['error_list'] is not None: + for url in json_list[:]: + if self.url_end in url: + print('rel: ', url) + if url in json_dictionary['error_list']: self.failed_pages.append(url) - json_list.remove(url) + print('error: ', url) + else: + try: + obj = self.page_type(url) + self.pages.append(obj) + except FileNotFoundError: + self.failed_pages.append(url) + json_list.remove(url) # Compare page size to page-specific minimum that any fully-scraped page should have def size_comparison(self): From e5c0089482021633273aee4644ce72701147a5c9 Mon Sep 17 00:00:00 2001 From: Sadiyah Faruk Date: Wed, 10 Aug 2016 14:07:38 -0400 Subject: [PATCH 3/7] made big big changes up in here --- crawler.py | 17 ++-- pages.py | 99 -------------------- settings.py | 2 +- verifier.py | 263 +++++++++++++++++++++------------------------------- 4 files changed, 118 insertions(+), 263 deletions(-) delete mode 100644 pages.py diff --git a/crawler.py b/crawler.py index 94d6dbd..0cca91b 100644 --- a/crawler.py +++ b/crawler.py @@ -523,14 +523,14 @@ def scrape_nodes(self, async=True): """ self.debug_logger.info("Scraping nodes, async = " + str(async)) if async: - self._scrape_pages(self.node_urls) + self.scrape_pages(self.node_urls) else: for elem in self.node_url_tuples: lst = [] while len(self.node_urls) > 0 and elem[0] in self.node_urls[0]: lst.append(self.node_urls.pop(0)) if len(lst) > 0: - self._scrape_pages(lst) + self.scrape_pages(lst) self.debug_logger.info("Finished scraping nodes, async = " + str(async)) def scrape_registrations(self, async=True): @@ -540,13 +540,13 @@ def scrape_registrations(self, async=True): """ self.debug_logger.info("Scraping registrations, async = " + str(async)) if async: - self._scrape_pages(self.registration_urls) + self.scrape_pages(self.registration_urls) else: for elem in self.registration_url_tuples: lst = [] while len(self.registration_urls) > 0 and elem[0] in self.registration_urls: lst.append(self.registration_urls.pop(0)) - self._scrape_pages(lst) + self.scrape_pages(lst) self.debug_logger.info("Finished scraping registrations, async = " + str(async)) def scrape_users(self): @@ -554,7 +554,7 @@ def scrape_users(self): Wrapper method that scrape all urls in self.user_urls. Calls _scrape_pages(). """ self.debug_logger.info("Scraping users") - self._scrape_pages(self.user_urls) + self.scrape_pages(self.user_urls) self.debug_logger.info("Finished scraping users") def scrape_institutions(self): @@ -562,7 +562,7 @@ def scrape_institutions(self): Wrapper method that scrape all institution_urls. Calls _scrape_pages(). """ self.debug_logger.info("Scraping institutions") - self._scrape_pages(self.institution_urls) + self.scrape_pages(self.institution_urls) self.debug_logger.info("Finished scraping institutions") def scrape_general(self): @@ -570,11 +570,12 @@ def scrape_general(self): Wrapper method that scrape all general_urls. Calls _scrape_pages(). """ self.debug_logger.info("Scraping general pages") - self._scrape_pages(self.general_urls) + self.scrape_pages(self.general_urls) self.debug_logger.info("Finished scraping general pages") + # TODO Make semaphore value a parameter - def _scrape_pages(self, aspect_list): + def scrape_pages(self, aspect_list): """ Runner method that runs scrape_url() :param aspect_list: list of url of pages to scrape diff --git a/pages.py b/pages.py deleted file mode 100644 index e5de596..0000000 --- a/pages.py +++ /dev/null @@ -1,99 +0,0 @@ -""" The page superclass and subclasses for verifier""" - -from bs4 import BeautifulSoup -from settings import base_urls -import os - -MIRROR = 'archive/' - - -# Superclass for page-specific page instances -class Page: - def __init__(self, url): - self.url = url - self.path = self.get_path_from_url(url) - # Set size attribute in KB, inherently checks if file exists - try: - self.file_size = os.path.getsize(self.path) / 1000 - except FileNotFoundError: - raise FileNotFoundError - - def __str__(self): - return self.path - - # Takes a URL and produces its relative file name. - def get_path_from_url(self, url): - # Remove http://domain - tail = url.replace(base_urls[0], '') + 'index.html' - path = MIRROR + tail - return path - - def get_content(self): - soup = BeautifulSoup(open(self.path), 'html.parser') - return soup - - -# Page-specific subclasses -class ProjectDashboardPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectFilesPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectWikiPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectAnalyticsPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectRegistrationsPage(Page): - def __init__(self, url): - super().__init__(url) - - -class ProjectForksPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationDashboardPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationFilesPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationWikiPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationAnalyticsPage(Page): - def __init__(self, url): - super().__init__(url) - - -class RegistrationForksPage(Page): - def __init__(self, url): - super().__init__(url) - - -class UserProfilePage(Page): - def __init__(self, url): - super().__init__(url) - - -class InstitutionDashboardPage(Page): - def __init__(self, url): - super().__init__(url) \ No newline at end of file diff --git a/settings.py b/settings.py index 5106b05..3d66fae 100644 --- a/settings.py +++ b/settings.py @@ -1,5 +1,5 @@ # The OSF website URL, and the API -base_urls = ['https://osf.io/', 'https://api.osf.io/v2/'] +base_urls = ['https://staging.osf.io/', 'https://staging-api.osf.io/v2/'] DEBUG_LOG_FILENAME = 'debug_log.txt' ERROR_LOG_FILENAME = 'error_log.txt' \ No newline at end of file diff --git a/verifier.py b/verifier.py index 7d8b635..a103e75 100644 --- a/verifier.py +++ b/verifier.py @@ -1,28 +1,57 @@ import json import codecs -from pages import ProjectDashboardPage, ProjectFilesPage, ProjectAnalyticsPage, \ - ProjectForksPage, ProjectRegistrationsPage, ProjectWikiPage, RegistrationDashboardPage, RegistrationFilesPage, \ - RegistrationAnalyticsPage, RegistrationForksPage, RegistrationWikiPage, UserProfilePage, InstitutionDashboardPage from crawler import Crawler +from bs4 import BeautifulSoup +from settings import base_urls +import os + +MIRROR = 'archive/' + + +# Superclass for page-specific page instances +class Page: + def __init__(self, url): + self.url = url + self.path = self.get_path_from_url(url) + # Set size attribute in KB, inherently checks if file exists + try: + self.file_size = os.path.getsize(self.path) / 1000 + except FileNotFoundError: + raise FileNotFoundError + + def __str__(self): + return self.path + + # Takes a URL and produces its relative file name. + def get_path_from_url(self, url): + # Remove http://domain + tail = url.replace(base_urls[0], '') + 'index.html' + print(tail) + path = MIRROR + tail + return path + + def get_content(self): + soup = BeautifulSoup(open(self.path), 'html.parser') + return soup # Verifier superclass class Verifier: - def __init__(self, min_size, pg_type, end): + def __init__(self): """ :param min_size: File size minimum for a page. Anything below this couldn't possibly be a complete file. :param pg_type: The class to instantiate page objects with. :param end: Indentifier in the URL, e.g. 'files/', 'end' is a misnomer ('wiki/' in the middle of a URL) """ - self.minimum_size = min_size - self.page_type = pg_type - self.url_end = end + self.minimum_size = 8 + # self.page_type = pg_type + # self.url_end = end self.pages = [] # All the page objects self.failed_pages = [] # Populate self.pages with the relevant files - def harvest_pages(self, json_dictionary, json_list): + def harvest_pages(self, json_dictionary, json_list, first_run): """ :param json_dictionary: The dictionary created from the json file :param json_list: The list in the json file of found URLs @@ -30,135 +59,70 @@ def harvest_pages(self, json_dictionary, json_list): """ if json_dictionary['error_list'] is not None: for url in json_list[:]: - if self.url_end in url: - print('rel: ', url) - if url in json_dictionary['error_list']: + # if self.url_end in url: + print('rel: ', url) + if url in json_dictionary['error_list'] and first_run: + self.failed_pages.append(url) + print('error: ', url) + else: + try: + obj = Page(url) + self.pages.append(obj) + print(obj.path) + except FileNotFoundError: + print("Failed harvest_pages ", url) self.failed_pages.append(url) - print('error: ', url) - else: - try: - obj = self.page_type(url) - self.pages.append(obj) - except FileNotFoundError: - self.failed_pages.append(url) - json_list.remove(url) + json_list.remove(url) # Compare page size to page-specific minimum that any fully-scraped page should have def size_comparison(self): for page in self.pages[:]: - # print(page) - # print(page.file_size) + print("Size comparison on ", page) + print(page.file_size) if not page.file_size > self.minimum_size: print('Failed: size_comparison(): ', page, ' has size: ', page.file_size) self.failed_pages.append(page.url) self.pages.remove(page) return - def run_verifier(self, json_filename, json_list): - self.harvest_pages(json_filename, json_list) + def run_verifier(self, json_filename, json_list, first_run): + self.harvest_pages(json_filename, json_list, first_run) self.size_comparison() -# Verifier subclasses - - -class ProjectDashboardVerifier(Verifier): - def __init__(self): - super().__init__(410, ProjectDashboardPage, '') - - -class ProjectFilesVerifier(Verifier): - def __init__(self): - super().__init__(380, ProjectFilesPage, "files/") - - -class ProjectWikiVerifier(Verifier): - def __init__(self): - super().__init__(410, ProjectWikiPage, "wiki/") - - -class ProjectAnalyticsVerifier(Verifier): - def __init__(self): - super().__init__(380, ProjectAnalyticsPage, "analytics/") - - -class ProjectRegistrationsVerifier(Verifier): - def __init__(self): - super().__init__(380, ProjectRegistrationsPage, "registrations/") - - -class ProjectForksVerifier(Verifier): - def __init__(self): - super().__init__(380, ProjectForksPage, "forks/") - - -class RegistrationDashboardVerifier(Verifier): - def __init__(self): - super().__init__(410, RegistrationDashboardPage, "") - - -class RegistrationFilesVerifier(Verifier): - def __init__(self): - super().__init__(380, RegistrationFilesPage, "files/") - - -class RegistrationWikiVerifier(Verifier): - def __init__(self): - super().__init__(410, RegistrationWikiPage, "wiki/") - - -class RegistrationAnalyticsVerifier(Verifier): - def __init__(self): - super().__init__(380, RegistrationAnalyticsPage, "analytics/") - - -class RegistrationForksVerifier(Verifier): - def __init__(self): - super().__init__(380, RegistrationForksPage, "forks/") - - -class UserProfileVerifier(Verifier): - def __init__(self): - super().__init__(80, UserProfilePage, "") - - -class InstitutionDashboardVerifier(Verifier): - def __init__(self): - super().__init__(350, InstitutionDashboardPage, "") - # Called when json file had scrape_nodes = true # Checks for all the components of a project and if they were scraped # Verifies them and returns a list of the failed pages -def verify_nodes(verification_dictionary, list_name): +def verify_nodes(verification_dictionary, list_name, first_run): nodes_list_verified = [] if verification_dictionary['include_files']: - project_files_verifier = ProjectFilesVerifier() - project_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + project_files_verifier = Verifier() + project_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) project_files = project_files_verifier.failed_pages nodes_list_verified += project_files if verification_dictionary['include_wiki']: - project_wiki_verifier = ProjectWikiVerifier() - project_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + project_wiki_verifier = Verifier() + project_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) project_wiki = project_wiki_verifier.failed_pages nodes_list_verified += project_wiki if verification_dictionary['include_analytics']: - project_analytics_verifier = ProjectAnalyticsVerifier() - project_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + project_analytics_verifier = Verifier() + project_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) project_analytics = project_analytics_verifier.failed_pages nodes_list_verified += project_analytics if verification_dictionary['include_registrations']: - project_registrations_verifier = ProjectRegistrationsVerifier() - project_registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + project_registrations_verifier = Verifier() + project_registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) project_registrations = project_registrations_verifier.failed_pages nodes_list_verified += project_registrations if verification_dictionary['include_forks']: - project_forks_verifier = ProjectForksVerifier() - project_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + project_forks_verifier = Verifier() + project_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) project_forks = project_forks_verifier.failed_pages nodes_list_verified += project_forks if verification_dictionary['include_dashboard']: # This must go last because its URLs don't have a specific ending. - project_dashboards_verifier = ProjectDashboardVerifier() - project_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + project_dashboards_verifier = Verifier() + project_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) project_dashboards = project_dashboards_verifier.failed_pages nodes_list_verified += project_dashboards return nodes_list_verified @@ -166,26 +130,26 @@ def verify_nodes(verification_dictionary, list_name): # Called when json file had scrape_registrations = true # Verifies the components of a registration and returns a list of the failed pages -def verify_registrations(verification_dictionary, list_name): +def verify_registrations(verification_dictionary, list_name, first_run): # Must run all page types automatically - registration_files_verifier = RegistrationFilesVerifier() - registration_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + registration_files_verifier = Verifier() + registration_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) registration_files = registration_files_verifier.failed_pages - registration_wiki_verifier = RegistrationWikiVerifier() - registration_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + registration_wiki_verifier = Verifier() + registration_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) registration_wiki = registration_wiki_verifier.failed_pages - registration_analytics_verifier = RegistrationAnalyticsVerifier() - registration_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + registration_analytics_verifier = Verifier() + registration_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) registration_analytics = registration_analytics_verifier.failed_pages - registration_forks_verifier = RegistrationForksVerifier() - registration_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + registration_forks_verifier = Verifier() + registration_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) registration_forks = registration_forks_verifier.failed_pages - registration_dashboards_verifier = RegistrationDashboardVerifier() - registration_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) + registration_dashboards_verifier = Verifier() + registration_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) registration_dashboards = registration_dashboards_verifier.failed_pages registrations_list_verified = registration_files + registration_wiki + registration_analytics + \ @@ -195,68 +159,57 @@ def verify_registrations(verification_dictionary, list_name): # Called when json file had scrape_users = true # Verifies all user profile pages and returns a list of the failed pages -def verify_users(verification_dictionary, list_name): - user_profiles_verifier = UserProfileVerifier() - user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) +def verify_users(verification_dictionary, list_name, first_run): + user_profiles_verifier = Verifier() + user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) user_profiles = user_profiles_verifier.failed_pages return user_profiles # Called when json file had scrape_institutions = true # Verifies all user profile pages and returns a list of the failed pages -def verify_institutions(verification_dictionary, list_name): - institution_dashboards_verifier = InstitutionDashboardVerifier() - institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name]) +def verify_institutions(verification_dictionary, list_name, first_run): + institution_dashboards_verifier = Verifier() + institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) institution_dashboards = institution_dashboards_verifier.failed_pages return institution_dashboards -def call_rescrape(json_dictionary, verification_json_dictionary): +def call_rescrape(verification_json_dictionary): print("Called rescrape.") second_chance = Crawler() - if json_dictionary['scrape_nodes']: - second_chance.node_urls = verification_json_dictionary['node_urls_failed_verification'] - second_chance.scrape_nodes(async=True) - if json_dictionary['scrape_registrations']: - second_chance.registration_urls = verification_json_dictionary['registration_urls_failed_verification'] - second_chance.scrape_registrations(async=True) - if json_dictionary['scrape_users']: - second_chance.user_urls = verification_json_dictionary['user_urls_failed_verification'] - second_chance.scrape_users() - if json_dictionary['scrape_institutions']: - second_chance.institution_urls = verification_json_dictionary['institution_urls_failed_verification'] - second_chance.scrape_institutions() + second_chance.scrape_pages(verification_json_dictionary['error_list']) -def setup_verification(json_dictionary, verification_json_dictionary, first_scrape): +def setup_verification(json_dictionary, first_run): + verification_list = [] print("Check verification") if json_dictionary['scrape_nodes']: - if first_scrape: + if first_run: list_name = 'node_urls' else: - list_name = 'node_urls_failed_verification' - verification_json_dictionary['node_urls_failed_verification'] = verify_nodes(json_dictionary, list_name) + list_name = 'error_list' + verification_list += verify_nodes(json_dictionary, list_name, first_run) if json_dictionary['scrape_registrations']: - if first_scrape: + if first_run: list_name = 'registration_urls' else: - list_name = 'registration_urls_failed_verification' - verification_json_dictionary['registration_urls_failed_verification'] = verify_registrations(json_dictionary, - list_name) + list_name = 'error_list' + verification_list += verify_registrations(json_dictionary, list_name, first_run) if json_dictionary['scrape_users']: - if first_scrape: + if first_run: list_name = 'user_urls' else: - list_name = 'user_urls_failed_verification' - verification_json_dictionary['user_urls_failed_verification'] = \ - verify_users(json_dictionary, list_name) + list_name = 'error_list' + verification_list += verify_users(json_dictionary, list_name, first_run) if json_dictionary['scrape_institutions']: - if first_scrape: + if first_run: list_name = 'institution_urls' else: - list_name = 'institution_urls_failed_verification' - verification_json_dictionary['institution_urls_failed_verification'] = verify_institutions(json_dictionary, - list_name) + list_name = 'error_list' + verification_list += verify_institutions(json_dictionary, list_name, first_run) + + return verification_list def run_verification(json_file, i): @@ -267,30 +220,30 @@ def run_verification(json_file, i): if i == 0: print("Begun 1st run") if run_info['scrape_finished']: - setup_verification(run_info, run_copy, True) + run_copy['error_list'] = setup_verification(run_info, True) run_copy['1st_verification_finished'] = True with codecs.open(json_file, mode='w', encoding='utf-8') as file: json.dump(run_copy, file, indent=4) print("Dumped json run_copy 1st verify") - call_rescrape(run_info, run_copy) + call_rescrape(run_copy) else: print("Begun next run") - setup_verification(run_copy, run_copy, False) + run_copy['error_list'] = setup_verification(run_copy, False) # truncates json and dumps new lists with codecs.open(json_file, mode='w', encoding='utf-8') as file: json.dump(run_copy, file, indent=4) - call_rescrape(run_copy, run_copy) + call_rescrape(run_copy) def resume_verification(json_filename): with codecs.open(json_filename, mode='r', encoding='utf-8') as failure_file: run_copy = json.load(failure_file) print("Resumed verification.") - setup_verification(run_copy, run_info, False) + run_copy['error_list'] = setup_verification(run_copy, False) # truncates json and dumps new lists with codecs.open(json_filename, mode='w', encoding='utf-8') as file: json.dump(run_copy, file, indent=4) - call_rescrape(run_copy, run_copy) + call_rescrape(run_copy) def main(json_filename, num_retries): From 8291ee3e0a9c7b656fed0e7cca2129078b5fb195 Mon Sep 17 00:00:00 2001 From: Sadiyah Faruk Date: Wed, 10 Aug 2016 16:40:19 -0400 Subject: [PATCH 4/7] trying to fix scraping errors, why are pages not being scraped --- verifier.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/verifier.py b/verifier.py index a103e75..1cd6a21 100644 --- a/verifier.py +++ b/verifier.py @@ -4,6 +4,7 @@ from bs4 import BeautifulSoup from settings import base_urls import os +import pdb MIRROR = 'archive/' @@ -26,7 +27,6 @@ def __str__(self): def get_path_from_url(self, url): # Remove http://domain tail = url.replace(base_urls[0], '') + 'index.html' - print(tail) path = MIRROR + tail return path @@ -44,9 +44,6 @@ def __init__(self): :param end: Indentifier in the URL, e.g. 'files/', 'end' is a misnomer ('wiki/' in the middle of a URL) """ self.minimum_size = 8 - # self.page_type = pg_type - # self.url_end = end - self.pages = [] # All the page objects self.failed_pages = [] @@ -59,7 +56,6 @@ def harvest_pages(self, json_dictionary, json_list, first_run): """ if json_dictionary['error_list'] is not None: for url in json_list[:]: - # if self.url_end in url: print('rel: ', url) if url in json_dictionary['error_list'] and first_run: self.failed_pages.append(url) @@ -68,7 +64,6 @@ def harvest_pages(self, json_dictionary, json_list, first_run): try: obj = Page(url) self.pages.append(obj) - print(obj.path) except FileNotFoundError: print("Failed harvest_pages ", url) self.failed_pages.append(url) @@ -77,8 +72,6 @@ def harvest_pages(self, json_dictionary, json_list, first_run): # Compare page size to page-specific minimum that any fully-scraped page should have def size_comparison(self): for page in self.pages[:]: - print("Size comparison on ", page) - print(page.file_size) if not page.file_size > self.minimum_size: print('Failed: size_comparison(): ', page, ' has size: ', page.file_size) self.failed_pages.append(page.url) @@ -178,6 +171,7 @@ def verify_institutions(verification_dictionary, list_name, first_run): def call_rescrape(verification_json_dictionary): print("Called rescrape.") second_chance = Crawler() + pdb.set_trace() second_chance.scrape_pages(verification_json_dictionary['error_list']) @@ -247,7 +241,4 @@ def resume_verification(json_filename): def main(json_filename, num_retries): - # For testing: - # num_retries = 2 - # call two verification/scraping methods depending on num retries run_verification(json_filename, num_retries) From 756051c2e95ad318dfd02a56edf896492fe80514 Mon Sep 17 00:00:00 2001 From: Sadiyah Faruk Date: Fri, 12 Aug 2016 14:54:42 -0400 Subject: [PATCH 5/7] added docstrings, took out multiple instances of verifier for components of registrations and nodes --- cli.py | 4 +- verifier.py | 230 +++++++++++++++++++++++++++++++++------------------- 2 files changed, 148 insertions(+), 86 deletions(-) diff --git a/cli.py b/cli.py index ac45380..34933bd 100644 --- a/cli.py +++ b/cli.py @@ -315,7 +315,7 @@ def resume_scrape(db, tf): def verify_mirror(tf, rn): for i in range(rn): - verifier.main(tf, i) + verifier.run_verification(tf, i) def resume_verify_mirror(tf, rn): @@ -326,7 +326,7 @@ def resume_verify_mirror(tf, rn): verifier.resume_verification(tf) else: for i in range(rn): - verifier.main(tf, i) + verifier.run_verification(tf, i) def delete_nodes(ptf, ctf): diff --git a/verifier.py b/verifier.py index 1cd6a21..f47ba2e 100644 --- a/verifier.py +++ b/verifier.py @@ -11,7 +11,19 @@ # Superclass for page-specific page instances class Page: + """ + A Page class is designed to hold an instance of a page scraped. + It's attributes are: + url = the url of the page + path = the file path of the page + """ + def __init__(self, url): + """ + Constructor for the Page class + + :param url: The url of the page + """ self.url = url self.path = self.get_path_from_url(url) # Set size attribute in KB, inherently checks if file exists @@ -25,23 +37,45 @@ def __str__(self): # Takes a URL and produces its relative file name. def get_path_from_url(self, url): + """ + Specifies the file path the page scraped is meant to have. + + :param url: The url of the page + """ # Remove http://domain tail = url.replace(base_urls[0], '') + 'index.html' path = MIRROR + tail return path def get_content(self): + """ + Returns the content of the page scraped. + """ soup = BeautifulSoup(open(self.path), 'html.parser') return soup # Verifier superclass class Verifier: + """ + A Verifier class for verification of the OSF Mirror. + A CLI is designed to work with this verifier in order to ensure that everything that is scraped, is verified. + Basic Workflow: + 1. Init + 2. All urls from json file run through harvest_pages. Failed pages get sent to rescrape. + 3. Remaining urls run through size_comparison. Failed pages get sent to rescrape. + 4. Rescrape failed urls. + 5. Verify the pages that were just rescraped. + + """ + def __init__(self): """ - :param min_size: File size minimum for a page. Anything below this couldn't possibly be a complete file. - :param pg_type: The class to instantiate page objects with. - :param end: Indentifier in the URL, e.g. 'files/', 'end' is a misnomer ('wiki/' in the middle of a URL) + Constructor for the Verifier class + + min_size: File size minimum for a page. Anything below this couldn't possibly be a complete file. + pages: All the page objects + failed_pages: Pages that failed verification and are being sent to rescrape. """ self.minimum_size = 8 self.pages = [] # All the page objects @@ -50,9 +84,13 @@ def __init__(self): # Populate self.pages with the relevant files def harvest_pages(self, json_dictionary, json_list, first_run): """ + On the first run of verification, puts all urls in error_list directly into failed_pages. + Otherwise, tries to create page objects unless scraped file cannot be found in which case the url is added + to failed pages. + :param json_dictionary: The dictionary created from the json file :param json_list: The list in the json file of found URLs - :return: Null, but self.pages is populated. + :param first_run: True if the first_run of verification has been completed. False, otherwise. """ if json_dictionary['error_list'] is not None: for url in json_list[:]: @@ -71,6 +109,10 @@ def harvest_pages(self, json_dictionary, json_list, first_run): # Compare page size to page-specific minimum that any fully-scraped page should have def size_comparison(self): + """ + Checks the file size of every page instance against the minimum size specified in the constructor. + Pages that fail get added to failed_pages to be sent to rescrape. + """ for page in self.pages[:]: if not page.file_size > self.minimum_size: print('Failed: size_comparison(): ', page, ' has size: ', page.file_size) @@ -78,140 +120,159 @@ def size_comparison(self): self.pages.remove(page) return - def run_verifier(self, json_filename, json_list, first_run): - self.harvest_pages(json_filename, json_list, first_run) + def run_verifier(self, json_dictionary, json_list, first_run): + """ + Runs the verifier. + + :param json_dictionary: The dictionary created from the json file + :param json_list: The list in the json file of found URLs + :param first_run: True if the first_run of verification has been completed. False, otherwise. + """ + self.harvest_pages(json_dictionary, json_list, first_run) self.size_comparison() # Called when json file had scrape_nodes = true -# Checks for all the components of a project and if they were scraped -# Verifies them and returns a list of the failed pages def verify_nodes(verification_dictionary, list_name, first_run): - nodes_list_verified = [] - if verification_dictionary['include_files']: - project_files_verifier = Verifier() - project_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - project_files = project_files_verifier.failed_pages - nodes_list_verified += project_files - if verification_dictionary['include_wiki']: - project_wiki_verifier = Verifier() - project_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - project_wiki = project_wiki_verifier.failed_pages - nodes_list_verified += project_wiki - if verification_dictionary['include_analytics']: - project_analytics_verifier = Verifier() - project_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - project_analytics = project_analytics_verifier.failed_pages - nodes_list_verified += project_analytics - if verification_dictionary['include_registrations']: - project_registrations_verifier = Verifier() - project_registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - project_registrations = project_registrations_verifier.failed_pages - nodes_list_verified += project_registrations - if verification_dictionary['include_forks']: - project_forks_verifier = Verifier() - project_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - project_forks = project_forks_verifier.failed_pages - nodes_list_verified += project_forks - if verification_dictionary['include_dashboard']: # This must go last because its URLs don't have a specific ending. - project_dashboards_verifier = Verifier() - project_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - project_dashboards = project_dashboards_verifier.failed_pages - nodes_list_verified += project_dashboards - return nodes_list_verified + """ + Called when scrape_nodes = True + + :param verification_dictionary: The dictionary created from the json file. + :param list_name: The list in the json file of found URLs. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: nodes_list_failed_verification: List of all the node urls that need to be rescraped. + """ + projects_verifier = Verifier() + projects_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) + nodes_list_failed_verification = projects_verifier.failed_pages + + return nodes_list_failed_verification # Called when json file had scrape_registrations = true -# Verifies the components of a registration and returns a list of the failed pages def verify_registrations(verification_dictionary, list_name, first_run): + """ + Called when scrape_registrations = True + + :param verification_dictionary: The dictionary created from the json file. + :param list_name: The list in the json file of found URLs. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: registrations_list_failed_verification: List of all the registration urls that need to be rescraped. + """ # Must run all page types automatically - registration_files_verifier = Verifier() - registration_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - registration_files = registration_files_verifier.failed_pages - - registration_wiki_verifier = Verifier() - registration_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - registration_wiki = registration_wiki_verifier.failed_pages - - registration_analytics_verifier = Verifier() - registration_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - registration_analytics = registration_analytics_verifier.failed_pages - - registration_forks_verifier = Verifier() - registration_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - registration_forks = registration_forks_verifier.failed_pages - - registration_dashboards_verifier = Verifier() - registration_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - registration_dashboards = registration_dashboards_verifier.failed_pages + registrations_verifier = Verifier() + registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) + registrations_list_failed_verification = registrations_verifier.failed_pages - registrations_list_verified = registration_files + registration_wiki + registration_analytics + \ - registration_forks + registration_dashboards - return registrations_list_verified + return registrations_list_failed_verification # Called when json file had scrape_users = true # Verifies all user profile pages and returns a list of the failed pages def verify_users(verification_dictionary, list_name, first_run): + """ + Called when scrape_users = True + + :param verification_dictionary: The dictionary created from the json file. + :param list_name: The list in the json file of found URLs. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: user_profiles_failed_verification: List of all the user urls that need to be rescraped. + """ user_profiles_verifier = Verifier() user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - user_profiles = user_profiles_verifier.failed_pages - return user_profiles + user_profiles_failed_verification = user_profiles_verifier.failed_pages + return user_profiles_failed_verification # Called when json file had scrape_institutions = true # Verifies all user profile pages and returns a list of the failed pages def verify_institutions(verification_dictionary, list_name, first_run): + """ + Called when scrape_institutions = True + + :param verification_dictionary: The dictionary created from the json file. + :param list_name: The list in the json file of found URLs. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: institutions_dashboards_failed_verification: List of all the institution urls that need to be rescraped. + """ institution_dashboards_verifier = Verifier() institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) - institution_dashboards = institution_dashboards_verifier.failed_pages - return institution_dashboards + institution_dashboards_failed_verification = institution_dashboards_verifier.failed_pages + return institution_dashboards_failed_verification -def call_rescrape(verification_json_dictionary): +def call_rescrape(verification_dictionary): + """ + Rescrapes all urls that failed verification + Creates an instance of the crawler and calls scrape_pages on all urls dumped into 'error_list' in the json file + + :param verification_dictionary: The dictionary created from the json file. + """ print("Called rescrape.") second_chance = Crawler() pdb.set_trace() - second_chance.scrape_pages(verification_json_dictionary['error_list']) + second_chance.scrape_pages(verification_dictionary['error_list']) def setup_verification(json_dictionary, first_run): - verification_list = [] + """ + Specified which lists in the json task file need to be read from based on conditions specified in the json task + file. Also, if its after the first run of verification all urls to be verified are read from error_list. + + :param json_dictionary: The dictionary created from the json file. + :param first_run: True if the first_run of verification has been completed. False, otherwise. + :return: failed_verification_list: List of all the urls that need to be rescraped. + + """ + failed_verification_list = [] print("Check verification") if json_dictionary['scrape_nodes']: if first_run: list_name = 'node_urls' else: list_name = 'error_list' - verification_list += verify_nodes(json_dictionary, list_name, first_run) + failed_verification_list += verify_nodes(json_dictionary, list_name, first_run) if json_dictionary['scrape_registrations']: if first_run: list_name = 'registration_urls' else: list_name = 'error_list' - verification_list += verify_registrations(json_dictionary, list_name, first_run) + failed_verification_list += verify_registrations(json_dictionary, list_name, first_run) if json_dictionary['scrape_users']: if first_run: list_name = 'user_urls' else: list_name = 'error_list' - verification_list += verify_users(json_dictionary, list_name, first_run) + failed_verification_list += verify_users(json_dictionary, list_name, first_run) if json_dictionary['scrape_institutions']: if first_run: list_name = 'institution_urls' else: list_name = 'error_list' - verification_list += verify_institutions(json_dictionary, list_name, first_run) + failed_verification_list += verify_institutions(json_dictionary, list_name, first_run) + + return failed_verification_list - return verification_list +def run_verification(json_file, retry_number): + """ + CLI Endpoint for a normal run of verification. + Controls the main workflow of verification. + Two copies of the json task file are opened. One to preserve the original lists of urls to be verified, + and one to alter to dump all urls to be rescraped into. + On the first run of verification, certain conditions in the json file are checked to determine what lists + in the json file to read from based on what was scraped. An additional condition is added to the json file + when the first run of verification is finished to specify that all subsequent runs of verification need only + read from and dump to the list 'error_list'. -def run_verification(json_file, i): + :param json_file: Name of the json task file. + :param retry_number: Number of what iteration of verification is being run. + """ with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: run_info = json.load(failure_file) with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: run_copy = json.load(failure_file) - if i == 0: + if retry_number == 0: print("Begun 1st run") if run_info['scrape_finished']: run_copy['error_list'] = setup_verification(run_info, True) @@ -229,16 +290,17 @@ def run_verification(json_file, i): call_rescrape(run_copy) -def resume_verification(json_filename): - with codecs.open(json_filename, mode='r', encoding='utf-8') as failure_file: +def resume_verification(json_file): + """ + CLI Endpoint for resuming interrupted verification + + :param json_file: The dictionary created from the json file. + """ + with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: run_copy = json.load(failure_file) print("Resumed verification.") run_copy['error_list'] = setup_verification(run_copy, False) # truncates json and dumps new lists - with codecs.open(json_filename, mode='w', encoding='utf-8') as file: + with codecs.open(json_file, mode='w', encoding='utf-8') as file: json.dump(run_copy, file, indent=4) call_rescrape(run_copy) - - -def main(json_filename, num_retries): - run_verification(json_filename, num_retries) From 01fa9b6c8f4a11b99e30c0aa78f027076ad558c9 Mon Sep 17 00:00:00 2001 From: Sadiyah Faruk Date: Fri, 12 Aug 2016 14:58:11 -0400 Subject: [PATCH 6/7] Commented out print statements used for testing and removed pdb trace --- verifier.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/verifier.py b/verifier.py index f47ba2e..122df09 100644 --- a/verifier.py +++ b/verifier.py @@ -4,7 +4,6 @@ from bs4 import BeautifulSoup from settings import base_urls import os -import pdb MIRROR = 'archive/' @@ -94,16 +93,16 @@ def harvest_pages(self, json_dictionary, json_list, first_run): """ if json_dictionary['error_list'] is not None: for url in json_list[:]: - print('rel: ', url) + # print('rel: ', url) if url in json_dictionary['error_list'] and first_run: self.failed_pages.append(url) - print('error: ', url) + # print('error: ', url) else: try: obj = Page(url) self.pages.append(obj) except FileNotFoundError: - print("Failed harvest_pages ", url) + # print("Failed harvest_pages ", url) self.failed_pages.append(url) json_list.remove(url) @@ -115,7 +114,7 @@ def size_comparison(self): """ for page in self.pages[:]: if not page.file_size > self.minimum_size: - print('Failed: size_comparison(): ', page, ' has size: ', page.file_size) + # print('Failed: size_comparison(): ', page, ' has size: ', page.file_size) self.failed_pages.append(page.url) self.pages.remove(page) return @@ -210,7 +209,6 @@ def call_rescrape(verification_dictionary): """ print("Called rescrape.") second_chance = Crawler() - pdb.set_trace() second_chance.scrape_pages(verification_dictionary['error_list']) From 12fddc0386d2bd5df3d298be30b88962ad810ba0 Mon Sep 17 00:00:00 2001 From: Sadiyah Faruk Date: Fri, 12 Aug 2016 15:03:16 -0400 Subject: [PATCH 7/7] fixed indentation errors --- verifier.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/verifier.py b/verifier.py index 122df09..871383f 100644 --- a/verifier.py +++ b/verifier.py @@ -65,7 +65,6 @@ class Verifier: 3. Remaining urls run through size_comparison. Failed pages get sent to rescrape. 4. Rescrape failed urls. 5. Verify the pages that were just rescraped. - """ def __init__(self): @@ -157,7 +156,7 @@ def verify_registrations(verification_dictionary, list_name, first_run): :param list_name: The list in the json file of found URLs. :param first_run: True if the first_run of verification has been completed. False, otherwise. :return: registrations_list_failed_verification: List of all the registration urls that need to be rescraped. - """ + """ # Must run all page types automatically registrations_verifier = Verifier() registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) @@ -176,7 +175,7 @@ def verify_users(verification_dictionary, list_name, first_run): :param list_name: The list in the json file of found URLs. :param first_run: True if the first_run of verification has been completed. False, otherwise. :return: user_profiles_failed_verification: List of all the user urls that need to be rescraped. - """ + """ user_profiles_verifier = Verifier() user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) user_profiles_failed_verification = user_profiles_verifier.failed_pages @@ -193,7 +192,7 @@ def verify_institutions(verification_dictionary, list_name, first_run): :param list_name: The list in the json file of found URLs. :param first_run: True if the first_run of verification has been completed. False, otherwise. :return: institutions_dashboards_failed_verification: List of all the institution urls that need to be rescraped. - """ + """ institution_dashboards_verifier = Verifier() institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run) institution_dashboards_failed_verification = institution_dashboards_verifier.failed_pages @@ -206,7 +205,7 @@ def call_rescrape(verification_dictionary): Creates an instance of the crawler and calls scrape_pages on all urls dumped into 'error_list' in the json file :param verification_dictionary: The dictionary created from the json file. - """ + """ print("Called rescrape.") second_chance = Crawler() second_chance.scrape_pages(verification_dictionary['error_list']) @@ -221,7 +220,7 @@ def setup_verification(json_dictionary, first_run): :param first_run: True if the first_run of verification has been completed. False, otherwise. :return: failed_verification_list: List of all the urls that need to be rescraped. - """ + """ failed_verification_list = [] print("Check verification") if json_dictionary['scrape_nodes']: @@ -265,7 +264,7 @@ def run_verification(json_file, retry_number): :param json_file: Name of the json task file. :param retry_number: Number of what iteration of verification is being run. - """ + """ with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: run_info = json.load(failure_file) with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: @@ -294,11 +293,11 @@ def resume_verification(json_file): :param json_file: The dictionary created from the json file. """ - with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: - run_copy = json.load(failure_file) - print("Resumed verification.") - run_copy['error_list'] = setup_verification(run_copy, False) - # truncates json and dumps new lists - with codecs.open(json_file, mode='w', encoding='utf-8') as file: - json.dump(run_copy, file, indent=4) - call_rescrape(run_copy) + with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file: + run_copy = json.load(failure_file) + print("Resumed verification.") + run_copy['error_list'] = setup_verification(run_copy, False) + # truncates json and dumps new lists + with codecs.open(json_file, mode='w', encoding='utf-8') as file: + json.dump(run_copy, file, indent=4) + call_rescrape(run_copy)