From 6aeb926a4baee80a2179d3e8b72925d9c42c3a94 Mon Sep 17 00:00:00 2001
From: Sadiyah Faruk <sf2ne@virginia.edu>
Date: Wed, 3 Aug 2016 16:31:53 -0400
Subject: [PATCH 1/7] took out spot_check

---
 verifier.py | 202 +---------------------------------------------------
 1 file changed, 1 insertion(+), 201 deletions(-)

diff --git a/verifier.py b/verifier.py
index 8a7f350..00a6d0f 100644
--- a/verifier.py
+++ b/verifier.py
@@ -62,276 +62,76 @@ def size_comparison(self):
                 self.pages.remove(page)
         return
 
-    # Check that specified elements are supposed to exist and a loading bar isn't present instead
-    # Check that specified elements or their alternates are present and non-empty in each page
-    # Alternate: different elements appear if there isn't supposed to be content, so it has to check both
-    # Format: Filled-in : Alternate
-    def spot_check(self):
-        for page in self.pages[:]:
-            soup = page.get_content()
-            # Existential crisis:
-            for element in self.loading_elements:
-                final_element = self.loading_elements[element]  # What is supposed to be there
-                loading_bar_result = soup.select(element)  # Is a loading bar present?
-                if len(loading_bar_result) > 0:  # A loading bar exists (so content does not exist completely)
-                    print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element, " present.")
-                    self.failed_pages.append(page.url)
-                    self.pages.remove(page)
-                    break
-            else:
-                # Alternate checker:
-                for element in self.alternate_elements:
-                    alt = self.alternate_elements[element]
-                    result = soup.select(element)
-                    # No results or empty results, with alternate
-                    if (len(result) == 0 or len(result[0].contents) == 0) and alt != '':
-                        alt_result = soup.select(alt)
-
-                        # Element's alternate has no or empty results
-                        if len(alt_result) == 0 or len(alt_result[0].contents) == 0:
-                            print("Failed: alternate spot_check(): ", page, alt, '\n')
-                            self.failed_pages.append(page.url)
-                            self.pages.remove(page)
-                            break
-
-                    # Element has no alternate and no results or empty results
-                    elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '':
-                        print('Failed: spot_check(): ', page, element, "No alt.", '\n')
-                        self.failed_pages.append(page.url)
-                        self.pages.remove(page)
-                        break
-        return
-
     def run_verifier(self, json_filename, json_list):
         self.harvest_pages(json_filename, json_list)
         self.size_comparison()
-        # self.spot_check()
-
 
 # Verifier subclasses
 
+
 class ProjectDashboardVerifier(Verifier):
     def __init__(self):
         super().__init__(410, ProjectDashboardPage, '')
-        self.loading_elements = {
-            "#treeGrid > div > p": '#tb-tbody',  # Files list
-            "#containment": "#render-node",  # Exists if there are supposed to be components / Is it filled?
-        }
-        self.alternate_elements = {
-            '#nodeTitleEditable': '',  # Title
-            '#contributors span.date.node-last-modified-date': '',  # Last modified
-            '#contributorsList > ol': '',  # Contributor list
-            '#tb-tbody': '',  # File list
-            '#logScope > div > div > div.panel-body > span > dl': '#logFeed > div > p'
-            # Activity / "Unable to retrieve at this time"
-        }
-
-    # Override: the loader for loading_elements is still supposed to exist
-    # Check that specified elements are supposed to exist and a loading bar isn't present instead
-    # Check that specified elements or their alternates are present and non-empty in each page
-    # Alternate: different elements appear if there isn't supposed to be content, so it has to check both
-    # Format: Filled-in : Alternate
-    def spot_check(self):
-        for page in self.pages[:]:
-            soup = page.get_content()
-            # Existential crisis:
-            for element in self.loading_elements:
-                final_element = self.loading_elements[element]  # What is supposed to be there
-                loading_bar_result = soup.select(element)
-                if len(loading_bar_result) > 0:  # Container div is present
-                    final_result = soup.select(final_element)
-                    if len(final_result) == 0:  # Final element isn't in place
-                        print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element,
-                              " present.")
-                        self.failed_pages.append(page.url)
-                        self.pages.remove(page)
-                        break
-            else:
-                # Alternate checker:
-                for element in self.alternate_elements:
-                    alt = self.alternate_elements[element]
-                    result = soup.select(element)
-                    # No results or empty results, with alternate
-                    if (len(result) == 0 or len(result[0].contents) == 0) and alt != '':
-                        alt_result = soup.select(alt)
-
-                        # Element's alternate has no or empty results
-                        if len(alt_result) == 0 or len(alt_result[0].contents) == 0:
-                            print("Failed: alternate spot_check(): ", page, alt, '\n')
-                            self.failed_pages.append(page.url)
-                            self.pages.remove(page)
-                            break
-
-                    # Element has no alternate and no results or empty results
-                    elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '':
-                        print('Failed: spot_check(): ', page, element, "No alt.", '\n')
-                        self.failed_pages.append(page.url)
-                        self.pages.remove(page)
-                        break
-        return
 
 
 class ProjectFilesVerifier(Verifier):
     def __init__(self):
         super().__init__(380, ProjectFilesPage, "files/")
-        self.alternate_elements = {
-            '.fg-file-links': '',  # Links to files (names them)
-        }
 
 
 class ProjectWikiVerifier(Verifier):
     def __init__(self):
         super().__init__(410, ProjectWikiPage, "wiki/")
-        self.alternate_elements = {
-            '#wikiViewRender': '#wikiViewRender > p > em',  # Wiki content / `No wiki content`
-            '#viewVersionSelect option': '',  # Current version date modified
-            '.fg-file-links': ''  # Links to other pages (names them)
-        }
 
 
 class ProjectAnalyticsVerifier(Verifier):
     def __init__(self):
         super().__init__(380, ProjectAnalyticsPage, "analytics/")
-        self.alternate_elements = {
-            '#adBlock': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center',
-            # Warning about AdBlock
-            'iframe': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center',
-            # External frame for analytics
-        }
 
 
 class ProjectRegistrationsVerifier(Verifier):
     def __init__(self):
         super().__init__(380, ProjectRegistrationsPage, "registrations/")
-        self.alternate_elements = {
-            '#renderNode': '#registrations > div > div > p'  # List of nodes
-        }
 
 
 class ProjectForksVerifier(Verifier):
     def __init__(self):
         super().__init__(380, ProjectForksPage, "forks/")
-        self.alternate_elements = {
-            '#renderNode': 'div.watermarked > div > div.row > div.col-xs-9.col-sm-8 > p'  # List
-        }
 
 
 class RegistrationDashboardVerifier(Verifier):
     def __init__(self):
         super().__init__(410, RegistrationDashboardPage, "")
-        self.loading_elements = {
-            "#treeGrid > div > p": '#tb-tbody',  # Files list
-            "#containment": "#render-node",  # Exists if there are supposed to be components / Is it filled?
-        }
-        self.alternate_elements = {
-            '#nodeTitleEditable': '',  # Title
-            '#contributors > div > p:nth-of-type(5) > span': '',  # Last modified
-            '#contributorsList > ol': '',  # Contributor list
-            '#logScope > div > div > div.panel-body > span > dl': '#logFeed > div > p'
-            # Activity / "Unable to retrieve at this time"
-        }
-
-    # Override: the loader for loading_elements is still supposed to exist
-    # Check that specified elements are supposed to exist and a loading bar isn't present instead
-    # Check that specified elements or their alternates are present and non-empty in each page
-    # Alternate: different elements appear if there isn't supposed to be content, so it has to check both
-    # Format: Filled-in : Alternate
-    def spot_check(self):
-        for page in self.pages[:]:
-            soup = page.get_content()
-            # Existential crisis:
-            for element in self.loading_elements:
-                final_element = self.loading_elements[element]  # What is supposed to be there
-                loading_bar_result = soup.select(element)
-                if len(loading_bar_result) > 0:  # Container div is present
-                    final_result = soup.select(final_element)
-                    if len(final_result) == 0:  # Final element isn't in place
-                        print("Failed: existential spot_check() ", page, final_element, " doesn't exist, loader ", element,
-                              " present.")
-                        self.failed_pages.append(page.url)
-                        self.pages.remove(page)
-                        break
-            else:
-                # Alternate checker:
-                for element in self.alternate_elements:
-                    alt = self.alternate_elements[element]
-                    result = soup.select(element)
-                    # No results or empty results, with alternate
-                    if (len(result) == 0 or len(result[0].contents) == 0) and alt != '':
-                        alt_result = soup.select(alt)
-
-                        # Element's alternate has no or empty results
-                        if len(alt_result) == 0 or len(alt_result[0].contents) == 0:
-                            print("Failed: alternate spot_check(): ", page, alt, '\n')
-                            self.failed_pages.append(page.url)
-                            self.pages.remove(page)
-                            break
-
-                    # Element has no alternate and no results or empty results
-                    elif (len(result) == 0 or len(result[0].contents) == 0) and alt == '':
-                        print('Failed: spot_check(): ', page, element, "No alt.", '\n')
-                        self.failed_pages.append(page.url)
-                        self.pages.remove(page)
-                        break
-        return
 
 
 class RegistrationFilesVerifier(Verifier):
     def __init__(self):
         super().__init__(380, RegistrationFilesPage, "files/")
-        self.alternate_elements = {
-            '.fg-file-links': '',  # Links to files (names them)
-        }
 
 
 class RegistrationWikiVerifier(Verifier):
     def __init__(self):
         super().__init__(410, RegistrationWikiPage, "wiki/")
-        self.alternate_elements = {
-            '#wikiViewRender': '#wikiViewRender > p > em',  # Wiki content / `No wiki content`
-            '#viewVersionSelect option': '',  # Current version date modified
-            '.fg-file-links': ''  # Links to other pages (names them)
-        }
 
 
 class RegistrationAnalyticsVerifier(Verifier):
     def __init__(self):
         super().__init__(380, RegistrationAnalyticsPage, "analytics/")
-        self.alternate_elements = {
-            '#adBlock': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center',
-            # Warning about AdBlock
-            'iframe': 'div.watermarked > div > div.m-b-md.p-md.osf-box-lt.box-round.text-center',
-            # External frame for analytics
-        }
 
 
 class RegistrationForksVerifier(Verifier):
     def __init__(self):
         super().__init__(380, RegistrationForksPage, "forks/")
-        self.alternate_elements = {
-            '#renderNode': 'div.watermarked > div > div.row > div.col-xs-9.col-sm-8 > p'  # List
-        }
 
 
 class UserProfileVerifier(Verifier):
     def __init__(self):
         super().__init__(80, UserProfilePage, "")
-        self.alternate_elements = {
-            '#projects': 'div > div:nth-of-type(1) > div > div.panel-body > div',  # Project list / "No projects"
-            '#components': 'div > div:nth-of-type(2) > div > div.panel-body > div',  # Component list / "No components"
-            'body h2': ''  # Activity points, project count
-        }
 
 
 class InstitutionDashboardVerifier(Verifier):
     def __init__(self):
         super().__init__(350, InstitutionDashboardPage, "")
-        self.loading_elements = {
-            '#fileBrowser > div.db-main > div.line-loader > div.load-message': '.fg-file-links'  # "loading" / Project browser
-        }
-        self.alternate_elements = {
-            '#fileBrowser > div.db-infobar > div > div': '#fileBrowser > div.db-infobar > div > div'  # Project preview / "Select a project"
-        }
 
 
 # Called when json file had scrape_nodes = true

From a878aad9ec58ad3659ddf029704bae275381e593 Mon Sep 17 00:00:00 2001
From: Sadiyah Faruk <sf2ne@virginia.edu>
Date: Thu, 4 Aug 2016 09:32:55 -0400
Subject: [PATCH 2/7] no more page modularization? bad idea? good idea?

---
 verifier.py | 34 +++++++++++++---------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/verifier.py b/verifier.py
index 00a6d0f..7d8b635 100644
--- a/verifier.py
+++ b/verifier.py
@@ -4,7 +4,6 @@
     ProjectForksPage, ProjectRegistrationsPage, ProjectWikiPage, RegistrationDashboardPage, RegistrationFilesPage, \
     RegistrationAnalyticsPage, RegistrationForksPage, RegistrationWikiPage, UserProfilePage, InstitutionDashboardPage
 from crawler import Crawler
-import bs4
 
 
 # Verifier superclass
@@ -19,14 +18,6 @@ def __init__(self, min_size, pg_type, end):
         self.page_type = pg_type
         self.url_end = end
 
-        # Certain elements will be absent if there's no content for them to display, so we check if there is a loading
-        # bar in its place. This means the element should exist, but it doesn't.
-        self.loading_elements = {}
-
-        # Other elements will be replaced by a message if there's no content for them (e.g. "This user has no projects")
-        # We check for the elements and their alternates if the original isn't found.
-        self.alternate_elements = {}
-
         self.pages = []  # All the page objects
         self.failed_pages = []
 
@@ -37,19 +28,20 @@ def harvest_pages(self, json_dictionary, json_list):
         :param json_list: The list in the json file of found URLs
         :return: Null, but self.pages is populated.
         """
-        for url in json_list[:]:
-            if self.url_end in url:
-                print('rel: ', url)
-                if url in json_dictionary['error_list']:
-                    self.failed_pages.append(url)
-                    print('error: ', url)
-                else:
-                    try:
-                        obj = self.page_type(url)
-                        self.pages.append(obj)
-                    except FileNotFoundError:
+        if json_dictionary['error_list'] is not None:
+            for url in json_list[:]:
+                if self.url_end in url:
+                    print('rel: ', url)
+                    if url in json_dictionary['error_list']:
                         self.failed_pages.append(url)
-                json_list.remove(url)
+                        print('error: ', url)
+                    else:
+                        try:
+                            obj = self.page_type(url)
+                            self.pages.append(obj)
+                        except FileNotFoundError:
+                            self.failed_pages.append(url)
+                    json_list.remove(url)
 
     # Compare page size to page-specific minimum that any fully-scraped page should have
     def size_comparison(self):

From e5c0089482021633273aee4644ce72701147a5c9 Mon Sep 17 00:00:00 2001
From: Sadiyah Faruk <sf2ne@virginia.edu>
Date: Wed, 10 Aug 2016 14:07:38 -0400
Subject: [PATCH 3/7] made big big changes up in here

---
 crawler.py  |  17 ++--
 pages.py    |  99 --------------------
 settings.py |   2 +-
 verifier.py | 263 +++++++++++++++++++++-------------------------------
 4 files changed, 118 insertions(+), 263 deletions(-)
 delete mode 100644 pages.py

diff --git a/crawler.py b/crawler.py
index 94d6dbd..0cca91b 100644
--- a/crawler.py
+++ b/crawler.py
@@ -523,14 +523,14 @@ def scrape_nodes(self, async=True):
         """
         self.debug_logger.info("Scraping nodes, async = " + str(async))
         if async:
-            self._scrape_pages(self.node_urls)
+            self.scrape_pages(self.node_urls)
         else:
             for elem in self.node_url_tuples:
                 lst = []
                 while len(self.node_urls) > 0 and elem[0] in self.node_urls[0]:
                     lst.append(self.node_urls.pop(0))
                 if len(lst) > 0:
-                    self._scrape_pages(lst)
+                    self.scrape_pages(lst)
         self.debug_logger.info("Finished scraping nodes, async = " + str(async))
 
     def scrape_registrations(self, async=True):
@@ -540,13 +540,13 @@ def scrape_registrations(self, async=True):
         """
         self.debug_logger.info("Scraping registrations, async = " + str(async))
         if async:
-            self._scrape_pages(self.registration_urls)
+            self.scrape_pages(self.registration_urls)
         else:
             for elem in self.registration_url_tuples:
                 lst = []
                 while len(self.registration_urls) > 0 and elem[0] in self.registration_urls:
                     lst.append(self.registration_urls.pop(0))
-                self._scrape_pages(lst)
+                self.scrape_pages(lst)
         self.debug_logger.info("Finished scraping registrations, async = " + str(async))
 
     def scrape_users(self):
@@ -554,7 +554,7 @@ def scrape_users(self):
         Wrapper method that scrape all urls in self.user_urls. Calls _scrape_pages().
         """
         self.debug_logger.info("Scraping users")
-        self._scrape_pages(self.user_urls)
+        self.scrape_pages(self.user_urls)
         self.debug_logger.info("Finished scraping users")
 
     def scrape_institutions(self):
@@ -562,7 +562,7 @@ def scrape_institutions(self):
         Wrapper method that scrape all institution_urls. Calls _scrape_pages().
         """
         self.debug_logger.info("Scraping institutions")
-        self._scrape_pages(self.institution_urls)
+        self.scrape_pages(self.institution_urls)
         self.debug_logger.info("Finished scraping institutions")
 
     def scrape_general(self):
@@ -570,11 +570,12 @@ def scrape_general(self):
         Wrapper method that scrape all general_urls. Calls _scrape_pages().
         """
         self.debug_logger.info("Scraping general pages")
-        self._scrape_pages(self.general_urls)
+        self.scrape_pages(self.general_urls)
         self.debug_logger.info("Finished scraping general pages")
 
+
     # TODO Make semaphore value a parameter
-    def _scrape_pages(self, aspect_list):
+    def scrape_pages(self, aspect_list):
         """
         Runner method that runs scrape_url()
         :param aspect_list: list of url of pages to scrape
diff --git a/pages.py b/pages.py
deleted file mode 100644
index e5de596..0000000
--- a/pages.py
+++ /dev/null
@@ -1,99 +0,0 @@
-""" The page superclass and subclasses for verifier"""
-
-from bs4 import BeautifulSoup
-from settings import base_urls
-import os
-
-MIRROR = 'archive/'
-
-
-# Superclass for page-specific page instances
-class Page:
-    def __init__(self, url):
-        self.url = url
-        self.path = self.get_path_from_url(url)
-        # Set size attribute in KB, inherently checks if file exists
-        try:
-            self.file_size = os.path.getsize(self.path) / 1000
-        except FileNotFoundError:
-            raise FileNotFoundError
-
-    def __str__(self):
-        return self.path
-
-    # Takes a URL and produces its relative file name.
-    def get_path_from_url(self, url):
-        # Remove http://domain
-        tail = url.replace(base_urls[0], '') + 'index.html'
-        path = MIRROR + tail
-        return path
-
-    def get_content(self):
-        soup = BeautifulSoup(open(self.path), 'html.parser')
-        return soup
-
-
-# Page-specific subclasses
-class ProjectDashboardPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class ProjectFilesPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class ProjectWikiPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class ProjectAnalyticsPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class ProjectRegistrationsPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class ProjectForksPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class RegistrationDashboardPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class RegistrationFilesPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class RegistrationWikiPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class RegistrationAnalyticsPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class RegistrationForksPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class UserProfilePage(Page):
-    def __init__(self, url):
-        super().__init__(url)
-
-
-class InstitutionDashboardPage(Page):
-    def __init__(self, url):
-        super().__init__(url)
\ No newline at end of file
diff --git a/settings.py b/settings.py
index 5106b05..3d66fae 100644
--- a/settings.py
+++ b/settings.py
@@ -1,5 +1,5 @@
 # The OSF website URL, and the API
-base_urls = ['https://osf.io/', 'https://api.osf.io/v2/']
+base_urls = ['https://staging.osf.io/', 'https://staging-api.osf.io/v2/']
 
 DEBUG_LOG_FILENAME = 'debug_log.txt'
 ERROR_LOG_FILENAME = 'error_log.txt'
\ No newline at end of file
diff --git a/verifier.py b/verifier.py
index 7d8b635..a103e75 100644
--- a/verifier.py
+++ b/verifier.py
@@ -1,28 +1,57 @@
 import json
 import codecs
-from pages import ProjectDashboardPage, ProjectFilesPage, ProjectAnalyticsPage, \
-    ProjectForksPage, ProjectRegistrationsPage, ProjectWikiPage, RegistrationDashboardPage, RegistrationFilesPage, \
-    RegistrationAnalyticsPage, RegistrationForksPage, RegistrationWikiPage, UserProfilePage, InstitutionDashboardPage
 from crawler import Crawler
+from bs4 import BeautifulSoup
+from settings import base_urls
+import os
+
+MIRROR = 'archive/'
+
+
+# Superclass for page-specific page instances
+class Page:
+    def __init__(self, url):
+        self.url = url
+        self.path = self.get_path_from_url(url)
+        # Set size attribute in KB, inherently checks if file exists
+        try:
+            self.file_size = os.path.getsize(self.path) / 1000
+        except FileNotFoundError:
+            raise FileNotFoundError
+
+    def __str__(self):
+        return self.path
+
+    # Takes a URL and produces its relative file name.
+    def get_path_from_url(self, url):
+        # Remove http://domain
+        tail = url.replace(base_urls[0], '') + 'index.html'
+        print(tail)
+        path = MIRROR + tail
+        return path
+
+    def get_content(self):
+        soup = BeautifulSoup(open(self.path), 'html.parser')
+        return soup
 
 
 # Verifier superclass
 class Verifier:
-    def __init__(self, min_size, pg_type, end):
+    def __init__(self):
         """
         :param min_size: File size minimum for a page. Anything below this couldn't possibly be a complete file.
         :param pg_type: The class to instantiate page objects with.
         :param end: Indentifier in the URL, e.g. 'files/', 'end' is a misnomer ('wiki/' in the middle of a URL)
         """
-        self.minimum_size = min_size
-        self.page_type = pg_type
-        self.url_end = end
+        self.minimum_size = 8
+        # self.page_type = pg_type
+        # self.url_end = end
 
         self.pages = []  # All the page objects
         self.failed_pages = []
 
     # Populate self.pages with the relevant files
-    def harvest_pages(self, json_dictionary, json_list):
+    def harvest_pages(self, json_dictionary, json_list, first_run):
         """
         :param json_dictionary: The dictionary created from the json file
         :param json_list: The list in the json file of found URLs
@@ -30,135 +59,70 @@ def harvest_pages(self, json_dictionary, json_list):
         """
         if json_dictionary['error_list'] is not None:
             for url in json_list[:]:
-                if self.url_end in url:
-                    print('rel: ', url)
-                    if url in json_dictionary['error_list']:
+                # if self.url_end in url:
+                print('rel: ', url)
+                if url in json_dictionary['error_list'] and first_run:
+                    self.failed_pages.append(url)
+                    print('error: ', url)
+                else:
+                    try:
+                        obj = Page(url)
+                        self.pages.append(obj)
+                        print(obj.path)
+                    except FileNotFoundError:
+                        print("Failed harvest_pages ", url)
                         self.failed_pages.append(url)
-                        print('error: ', url)
-                    else:
-                        try:
-                            obj = self.page_type(url)
-                            self.pages.append(obj)
-                        except FileNotFoundError:
-                            self.failed_pages.append(url)
-                    json_list.remove(url)
+                json_list.remove(url)
 
     # Compare page size to page-specific minimum that any fully-scraped page should have
     def size_comparison(self):
         for page in self.pages[:]:
-            # print(page)
-            # print(page.file_size)
+            print("Size comparison on ", page)
+            print(page.file_size)
             if not page.file_size > self.minimum_size:
                 print('Failed: size_comparison(): ', page, ' has size: ', page.file_size)
                 self.failed_pages.append(page.url)
                 self.pages.remove(page)
         return
 
-    def run_verifier(self, json_filename, json_list):
-        self.harvest_pages(json_filename, json_list)
+    def run_verifier(self, json_filename, json_list, first_run):
+        self.harvest_pages(json_filename, json_list, first_run)
         self.size_comparison()
 
-# Verifier subclasses
-
-
-class ProjectDashboardVerifier(Verifier):
-    def __init__(self):
-        super().__init__(410, ProjectDashboardPage, '')
-
-
-class ProjectFilesVerifier(Verifier):
-    def __init__(self):
-        super().__init__(380, ProjectFilesPage, "files/")
-
-
-class ProjectWikiVerifier(Verifier):
-    def __init__(self):
-        super().__init__(410, ProjectWikiPage, "wiki/")
-
-
-class ProjectAnalyticsVerifier(Verifier):
-    def __init__(self):
-        super().__init__(380, ProjectAnalyticsPage, "analytics/")
-
-
-class ProjectRegistrationsVerifier(Verifier):
-    def __init__(self):
-        super().__init__(380, ProjectRegistrationsPage, "registrations/")
-
-
-class ProjectForksVerifier(Verifier):
-    def __init__(self):
-        super().__init__(380, ProjectForksPage, "forks/")
-
-
-class RegistrationDashboardVerifier(Verifier):
-    def __init__(self):
-        super().__init__(410, RegistrationDashboardPage, "")
-
-
-class RegistrationFilesVerifier(Verifier):
-    def __init__(self):
-        super().__init__(380, RegistrationFilesPage, "files/")
-
-
-class RegistrationWikiVerifier(Verifier):
-    def __init__(self):
-        super().__init__(410, RegistrationWikiPage, "wiki/")
-
-
-class RegistrationAnalyticsVerifier(Verifier):
-    def __init__(self):
-        super().__init__(380, RegistrationAnalyticsPage, "analytics/")
-
-
-class RegistrationForksVerifier(Verifier):
-    def __init__(self):
-        super().__init__(380, RegistrationForksPage, "forks/")
-
-
-class UserProfileVerifier(Verifier):
-    def __init__(self):
-        super().__init__(80, UserProfilePage, "")
-
-
-class InstitutionDashboardVerifier(Verifier):
-    def __init__(self):
-        super().__init__(350, InstitutionDashboardPage, "")
-
 
 # Called when json file had scrape_nodes = true
 # Checks for all the components of a project and if they were scraped
 # Verifies them and returns a list of the failed pages
-def verify_nodes(verification_dictionary, list_name):
+def verify_nodes(verification_dictionary, list_name, first_run):
     nodes_list_verified = []
     if verification_dictionary['include_files']:
-        project_files_verifier = ProjectFilesVerifier()
-        project_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+        project_files_verifier = Verifier()
+        project_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
         project_files = project_files_verifier.failed_pages
         nodes_list_verified += project_files
     if verification_dictionary['include_wiki']:
-        project_wiki_verifier = ProjectWikiVerifier()
-        project_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+        project_wiki_verifier = Verifier()
+        project_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
         project_wiki = project_wiki_verifier.failed_pages
         nodes_list_verified += project_wiki
     if verification_dictionary['include_analytics']:
-        project_analytics_verifier = ProjectAnalyticsVerifier()
-        project_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+        project_analytics_verifier = Verifier()
+        project_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
         project_analytics = project_analytics_verifier.failed_pages
         nodes_list_verified += project_analytics
     if verification_dictionary['include_registrations']:
-        project_registrations_verifier = ProjectRegistrationsVerifier()
-        project_registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+        project_registrations_verifier = Verifier()
+        project_registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
         project_registrations = project_registrations_verifier.failed_pages
         nodes_list_verified += project_registrations
     if verification_dictionary['include_forks']:
-        project_forks_verifier = ProjectForksVerifier()
-        project_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+        project_forks_verifier = Verifier()
+        project_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
         project_forks = project_forks_verifier.failed_pages
         nodes_list_verified += project_forks
     if verification_dictionary['include_dashboard']:  # This must go last because its URLs don't have a specific ending.
-        project_dashboards_verifier = ProjectDashboardVerifier()
-        project_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+        project_dashboards_verifier = Verifier()
+        project_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
         project_dashboards = project_dashboards_verifier.failed_pages
         nodes_list_verified += project_dashboards
     return nodes_list_verified
@@ -166,26 +130,26 @@ def verify_nodes(verification_dictionary, list_name):
 
 # Called when json file had scrape_registrations = true
 # Verifies the components of a registration and returns a list of the failed pages
-def verify_registrations(verification_dictionary, list_name):
+def verify_registrations(verification_dictionary, list_name, first_run):
     # Must run all page types automatically
-    registration_files_verifier = RegistrationFilesVerifier()
-    registration_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+    registration_files_verifier = Verifier()
+    registration_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     registration_files = registration_files_verifier.failed_pages
 
-    registration_wiki_verifier = RegistrationWikiVerifier()
-    registration_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+    registration_wiki_verifier = Verifier()
+    registration_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     registration_wiki = registration_wiki_verifier.failed_pages
 
-    registration_analytics_verifier = RegistrationAnalyticsVerifier()
-    registration_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+    registration_analytics_verifier = Verifier()
+    registration_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     registration_analytics = registration_analytics_verifier.failed_pages
 
-    registration_forks_verifier = RegistrationForksVerifier()
-    registration_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+    registration_forks_verifier = Verifier()
+    registration_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     registration_forks = registration_forks_verifier.failed_pages
 
-    registration_dashboards_verifier = RegistrationDashboardVerifier()
-    registration_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+    registration_dashboards_verifier = Verifier()
+    registration_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     registration_dashboards = registration_dashboards_verifier.failed_pages
 
     registrations_list_verified = registration_files + registration_wiki + registration_analytics + \
@@ -195,68 +159,57 @@ def verify_registrations(verification_dictionary, list_name):
 
 # Called when json file had scrape_users = true
 # Verifies all user profile pages and returns a list of the failed pages
-def verify_users(verification_dictionary, list_name):
-    user_profiles_verifier = UserProfileVerifier()
-    user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+def verify_users(verification_dictionary, list_name, first_run):
+    user_profiles_verifier = Verifier()
+    user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     user_profiles = user_profiles_verifier.failed_pages
     return user_profiles
 
 
 # Called when json file had scrape_institutions = true
 # Verifies all user profile pages and returns a list of the failed pages
-def verify_institutions(verification_dictionary, list_name):
-    institution_dashboards_verifier = InstitutionDashboardVerifier()
-    institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name])
+def verify_institutions(verification_dictionary, list_name, first_run):
+    institution_dashboards_verifier = Verifier()
+    institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     institution_dashboards = institution_dashboards_verifier.failed_pages
     return institution_dashboards
 
 
-def call_rescrape(json_dictionary, verification_json_dictionary):
+def call_rescrape(verification_json_dictionary):
     print("Called rescrape.")
     second_chance = Crawler()
-    if json_dictionary['scrape_nodes']:
-        second_chance.node_urls = verification_json_dictionary['node_urls_failed_verification']
-        second_chance.scrape_nodes(async=True)
-    if json_dictionary['scrape_registrations']:
-        second_chance.registration_urls = verification_json_dictionary['registration_urls_failed_verification']
-        second_chance.scrape_registrations(async=True)
-    if json_dictionary['scrape_users']:
-        second_chance.user_urls = verification_json_dictionary['user_urls_failed_verification']
-        second_chance.scrape_users()
-    if json_dictionary['scrape_institutions']:
-        second_chance.institution_urls = verification_json_dictionary['institution_urls_failed_verification']
-        second_chance.scrape_institutions()
+    second_chance.scrape_pages(verification_json_dictionary['error_list'])
 
 
-def setup_verification(json_dictionary, verification_json_dictionary, first_scrape):
+def setup_verification(json_dictionary, first_run):
+    verification_list = []
     print("Check verification")
     if json_dictionary['scrape_nodes']:
-        if first_scrape:
+        if first_run:
             list_name = 'node_urls'
         else:
-            list_name = 'node_urls_failed_verification'
-        verification_json_dictionary['node_urls_failed_verification'] = verify_nodes(json_dictionary, list_name)
+            list_name = 'error_list'
+        verification_list += verify_nodes(json_dictionary, list_name, first_run)
     if json_dictionary['scrape_registrations']:
-        if first_scrape:
+        if first_run:
             list_name = 'registration_urls'
         else:
-            list_name = 'registration_urls_failed_verification'
-        verification_json_dictionary['registration_urls_failed_verification'] = verify_registrations(json_dictionary,
-                                                                                                     list_name)
+            list_name = 'error_list'
+        verification_list += verify_registrations(json_dictionary, list_name, first_run)
     if json_dictionary['scrape_users']:
-        if first_scrape:
+        if first_run:
             list_name = 'user_urls'
         else:
-            list_name = 'user_urls_failed_verification'
-        verification_json_dictionary['user_urls_failed_verification'] = \
-            verify_users(json_dictionary, list_name)
+            list_name = 'error_list'
+        verification_list += verify_users(json_dictionary, list_name, first_run)
     if json_dictionary['scrape_institutions']:
-        if first_scrape:
+        if first_run:
             list_name = 'institution_urls'
         else:
-            list_name = 'institution_urls_failed_verification'
-        verification_json_dictionary['institution_urls_failed_verification'] = verify_institutions(json_dictionary,
-                                                                                                   list_name)
+            list_name = 'error_list'
+        verification_list += verify_institutions(json_dictionary, list_name, first_run)
+
+    return verification_list
 
 
 def run_verification(json_file, i):
@@ -267,30 +220,30 @@ def run_verification(json_file, i):
     if i == 0:
         print("Begun 1st run")
         if run_info['scrape_finished']:
-            setup_verification(run_info, run_copy, True)
+            run_copy['error_list'] = setup_verification(run_info, True)
             run_copy['1st_verification_finished'] = True
             with codecs.open(json_file, mode='w', encoding='utf-8') as file:
                 json.dump(run_copy, file, indent=4)
                 print("Dumped json run_copy 1st verify")
-        call_rescrape(run_info, run_copy)
+        call_rescrape(run_copy)
     else:
         print("Begun next run")
-        setup_verification(run_copy, run_copy, False)
+        run_copy['error_list'] = setup_verification(run_copy, False)
         # truncates json and dumps new lists
         with codecs.open(json_file, mode='w', encoding='utf-8') as file:
             json.dump(run_copy, file, indent=4)
-        call_rescrape(run_copy, run_copy)
+        call_rescrape(run_copy)
 
 
 def resume_verification(json_filename):
         with codecs.open(json_filename, mode='r', encoding='utf-8') as failure_file:
             run_copy = json.load(failure_file)
         print("Resumed verification.")
-        setup_verification(run_copy, run_info, False)
+        run_copy['error_list'] = setup_verification(run_copy, False)
         # truncates json and dumps new lists
         with codecs.open(json_filename, mode='w', encoding='utf-8') as file:
             json.dump(run_copy, file, indent=4)
-        call_rescrape(run_copy, run_copy)
+        call_rescrape(run_copy)
 
 
 def main(json_filename, num_retries):

From 8291ee3e0a9c7b656fed0e7cca2129078b5fb195 Mon Sep 17 00:00:00 2001
From: Sadiyah Faruk <sf2ne@virginia.edu>
Date: Wed, 10 Aug 2016 16:40:19 -0400
Subject: [PATCH 4/7] trying to fix scraping errors, why are pages not being
 scraped

---
 verifier.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/verifier.py b/verifier.py
index a103e75..1cd6a21 100644
--- a/verifier.py
+++ b/verifier.py
@@ -4,6 +4,7 @@
 from bs4 import BeautifulSoup
 from settings import base_urls
 import os
+import pdb
 
 MIRROR = 'archive/'
 
@@ -26,7 +27,6 @@ def __str__(self):
     def get_path_from_url(self, url):
         # Remove http://domain
         tail = url.replace(base_urls[0], '') + 'index.html'
-        print(tail)
         path = MIRROR + tail
         return path
 
@@ -44,9 +44,6 @@ def __init__(self):
         :param end: Indentifier in the URL, e.g. 'files/', 'end' is a misnomer ('wiki/' in the middle of a URL)
         """
         self.minimum_size = 8
-        # self.page_type = pg_type
-        # self.url_end = end
-
         self.pages = []  # All the page objects
         self.failed_pages = []
 
@@ -59,7 +56,6 @@ def harvest_pages(self, json_dictionary, json_list, first_run):
         """
         if json_dictionary['error_list'] is not None:
             for url in json_list[:]:
-                # if self.url_end in url:
                 print('rel: ', url)
                 if url in json_dictionary['error_list'] and first_run:
                     self.failed_pages.append(url)
@@ -68,7 +64,6 @@ def harvest_pages(self, json_dictionary, json_list, first_run):
                     try:
                         obj = Page(url)
                         self.pages.append(obj)
-                        print(obj.path)
                     except FileNotFoundError:
                         print("Failed harvest_pages ", url)
                         self.failed_pages.append(url)
@@ -77,8 +72,6 @@ def harvest_pages(self, json_dictionary, json_list, first_run):
     # Compare page size to page-specific minimum that any fully-scraped page should have
     def size_comparison(self):
         for page in self.pages[:]:
-            print("Size comparison on ", page)
-            print(page.file_size)
             if not page.file_size > self.minimum_size:
                 print('Failed: size_comparison(): ', page, ' has size: ', page.file_size)
                 self.failed_pages.append(page.url)
@@ -178,6 +171,7 @@ def verify_institutions(verification_dictionary, list_name, first_run):
 def call_rescrape(verification_json_dictionary):
     print("Called rescrape.")
     second_chance = Crawler()
+    pdb.set_trace()
     second_chance.scrape_pages(verification_json_dictionary['error_list'])
 
 
@@ -247,7 +241,4 @@ def resume_verification(json_filename):
 
 
 def main(json_filename, num_retries):
-    # For testing:
-    # num_retries = 2
-    # call two verification/scraping methods depending on num retries
     run_verification(json_filename, num_retries)

From 756051c2e95ad318dfd02a56edf896492fe80514 Mon Sep 17 00:00:00 2001
From: Sadiyah Faruk <sf2ne@virginia.edu>
Date: Fri, 12 Aug 2016 14:54:42 -0400
Subject: [PATCH 5/7] added docstrings, took out multiple instances of verifier
 for components of registrations and nodes

---
 cli.py      |   4 +-
 verifier.py | 230 +++++++++++++++++++++++++++++++++-------------------
 2 files changed, 148 insertions(+), 86 deletions(-)

diff --git a/cli.py b/cli.py
index ac45380..34933bd 100644
--- a/cli.py
+++ b/cli.py
@@ -315,7 +315,7 @@ def resume_scrape(db, tf):
 
 def verify_mirror(tf, rn):
     for i in range(rn):
-        verifier.main(tf, i)
+        verifier.run_verification(tf, i)
 
 
 def resume_verify_mirror(tf, rn):
@@ -326,7 +326,7 @@ def resume_verify_mirror(tf, rn):
             verifier.resume_verification(tf)
     else:
         for i in range(rn):
-            verifier.main(tf, i)
+            verifier.run_verification(tf, i)
 
 
 def delete_nodes(ptf, ctf):
diff --git a/verifier.py b/verifier.py
index 1cd6a21..f47ba2e 100644
--- a/verifier.py
+++ b/verifier.py
@@ -11,7 +11,19 @@
 
 # Superclass for page-specific page instances
 class Page:
+    """
+        A Page class is designed to hold an instance of a page scraped.
+        It's attributes are:
+            url = the url of the page
+            path = the file path of the page
+    """
+
     def __init__(self, url):
+        """
+            Constructor for the Page class
+
+            :param url: The url of the page
+        """
         self.url = url
         self.path = self.get_path_from_url(url)
         # Set size attribute in KB, inherently checks if file exists
@@ -25,23 +37,45 @@ def __str__(self):
 
     # Takes a URL and produces its relative file name.
     def get_path_from_url(self, url):
+        """
+            Specifies the file path the page scraped is meant to have.
+
+            :param url: The url of the page
+        """
         # Remove http://domain
         tail = url.replace(base_urls[0], '') + 'index.html'
         path = MIRROR + tail
         return path
 
     def get_content(self):
+        """
+            Returns the content of the page scraped.
+        """
         soup = BeautifulSoup(open(self.path), 'html.parser')
         return soup
 
 
 # Verifier superclass
 class Verifier:
+    """
+        A Verifier class for verification of the OSF Mirror.
+        A CLI is designed to work with this verifier in order to ensure that everything that is scraped, is verified.
+        Basic Workflow:
+            1. Init
+            2. All urls from json file run through harvest_pages. Failed pages get sent to rescrape.
+            3. Remaining urls run through size_comparison. Failed pages get sent to rescrape.
+            4. Rescrape failed urls.
+            5. Verify the pages that were just rescraped.
+
+        """
+
     def __init__(self):
         """
-        :param min_size: File size minimum for a page. Anything below this couldn't possibly be a complete file.
-        :param pg_type: The class to instantiate page objects with.
-        :param end: Indentifier in the URL, e.g. 'files/', 'end' is a misnomer ('wiki/' in the middle of a URL)
+        Constructor for the Verifier class
+
+            min_size: File size minimum for a page. Anything below this couldn't possibly be a complete file.
+            pages: All the page objects
+            failed_pages: Pages that failed verification and are being sent to rescrape.
         """
         self.minimum_size = 8
         self.pages = []  # All the page objects
@@ -50,9 +84,13 @@ def __init__(self):
     # Populate self.pages with the relevant files
     def harvest_pages(self, json_dictionary, json_list, first_run):
         """
+        On the first run of verification, puts all urls in error_list directly into failed_pages.
+        Otherwise, tries to create page objects unless scraped file cannot be found in which case the url is added
+        to failed pages.
+
         :param json_dictionary: The dictionary created from the json file
         :param json_list: The list in the json file of found URLs
-        :return: Null, but self.pages is populated.
+        :param first_run: True if the first_run of verification has been completed. False, otherwise.
         """
         if json_dictionary['error_list'] is not None:
             for url in json_list[:]:
@@ -71,6 +109,10 @@ def harvest_pages(self, json_dictionary, json_list, first_run):
 
     # Compare page size to page-specific minimum that any fully-scraped page should have
     def size_comparison(self):
+        """
+            Checks the file size of every page instance against the minimum size specified in the constructor.
+            Pages that fail get added to failed_pages to be sent to rescrape.
+        """
         for page in self.pages[:]:
             if not page.file_size > self.minimum_size:
                 print('Failed: size_comparison(): ', page, ' has size: ', page.file_size)
@@ -78,140 +120,159 @@ def size_comparison(self):
                 self.pages.remove(page)
         return
 
-    def run_verifier(self, json_filename, json_list, first_run):
-        self.harvest_pages(json_filename, json_list, first_run)
+    def run_verifier(self, json_dictionary, json_list, first_run):
+        """
+            Runs the verifier.
+
+            :param json_dictionary: The dictionary created from the json file
+            :param json_list: The list in the json file of found URLs
+            :param first_run: True if the first_run of verification has been completed. False, otherwise.
+        """
+        self.harvest_pages(json_dictionary, json_list, first_run)
         self.size_comparison()
 
 
 # Called when json file had scrape_nodes = true
-# Checks for all the components of a project and if they were scraped
-# Verifies them and returns a list of the failed pages
 def verify_nodes(verification_dictionary, list_name, first_run):
-    nodes_list_verified = []
-    if verification_dictionary['include_files']:
-        project_files_verifier = Verifier()
-        project_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-        project_files = project_files_verifier.failed_pages
-        nodes_list_verified += project_files
-    if verification_dictionary['include_wiki']:
-        project_wiki_verifier = Verifier()
-        project_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-        project_wiki = project_wiki_verifier.failed_pages
-        nodes_list_verified += project_wiki
-    if verification_dictionary['include_analytics']:
-        project_analytics_verifier = Verifier()
-        project_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-        project_analytics = project_analytics_verifier.failed_pages
-        nodes_list_verified += project_analytics
-    if verification_dictionary['include_registrations']:
-        project_registrations_verifier = Verifier()
-        project_registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-        project_registrations = project_registrations_verifier.failed_pages
-        nodes_list_verified += project_registrations
-    if verification_dictionary['include_forks']:
-        project_forks_verifier = Verifier()
-        project_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-        project_forks = project_forks_verifier.failed_pages
-        nodes_list_verified += project_forks
-    if verification_dictionary['include_dashboard']:  # This must go last because its URLs don't have a specific ending.
-        project_dashboards_verifier = Verifier()
-        project_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-        project_dashboards = project_dashboards_verifier.failed_pages
-        nodes_list_verified += project_dashboards
-    return nodes_list_verified
+    """
+       Called when scrape_nodes = True
+
+       :param verification_dictionary: The dictionary created from the json file.
+       :param list_name: The list in the json file of found URLs.
+       :param first_run: True if the first_run of verification has been completed. False, otherwise.
+       :return: nodes_list_failed_verification: List of all the node urls that need to be rescraped.
+    """
+    projects_verifier = Verifier()
+    projects_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
+    nodes_list_failed_verification = projects_verifier.failed_pages
+
+    return nodes_list_failed_verification
 
 
 # Called when json file had scrape_registrations = true
-# Verifies the components of a registration and returns a list of the failed pages
 def verify_registrations(verification_dictionary, list_name, first_run):
+    """
+        Called when scrape_registrations = True
+
+        :param verification_dictionary: The dictionary created from the json file.
+        :param list_name: The list in the json file of found URLs.
+        :param first_run: True if the first_run of verification has been completed. False, otherwise.
+        :return: registrations_list_failed_verification: List of all the registration urls that need to be rescraped.
+     """
     # Must run all page types automatically
-    registration_files_verifier = Verifier()
-    registration_files_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-    registration_files = registration_files_verifier.failed_pages
-
-    registration_wiki_verifier = Verifier()
-    registration_wiki_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-    registration_wiki = registration_wiki_verifier.failed_pages
-
-    registration_analytics_verifier = Verifier()
-    registration_analytics_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-    registration_analytics = registration_analytics_verifier.failed_pages
-
-    registration_forks_verifier = Verifier()
-    registration_forks_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-    registration_forks = registration_forks_verifier.failed_pages
-
-    registration_dashboards_verifier = Verifier()
-    registration_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-    registration_dashboards = registration_dashboards_verifier.failed_pages
+    registrations_verifier = Verifier()
+    registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
+    registrations_list_failed_verification = registrations_verifier.failed_pages
 
-    registrations_list_verified = registration_files + registration_wiki + registration_analytics + \
-        registration_forks + registration_dashboards
-    return registrations_list_verified
+    return registrations_list_failed_verification
 
 
 # Called when json file had scrape_users = true
 # Verifies all user profile pages and returns a list of the failed pages
 def verify_users(verification_dictionary, list_name, first_run):
+    """
+        Called when scrape_users = True
+
+        :param verification_dictionary: The dictionary created from the json file.
+        :param list_name: The list in the json file of found URLs.
+        :param first_run: True if the first_run of verification has been completed. False, otherwise.
+        :return: user_profiles_failed_verification: List of all the user urls that need to be rescraped.
+     """
     user_profiles_verifier = Verifier()
     user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-    user_profiles = user_profiles_verifier.failed_pages
-    return user_profiles
+    user_profiles_failed_verification = user_profiles_verifier.failed_pages
+    return user_profiles_failed_verification
 
 
 # Called when json file had scrape_institutions = true
 # Verifies all user profile pages and returns a list of the failed pages
 def verify_institutions(verification_dictionary, list_name, first_run):
+    """
+        Called when scrape_institutions = True
+
+        :param verification_dictionary: The dictionary created from the json file.
+        :param list_name: The list in the json file of found URLs.
+        :param first_run: True if the first_run of verification has been completed. False, otherwise.
+        :return: institutions_dashboards_failed_verification: List of all the institution urls that need to be rescraped.
+     """
     institution_dashboards_verifier = Verifier()
     institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
-    institution_dashboards = institution_dashboards_verifier.failed_pages
-    return institution_dashboards
+    institution_dashboards_failed_verification = institution_dashboards_verifier.failed_pages
+    return institution_dashboards_failed_verification
 
 
-def call_rescrape(verification_json_dictionary):
+def call_rescrape(verification_dictionary):
+    """
+        Rescrapes all urls that failed verification
+        Creates an instance of the crawler and calls scrape_pages on all urls dumped into 'error_list' in the json file
+
+        :param verification_dictionary: The dictionary created from the json file.
+     """
     print("Called rescrape.")
     second_chance = Crawler()
     pdb.set_trace()
-    second_chance.scrape_pages(verification_json_dictionary['error_list'])
+    second_chance.scrape_pages(verification_dictionary['error_list'])
 
 
 def setup_verification(json_dictionary, first_run):
-    verification_list = []
+    """
+        Specified which lists in the json task file need to be read from based on conditions specified in the json task
+        file. Also, if its after the first run of verification all urls to be verified are read from error_list.
+
+        :param json_dictionary: The dictionary created from the json file.
+        :param first_run: True if the first_run of verification has been completed. False, otherwise.
+        :return: failed_verification_list: List of all the urls that need to be rescraped.
+
+     """
+    failed_verification_list = []
     print("Check verification")
     if json_dictionary['scrape_nodes']:
         if first_run:
             list_name = 'node_urls'
         else:
             list_name = 'error_list'
-        verification_list += verify_nodes(json_dictionary, list_name, first_run)
+        failed_verification_list += verify_nodes(json_dictionary, list_name, first_run)
     if json_dictionary['scrape_registrations']:
         if first_run:
             list_name = 'registration_urls'
         else:
             list_name = 'error_list'
-        verification_list += verify_registrations(json_dictionary, list_name, first_run)
+        failed_verification_list += verify_registrations(json_dictionary, list_name, first_run)
     if json_dictionary['scrape_users']:
         if first_run:
             list_name = 'user_urls'
         else:
             list_name = 'error_list'
-        verification_list += verify_users(json_dictionary, list_name, first_run)
+        failed_verification_list += verify_users(json_dictionary, list_name, first_run)
     if json_dictionary['scrape_institutions']:
         if first_run:
             list_name = 'institution_urls'
         else:
             list_name = 'error_list'
-        verification_list += verify_institutions(json_dictionary, list_name, first_run)
+        failed_verification_list += verify_institutions(json_dictionary, list_name, first_run)
+
+    return failed_verification_list
 
-    return verification_list
 
+def run_verification(json_file, retry_number):
+    """
+        CLI Endpoint for a normal run of verification.
+        Controls the main workflow of verification.
+            Two copies of the json task file are opened. One to preserve the original lists of urls to be verified,
+            and one to alter to dump all urls to be rescraped into.
+            On the first run of verification, certain conditions in the json file are checked to determine what lists
+            in the json file to read from based on what was scraped. An additional condition is added to the json file
+            when the first run of verification is finished to specify that all subsequent runs of verification need only
+            read from and dump to the list 'error_list'.
 
-def run_verification(json_file, i):
+        :param json_file: Name of the json task file.
+        :param retry_number: Number of what iteration of verification is being run.
+     """
     with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file:
         run_info = json.load(failure_file)
     with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file:
         run_copy = json.load(failure_file)
-    if i == 0:
+    if retry_number == 0:
         print("Begun 1st run")
         if run_info['scrape_finished']:
             run_copy['error_list'] = setup_verification(run_info, True)
@@ -229,16 +290,17 @@ def run_verification(json_file, i):
         call_rescrape(run_copy)
 
 
-def resume_verification(json_filename):
-        with codecs.open(json_filename, mode='r', encoding='utf-8') as failure_file:
+def resume_verification(json_file):
+    """
+        CLI Endpoint for resuming interrupted verification
+
+        :param json_file: The dictionary created from the json file.
+     """
+        with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file:
             run_copy = json.load(failure_file)
         print("Resumed verification.")
         run_copy['error_list'] = setup_verification(run_copy, False)
         # truncates json and dumps new lists
-        with codecs.open(json_filename, mode='w', encoding='utf-8') as file:
+        with codecs.open(json_file, mode='w', encoding='utf-8') as file:
             json.dump(run_copy, file, indent=4)
         call_rescrape(run_copy)
-
-
-def main(json_filename, num_retries):
-    run_verification(json_filename, num_retries)

From 01fa9b6c8f4a11b99e30c0aa78f027076ad558c9 Mon Sep 17 00:00:00 2001
From: Sadiyah Faruk <sf2ne@virginia.edu>
Date: Fri, 12 Aug 2016 14:58:11 -0400
Subject: [PATCH 6/7] Commented out print statements used for testing and
 removed pdb trace

---
 verifier.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/verifier.py b/verifier.py
index f47ba2e..122df09 100644
--- a/verifier.py
+++ b/verifier.py
@@ -4,7 +4,6 @@
 from bs4 import BeautifulSoup
 from settings import base_urls
 import os
-import pdb
 
 MIRROR = 'archive/'
 
@@ -94,16 +93,16 @@ def harvest_pages(self, json_dictionary, json_list, first_run):
         """
         if json_dictionary['error_list'] is not None:
             for url in json_list[:]:
-                print('rel: ', url)
+                # print('rel: ', url)
                 if url in json_dictionary['error_list'] and first_run:
                     self.failed_pages.append(url)
-                    print('error: ', url)
+                    # print('error: ', url)
                 else:
                     try:
                         obj = Page(url)
                         self.pages.append(obj)
                     except FileNotFoundError:
-                        print("Failed harvest_pages ", url)
+                        # print("Failed harvest_pages ", url)
                         self.failed_pages.append(url)
                 json_list.remove(url)
 
@@ -115,7 +114,7 @@ def size_comparison(self):
         """
         for page in self.pages[:]:
             if not page.file_size > self.minimum_size:
-                print('Failed: size_comparison(): ', page, ' has size: ', page.file_size)
+                # print('Failed: size_comparison(): ', page, ' has size: ', page.file_size)
                 self.failed_pages.append(page.url)
                 self.pages.remove(page)
         return
@@ -210,7 +209,6 @@ def call_rescrape(verification_dictionary):
      """
     print("Called rescrape.")
     second_chance = Crawler()
-    pdb.set_trace()
     second_chance.scrape_pages(verification_dictionary['error_list'])
 
 

From 12fddc0386d2bd5df3d298be30b88962ad810ba0 Mon Sep 17 00:00:00 2001
From: Sadiyah Faruk <sf2ne@virginia.edu>
Date: Fri, 12 Aug 2016 15:03:16 -0400
Subject: [PATCH 7/7] fixed indentation errors

---
 verifier.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/verifier.py b/verifier.py
index 122df09..871383f 100644
--- a/verifier.py
+++ b/verifier.py
@@ -65,7 +65,6 @@ class Verifier:
             3. Remaining urls run through size_comparison. Failed pages get sent to rescrape.
             4. Rescrape failed urls.
             5. Verify the pages that were just rescraped.
-
         """
 
     def __init__(self):
@@ -157,7 +156,7 @@ def verify_registrations(verification_dictionary, list_name, first_run):
         :param list_name: The list in the json file of found URLs.
         :param first_run: True if the first_run of verification has been completed. False, otherwise.
         :return: registrations_list_failed_verification: List of all the registration urls that need to be rescraped.
-     """
+    """
     # Must run all page types automatically
     registrations_verifier = Verifier()
     registrations_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
@@ -176,7 +175,7 @@ def verify_users(verification_dictionary, list_name, first_run):
         :param list_name: The list in the json file of found URLs.
         :param first_run: True if the first_run of verification has been completed. False, otherwise.
         :return: user_profiles_failed_verification: List of all the user urls that need to be rescraped.
-     """
+    """
     user_profiles_verifier = Verifier()
     user_profiles_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     user_profiles_failed_verification = user_profiles_verifier.failed_pages
@@ -193,7 +192,7 @@ def verify_institutions(verification_dictionary, list_name, first_run):
         :param list_name: The list in the json file of found URLs.
         :param first_run: True if the first_run of verification has been completed. False, otherwise.
         :return: institutions_dashboards_failed_verification: List of all the institution urls that need to be rescraped.
-     """
+    """
     institution_dashboards_verifier = Verifier()
     institution_dashboards_verifier.run_verifier(verification_dictionary, verification_dictionary[list_name], first_run)
     institution_dashboards_failed_verification = institution_dashboards_verifier.failed_pages
@@ -206,7 +205,7 @@ def call_rescrape(verification_dictionary):
         Creates an instance of the crawler and calls scrape_pages on all urls dumped into 'error_list' in the json file
 
         :param verification_dictionary: The dictionary created from the json file.
-     """
+    """
     print("Called rescrape.")
     second_chance = Crawler()
     second_chance.scrape_pages(verification_dictionary['error_list'])
@@ -221,7 +220,7 @@ def setup_verification(json_dictionary, first_run):
         :param first_run: True if the first_run of verification has been completed. False, otherwise.
         :return: failed_verification_list: List of all the urls that need to be rescraped.
 
-     """
+    """
     failed_verification_list = []
     print("Check verification")
     if json_dictionary['scrape_nodes']:
@@ -265,7 +264,7 @@ def run_verification(json_file, retry_number):
 
         :param json_file: Name of the json task file.
         :param retry_number: Number of what iteration of verification is being run.
-     """
+    """
     with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file:
         run_info = json.load(failure_file)
     with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file:
@@ -294,11 +293,11 @@ def resume_verification(json_file):
 
         :param json_file: The dictionary created from the json file.
      """
-        with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file:
-            run_copy = json.load(failure_file)
-        print("Resumed verification.")
-        run_copy['error_list'] = setup_verification(run_copy, False)
-        # truncates json and dumps new lists
-        with codecs.open(json_file, mode='w', encoding='utf-8') as file:
-            json.dump(run_copy, file, indent=4)
-        call_rescrape(run_copy)
+    with codecs.open(json_file, mode='r', encoding='utf-8') as failure_file:
+        run_copy = json.load(failure_file)
+    print("Resumed verification.")
+    run_copy['error_list'] = setup_verification(run_copy, False)
+    # truncates json and dumps new lists
+    with codecs.open(json_file, mode='w', encoding='utf-8') as file:
+        json.dump(run_copy, file, indent=4)
+    call_rescrape(run_copy)