Adding wait option

bartdag · jimpriest · Sep 22, 2016 · Sep 22, 2016 · Sep 22, 2016 · dad32eb2eec5eac9a75d43d42ccc9b11a4d1c899
commit dad32eb2eec5eac9a75d43d42ccc9b11a4d1c899
diff --git a/pylinkvalidator/crawler.py b/pylinkvalidator/crawler.py
@@ -240,16 +240,17 @@ def __init__(self, worker_init):
         self.urlopen = get_url_open()
         self.request_class = get_url_request()
         self.logger = worker_init.logger
+
         if not self.logger:
             # Get a new one!
             self.logger = get_logger()
 
         # We do this here to allow patching by gevent
         import socket
         self.timeout_exception = socket.timeout
-
         self.auth_header = None
 
+
         if self.worker_config.username and self.worker_config.password:
             base64string = unicode(
                 base64.encodestring(
@@ -280,6 +281,7 @@ def _crawl_page(self, worker_input):
         url_split_to_crawl = worker_input.url_split
 
         try:
+            time.sleep(self.worker_config.wait)
             response = open_url(
                 self.urlopen, self.request_class,
                 url_split_to_crawl.geturl(), self.worker_config.timeout,
@@ -375,6 +377,7 @@ def _crawl_page(self, worker_input):
                     site_origin=worker_input.site_origin,
                     missing_content=missing_content,
                     erroneous_content=erroneous_content)
+
         except Exception as exc:
             exception = ExceptionStr(unicode(type(exc)), unicode(exc))
             page_crawl = PageCrawl(

diff --git a/pylinkvalidator/models.py b/pylinkvalidator/models.py
@@ -43,30 +43,25 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
 
 DEFAULT_TYPES = ['a', 'img', 'script', 'link']
 
-
 TYPE_ATTRIBUTES = {
     'a': 'href',
     'img': 'src',
     'script': 'src',
     'link': 'href',
 }
 
-
 DEFAULT_TIMEOUT = 10
 
-
 MODE_THREAD = "thread"
 MODE_PROCESS = "process"
 MODE_GREEN = "green"
 
-
 DEFAULT_WORKERS = {
     MODE_THREAD: 1,
     MODE_PROCESS: 1,
     MODE_GREEN: 1000,
 }
 
-
 PARSER_STDLIB = "html.parser"
 PARSER_LXML = "lxml"
 PARSER_HTML5 = "html5lib"
@@ -80,23 +75,20 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
 FORMAT_HTML = "html"
 FORMAT_JSON = "json"
 
-
 WHEN_ALWAYS = "always"
 WHEN_ON_ERROR = "error"
 
-
 REPORT_TYPE_ERRORS = "errors"
 REPORT_TYPE_SUMMARY = "summary"
 REPORT_TYPE_ALL = "all"
 
-
 VERBOSE_QUIET = "0"
 VERBOSE_NORMAL = "1"
 VERBOSE_INFO = "2"
 
-
 HTML_MIME_TYPE = "text/html"
 
+DEFAULT_WAIT = 0
 
 PAGE_QUEUED = '__PAGE_QUEUED__'
 PAGE_CRAWLED = '__PAGE_CRAWLED__'
@@ -108,48 +100,39 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
     "WorkerInit",
     ["worker_config", "input_queue", "output_queue", "logger"])
 
-
 WorkerConfig = namedtuple_with_defaults(
     "WorkerConfig",
     ["username", "password", "types", "timeout", "parser", "strict_mode",
-     "prefer_server_encoding", "extra_headers"])
-
+     "prefer_server_encoding", "extra_headers", "wait"])
 
 WorkerInput = namedtuple_with_defaults(
     "WorkerInput",
     ["url_split", "should_crawl", "depth", "site_origin", "content_check"])
 
-
 Response = namedtuple_with_defaults(
     "Response", ["content", "status", "exception", "original_url",
                  "final_url", "is_redirect", "is_timeout", "response_time"])
 
-
 ExceptionStr = namedtuple_with_defaults(
     "ExceptionStr", ["type_name", "message"])
 
-
 Link = namedtuple_with_defaults(
     "Link",
     ["type", "url_split", "original_url_split", "source_str"])
 
-
 PageCrawl = namedtuple_with_defaults(
     "PageCrawl", ["original_url_split", "final_url_split",
                   "status", "is_timeout", "is_redirect", "links",
                   "exception", "is_html", "depth", "response_time",
                   "process_time", "site_origin", "missing_content",
                   "erroneous_content"])
 
-
 PageStatus = namedtuple_with_defaults(
     "PageStatus", ["status", "sources"])
 
-
 PageSource = namedtuple_with_defaults(
     "PageSource", ["origin", "origin_str"])
 
-
 ContentCheck = namedtuple_with_defaults(
     "ContentCheck",
     ["html_presence", "html_absence", "text_presence", "text_absence",
@@ -162,6 +145,7 @@ def namedtuple_with_defaults(typename, field_names, default_values=[]):
 class UTF8Class(object):
     """Handles unicode string from __unicode__() in: __str__() and __repr__()
     """
+
     def __str__(self):
         return get_safe_str(self.__unicode__())
 
@@ -202,8 +186,8 @@ def __init__(self):
 
     def should_crawl(self, url_split, depth):
         """Returns True if url split is local AND depth is acceptable"""
-        return (self.options.depth < 0 or depth < self.options.depth) and\
-            self.is_local(url_split)
+        return (self.options.depth < 0 or depth < self.options.depth) and \
+               self.is_local(url_split)
 
     def is_local(self, url_split, site_origin=None):
         """Returns true if url split is in the accepted hosts. site_origin must
@@ -307,7 +291,7 @@ def _build_worker_config(self, options):
         return WorkerConfig(
             options.username, options.password, types, options.timeout,
             options.parser, options.strict_mode,
-            options.prefer_server_encoding, headers)
+            options.prefer_server_encoding, headers, options.wait)
 
     def _build_accepted_hosts(self, options, start_urls):
         if options.multi:
@@ -427,7 +411,7 @@ def _get_prefix_content(self, content, prefix=None):
         if not prefix:
             index = content.find(",")
             prefix = get_clean_url_split(content[:index])
-            content = content[index+1:]
+            content = content[index + 1:]
 
         return (prefix, content)
 
@@ -454,14 +438,14 @@ def _build_parser(self):
             help="fetch resources from other domains without crawling them")
         crawler_group.add_option(
             "-H", "--accepted-hosts",
-            dest="accepted_hosts",  action="store", default=None,
+            dest="accepted_hosts", action="store", default=None,
             help="comma-separated list of additional hosts to crawl (e.g., "
-            "example.com,subdomain.another.com)")
+                 "example.com,subdomain.another.com)")
         crawler_group.add_option(
             "-i", "--ignore", dest="ignored_prefixes",
             action="store", default=None,
             help="comma-separated list of host/path prefixes to ignore "
-            "(e.g., www.example.com/ignore_this_and_after/)")
+                 "(e.g., www.example.com/ignore_this_and_after/)")
         crawler_group.add_option(
             "-u", "--username", dest="username",
             action="store", default=None,
@@ -476,9 +460,9 @@ def _build_parser(self):
             help="each argument is considered to be a different site")
         crawler_group.add_option(
             "-D", "--header",
-            dest="headers",  action="append", metavar="HEADER",
+            dest="headers", action="append", metavar="HEADER",
             help="custom header of the form Header: Value "
-            "(repeat for multiple headers)")
+                 "(repeat for multiple headers)")
         crawler_group.add_option(
             "--url-file-path", dest="url_file_path",
             action="store", default=None,
@@ -489,7 +473,7 @@ def _build_parser(self):
             "-t", "--types", dest="types", action="store",
             default=",".join(DEFAULT_TYPES),
             help="Comma-separated values of tags to look for when crawling"
-            "a site. Default (and supported types): a,img,link,script")
+                 "a site. Default (and supported types): a,img,link,script")
         crawler_group.add_option(
             "-T", "--timeout", dest="timeout",
             type="int", action="store", default=DEFAULT_TIMEOUT,
@@ -518,30 +502,30 @@ def _build_parser(self):
             "--check-presence", dest="content_presence",
             action="append",
             help="Check presence of raw or HTML content on all pages. e.g., "
-            "<tag attr1=\"val\">regex:content</tag>. "
-            "Content can be either regex:pattern or plain content")
+                 "<tag attr1=\"val\">regex:content</tag>. "
+                 "Content can be either regex:pattern or plain content")
         crawler_group.add_option(
             "--check-absence", dest="content_absence",
             action="append",
             help="Check absence of raw or HTML content on all pages. e.g., "
-            "<tag attr1=\"val\">regex:content</tag>. "
-            "Content can be either regex:pattern or plain content")
+                 "<tag attr1=\"val\">regex:content</tag>. "
+                 "Content can be either regex:pattern or plain content")
         crawler_group.add_option(
             "--check-presence-once", dest="content_presence_once",
             action="append",
             help="Check presence of raw or HTML content for one page: "
-            "path,content, e.g.,: "
-            "/path,<tag attr1=\"val\">regex:content</tag>. "
-            "Content can be either regex:pattern or plain content. "
-            "Path can be either relative or absolute with domain.")
+                 "path,content, e.g.,: "
+                 "/path,<tag attr1=\"val\">regex:content</tag>. "
+                 "Content can be either regex:pattern or plain content. "
+                 "Path can be either relative or absolute with domain.")
         crawler_group.add_option(
             "--check-absence-once", dest="content_absence_once",
             action="append",
             help="Check absence of raw or HTML content for one page: "
-            "path,content, e.g.,"
-            "path,<tag attr1=\"val\">regex:content</tag>. "
-            "Content can be either regex:pattern or plain content. "
-            "Path can be either relative or absolute with domain.")
+                 "path,content, e.g.,"
+                 "path,<tag attr1=\"val\">regex:content</tag>. "
+                 "Content can be either regex:pattern or plain content. "
+                 "Path can be either relative or absolute with domain.")
 
         # TODO Add follow redirect option.
 
@@ -565,6 +549,10 @@ def _build_parser(self):
             help="Types of HTML parse: html.parser (default), lxml, html5lib",
             default=PARSER_STDLIB, choices=[PARSER_STDLIB, PARSER_LXML,
                                             PARSER_HTML5])
+        perf_group.add_option(
+            "--wait", dest="wait", type="int", action="store", default=DEFAULT_WAIT,
+            help="Number of seconds to wait between each worker request: 0 (default). "
+                 "Combine with --workers to control concurrency. ")
 
         parser.add_option_group(perf_group)
 
@@ -584,18 +572,18 @@ def _build_parser(self):
             "-W", "--when", dest="when", action="store",
             default=WHEN_ALWAYS, choices=[WHEN_ALWAYS, WHEN_ON_ERROR],
             help="When to print the report. error (only if a "
-            "crawling error occurs) or always (default)")
+                 "crawling error occurs) or always (default)")
         output_group.add_option(
             "-E", "--report-type", dest="report_type",
             help="Type of report to print: errors (default, summary and "
-            "erroneous links), summary, all (summary and all links)",
+                 "erroneous links), summary, all (summary and all links)",
             action="store", default=REPORT_TYPE_ERRORS,
             choices=[REPORT_TYPE_ERRORS, REPORT_TYPE_SUMMARY, REPORT_TYPE_ALL])
         output_group.add_option(
             "-c", "--console", dest="console",
             action="store_true", default=False,
             help="Prints report to the console in addition to other output"
-            " options such as file or email.")
+                 " options such as file or email.")
         crawler_group.add_option(
             "-S", "--show-source", dest="show_source",
             action="store_true", default=False,
@@ -611,12 +599,12 @@ def _build_parser(self):
             "-a", "--address", dest="address", action="store",
             default=None,
             help="Comma-separated list of email addresses used to send a "
-            "report")
+                 "report")
         email_group.add_option(
             "--from", dest="from_address", action="store",
             default=None,
             help="Email address to use in the from field of the email "
-            "(optional)")
+                 "(optional)")
         email_group.add_option(
             "-s", "--smtp", dest="smtp", action="store",
             default=None,
@@ -673,8 +661,8 @@ def __init__(self, url_split, status=200, is_timeout=False, exception=None,
         self.exception = exception
         self.is_html = is_html
         self.is_local = is_local
-        self.is_ok = status and status < 400 and not missing_content and\
-            not erroneous_content
+        self.is_ok = status and status < 400 and not missing_content and \
+                     not erroneous_content
         self.response_time = response_time
         self.process_time = process_time
         self.site_origin = site_origin
@@ -723,10 +711,10 @@ def get_content_messages(self):
         """Gets missing and erroneous content
         """
         messages = [
-            "missing content: {0}".format(content) for content in
-            self.missing_content] + [
-            "erroneous content: {0}".format(content) for content in
-            self.erroneous_content]
+                       "missing content: {0}".format(content) for content in
+                       self.missing_content] + [
+                       "erroneous content: {0}".format(content) for content in
+                       self.erroneous_content]
 
         return messages
 

diff --git a/pylinkvalidator/tests.py b/pylinkvalidator/tests.py
@@ -432,6 +432,15 @@ def test_depth_0(self):
         self.assertEqual(7, len(site.pages))
         self.assertEqual(1, len(site.error_pages))
 
+    def test_wait_1(self):
+        startCrawl = time.time()
+        site = self._run_crawler_plain(
+            ThreadSiteCrawler, ["--wait", "1"], "/depth/root.html")
+        endCrawl = time.time()
+        crawlTime = endCrawl - startCrawl
+        # with 1 second wait crawl time should be equal to or greater than # of pages crawled
+        self.assertTrue(crawlTime >= len(site.pages))
+
     def test_strict_mode(self):
         site = self._run_crawler_plain(ThreadSiteCrawler, ["--strict"])