Skip to content

Commit

Permalink
Add excluded_urls option
Browse files Browse the repository at this point in the history
URLs matching the regular expression will be ignored
  • Loading branch information
Jim Priest committed Sep 1, 2015
1 parent 3012073 commit 8838f8c
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ usage examples.
-H ACCEPTED_HOSTS, --accepted-hosts=ACCEPTED_HOSTS
Comma-separated list of additional hosts to crawl
(e.g., example.com,subdomain.another.com)
-x EXCLUDED_URLS, --exclude=EXCLUDED_URLS
URLs matching the regular expression will be ignored
(e.g., /private/ )
-i IGNORED_PREFIXES, --ignore=IGNORED_PREFIXES
Comma-separated list of host/path prefixes to ignore
(e.g., www.example.com/ignore_this_and_after/)
Expand Down
21 changes: 18 additions & 3 deletions pylinkvalidator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from optparse import OptionParser, OptionGroup

from pylinkvalidator.compat import get_safe_str
from pylinkvalidator.urlutil import get_clean_url_split
from pylinkvalidator.urlutil import get_clean_url_split, re


DEFAULT_TYPES = ['a', 'img', 'script', 'link']
Expand Down Expand Up @@ -148,6 +148,7 @@ def __init__(self):
self.worker_config = None
self.accepted_hosts = []
self.ignored_prefixes = []
self.excluded_urls = []
self.worker_size = 0

def should_crawl(self, url_split, depth):
Expand All @@ -160,15 +161,21 @@ def is_local(self, url_split):
return url_split.netloc in self.accepted_hosts

def should_download(self, url_split):
"""Returns True if the url does not start with an ignored prefix and if
it is local or outside links are allowed."""
"""Returns True if the url does not start with
* an ignored prefix
* it does not match excluded url regex
* if it is local or outside links are allowed."""
local = self.is_local(url_split)

if not self.options.test_outside and not local:
return False

url = url_split.geturl()

for exclude_url in self.excluded_urls:
if re.search(exclude_url, url):
return False

for ignored_prefix in self.ignored_prefixes:
if url.startswith(ignored_prefix):
return False
Expand Down Expand Up @@ -207,6 +214,9 @@ def _parse_config(self):
if self.options.ignored_prefixes:
self.ignored_prefixes = self.options.ignored_prefixes.split(',')

if self.options.excluded_urls:
self.excluded_urls = self.options.excluded_urls.split(',')

if self.options.workers:
self.worker_size = self.options.workers
else:
Expand Down Expand Up @@ -274,6 +284,11 @@ def _build_parser(self):
dest="accepted_hosts", action="store", default=None,
help="comma-separated list of additional hosts to crawl (e.g., "
"example.com,subdomain.another.com)")
crawler_group.add_option(
"-x", "--exclude", dest="excluded_urls",
action="store", default=None,
help="URLs matching the regular expression will be ignored"
"(e.g., /private/)")
crawler_group.add_option(
"-i", "--ignore", dest="ignored_prefixes",
action="store", default=None,
Expand Down

0 comments on commit 8838f8c

Please sign in to comment.