Skip to content

Commit

Permalink
Add excluded urls option
Browse files Browse the repository at this point in the history
* refactor from pull request comments to compile regex
* add exclude test
  • Loading branch information
Jim Priest committed Sep 2, 2015
1 parent 8838f8c commit 86a8679
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 2 deletions.
5 changes: 3 additions & 2 deletions pylinkvalidator/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def should_download(self, url_split):
url = url_split.geturl()

for exclude_url in self.excluded_urls:
if re.search(exclude_url, url):
if exclude_url.search(url):
return False

for ignored_prefix in self.ignored_prefixes:
Expand Down Expand Up @@ -215,7 +215,8 @@ def _parse_config(self):
self.ignored_prefixes = self.options.ignored_prefixes.split(',')

if self.options.excluded_urls:
self.excluded_urls = self.options.excluded_urls.split(',')
self.excluded_urls = [re.compile(pattern) for pattern in self.options.excluded_urls.split(',')]


if self.options.workers:
self.worker_size = self.options.workers
Expand Down
7 changes: 7 additions & 0 deletions pylinkvalidator/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,13 @@ def test_run_once(self):
self.assertEqual(8, len(site.pages))
self.assertEqual(0, len(site.error_pages))

def test_exclude(self):
site = self._run_crawler_plain(ThreadSiteCrawler, ["--exclude=/sub/"])

# exclude /sub/ directory = 4 pages linked on the index
self.assertEqual(4, len(site.pages))
self.assertEqual(0, len(site.error_pages))

def test_depth_0(self):
site = self._run_crawler_plain(
ThreadSiteCrawler, ["--depth", "0"], "/depth/root.html")
Expand Down

3 comments on commit 86a8679

@jimpriest
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the feedback! I've updated code and added a simple test.

If this looks OK I'll add the other option next.

@bartdag
Copy link
Owner

@bartdag bartdag commented on 86a8679 Sep 2, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That looks good! Thanks!

@jimpriest
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the re.compile() tip. Reading about that now - very interesting!

Please sign in to comment.