From a77969bc16f928d3e28fb77afe4490778fb1850b Mon Sep 17 00:00:00 2001 From: Hartator Date: Tue, 28 Feb 2012 14:36:31 +0100 Subject: [PATCH] Added only_links_like Feature --- lib/anemone/core.rb | 22 ++++++++++++++++++++-- spec/core_spec.rb | 16 ++++++++++++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index 208c6d4c..1a757f79 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -77,6 +77,7 @@ def initialize(urls, opts = {}) @on_every_page_blocks = [] @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] } @skip_link_patterns = [] + @only_link_patterns = [] @after_crawl_blocks = [] @opts = opts @@ -111,6 +112,15 @@ def skip_links_like(*patterns) self end + # + # Add one ore more Regex patterns for URLs which should only be + # followed + # + def only_links_like(*patterns) + @only_link_patterns.concat [patterns].flatten.compact + self + end + # # Add a block to be executed on every Page as they are encountered # during the crawl @@ -292,10 +302,18 @@ def skip_query_string?(link) # # Returns +true+ if *link* should not be visited because - # its URL matches a skip_link pattern. + # its URL matches a skip_link pattern or not matches a only_link pattern. # def skip_link?(link) - @skip_link_patterns.any? { |pattern| link.path =~ pattern } + unless @only_link_patterns.empty? + if @only_link_patterns.any? { |pattern| link.path =~ pattern } + return false + else + return true + end + else + @skip_link_patterns.any? { |pattern| link.path =~ pattern } + end end end diff --git a/spec/core_spec.rb b/spec/core_spec.rb index 775c79f1..2d93e1c9 100644 --- a/spec/core_spec.rb +++ b/spec/core_spec.rb @@ -109,6 +109,22 @@ module Anemone core.pages.keys.should_not include(pages[1].url) core.pages.keys.should_not include(pages[3].url) end + + it "should be able to follow only links based on a RegEx" do + pages = [] + pages << FakePage.new('0', :links => ['1', '2']) + pages << FakePage.new('1') + pages << FakePage.new('2') + pages << FakePage.new('3') + + core = Anemone.crawl(pages[0].url, @opts) do |a| + a.only_links_like /1/, /3/ + end + + core.should have(2).pages + core.pages.keys.should include(pages[1].url) + core.pages.keys.should_not include(pages[3].url) + end it "should be able to call a block on every page" do pages = []