From a77969bc16f928d3e28fb77afe4490778fb1850b Mon Sep 17 00:00:00 2001
From: Hartator <hartator@gmail.com>
Date: Tue, 28 Feb 2012 14:36:31 +0100
Subject: [PATCH] Added only_links_like Feature

---
 lib/anemone/core.rb | 22 ++++++++++++++++++++--
 spec/core_spec.rb   | 16 ++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb
index 208c6d4c..1a757f79 100644
--- a/lib/anemone/core.rb
+++ b/lib/anemone/core.rb
@@ -77,6 +77,7 @@ def initialize(urls, opts = {})
       @on_every_page_blocks = []
       @on_pages_like_blocks = Hash.new { |hash,key| hash[key] = [] }
       @skip_link_patterns = []
+      @only_link_patterns = []
       @after_crawl_blocks = []
       @opts = opts
 
@@ -111,6 +112,15 @@ def skip_links_like(*patterns)
       self
     end
 
+    #
+    # Add one ore more Regex patterns for URLs which should only be
+    # followed
+    #
+    def only_links_like(*patterns)
+      @only_link_patterns.concat [patterns].flatten.compact
+      self
+    end
+    
     #
     # Add a block to be executed on every Page as they are encountered
     # during the crawl
@@ -292,10 +302,18 @@ def skip_query_string?(link)
 
     #
     # Returns +true+ if *link* should not be visited because
-    # its URL matches a skip_link pattern.
+    # its URL matches a skip_link pattern or not matches a only_link pattern.
     #
     def skip_link?(link)
-      @skip_link_patterns.any? { |pattern| link.path =~ pattern }
+      unless @only_link_patterns.empty?
+        if @only_link_patterns.any? { |pattern| link.path =~ pattern }
+          return false
+        else
+          return true
+        end
+      else
+        @skip_link_patterns.any? { |pattern| link.path =~ pattern }
+      end
     end
 
   end
diff --git a/spec/core_spec.rb b/spec/core_spec.rb
index 775c79f1..2d93e1c9 100644
--- a/spec/core_spec.rb
+++ b/spec/core_spec.rb
@@ -109,6 +109,22 @@ module Anemone
         core.pages.keys.should_not include(pages[1].url)
         core.pages.keys.should_not include(pages[3].url)
       end
+      
+      it "should be able to follow only links based on a RegEx" do
+        pages = []
+        pages << FakePage.new('0', :links => ['1', '2'])
+        pages << FakePage.new('1')
+        pages << FakePage.new('2')
+        pages << FakePage.new('3')
+
+        core = Anemone.crawl(pages[0].url, @opts) do |a|
+          a.only_links_like /1/, /3/
+        end
+
+        core.should have(2).pages
+        core.pages.keys.should include(pages[1].url)
+        core.pages.keys.should_not include(pages[3].url)
+      end
 
       it "should be able to call a block on every page" do
         pages = []