diff --git a/lib/anemone/core.rb b/lib/anemone/core.rb index d1629a49..33c844ec 100644 --- a/lib/anemone/core.rb +++ b/lib/anemone/core.rb @@ -55,7 +55,9 @@ class Core # proxy server port number :proxy_port => false, # HTTP read timeout in seconds - :read_timeout => nil + :read_timeout => nil, + # Crawl subdomains? + :crawl_subdomains => false, } # Create setter methods for all options to be called from the crawl block @@ -72,6 +74,7 @@ class Core def initialize(urls, opts = {}) @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) } @urls.each{ |url| url.path = '/' if url.path.empty? } + @valid_domains = @urls.map{|u| [u.host,u.host.gsub(/^www\./,'.')]}.flatten.compact.uniq @tentacles = [] @on_every_page_blocks = [] @@ -256,7 +259,16 @@ def visit_link?(link, from_page = nil) !skip_link?(link) && !skip_query_string?(link) && allowed(link) && - !too_deep?(from_page) + !too_deep?(from_page) && + (in_allowed_domain?(link) or in_allowed_subdomain?(link)) + end + + def in_allowed_domain?(link) + @valid_domains.index(link.host) + end + + def in_allowed_subdomain?(link) + opts[:crawl_subdomains] and @valid_domains.find{|domain| link.host.end_with?(domain)} end # diff --git a/lib/anemone/page.rb b/lib/anemone/page.rb index b157ad63..3c914722 100644 --- a/lib/anemone/page.rb +++ b/lib/anemone/page.rb @@ -63,7 +63,7 @@ def links u = a['href'] next if u.nil? or u.empty? abs = to_absolute(u) rescue next - @links << abs if in_domain?(abs) + @links << abs end @links.uniq! @links diff --git a/spec/core_spec.rb b/spec/core_spec.rb index 775c79f1..cc84ff84 100644 --- a/spec/core_spec.rb +++ b/spec/core_spec.rb @@ -42,6 +42,18 @@ module Anemone core.pages.keys.should_not include('http://www.other.com/') end + it "should follow links to subdomains" do + pages = [] + pages << FakePage.new('0', :links => ['1'], :hrefs => [ 'http://www.other.com/', 'http://subdomain.example.com/'] ) + pages << FakePage.new('1') + + core = Anemone.crawl(pages[0].url, @opts.merge({:crawl_subdomains => true})) + + core.should have(3).pages + core.pages.keys.should_not include('http://www.other.com/') + core.pages.keys.should include('http://subdomain.example.com/') + end + it "should follow http redirects" do pages = [] pages << FakePage.new('0', :links => ['1'])