Skip to content

Commit

Permalink
サブドメイン対応
Browse files Browse the repository at this point in the history
  • Loading branch information
takundao71 committed Feb 20, 2019
1 parent 72b699e commit 90a0cca
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 3 deletions.
16 changes: 14 additions & 2 deletions lib/anemone/core.rb
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,9 @@ class Core
# proxy server port number
:proxy_port => false,
# HTTP read timeout in seconds
:read_timeout => nil
:read_timeout => nil,
# Crawl subdomains?
:crawl_subdomains => false,
}

# Create setter methods for all options to be called from the crawl block
Expand All @@ -72,6 +74,7 @@ class Core
def initialize(urls, opts = {})
@urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
@urls.each{ |url| url.path = '/' if url.path.empty? }
@valid_domains = @urls.map{|u| [u.host,u.host.gsub(/^www\./,'.')]}.flatten.compact.uniq

@tentacles = []
@on_every_page_blocks = []
Expand Down Expand Up @@ -256,7 +259,16 @@ def visit_link?(link, from_page = nil)
!skip_link?(link) &&
!skip_query_string?(link) &&
allowed(link) &&
!too_deep?(from_page)
!too_deep?(from_page) &&
(in_allowed_domain?(link) or in_allowed_subdomain?(link))
end

def in_allowed_domain?(link)
@valid_domains.index(link.host)
end

def in_allowed_subdomain?(link)
opts[:crawl_subdomains] and @valid_domains.find{|domain| link.host.end_with?(domain)}
end

#
Expand Down
2 changes: 1 addition & 1 deletion lib/anemone/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def links
u = a['href']
next if u.nil? or u.empty?
abs = to_absolute(u) rescue next
@links << abs if in_domain?(abs)
@links << abs
end
@links.uniq!
@links
Expand Down
12 changes: 12 additions & 0 deletions spec/core_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,18 @@ module Anemone
core.pages.keys.should_not include('http://www.other.com/')
end

it "should follow links to subdomains" do
pages = []
pages << FakePage.new('0', :links => ['1'], :hrefs => [ 'http://www.other.com/', 'http://subdomain.example.com/'] )
pages << FakePage.new('1')

core = Anemone.crawl(pages[0].url, @opts.merge({:crawl_subdomains => true}))

core.should have(3).pages
core.pages.keys.should_not include('http://www.other.com/')
core.pages.keys.should include('http://subdomain.example.com/')
end

it "should follow http redirects" do
pages = []
pages << FakePage.new('0', :links => ['1'])
Expand Down

1 comment on commit 90a0cca

@takundao71
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.