Skip to content

Commit 5202af5

Browse files
committed
Added a separate import:news task (issue #2).
* Extracted common logic into methods. * Enqueue the RSS feeds and then spider the news directories for a more complete news import. * Scrape the archive pages for the author information for each post.
1 parent 52dd170 commit 5202af5

File tree

1 file changed

+188
-80
lines changed

1 file changed

+188
-80
lines changed

_tasks/import.rb

Lines changed: 188 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,113 +1,221 @@
1+
# encoding: utf-8
12
require 'fileutils'
3+
require 'open-uri'
4+
require 'rss'
25

36
gem 'spidr', '~> 0.4'
47
require 'spidr'
58

69
gem 'kramdown', '~> 0.13'
710
require 'kramdown'
811

12+
HOST = 'www.ruby-lang.org'
913
OUTPUT_DIR = '_import'
10-
LAYOUTS = {
11-
:default => 'page',
12-
:post => 'news_post'
13-
}
1414

15-
desc 'Spiders ruby-lang.org and converts HTML to Markdown'
16-
task :import do
17-
Spidr.site('http://www.ruby-lang.org/index.html') do |agent|
18-
agent.ignore_links_like /\/cgi-bin\//
19-
agent.ignore_links_like /\.cgi[\/]?$/
20-
agent.ignore_links_like /\/[a-z_]+\/old-man\//
15+
def url_to_path(url)
16+
local_path = File.join(OUTPUT_DIR,url.path[1..-1])
2117

22-
agent.every_ok_page do |page|
23-
path = page.url.path[1..-1]
18+
case File.extname(local_path)
19+
when '.html'
20+
local_path.chomp!('.html') << '.md'
21+
when ''
22+
local_path << '/' unless local_path.end_with?('/')
23+
local_path << 'index.md'
24+
end
25+
26+
return local_path
27+
end
28+
29+
def html_to_markdown(content_div)
30+
# remove all comments
31+
content_div.traverse do |node|
32+
node.remove if node.comment?
33+
end
34+
35+
# remove all page anchors
36+
content_div.search('//a[@id]').remove
2437

25-
layout = :default
38+
# replace all caps spans with their text
39+
content_div.search('span.caps').each do |span|
40+
span.replace(span.inner_text)
41+
end
2642

27-
if path =~ %r{^[a-z_-]+/news/\d{4}/\d{1,2}/\d{1,2}/[^/]+/$}
28-
# map news posts in to news/_posts/
29-
dirs = path.split('/')
30-
local_path = File.join(OUTPUT_DIR,dirs[0,2],'_posts',dirs[2..-1].join('-')) + '.md'
43+
# remove the 'class' attribute from all pre tags
44+
content_div.search('pre').remove_attr('class')
3145

32-
layout = :post
46+
# map all code elements to their inner_text
47+
content_div.search('pre > code').each do |code|
48+
code.replace(code.children.map { |node|
49+
if node.name == 'br'
50+
$/
3351
else
34-
# normal page
35-
local_path = File.join(OUTPUT_DIR,path)
36-
37-
case File.extname(local_path)
38-
when '.html'
39-
local_path.gsub!(/\.html$/,'.md')
40-
when ''
41-
local_path += '/' unless local_path.end_with?('/')
42-
local_path += 'index.md'
52+
node.inner_text
53+
end
54+
}.join)
55+
end
56+
57+
# replace the #extended div with it's children
58+
if (extended_div = content_div.at('#extended'))
59+
extended_div.replace(extended_div.inner_html)
60+
end
61+
62+
# convert from HTML to Markdown
63+
return Kramdown::Document.new(
64+
content_div.inner_html,
65+
:input => :html
66+
).to_kramdown
67+
end
68+
69+
namespace :import do
70+
desc 'Spiders ruby-lang.org and converts HTML to Markdown'
71+
task :pages do
72+
Spidr.site("http://www.#{HOST}/index.html") do |agent|
73+
agent.ignore_links_like /\/cgi-bin\//
74+
agent.ignore_links_like /\.cgi[\/]?$/
75+
agent.ignore_links_like /\/[a-z_]+\/feeds\//
76+
agent.ignore_links_like /\/[a-z_]+\/news\//
77+
agent.ignore_links_like /\/[a-z_]+\/old-man\//
78+
79+
agent.every_ok_page do |page|
80+
local_path = url_to_path(page.url)
81+
82+
# ensure the parent directory exists
83+
mkdir_p File.dirname(local_path)
84+
85+
# don't overwrite existing files
86+
unless File.exist?(local_path)
87+
puts "Importing #{page.url} -> #{local_path} ..."
88+
89+
File.open(local_path,'w') do |file|
90+
if page.html?
91+
title = page.title.strip
92+
lang = path.split('/',2).first
93+
94+
# add the YAML front matter
95+
file.puts(
96+
'---',
97+
"layout: default",
98+
"title: #{title.inspect}",
99+
"lang: #{lang}",
100+
'---',
101+
''
102+
)
103+
104+
if (content_div = page.at('#content'))
105+
file.puts(html_to_markdown(content_div))
106+
end
107+
else
108+
file.write(page.body)
109+
end
110+
end
43111
end
44112
end
113+
end
114+
end
45115

46-
# ensure the parent directory exists
47-
FileUtils.mkdir_p(File.dirname(local_path))
116+
desc "Imports news posts from the RSS feed"
117+
task :news do
118+
languages = %w[bg de en es fr id it ja ko pl pt tr zh_TW zh_cn]
119+
by_lines = {
120+
'bg' => /Публикувана от (.+) на/,
121+
'de' => /Geschrieben von (.+) am/,
122+
'en' => /Posted by (.+) on/,
123+
'es' => /Publicado por (.+) Caro el/,
124+
'fr' => /par (.+)/,
125+
'id' => /Ditulis oleh (.+) tanggal/,
126+
'it' => /Inserito da (.+) il/,
127+
'ja' => /Posted by (.+) on/,
128+
'ko' => /작성자 (.+) \(/,
129+
'pl' => /Zamieszczone przez (.+) \d+/,
130+
'pt' => /Escrito por (.+) em/,
131+
'tr' => /Posted by (.+) on/,
132+
'zh_TW' => /Posted by (.+) on/,
133+
'zh_cn' => /由 (.+) 发表于/
134+
}
135+
136+
Spidr.host(HOST) do |agent|
137+
languages.each do |lang|
138+
feed, news_dir = case lang
139+
when 'pt' then ['noticias', 'noticias-recentes']
140+
else ['news', 'news']
141+
end
142+
143+
agent.visit_urls_like do |url|
144+
url.path.start_with?("/#{lang}/#{news_dir}/")
145+
end
48146

49-
# don't overwrite existing files
50-
unless File.exist?(local_path)
51-
puts "Saving #{page.url} -> #{local_path} ..."
147+
agent.enqueue("http://#{HOST}/#{lang}/#{news_dir}/")
52148

53-
File.open(local_path,'w') do |file|
54-
if page.html?
55-
title = page.title.strip
56-
lang = path.split('/',2).first
149+
begin
150+
rss = RSS::Parser.parse(open("http://#{HOST}/#{lang}/feeds/#{feed}.rss"))
151+
rss.items.each do |item|
152+
puts "Queuing #{item.link} ..."
153+
agent.enqueue(item.link)
154+
end
155+
rescue OpenURI::HTTPError
156+
end
157+
end
57158

159+
agent.every_ok_page do |page|
160+
lang, news_dir, year, month, day, slug = page.url.path[1..-2].split('/')
161+
title = page.title.strip
162+
163+
if page.url.path =~ /^\/#{lang}\/#{news_dir}\/\d{4}\/\d{2}\/\d{2}\//
164+
# news post
165+
local_path = File.join(OUTPUT_DIR,lang,news_dir,'_posts',"#{year}-#{month}-#{day}-#{slug}.md")
166+
layout = 'news_post'
167+
author = nil
168+
169+
archive_url = URI("http://#{HOST}/#{lang}/#{news_dir}/#{year}/#{month}/")
170+
begin
171+
agent.get_page(archive_url) do |archive|
172+
if archive.is_ok?
173+
if (post_div = archive.at("//div[@class='post']/h3/a[@href=#{page.url.path.dump}]/../.."))
174+
post_info = post_div.at("//p[@class='post-info']").inner_text
175+
176+
author = if (match = post_info.match(by_lines[lang]))
177+
match[1]
178+
else
179+
''
180+
end
181+
end
182+
end
183+
end
184+
rescue Net::HTTPNotFound
185+
end
186+
else
187+
# archive page
188+
local_path = url_to_path(page.url)
189+
layout = 'default'
190+
end
191+
192+
# ensure the parent directory exists
193+
FileUtils.mkdir_p File.dirname(local_path)
194+
195+
unless File.exists?(local_path)
196+
puts "Importing #{page.url} -> #{local_path} ..."
197+
198+
File.open(local_path,'w') do |file|
58199
# add the YAML front matter
59200
file.puts(
60201
'---',
61-
"layout: #{LAYOUTS[layout]}",
62-
"title: #{title.inspect}",
202+
"layout: #{layout}",
203+
"title: #{title.inspect}"
204+
)
205+
206+
if author
207+
file.puts "author: #{author.inspect}"
208+
end
209+
210+
file.puts(
63211
"lang: #{lang}",
64212
'---',
65213
''
66214
)
67215

68-
if (content_div = page.at('#content'))
69-
# remove all comments
70-
content_div.traverse do |node|
71-
node.remove if node.comment?
72-
end
73-
74-
# remove all page anchors
75-
content_div.search('//a[@id]').remove
216+
content_div = page.at('div.post') || page.at('#content')
76217

77-
# replace all caps spans with their text
78-
content_div.search('span.caps').each do |span|
79-
span.replace(span.inner_text)
80-
end
81-
82-
# remove the 'class' attribute from all pre tags
83-
content_div.search('pre').remove_attr('class')
84-
85-
# map all code elements to their inner_text
86-
content_div.search('pre > code').each do |code|
87-
code.replace(code.children.map { |node|
88-
if node.name == 'br'
89-
$/
90-
else
91-
node.inner_text
92-
end
93-
}.join)
94-
end
95-
96-
# replace the #extended div with it's children
97-
if (extended_div = content_div.at('#extended'))
98-
extended_div.replace(extended_div.inner_html)
99-
end
100-
101-
# convert from HTML to Markdown
102-
content = Kramdown::Document.new(
103-
content_div.inner_html,
104-
:input => :html
105-
).to_kramdown
106-
107-
file.puts(content)
108-
end
109-
else
110-
file.write(page.body)
218+
file.puts(html_to_markdown(content_div))
111219
end
112220
end
113221
end

0 commit comments

Comments
 (0)