|
| 1 | +# encoding: utf-8 |
1 | 2 | require 'fileutils'
|
| 3 | +require 'open-uri' |
| 4 | +require 'rss' |
2 | 5 |
|
3 | 6 | gem 'spidr', '~> 0.4'
|
4 | 7 | require 'spidr'
|
5 | 8 |
|
6 | 9 | gem 'kramdown', '~> 0.13'
|
7 | 10 | require 'kramdown'
|
8 | 11 |
|
| 12 | +HOST = 'www.ruby-lang.org' |
9 | 13 | OUTPUT_DIR = '_import'
|
10 |
| -LAYOUTS = { |
11 |
| - :default => 'page', |
12 |
| - :post => 'news_post' |
13 |
| -} |
14 | 14 |
|
15 |
| -desc 'Spiders ruby-lang.org and converts HTML to Markdown' |
16 |
| -task :import do |
17 |
| - Spidr.site('http://www.ruby-lang.org/index.html') do |agent| |
18 |
| - agent.ignore_links_like /\/cgi-bin\// |
19 |
| - agent.ignore_links_like /\.cgi[\/]?$/ |
20 |
| - agent.ignore_links_like /\/[a-z_]+\/old-man\// |
| 15 | +def url_to_path(url) |
| 16 | + local_path = File.join(OUTPUT_DIR,url.path[1..-1]) |
21 | 17 |
|
22 |
| - agent.every_ok_page do |page| |
23 |
| - path = page.url.path[1..-1] |
| 18 | + case File.extname(local_path) |
| 19 | + when '.html' |
| 20 | + local_path.chomp!('.html') << '.md' |
| 21 | + when '' |
| 22 | + local_path << '/' unless local_path.end_with?('/') |
| 23 | + local_path << 'index.md' |
| 24 | + end |
| 25 | + |
| 26 | + return local_path |
| 27 | +end |
| 28 | + |
| 29 | +def html_to_markdown(content_div) |
| 30 | + # remove all comments |
| 31 | + content_div.traverse do |node| |
| 32 | + node.remove if node.comment? |
| 33 | + end |
| 34 | + |
| 35 | + # remove all page anchors |
| 36 | + content_div.search('//a[@id]').remove |
24 | 37 |
|
25 |
| - layout = :default |
| 38 | + # replace all caps spans with their text |
| 39 | + content_div.search('span.caps').each do |span| |
| 40 | + span.replace(span.inner_text) |
| 41 | + end |
26 | 42 |
|
27 |
| - if path =~ %r{^[a-z_-]+/news/\d{4}/\d{1,2}/\d{1,2}/[^/]+/$} |
28 |
| - # map news posts in to news/_posts/ |
29 |
| - dirs = path.split('/') |
30 |
| - local_path = File.join(OUTPUT_DIR,dirs[0,2],'_posts',dirs[2..-1].join('-')) + '.md' |
| 43 | + # remove the 'class' attribute from all pre tags |
| 44 | + content_div.search('pre').remove_attr('class') |
31 | 45 |
|
32 |
| - layout = :post |
| 46 | + # map all code elements to their inner_text |
| 47 | + content_div.search('pre > code').each do |code| |
| 48 | + code.replace(code.children.map { |node| |
| 49 | + if node.name == 'br' |
| 50 | + $/ |
33 | 51 | else
|
34 |
| - # normal page |
35 |
| - local_path = File.join(OUTPUT_DIR,path) |
36 |
| - |
37 |
| - case File.extname(local_path) |
38 |
| - when '.html' |
39 |
| - local_path.gsub!(/\.html$/,'.md') |
40 |
| - when '' |
41 |
| - local_path += '/' unless local_path.end_with?('/') |
42 |
| - local_path += 'index.md' |
| 52 | + node.inner_text |
| 53 | + end |
| 54 | + }.join) |
| 55 | + end |
| 56 | + |
| 57 | + # replace the #extended div with it's children |
| 58 | + if (extended_div = content_div.at('#extended')) |
| 59 | + extended_div.replace(extended_div.inner_html) |
| 60 | + end |
| 61 | + |
| 62 | + # convert from HTML to Markdown |
| 63 | + return Kramdown::Document.new( |
| 64 | + content_div.inner_html, |
| 65 | + :input => :html |
| 66 | + ).to_kramdown |
| 67 | +end |
| 68 | + |
| 69 | +namespace :import do |
| 70 | + desc 'Spiders ruby-lang.org and converts HTML to Markdown' |
| 71 | + task :pages do |
| 72 | + Spidr.site("http://www.#{HOST}/index.html") do |agent| |
| 73 | + agent.ignore_links_like /\/cgi-bin\// |
| 74 | + agent.ignore_links_like /\.cgi[\/]?$/ |
| 75 | + agent.ignore_links_like /\/[a-z_]+\/feeds\// |
| 76 | + agent.ignore_links_like /\/[a-z_]+\/news\// |
| 77 | + agent.ignore_links_like /\/[a-z_]+\/old-man\// |
| 78 | + |
| 79 | + agent.every_ok_page do |page| |
| 80 | + local_path = url_to_path(page.url) |
| 81 | + |
| 82 | + # ensure the parent directory exists |
| 83 | + mkdir_p File.dirname(local_path) |
| 84 | + |
| 85 | + # don't overwrite existing files |
| 86 | + unless File.exist?(local_path) |
| 87 | + puts "Importing #{page.url} -> #{local_path} ..." |
| 88 | + |
| 89 | + File.open(local_path,'w') do |file| |
| 90 | + if page.html? |
| 91 | + title = page.title.strip |
| 92 | + lang = path.split('/',2).first |
| 93 | + |
| 94 | + # add the YAML front matter |
| 95 | + file.puts( |
| 96 | + '---', |
| 97 | + "layout: default", |
| 98 | + "title: #{title.inspect}", |
| 99 | + "lang: #{lang}", |
| 100 | + '---', |
| 101 | + '' |
| 102 | + ) |
| 103 | + |
| 104 | + if (content_div = page.at('#content')) |
| 105 | + file.puts(html_to_markdown(content_div)) |
| 106 | + end |
| 107 | + else |
| 108 | + file.write(page.body) |
| 109 | + end |
| 110 | + end |
43 | 111 | end
|
44 | 112 | end
|
| 113 | + end |
| 114 | + end |
45 | 115 |
|
46 |
| - # ensure the parent directory exists |
47 |
| - FileUtils.mkdir_p(File.dirname(local_path)) |
| 116 | + desc "Imports news posts from the RSS feed" |
| 117 | + task :news do |
| 118 | + languages = %w[bg de en es fr id it ja ko pl pt tr zh_TW zh_cn] |
| 119 | + by_lines = { |
| 120 | + 'bg' => /Публикувана от (.+) на/, |
| 121 | + 'de' => /Geschrieben von (.+) am/, |
| 122 | + 'en' => /Posted by (.+) on/, |
| 123 | + 'es' => /Publicado por (.+) Caro el/, |
| 124 | + 'fr' => /par (.+)/, |
| 125 | + 'id' => /Ditulis oleh (.+) tanggal/, |
| 126 | + 'it' => /Inserito da (.+) il/, |
| 127 | + 'ja' => /Posted by (.+) on/, |
| 128 | + 'ko' => /작성자 (.+) \(/, |
| 129 | + 'pl' => /Zamieszczone przez (.+) \d+/, |
| 130 | + 'pt' => /Escrito por (.+) em/, |
| 131 | + 'tr' => /Posted by (.+) on/, |
| 132 | + 'zh_TW' => /Posted by (.+) on/, |
| 133 | + 'zh_cn' => /由 (.+) 发表于/ |
| 134 | + } |
| 135 | + |
| 136 | + Spidr.host(HOST) do |agent| |
| 137 | + languages.each do |lang| |
| 138 | + feed, news_dir = case lang |
| 139 | + when 'pt' then ['noticias', 'noticias-recentes'] |
| 140 | + else ['news', 'news'] |
| 141 | + end |
| 142 | + |
| 143 | + agent.visit_urls_like do |url| |
| 144 | + url.path.start_with?("/#{lang}/#{news_dir}/") |
| 145 | + end |
48 | 146 |
|
49 |
| - # don't overwrite existing files |
50 |
| - unless File.exist?(local_path) |
51 |
| - puts "Saving #{page.url} -> #{local_path} ..." |
| 147 | + agent.enqueue("http://#{HOST}/#{lang}/#{news_dir}/") |
52 | 148 |
|
53 |
| - File.open(local_path,'w') do |file| |
54 |
| - if page.html? |
55 |
| - title = page.title.strip |
56 |
| - lang = path.split('/',2).first |
| 149 | + begin |
| 150 | + rss = RSS::Parser.parse(open("http://#{HOST}/#{lang}/feeds/#{feed}.rss")) |
| 151 | + rss.items.each do |item| |
| 152 | + puts "Queuing #{item.link} ..." |
| 153 | + agent.enqueue(item.link) |
| 154 | + end |
| 155 | + rescue OpenURI::HTTPError |
| 156 | + end |
| 157 | + end |
57 | 158 |
|
| 159 | + agent.every_ok_page do |page| |
| 160 | + lang, news_dir, year, month, day, slug = page.url.path[1..-2].split('/') |
| 161 | + title = page.title.strip |
| 162 | + |
| 163 | + if page.url.path =~ /^\/#{lang}\/#{news_dir}\/\d{4}\/\d{2}\/\d{2}\// |
| 164 | + # news post |
| 165 | + local_path = File.join(OUTPUT_DIR,lang,news_dir,'_posts',"#{year}-#{month}-#{day}-#{slug}.md") |
| 166 | + layout = 'news_post' |
| 167 | + author = nil |
| 168 | + |
| 169 | + archive_url = URI("http://#{HOST}/#{lang}/#{news_dir}/#{year}/#{month}/") |
| 170 | + begin |
| 171 | + agent.get_page(archive_url) do |archive| |
| 172 | + if archive.is_ok? |
| 173 | + if (post_div = archive.at("//div[@class='post']/h3/a[@href=#{page.url.path.dump}]/../..")) |
| 174 | + post_info = post_div.at("//p[@class='post-info']").inner_text |
| 175 | + |
| 176 | + author = if (match = post_info.match(by_lines[lang])) |
| 177 | + match[1] |
| 178 | + else |
| 179 | + '' |
| 180 | + end |
| 181 | + end |
| 182 | + end |
| 183 | + end |
| 184 | + rescue Net::HTTPNotFound |
| 185 | + end |
| 186 | + else |
| 187 | + # archive page |
| 188 | + local_path = url_to_path(page.url) |
| 189 | + layout = 'default' |
| 190 | + end |
| 191 | + |
| 192 | + # ensure the parent directory exists |
| 193 | + FileUtils.mkdir_p File.dirname(local_path) |
| 194 | + |
| 195 | + unless File.exists?(local_path) |
| 196 | + puts "Importing #{page.url} -> #{local_path} ..." |
| 197 | + |
| 198 | + File.open(local_path,'w') do |file| |
58 | 199 | # add the YAML front matter
|
59 | 200 | file.puts(
|
60 | 201 | '---',
|
61 |
| - "layout: #{LAYOUTS[layout]}", |
62 |
| - "title: #{title.inspect}", |
| 202 | + "layout: #{layout}", |
| 203 | + "title: #{title.inspect}" |
| 204 | + ) |
| 205 | + |
| 206 | + if author |
| 207 | + file.puts "author: #{author.inspect}" |
| 208 | + end |
| 209 | + |
| 210 | + file.puts( |
63 | 211 | "lang: #{lang}",
|
64 | 212 | '---',
|
65 | 213 | ''
|
66 | 214 | )
|
67 | 215 |
|
68 |
| - if (content_div = page.at('#content')) |
69 |
| - # remove all comments |
70 |
| - content_div.traverse do |node| |
71 |
| - node.remove if node.comment? |
72 |
| - end |
73 |
| - |
74 |
| - # remove all page anchors |
75 |
| - content_div.search('//a[@id]').remove |
| 216 | + content_div = page.at('div.post') || page.at('#content') |
76 | 217 |
|
77 |
| - # replace all caps spans with their text |
78 |
| - content_div.search('span.caps').each do |span| |
79 |
| - span.replace(span.inner_text) |
80 |
| - end |
81 |
| - |
82 |
| - # remove the 'class' attribute from all pre tags |
83 |
| - content_div.search('pre').remove_attr('class') |
84 |
| - |
85 |
| - # map all code elements to their inner_text |
86 |
| - content_div.search('pre > code').each do |code| |
87 |
| - code.replace(code.children.map { |node| |
88 |
| - if node.name == 'br' |
89 |
| - $/ |
90 |
| - else |
91 |
| - node.inner_text |
92 |
| - end |
93 |
| - }.join) |
94 |
| - end |
95 |
| - |
96 |
| - # replace the #extended div with it's children |
97 |
| - if (extended_div = content_div.at('#extended')) |
98 |
| - extended_div.replace(extended_div.inner_html) |
99 |
| - end |
100 |
| - |
101 |
| - # convert from HTML to Markdown |
102 |
| - content = Kramdown::Document.new( |
103 |
| - content_div.inner_html, |
104 |
| - :input => :html |
105 |
| - ).to_kramdown |
106 |
| - |
107 |
| - file.puts(content) |
108 |
| - end |
109 |
| - else |
110 |
| - file.write(page.body) |
| 218 | + file.puts(html_to_markdown(content_div)) |
111 | 219 | end
|
112 | 220 | end
|
113 | 221 | end
|
|
0 commit comments