Added a separate import:news task (issue #2).

postmodern · postmodern · commit 5202af507c22 · 2013-02-07T23:22:47.000-08:00
* Extracted common logic into methods.
* Enqueue the RSS feeds and then spider the news directories for a more
  complete news import.
* Scrape the archive pages for the author information for each post.
diff --git a/_tasks/import.rb b/_tasks/import.rb
@@ -1,113 +1,221 @@
+# encoding: utf-8
 require 'fileutils'
+require 'open-uri'
+require 'rss'
 
 gem 'spidr', '~> 0.4'
 require 'spidr'
 
 gem 'kramdown', '~> 0.13'
 require 'kramdown'
 
+HOST = 'www.ruby-lang.org'
 OUTPUT_DIR = '_import'
-LAYOUTS = {
-  :default => 'page',
-  :post => 'news_post'
-}
 
-desc 'Spiders ruby-lang.org and converts HTML to Markdown'
-task :import do
-  Spidr.site('http://www.ruby-lang.org/index.html') do |agent|
-    agent.ignore_links_like /\/cgi-bin\//
-    agent.ignore_links_like /\.cgi[\/]?$/
-    agent.ignore_links_like /\/[a-z_]+\/old-man\//
+def url_to_path(url)
+  local_path = File.join(OUTPUT_DIR,url.path[1..-1])
 
-    agent.every_ok_page do |page|
-      path = page.url.path[1..-1]
+  case File.extname(local_path)
+  when '.html'
+    local_path.chomp!('.html') << '.md'
+  when ''
+    local_path << '/' unless local_path.end_with?('/')
+    local_path << 'index.md'
+  end
+
+  return local_path
+end
+
+def html_to_markdown(content_div)
+  # remove all comments
+  content_div.traverse do |node|
+    node.remove if node.comment?
+  end
+
+  # remove all page anchors
+  content_div.search('//a[@id]').remove
 
-      layout = :default
+  # replace all caps spans with their text
+  content_div.search('span.caps').each do |span|
+    span.replace(span.inner_text)
+  end
 
-      if path =~ %r{^[a-z_-]+/news/\d{4}/\d{1,2}/\d{1,2}/[^/]+/$}
-        # map news posts in to news/_posts/
-        dirs = path.split('/')
-        local_path = File.join(OUTPUT_DIR,dirs[0,2],'_posts',dirs[2..-1].join('-')) + '.md'
+  # remove the 'class' attribute from all pre tags
+  content_div.search('pre').remove_attr('class')
 
-        layout = :post
+  # map all code elements to their inner_text
+  content_div.search('pre > code').each do |code|
+    code.replace(code.children.map { |node|
+      if node.name == 'br'
+        $/
       else
-        # normal page
-        local_path = File.join(OUTPUT_DIR,path)
-
-        case File.extname(local_path)
-        when '.html'
-          local_path.gsub!(/\.html$/,'.md')
-        when ''
-          local_path += '/' unless local_path.end_with?('/')
-          local_path += 'index.md'
+        node.inner_text
+      end
+    }.join)
+  end
+
+  # replace the #extended div with it's children
+  if (extended_div = content_div.at('#extended'))
+    extended_div.replace(extended_div.inner_html)
+  end
+
+  # convert from HTML to Markdown
+  return Kramdown::Document.new(
+    content_div.inner_html,
+    :input => :html
+  ).to_kramdown
+end
+
+namespace :import do
+  desc 'Spiders ruby-lang.org and converts HTML to Markdown'
+  task :pages do
+    Spidr.site("http://www.#{HOST}/index.html") do |agent|
+      agent.ignore_links_like /\/cgi-bin\//
+      agent.ignore_links_like /\.cgi[\/]?$/
+      agent.ignore_links_like /\/[a-z_]+\/feeds\//
+      agent.ignore_links_like /\/[a-z_]+\/news\//
+      agent.ignore_links_like /\/[a-z_]+\/old-man\//
+
+      agent.every_ok_page do |page|
+        local_path = url_to_path(page.url)
+
+        # ensure the parent directory exists
+        mkdir_p File.dirname(local_path)
+
+        # don't overwrite existing files
+        unless File.exist?(local_path)
+          puts "Importing #{page.url} -> #{local_path} ..."
+
+          File.open(local_path,'w') do |file|
+            if page.html?
+              title = page.title.strip
+              lang  = path.split('/',2).first
+
+              # add the YAML front matter
+              file.puts(
+                '---',
+                "layout: default",
+                "title: #{title.inspect}",
+                "lang: #{lang}",
+                '---',
+                  ''
+              )
+
+              if (content_div = page.at('#content'))
+                file.puts(html_to_markdown(content_div))
+              end
+            else
+              file.write(page.body)
+            end
+          end
         end
       end
+    end
+  end
 
-      # ensure the parent directory exists
-      FileUtils.mkdir_p(File.dirname(local_path))
+  desc "Imports news posts from the RSS feed"
+  task :news do
+    languages = %w[bg de en es fr id it ja ko pl pt tr zh_TW zh_cn]
+    by_lines = {
+      'bg' => /Публикувана от (.+) на/,
+      'de' => /Geschrieben von (.+) am/,
+      'en' => /Posted by (.+) on/,
+      'es' => /Publicado por (.+) Caro el/,
+      'fr' => /par (.+)/,
+      'id' => /Ditulis oleh (.+) tanggal/,
+      'it' => /Inserito da (.+) il/,
+      'ja' => /Posted by (.+) on/,
+      'ko' => /작성자 (.+) \(/,
+      'pl' => /Zamieszczone przez (.+) \d+/,
+      'pt' => /Escrito por (.+) em/,
+      'tr' => /Posted by (.+) on/,
+      'zh_TW' => /Posted by (.+) on/,
+      'zh_cn' => /由 (.+) 发表于/
+    }
+
+    Spidr.host(HOST) do |agent|
+      languages.each do |lang|
+        feed, news_dir = case lang
+                         when 'pt' then ['noticias', 'noticias-recentes']
+                         else           ['news', 'news']
+                         end
+
+        agent.visit_urls_like do |url|
+          url.path.start_with?("/#{lang}/#{news_dir}/")
+        end
 
-      # don't overwrite existing files
-      unless File.exist?(local_path)
-        puts "Saving #{page.url} -> #{local_path} ..."
+        agent.enqueue("http://#{HOST}/#{lang}/#{news_dir}/")
 
-        File.open(local_path,'w') do |file|
-          if page.html?
-            title = page.title.strip
-            lang = path.split('/',2).first
+        begin
+          rss = RSS::Parser.parse(open("http://#{HOST}/#{lang}/feeds/#{feed}.rss"))
+          rss.items.each do |item|
+            puts "Queuing #{item.link} ..."
+            agent.enqueue(item.link)
+          end
+        rescue OpenURI::HTTPError
+        end
+      end
 
+      agent.every_ok_page do |page|
+        lang, news_dir, year, month, day, slug = page.url.path[1..-2].split('/')
+        title = page.title.strip
+
+        if page.url.path =~ /^\/#{lang}\/#{news_dir}\/\d{4}\/\d{2}\/\d{2}\//
+          # news post
+          local_path = File.join(OUTPUT_DIR,lang,news_dir,'_posts',"#{year}-#{month}-#{day}-#{slug}.md")
+          layout     = 'news_post'
+          author     = nil
+
+          archive_url = URI("http://#{HOST}/#{lang}/#{news_dir}/#{year}/#{month}/")
+          begin
+            agent.get_page(archive_url) do |archive|
+              if archive.is_ok?
+                if (post_div = archive.at("//div[@class='post']/h3/a[@href=#{page.url.path.dump}]/../.."))
+                  post_info = post_div.at("//p[@class='post-info']").inner_text
+
+                  author = if (match = post_info.match(by_lines[lang]))
+                             match[1]
+                           else
+                             ''
+                           end
+                end
+              end
+            end
+          rescue Net::HTTPNotFound
+          end
+        else
+          # archive page
+          local_path = url_to_path(page.url)
+          layout     = 'default'
+        end
+
+        # ensure the parent directory exists
+        FileUtils.mkdir_p File.dirname(local_path)
+
+        unless File.exists?(local_path)
+          puts "Importing #{page.url} -> #{local_path} ..."
+
+          File.open(local_path,'w') do |file|
             # add the YAML front matter
             file.puts(
               '---',
-              "layout: #{LAYOUTS[layout]}",
-              "title: #{title.inspect}",
+              "layout: #{layout}",
+              "title: #{title.inspect}"
+            )
+
+            if author
+              file.puts "author: #{author.inspect}"
+            end
+
+            file.puts(
               "lang: #{lang}",
               '---',
               ''
             )
 
-            if (content_div = page.at('#content'))
-              # remove all comments
-              content_div.traverse do |node|
-                node.remove if node.comment?
-              end
-
-              # remove all page anchors
-              content_div.search('//a[@id]').remove
+            content_div = page.at('div.post') || page.at('#content')
 
-              # replace all caps spans with their text
-              content_div.search('span.caps').each do |span|
-                span.replace(span.inner_text)
-              end
-
-              # remove the 'class' attribute from all pre tags
-              content_div.search('pre').remove_attr('class')
-
-              # map all code elements to their inner_text
-              content_div.search('pre > code').each do |code|
-                code.replace(code.children.map { |node|
-                  if node.name == 'br'
-                    $/
-                  else
-                    node.inner_text
-                  end
-                }.join)
-              end
-
-              # replace the #extended div with it's children
-              if (extended_div = content_div.at('#extended'))
-                extended_div.replace(extended_div.inner_html)
-              end
-
-              # convert from HTML to Markdown
-              content = Kramdown::Document.new(
-                content_div.inner_html,
-                :input => :html
-              ).to_kramdown
-
-              file.puts(content)
-            end
-          else
-            file.write(page.body)
+            file.puts(html_to_markdown(content_div))
           end
         end
       end