-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscrape-starred-posts.rb
122 lines (101 loc) · 2.66 KB
/
scrape-starred-posts.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Script to download starred posts from Google Reader (including post content).
# Requires that your starred posts are in a public feed.
#
# You can find the public feed by going to "Reader settings", then "Folders and Tags",
# setting "starred items" to public and then going to "view public page". That url is the
# one to pass to the script.
#
# I guess it would work for other folders too.
#
# Usage:
#
# ruby scrape-starred-posts.rb "http://view public page starred posts link" > all_starred_posts.json
require 'net/http'
require 'nokogiri'
require 'json'
module GoogleReader
class StarredFetcher
class Page
def initialize(url)
@url = url
$stderr.puts "new page:" + @url
end
def html
@html ||= Net::HTTP.get(URI.parse(@url))
end
def doc
@doc ||= Nokogiri::HTML(html)
end
def items
doc.search(".item").map {|el| Item.new(el) }
end
def next
Page.new(next_page_url) if next_page_url
end
def next_page_url
doc.search("#more").first.search("a").first.attributes["href"].value
rescue
nil
end
end
class Item
def initialize(el)
@el = el
end
def title
@el.search(".item-title").text
end
def href
@el.search(".item-title").children.first.children.first.attributes["href"].value
end
def blog_href
blog_info.first.search("a").first.attributes["href"].value
end
def blog_title
blog_info.first.search("a").first.text
end
def author
blog_info.map {|e| e.text =~ /by (.*) on/; $1 }.compact.first
end
def blog_info
@item_info ||= @el.search(".item-info")
end
def date
blog_info.map {|e| e.text =~ /on (\d\d?\/\d\d?\/\d\d?)/; $1}.compact.first
end
def body
@el.search(".item-body").children.to_s
end
def to_hash
{:title => title, :href => href, :blog => {:href => blog_href, :title => blog_title}, :author => author, :date => date, :body => body}
end
end
def initialize(first_page_url)
@first_page_url = first_page_url
end
def first_page
Page.new(@first_page_url)
end
def each_item
@all ||= begin
page = first_page
while page
page.items.each do |item|
yield item
end
page = page.next
end
end
end
end
end
page_one = ARGV[0]
$stderr.puts "starting scrape at #{page_one}"
fetcher = GoogleReader::StarredFetcher.new(page_one)
puts "["
fetcher.each_item do |item|
print JSON.pretty_generate(item.to_hash)
puts ","
end
puts " null"
puts "]"