-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.rb
52 lines (34 loc) · 973 Bytes
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
require 'nokogiri'
require 'httparty'
require 'byebug'
def scraper
page = 1
reached_last_page = false
books = Array.new
while !reached_last_page do
puts "We are on page " + page.to_s
start_count = books.count
url = "http://books.toscrape.com/catalogue/page-" + page.to_s + ".html"
unparsed_page = HTTParty.get(url)
parsed_page = Nokogiri::HTML(unparsed_page)
# Entire div of book box
books_boxs = parsed_page.css('article.product_pod')
# Loop through objects
books_boxs.each do |book_box|
# Create obj
book = {
thumbnail: "http://books.toscrape.com/" + book_box.css('a img')[0]['src'],
price: book_box.css('p.price_color').text.gsub('£', ''),
title: book_box.css('h3')[0].text
}
books << book
end
end_count = books.count
reached_last_page = true if start_count == end_count
page += 1
end
puts "We collected " + books.count.to_s + " books."
puts "This is our array of books"
puts books
end
scraper