-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBoardGameCapital-selenium.py
66 lines (51 loc) · 1.83 KB
/
BoardGameCapital-selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
from selenium import webdriver # powers the browser interaction
from selenium.webdriver.support.ui import Select # selects menu options
from bs4 import BeautifulSoup # to parse HTML
import csv # to write CSV
import pandas as pd # to see CSV
import time
import os
import random
import requests
import time as time_lib
driver = webdriver.Chrome()
next_page = "http://www.boardgamecapital.com/board-game-rules.htm"
driver.get(next_page)
soup = BeautifulSoup(driver.page_source, 'html5lib')
game_cells = soup.find('tbody').find('tbody').find_all('td')[:-1]
game_dict = {}
for g in game_cells:
game_dict[g.text] = {}
game_dict[g.text]['link'] = 'http://www.boardgamecapital.com/' + \
g.find('a')['href']
for k in game_dict.keys():
print(k)
driver.get(game_dict[k]['link'])
soup = BeautifulSoup(driver.page_source, 'html5lib')
gstats1 = [x.split(':') for x in soup.find(
'div', {'class': 'gstats1'}).text.split('\n')]
price = gstats1[0][1].strip()[1:]
time = gstats1[1][1].strip()
gstats2 = [x.split(':') for x in soup.find(
'div', {'class': 'gstats2'}).text.split('\n')]
age = gstats2[0][1].strip()
players = gstats2[1][1].strip()
text = soup.find('div', {'class', 'mainbody'}).text
pdf_links = [
a for a in soup.find(
'div', {
'class', 'mainbody'}).find_all('a') if 'Game Rules' in a.text]
paths = []
for url in pdf_links:
path = 'pdfs/{}.pdf'.format(url.text)
with open(path, 'wb') as f:
f.write(requests.get(url['href']).content)
paths.append(path)
paths = ';'.join(paths)
game_dict[k]['price'] = price
game_dict[k]['time'] = time
game_dict[k]['age'] = age
game_dict[k]['players'] = players
game_dict[k]['paths'] = paths
game_dict[k]['web_text'] = text
time_lib.sleep(1)