forked from remotephone/brightwheel-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbrightscraper.py
196 lines (155 loc) · 6.37 KB
/
brightscraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import logging
import re
import time
import yaml
import requests
from selenium.common.exceptions import ElementNotVisibleException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select, WebDriverWait
"""I was saving pictures ad hoc through the brightwheel app, but got way behind
and didn't want to lose them if my kid changed schools or lost access to the app.
This uses selenium to crawl a BrightWheel (https://mybrightwheel.com/) profile
for images, find all of them, pass the cookies to requests, and then download
all images in bulk. Works with current site design as off 6/24/19"""
def config_parser():
"""parse config file in config.yml if present"""
try:
with open("config.yml", 'r') as config:
cfg = yaml.safe_load(config)
username = cfg['bwuser']
password = cfg['bwpass']
signin_url = cfg['bwsignin']
kidlist_url = cfg['bwlist']
startdate = cfg['startdate']
enddate = cfg['enddate']
except FileNotFoundError:
logging.error('[!] No config file found, check config file!')
raise SystemExit
return username, password, signin_url, kidlist_url, startdate, enddate
# Get the first URL and populate the fields
def signme_in(browser, username, password, signin_url):
"""Populate and send login info using U/P from config"""
browser.get(signin_url)
loginuser = browser.find_element('id', 'username')
loginpass = browser.find_element('id', 'password')
loginuser.send_keys(username)
loginpass.send_keys(password)
# Submit login, have to wait for page to change
try:
loginpass.submit()
WebDriverWait(browser, 5).until(EC.url_changes(signin_url))
except:
logging.error('[!] - Unable to authenticate - Check credentials')
raise SystemExit
return browser
def pic_finder(browser, kidlist_url, startdate, enddate):
""" This is the core logic of the script, navigate through the site, find
the page with photos, scroll to the bottom to load them all, load them all
in a specified date range, and create an iterable list of image URLs"""
browser.get(kidlist_url)
time.sleep(3)
# This xpath is generic enough to find any student listed.
# You need to iterate through a list you create if you have more than one
try:
students = browser.find_element(
'xpath',
"//a[contains(@href, '/students/')]"
)
profile_url = students.get_property('href')
browser.get(profile_url)
except:
logging.error('[!] - Unable to find profile page, check target')
raise SystemExit
time.sleep(3)
# Get to feed, this is where the pictures are
pics = browser.find_element("link text", 'Feed')
pics.click()
time.sleep(3)
# Populate the selector for date range to load all images
start_date = browser.find_element('name', 'activity-start-date')
start_date.send_keys(startdate)
end_date = browser.find_element('name', 'activity-end-date')
end_date.send_keys(enddate)
select = browser.find_element('id', 'select-input-2')
select.send_keys('Photo')
select.send_keys(Keys.ENTER)
# This is the XPATH for the Apply button.
browser.find_element(
'xpath',
'/html/body/div[2]/div/main/div/div/div[2]/div/form/button'
).click()
try:
last_height = browser.execute_script(
"return document.body.scrollHeight"
)
counter = 0
state = True
while state is True:
try:
counter += 1
button = WebDriverWait(browser, 7).until(
EC.presence_of_element_located((
By.XPATH, '//button[text()="Load more"]')))
button.click()
except:
if counter == 1:
logging.info('[!] No Loading button found!')
else:
logging.debug('[!] No loading button found')
browser.execute_script(
"window.scrollTo(0, document.body.scrollHeight);")
# Wait to load the page.
time.sleep(2)
# Calculate new scroll height and compare with last scroll height.
new_height = browser.execute_script(
"return document.body.scrollHeight")
if new_height == last_height:
logging.info('[!] Page fully loaded, finding images...')
state = False
last_height = new_height
except ElementNotVisibleException:
print('none')
matches = re.findall(
r'(?<=href=\")https:\/\/cdn\.mybrightwheel\.com\/media_images\/images\/.*cover.*jpg(?="?)',
browser.page_source)
count_matches = len(matches)
if count_matches == 0:
logging.error(
'[!] No Images found to download! Check the source target page')
else:
logging.info('[!] Found {} files to download...'.format(count_matches))
return browser, matches
def get_images(browser, matches):
""" Since Selenium doesn't handle saving images well, requests
can do this for us, but we need to pass it the cookies"""
cookies = browser.get_cookies()
session = requests.Session()
for cookie in cookies:
session.cookies.set(cookie['name'], cookie['value'])
for match in matches:
try:
filename = match.split("/")[-1]
request = session.get(match)
open('./pics/' + filename, 'wb').write(request.content)
logging.info('[-] - Downloading {}'.format(filename))
except:
logging.error('[!] - Failed to save {}'.format(match))
try:
session.cookies.clear()
browser.delete_all_cookies()
logging.info('[-] - Cleared cookies')
except:
logging.error('[!] - Failed to clear cookies')
def main():
"""Init logging and do it"""
logging.basicConfig(filename='scraper.log', filemode='w')
browser = webdriver.Firefox()
username, password, signin_url, kidlist_url, startdate, enddate = config_parser()
browser = signme_in(browser, username, password, signin_url)
browser, matches = pic_finder(browser, kidlist_url, startdate, enddate)
get_images(browser, matches)
if __name__ == "__main__":
main()