-
Notifications
You must be signed in to change notification settings - Fork 0
/
cl-scraper.py
55 lines (46 loc) · 1.77 KB
/
cl-scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import json
import os
import re
import sys
import time
from urllib.request import urlopen, urlretrieve
image_folder_path = "images/cl"
def scrape_bikes(city):
base_url = "https://" + city + ".craigslist.org"
first_page = base_url + "/search/bia"
if not os.path.isdir(image_folder_path + "/" + city):
os.makedirs(image_folder_path + "/" + city)
html = urlopen(first_page).read()
# regex = "https://" + city + "\.craigslist\.org(/([a-z])*)*/bik/[0-9]{10}.html"
regex = r"(?:/[a-z]{3})*/bik/[0-9]{10}.html" # some suffixes have 3 char prefix in the beginning
matches = re.findall(regex, html.decode('utf-8'))
bike_pages = set()
for m in matches:
bike_pages.add(m)
for bp in bike_pages:
html = urlopen(base_url + bp).read()
image_urls = re.findall("var imgList = (.*?);", html.decode('utf-8'))
if image_urls:
img_list = json.loads(image_urls[0])
# get bike num and make folder
num_regex = r"[0-9]{10}"
num = re.search(num_regex, bp);
bike_num = num.group(0)
bike_folder = image_folder_path + "/" + city + "/" + bike_num
if not os.path.isdir(bike_folder):
os.makedirs(bike_folder)
i = 0
for img in img_list :
urlretrieve(img["url"], image_folder_path + "/" + city + "/" + bike_num + "/" + str(i) + ".jpg")
i = i + 1
time.sleep(10)
# set up folders
if not os.path.isdir(image_folder_path):
os.makedirs(image_folder_path)
print("made folder for craigslist images")
cities = open('cities.txt', 'r')
for line in cities:
print("scraping " + line[:-1] + "...", end="")
sys.stdout.flush()
scrape_bikes(line[:-1])
print("done")