-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_unsplash.py
118 lines (91 loc) · 3.4 KB
/
scrape_unsplash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import argparse
import json
import os
import urllib.parse
import requests
def url_encode_string(s):
return urllib.parse.quote(s)
def get_photo_query(query: str, page: int):
"""Query the unsplash API for a list of images that match the query string.
Args:
query (str): Query string.
page (int): The page of results to get.
Returns:
dict: Dict of photo results.
"""
response = requests.get(
f"https://unsplash.com/napi/search/photos?query={url_encode_string(query)}&per_page=20&page={page}&plus=none"
)
# Check if the request was successful.
response.raise_for_status()
return json.loads(response.text)
def download_image(url, save_path):
"""Download a single image from url and save it to save_path."""
response = requests.get(url)
# Check if the request was successful.
response.raise_for_status()
with open(save_path, "wb") as f:
f.write(response.content)
def download_photos(query: str, max_images: int, out_dir: str):
"""Download photos from unsplash. Photos are saved to disk and a manifest
is returned describing the downloaded images.
Args:
query (str): The photo query string.
max_images (int): The maximum number of images to download. The script
will stop when it hits this number or has processed all available
images.
out_dir (str): The output directory where photos will be saved.
Returns:
List[dict]: A manifest describing all of the downloaded photos.
"""
manifest = []
total_images = 0
cur_page = 0
while total_images < max_images:
cur_page += 1
photo_list = get_photo_query(query, cur_page)["results"]
if len(photo_list) == 0:
print("Scraped all available images.")
return manifest
# TODO: Parallel async download.
for photo in photo_list:
if total_images >= max_images:
return manifest
if photo["premium"]:
continue
url = photo["urls"]["regular"]
file_name = f"{photo['id']}.jpg"
out_path = os.path.join(out_dir, file_name)
download_image(url, out_path)
manifest.append(
{
"id": photo["id"],
"url": url,
"file_name": file_name,
"query": query,
"alt_description": photo["alt_description"],
}
)
total_images += 1
print(f"{total_images} / {max_images}: '{out_path}'")
return manifest
def main():
parser = argparse.ArgumentParser(
prog="scrape_unsplash",
description="Scrape free images from unsplash.",
)
parser.add_argument("query")
parser.add_argument("--max-images", type=int, default=10)
parser.add_argument("--out-dir", type=str, default="out/")
args = parser.parse_args()
print(f"Scraping images for query '{args.query}'...")
# Create output directory. Throws if the directory already exists.
os.makedirs(args.out_dir)
manifest = download_photos(args.query, args.max_images, args.out_dir)
# Save manifest file.
manifest_path = os.path.join(args.out_dir, "manifest.json")
with open(manifest_path, "w") as f:
json.dump(manifest, f)
print(f"Saved manifest to '{manifest_path}'.")
if __name__ == "__main__":
main()