-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimdb_backup.py
214 lines (174 loc) · 6.26 KB
/
imdb_backup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
#!/usr/bin/env python3
import json
import os
import re
import sys
import time
import zipfile
from collections.abc import Generator, Iterable
from pathlib import Path
import requests
import unidecode
from bs4 import BeautifulSoup
REQUIRED_COOKIES = {'at-main', 'ubid-main', 'uu'}
COOKIE_FNAME = 'imdb_cookie.json'
ZIP_FNAME = 'imdb_exported_lists.zip'
README_REF = (
'For more info check README.md.\n'
'[https://github.com/monk-time/imdb-backup-lists/blob/master/README.md]'
)
MList = dict[str, str | bytes]
class LoginError(Exception):
pass
class UrlParseError(Exception):
pass
def slugify(s: str) -> str:
"""Convert to lowercase ASCII with hyphens instead of underscores/spaces.
Remove all non-alphanumeric characters and strip
leading and trailing whitespace.
"""
s = unidecode.unidecode(s)
s = re.sub(r'[^\w\s-]', '', s).strip().lower()
return re.sub(r'[-_\s]+', '-', s)
def load_imdb_cookies(cookie_path):
"""Read IMDb cookies from the folder with the script or executable."""
# https://pyinstaller.readthedocs.io/en/stable/runtime-information.html#using-sys-executable-and-sys-argv-0
if cookie_path.exists():
cookies = json.loads(cookie_path.read_text())
if not set(cookies) >= REQUIRED_COOKIES:
msg = (
f'\n\n{COOKIE_FNAME} must contain the following cookies: '
f'{", ".join(REQUIRED_COOKIES)}.'
)
raise ValueError(msg)
return cookies
msg = (
f'\n\nCreate a file "{COOKIE_FNAME}" in the script directory\n'
f'and put your IMDb cookie inside.\n{README_REF}'
)
raise FileNotFoundError(msg)
def fetch_userid(cookies: dict) -> str:
"""Fetch user ID that is required for exporting any lists.
Cookie validity will also be checked here.
"""
r = requests.head('https://www.imdb.com/profile', cookies=cookies)
r.raise_for_status()
m = re.search(r'ur\d+', r.headers['Location'])
if not m:
msg = (
"\n\nCan't log into IMDb.\n"
f'Make sure that your IMDb cookie in {COOKIE_FNAME} is correct.\n'
f'{README_REF}'
)
raise LoginError(msg)
return m.group()
def get_fname(url: str, title: str) -> str:
"""Turn an IMDb list into {LIST_OR_USER_ID}_{TITLE_SLUG}.csv."""
match = re.search(r'..\d{6,}', url, re.MULTILINE)
if not match:
msg = (
f"\n\nCan't extract list/user ID from {url} "
f'for the list "{title}"'
)
raise UrlParseError(msg)
return match.group() + '_' + slugify(title) + '.csv'
def fetch_lists_info(
userid: str, cookies: dict
) -> Generator[dict, None, None]:
r = requests.get(
f'https://www.imdb.com/user/{userid}/lists', cookies=cookies
)
r.raise_for_status()
# Fetch two special lists: ratings and watchlist
# /lists has an old link for ratings; easier to hardcode it
yield {
'url': f'/user/{userid}/ratings/',
'fname': get_fname(userid, 'ratings'),
'title': 'Ratings',
}
# /lists doesn't have a link for watchlist that can be used for exporting
r_wl = requests.get(
f'https://www.imdb.com/user/{userid}/watchlist', cookies=cookies
)
listid = (
BeautifulSoup(r_wl.text, 'html.parser')
.find('meta', property='pageId')
.get('content')
)
yield {
'url': f'/list/{listid}/',
'fname': get_fname(userid, 'watchlist'),
'title': 'Watchlist',
}
# Fetch the rest of user's lists
links = BeautifulSoup(r.text, 'html.parser').select('a.list-name')
for link in links:
url = link.get('href')
title = link.string
yield {'url': url, 'fname': get_fname(url, title), 'title': title}
def export(mlist: MList, cookies: dict) -> MList:
"""All requests are throttled just in case."""
time.sleep(0.5)
print('Downloading:', mlist['title'].replace('\n', ' '))
r = requests.get(
f'https://www.imdb.com{mlist["url"]}export', cookies=cookies
)
r.raise_for_status()
mlist['content'] = r.content
return mlist
def zip_all(mlists: Iterable[MList], zip_fname=ZIP_FNAME):
"""Write all downloaded movielists into a zip archive.
A file with original list names (quoted if multi-line) is also added.
"""
with zipfile.ZipFile(
zip_fname, mode='w', compression=zipfile.ZIP_DEFLATED
) as zf:
titles = []
for ml in mlists:
print(' ->', ml['fname'])
zf.writestr(ml['fname'], ml['content'])
# After the Dec'17 redesign IMDb lists can have multi-line titles
title = ml['title']
if '\n' in title:
# zipfile.writestr doesn't do automatic line ending conversion
title = f'"{title}"'.replace('\n', os.linesep)
titles.append(f'{ml["fname"]}: {title}')
zf.writestr('lists.txt', os.linesep.join(titles))
def backup(cookie_path):
cookies = load_imdb_cookies(cookie_path)
userid = fetch_userid(cookies)
print(f'Successfully logged in as user {userid}')
mlists = fetch_lists_info(userid, cookies)
zip_all(export(ml, cookies) for ml in mlists)
def pause_before_exit_unless_run_with_flag():
"""Pause the script before exiting unless it was run with --nopause.
This will cause the script to show a standard "Press any key" prompt
even if it crashes, keeping a console window visible when it wasn't
launched in a terminal (e.g. by double-clicking the file on Windows).
"""
def prompt():
input('\nPress <ENTER> to exit ... ')
import argparse
parser = argparse.ArgumentParser()
# Optional positional argument for the input file with cookies
# noinspection PyTypeChecker
parser.add_argument(
'path',
nargs='?',
type=Path,
default=Path(sys.argv[0]).resolve().parent / COOKIE_FNAME,
help='path to the .json file with IMDb cookies',
)
parser.add_argument(
'-n',
'--nopause',
action='store_true',
help="don't pause the script before exiting",
)
args = parser.parse_args()
if not args.nopause:
import atexit
atexit.register(prompt)
backup(cookie_path=args.path)
if __name__ == '__main__':
pause_before_exit_unless_run_with_flag()