-
Notifications
You must be signed in to change notification settings - Fork 1
/
scraper.py
191 lines (149 loc) · 6.4 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# -*- coding: utf-8 -*-
import argparse
import os
import requests
import shutil
import sys
import time
from collections import OrderedDict
from contextlib import closing
from lxml import html
WS_URL = 'https://en.ws.q3df.org'
WS_URL_LIST_TEMPLATE = '{}/maps/?show=50&page={{}}'.format(WS_URL)
WS_URL_PK3_TEMPLATE = '{}/maps/downloads/{{}}.pk3'.format(WS_URL)
def collect_pk3_data(pk3_data, final_date=None, final_pk3=None, count=None):
failure_count = 0
current_page = 0
# This variable is a flag for whether or not we have finished collecting pk3 data
collecting = True
while(collecting):
page = requests.get(WS_URL_LIST_TEMPLATE.format(current_page))
# If we didn't get a 200 response, take a break and try again, if we fail 3 times - die.
if page.status_code != requests.codes['ok']:
if failure_count >= 3:
return 1
failure_count += 1
error = "Page returned status code ({}), retrying in 30s".format(page.status_code)
print(error, file=sys.stderr, flush=True)
time.sleep(30)
continue
tree = html.fromstring(page.content)
# Grab all the map rows, slice off the header
maps_table = tree.xpath('//tr')[1:]
# There can be multiple bsp map files per pk3, so we need to retain
# state between rows
pk3_name = None
pk3_size = None
release_date = None
for row in maps_table:
columns = row.getchildren()
# End collection condition
if count and len(pk3_data) >= count:
collecting = False
break
# Each map pk3 name is a link which contains the text, if it's
# not a link that means this is another map from the previous pk3
pk3_name_cell = columns[2].find('a')
if pk3_name_cell is not None:
pk3_name = pk3_name_cell.text
# End collection condition
if final_pk3 and final_pk3 == pk3_name:
collecting = False
break
# The filesize is after some alignment spans
# [-1] gets the last span spacer within the element
# strip() removes extra spaces, [:-3] strips the " MB"
pk3_size = float(columns[3].findall('span')[-1].tail.strip()[:-3])
# Each one has a 'time' element
release_date = columns[0].find('time').text
# End collection condition
if final_date and release_date < final_date:
collecting = False
break
print("{} collected".format(pk3_name))
# Initialize the pk3 data structure
pk3_data[pk3_name] = dict()
pk3_data[pk3_name]['release_date'] = release_date
pk3_data[pk3_name]['size'] = pk3_size
pk3_data[pk3_name]['maps'] = list()
# Each map bsp can be taken from the link href
# [5:-1] strips /map/ and / from the output
current_map = {}
current_map['bsp'] = columns[1].find('a').attrib['href'][5:-1]
# Get the mod name, if there is no 'a', then there is no mod
mod_cell = columns[4].find('a')
if mod_cell is not None:
current_map['mod'] = mod_cell.find('img').attrib['alt']
# Get the gametypes
current_map['gametypes'] = [c.attrib['title'] for c in columns[5].findall('a')]
# TODO columns[6] = weapons
# TODO columns[7] = items
# TODO columns[8] = functions
pk3_data[pk3_name]['maps'].append(current_map)
# Increment the page number
current_page += 1
# Don't DOS pan :)
time.sleep(2)
return 0
def process_arguments(argv):
parser = argparse.ArgumentParser(description="DeFRaG server runner")
parser.add_argument('-d', '--date',
metavar='ISO_DATE',
dest='date',
help="Stop scraping once we reach this release date (exclusive) formatted in iso 8601 (ex: 2019-01-01)")
parser.add_argument('-p', '--pk3',
metavar='PK3_NAME',
dest='pk3',
help="Stop scraping once we reach this pk3 name")
parser.add_argument('-m', '--max',
metavar='MAX_PK3S',
dest='max',
type=int,
help="Stop scraping once we reach this number of pk3s")
parser.add_argument('-o', '--output_directory',
metavar='DIRECTORY_NAME',
dest='directory',
required=True,
help="The directory to save the downloaded pk3s to")
return parser.parse_args(argv[1:])
def download_pk3s(pk3_data, directory):
# Create the output directory if it doesn't exist
if not os.path.exists(directory):
os.mkdir(directory)
# Make sure the mountpoint is directory
if not os.path.isdir(directory):
print("The file {} is not a directory".format(directory))
return 1
for pk3_name in pk3_data.keys():
url = WS_URL_PK3_TEMPLATE.format(pk3_name)
output_filename = '{}.pk3'.format(pk3_name)
path = '{}/{}'.format(directory, output_filename)
print("Downloading {}...".format(pk3_name), end='', flush=True)
# In python 3.7 closing() can be removed (and the import too)
try:
with closing(requests.get(url,
stream=True,
headers={'User-agent': 'defrag-server-scraper'})) as data:
data.raise_for_status()
with open(path, 'wb') as file_descriptor:
shutil.copyfileobj(data.raw, file_descriptor)
print("DONE!")
except requests.exceptions.HTTPError:
print("FAIL!")
# Although continuing here is an option, returning prevents losing maps to auto dl scripts (with retry)
return 1
# Wait a little bit before starting the next file
time.sleep(1)
return 0
def main(argv):
args = process_arguments(argv)
# Gather the pk3 data from worldspawn
pk3_data = OrderedDict()
rc = collect_pk3_data(pk3_data, final_date=args.date, final_pk3=args.pk3, count=args.max)
if rc != 0:
return rc
# Download the pk3s from worldspawn
return download_pk3s(pk3_data, args.directory)
if __name__ == '__main__':
sys.exit(main(sys.argv))
# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4