This repository has been archived by the owner on Nov 2, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
wiki.py
168 lines (141 loc) · 5.25 KB
/
wiki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import re
import gzip
import asyncio as aio
from yarl import URL
# yarl seems to have bug in quoting, so use urllib's quote instead
from urllib.parse import quote, unquote
from bs4 import BeautifulSoup, SoupStrainer
base = 'http://starbounder.org'
api = base + '/mediawiki/api.php'
crops_page = '/Crops'
category = '/Category:'
category_food = category + 'Food'
food_categories = ['Food', 'Prepared_Food', 'Drink', 'Cooking Ingredient']
crafts_categories = ['Crafting materials', 'Craftables']
ignore_categories = ['Removed', 'Disabled']
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:51.0) Gecko/20100101 Firefox/51.0'}
redirected = {}
cache = {}
def get_real_url(soup, u):
_u = unquote(soup.find(id='ca-nstab-main').find('a')['href'])
redirected[u] = _u
return _u
def should_ignore(soup):
ul = soup.find(id='mw-normal-catlinks').ul
for li in ul.find_all('li', recursive=False):
if li.a.string.strip() in ignore_categories:
return True
return False
async def get_soup(session, url, parse_only=None):
await aio.sleep(0.1)
async with session.get(URL(base + quote(url), encoded=True), headers=headers) as res:
content = await res.text()
return BeautifulSoup(content, 'html.parser',
parse_only=parse_only)
async def url_soup(session, url, parse_only=None):
soup = await get_soup(session, url, parse_only)
print("GET %s" % url)
return soup, get_real_url(soup, url)
def get_price(infobox):
pixel_a = infobox.find('a', href='/File:Pixels-Sell.png')
if pixel_a:
return int(pixel_a.parent.get_text())
pixel_img = infobox.find('img', alt='Pixels-Sell.png')
if pixel_img:
return int(pixel_img.parent.get_text())
pixel_a = infobox.find('a', href='/Pixel')
return int(pixel_a.next_sibling.string)
async def _get_item(session, url):
soup, url = await url_soup(session, url)
if should_ignore(soup):
return None, url
item = {'name': url[1:], 'url': url}
pixel_a = None
infobox = soup.find('div', class_='infoboxwrapper')
item['price'] = get_price(infobox)
ing_text_div = infobox.find('div', text='INGREDIENTS')
if ing_text_div:
ingredients = {}
for div in ing_text_div.next_siblings:
if not div.name:
continue
a = div.a
n = a.parent.next_sibling.div.string
ingredients[a['href']] = int(n)
item['ingredients'] = ingredients
return item, url
async def get_item(session, url):
# Since async is single threaded,
# only await statements are interrupted
url = unquote(url)
url = redirected.get(url, url) # this could be redirected url
f = cache.get(url)
if not f:
f = aio.ensure_future(_get_item(session, url))
cache[url] = f
it, url = await f
_f = cache.get(url) # Direct entry
# If direct entry doesn't exist, use redirected.
# Redirected is never used when direct entry exists.
# Else cache[url] already points to the direct one
if not _f:
cache[url] = f # Direct url must haven't been queried yet
elif _f != f:
it, _ = await _f
return it
async def get_category_urls(session, category, limit=50):
params = {'action': 'query', 'list': 'categorymembers', 'continue': '',
'format': 'json', 'cmprop': 'title', 'cmlimit': str(limit), 'cmtitle':
'Category:' + category}
while True:
async with session.get(api, params=params) as res:
data = await res.json()
print("LIST %s" % category)
for member in data['query']['categorymembers']:
title = member['title']
yield '/' + title.replace(' ', '_')
cont = data.get('continue')
if not cont:
break
params['cmcontinue'] = cont['cmcontinue']
async def get_items_all_categories(session, categories):
done, _ = await aio.wait([(get_items_in_category(session, c)) for c in
categories], loop=session.loop)
res = {}
for d in done:
res.update(await d)
return res
async def get_crops_time(session, url=crops_page):
'''
This is currently unused
'''
harvest_re = re.compile(r"^\s*Harvest:\s*([0-9]+)\s*minutes\s*", re.I)
soup, _ = await url_soup(session, url)
tag = soup.find(id='Crop_Images_and_Average_Growth_Times')
while tag.name != 'table':
tag = tag.next_element
times = {}
for table in tag.find_all('table', class_='gametable'):
a = table.find('a')
p = table.find('p')
times[a['href']] = harvest_re.match(p.string).group(1)
return times
async def gen_food_data(session):
'''
This is currently unused
Get a list of all kind of food
Try to get simple ingredients since ingredients may be compound
Return a dictionary from food url to food dictionary
'''
food, time = await aio.gather(get_food_all_categories(session),
get_crops_time(session), loop=session.loop)
for u, t in time.items():
try:
f = food[u]
except KeyError as e:
u = await get_redirect(session, u)
if not u:
raise e
f = food[u]
f['time'] = t
return food