scrape_poe_maps.py

#! python3
"""
# scrape_poe_maps.py - scrapes poe map data via the API and all the individual map articles, see http://pathofexile.gamepedia.com/Map for example.
"""

import requests, bs4, re, datetime, time, json, os
from bs4 import NavigableString
from multiprocessing.dummy import Pool as ThreadPool

SCRIPTDIR = os.path.dirname(os.path.abspath(__file__))

base_url = 'http://pathofexile.gamepedia.com'

rx_search = re.compile(r'\+*\(([\d\.]+)\s[a-z]+\s([\d\.]+)[)]|(\+*[\d\.]+\%)|([\d\.]+-[\d\.]+)|(\([\d\.]+-[\d\.]+)\s\w+\s([\d\.]+-[\d\.]+\))|(-?\+?[\d\.]+)')
vendor_regex = re.compile('yields? one|produces? one', re.IGNORECASE)
maptype_regex = re.compile('Map type', re.IGNORECASE)
league_suffix_regex = re.compile(r' \([a-z]+\)$', re.IGNORECASE)


def write_file_headers():
	"""
	info headers for MapList.txt

	:return: list
	"""
	data = []
	d = datetime.datetime.now()
	now_time = d.strftime('%Y-%m-%d at %H:%M:%S')
	data.append('; Data from http://pathofexile.gamepedia.com')
	data.append('; Comments can be made with ";", blank lines will be ignored.')
	data.append(';')
	data.append('; This file was auto-generated by scrape_poe_maps.py on {}'.format(now_time) + '\n')
	data.append('mapList := Object()')
	data.append('mapList["Unknown Map"] := "Map not recognised or not supported"\n')
	data.append('uniqueMapList := Object()')
	data.append('uniqueMapList["Unknown Map"] := "Map not recognised or not supported"\n')

	return data
	
	
def clean_up_api_results(api_results):
	"""
	Takes the API result and turns it into a list of json objects.
	"""
	
	map_list = []
	for result in api_results:
		itemdata = result['title']
		
		map_info = {}
		map_info['name'] = league_suffix_regex.sub('', itemdata['name'])
		map_info['url'] = base_url + '/' + map_info['name'].replace(' ', '_')
		
		if ' Map (' in itemdata['name']:
			map_info['unique'] = False			# Unique maps don't have the "Map" before the league suffix in the query data
		else:
			map_info['unique'] = True
		
		map_list.append(map_info)
	
	return map_list
	
	
def get_api_results():
	"""
	This function gets the map names, it uses the wiki's API and requests json format.
	See this HTML version to get a better idea how the API response is structured:
	https://pathofexile.gamepedia.com/api.php?action=cargoquery&format=json&limit=500&tables=atlas_maps&fields=_pageName=name&formatversion=1
	"""
	
	print('Getting data for maps')
	r = requests.get('https://pathofexile.gamepedia.com/api.php?action=cargoquery&format=json&limit=500&tables=atlas_maps&fields=_pageName=name&formatversion=1')
	rj = r.json()
	api_results = rj['cargoquery']
	
	return clean_up_api_results(api_results)


def get_wiki_data():
	map_list = []
	#for category in item_categories:
	#	map_list.extend(get_api_results(category))
	
	map_list.extend(get_api_results())
	
	print('')
	return map_list
	
	
def parse_map_data(map_info):
	"""
	fetches the page for a map
	:param links:
	:return:
	"""
	page = requests.get(map_info['url'])
	page.raise_for_status()
	soup = bs4.BeautifulSoup(page.text, 'html.parser')
	return build_data(soup, map_info)

"""
def find_divcards(div):
	for h2 in div.find_all('h2'):
		if h2.find('span', id='Divination_cards'):
			return h2
	return None
	
def find_vendor_recipe(div):
	try:
		for yields in div.find_all(text=vendor_regex):
			return yields.next_sibling.findNext('a').findNext('a').text
	except:
		return None
	return None
	

def find_setting(div):
	try:
		for maptype in div.find_all(text=maptype_regex):
			return maptype.parent.next_sibling.replace(':', '').strip()
	except:
		return None
	return None
"""


def build_data(data, mapinfo):
	"""
	parse map data from the page
	:param data: BS4 ResultSet
	:return: list
	"""
	map_data = dict(mapinfo)
	print('Getting data for {}'.format(map_data['name']))
	
	
	map_data['divcards'] = []
	
	heading = data.find(id='Items_found_in_this_area')
	if heading is not None:
		table = heading.find_next('table')
		
		items = table.find_all('span', class_='divicard-header')
		for item in items:
			map_data['divcards'].append(item.text)
		
	# find the map setting (indoors/outdoors)
	#map_data['setting'] = find_setting(div)
	
	
	map_data['producedby'] = []
	
	heading = data.find(id='Upgrade_paths')
	if heading is not None:
		table = heading.find_next('table')
		
		items = table.find_all('span', class_='header -single')
		for item in items:
			if item.text not in map_data['producedby']:
				map_data['producedby'].append(item.text)
	
	
	map_data['upgradesto'] = []
	
	heading = data.find(id='Usage_in_upgrade_paths')
	if heading is not None:
		table = heading.find_next('table')
		
		items = table.find_all('span', class_='header -single')
		for item in items:
			if item.text != map_data['name']:
				if item.text not in map_data['upgradesto']:
					map_data['upgradesto'].append(item.text)
	
	return map_data


def convert_data_to_AHK_readable_format(all_data):
	"""
	This function takes the raw web page data, and converts it into lines that are readable by the
	Poe_item_info AHK script.
	:return:
	"""
	
	# Read the descriptions of various maps
	with open(SCRIPTDIR + '\\MapDescriptions.json', 'r') as f:
		map_descriptions = json.load(f)
	
	uniqueMapNameFromBase = open(SCRIPTDIR + '\\MapNameFromBase.txt', 'r').read()
	
	new_data = []
	unique_map = {}
	matchList = []
	for mymap in all_data:
		if mymap['unique'] is False:
			matchList.append(mymap['name'])

	# lists sorted by descending name length to avoid mismatching ("Spider Lair Map" before "Lair Map" etc.)
	matchList.sort(key=len, reverse=True)
	
	new_data.append('mapMatchList := ["' + '","'.join(matchList) + '"]\n')
	
	new_data.append('\n' + uniqueMapNameFromBase + '\n')
	
	for mymap in all_data:
		
		line = ''
		#line = 'Tier: ' + mymap['tier'] + ', Level: ' + mymap['level']
		#line += '`nTileset: ' + mymap['tileset']
		#if mymap['setting'] is not None:
		#	line += ' (' + mymap['setting'] + ')'
		
		# Add on vendor recipes and connected maps
		if mymap['unique'] is False:
			vendor_lines = '3 to 1 vendor recipe:'
			if mymap['producedby']:
				vendor_lines += '`n Produced by: ' + ', '.join(mymap['producedby'])
			else:
				vendor_lines += '`n Produced by: none'
			if mymap['upgradesto']:
				vendor_lines += '`n Upgrades to: ' + ', '.join(mymap['upgradesto'])
			else:
				vendor_lines += '`n Upgrades to: none'
			
			line += vendor_lines
		
		# Add line when map is shaped
		#if mymap['shaped'] == 'yes':
		#	line += '`n`nInfos from ' + mymap['base'] + ':'

		# Add on unique version if one exists
		#if mymap['base'] in unique_map and mymap['unique'] == 'no':
		#	line += '`n`nUnique version of map: ' + unique_map[mymap['base']]

		# Add on divination cards
		if len(mymap['divcards']) > 0:
			line += '`n`nDivination cards:'
			for divcard in mymap['divcards']:
				line += '`n ' + divcard
				
		# Here we insert the prewritten text descriptions for the maps that have them
		if mymap['unique']:
			if mymap['name'] in map_descriptions['uniqueMaps']:
				line += '`n`n' + map_descriptions['uniqueMaps'][mymap['name']]
		else:
			if mymap['name'] in map_descriptions['maps']:
				line += '`n`n' + map_descriptions['maps'][mymap['name']]

		line = line.lstrip('`n')
		
		if mymap['unique']:
			entry = 'uniqueMapList["' + mymap['name'] + '"] := "' + line + '"\n'
		else:
			entry = 'mapList["' + mymap['name'] + '"] := "' + line + '"\n'
		
		new_data.append(entry)
	
	return new_data


def write(new_data):
	file = open(SCRIPTDIR + '\\MapList.txt', 'a+b')  # opens file for writing
	for row in new_data:
		file.write(row.encode('cp1252'))
		file.write(b'\n')
	file.close()


def main():
	map_list = get_wiki_data()
	open(SCRIPTDIR + '\\MapList.txt', 'w').close()  # create file (or overwrite it if it exists)
	write(write_file_headers())
	pool = ThreadPool(4)
	data = pool.map(parse_map_data, map_list)
	data.sort(key=lambda m: m['name'])
	x = convert_data_to_AHK_readable_format(data)
	write(x)
	pool.close()
	pool.join()


startTime = datetime.datetime.now()
main()
print('Program execution time: ',(datetime.datetime.now() - startTime))