-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgenerate_regions_data.py
99 lines (78 loc) · 3.02 KB
/
generate_regions_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import json
import os
from glob import glob
import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString
from core_scraper import change_ip
YELLOW_PAGE_URL = 'https://itp.ne.jp'
OUTPUT_DIR = 'regions'
def get_soup(url):
response = requests.get(url)
assert response.status_code == 200
soup = BeautifulSoup(response.content, 'html.parser')
return soup
def get_soup_vpn(url):
try:
return get_soup(url)
except:
change_ip()
try:
return get_soup(url)
except:
print('ERROR COULD NOT FETCH THIS URL {0}'.format(url))
return None
def get_sub_sub_region(sub_region_url):
soup = get_soup_vpn(sub_region_url)
if soup is None:
return []
refine_block = [b for b in soup.find_all('div', {'class': 'refineBlock'}) if 'address_narrowing' in str(b)]
if len(refine_block) == 0:
return []
refine_block = refine_block[0]
links = [a.attrs['href'] for a in refine_block.find_all('a')]
links = list(filter(lambda x: x.startswith('/'), links))
links = [YELLOW_PAGE_URL + link for link in links]
return links
def get_sub_regions(region_url):
soup = get_soup_vpn(region_url)
resp = dict()
if soup is None:
return resp
all_links_1 = [a.attrs['href'] for a in soup.find('section', {'class': 'Japamap'}).find_all('a')]
all_links_1 = sorted(list(filter(lambda x: 'http' in x, all_links_1)))
all_links_2 = []
for link in all_links_1:
print('-> {} [from the map]'.format(link))
links_found = get_sub_sub_region(link)
# print('Found links {}'.format(links_found))
all_links_2.extend(links_found)
resp['level_1'] = all_links_1
resp['level_2'] = sorted(list(set(all_links_2)))
print('Found {0} links for level 1.'.format(len(all_links_1)))
print('Found {0} links for level 2.'.format(len(all_links_2)))
return resp
def main():
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
json_regions = [v.split('/')[1].split('.')[0] for v in glob(OUTPUT_DIR + '/*.json')]
soup = get_soup_vpn(YELLOW_PAGE_URL)
regions = soup.find('div', {'class': 'txt-table'}).find_all('a')
for region in regions:
print('-' * 80)
if not isinstance(region, NavigableString):
region_name = str(region.contents[0])
output = {}
if region_name not in json_regions:
url = YELLOW_PAGE_URL + str(region.attrs['href'])
print(region_name, url)
sub_regions = get_sub_regions(region_url=url)
output[region_name] = dict()
output[region_name]['url'] = url
output[region_name]['sub_region'] = sub_regions
with open(OUTPUT_DIR + '/{}.json'.format(region_name), 'wb') as w:
w.write(json.dumps(output, ensure_ascii=False, indent=4).encode('utf8'))
else:
print('REGION ALREADY FETCHED = {0}'.format(region_name))
if __name__ == '__main__':
main()