forked from EL-S/DanMurphysDataCollector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
json_downloader.py
119 lines (101 loc) · 3.55 KB
/
json_downloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from bs4 import BeautifulSoup
from tornado import ioloop, httpclient
from random import randint
import requests
import xml.etree.ElementTree as ET
import re
import json
import os
# optimise speed and add option to fix invalid data
# create a server running this that constantly keeps track of products and price changes
# store data in a database that the website can query using php or maybe js and API/Widget
# rate limit and comply with robots.txt
# fix time outs and 404's
file_name = "data.json"
directory = "data/"
i = 0
product_ids = []
threads = 2
first = True
last = False
total = 0
counter = 1
def init():
global file_name,directory
try:
os.stat(directory)
except:
os.mkdir(directory)
try:
with open(directory+file_name, "r") as file:
pass
except:
with open(directory+file_name, "w") as file:
file.write("")
def get_product_ids():
global product_ids,total
url = "https://www.danmurphys.com.au/robots.txt"
headers = {"User-Agent":"Mozilla 5.0/"}
robots_text = requests.get(url, headers=headers).text.strip().split("\n")
for line in robots_text:
if "Sitemap:" in line and "Products" in line:
product_sitemap_url = line.split("Sitemap:")[1].strip()
print(product_sitemap_url)
product_sitemap = requests.get(product_sitemap_url, headers=headers).text
print("SiteMap FileSize:",len(product_sitemap.encode('utf-8')))
with open("product_sitemap.xml", "w") as file:
file.write(product_sitemap)
root = ET.fromstring(product_sitemap)
product_ids = []
for child in root:
if "product" in child[0].text:
product_id = child[0].text.split("/")[-2]
product_ids.append(product_id.split("DM_")[-1])
total = len(product_ids)
print("Products:",total)
with open("product_ids.txt", "w") as file:
file.write("\n".join(product_ids))
def get_info():
global product_ids,threads
http_client = httpclient.AsyncHTTPClient(force_instance=True,defaults=dict(user_agent="Mozilla/5.0"),max_clients=threads)
for product_id_new in product_ids:
global i
i += 1
url = "https://api.danmurphys.com.au/apis/ui/Product/"+str(product_id_new)
http_client.fetch(url.strip(),handle_request_index, method='GET',connect_timeout=10000,request_timeout=10000)
ioloop.IOLoop.instance().start()
def save_to_json(html,url):
global file_name,directory,first,last,counter
json = "\"Products"+str(counter)+"\":["+html+"]" #converts it to a sub json part of the larger data file
full_path = directory + file_name
if i == 0:
last = True
with open(full_path, "a", encoding="utf-8") as data_json:
if first:
data_json.write("{"+json)
first = False
elif last:
data_json.write(","+json+"}")
else:
data_json.write(","+json)
counter += 1
def handle_request_index(response):
global total
if response.code == 599:
print(response.effective_url)
http_client.fetch(response.effective_url.strip(), handle_request, method='GET',connect_timeout=10000,request_timeout=10000)
else:
global i
i -= 1
html = response.body.decode('utf-8')
url = response.effective_url
time = response.request_time
percent = 100 - ((i/total)*100)
percent_str = '{:.2f}'.format(round(percent, 2))+"%"
print(percent_str,url,time)
save_to_json(html,url)
if i == 0:
ioloop.IOLoop.instance().stop()
init()
get_product_ids()
get_info()