-
Notifications
You must be signed in to change notification settings - Fork 0
/
resources_crawler.py
executable file
·125 lines (86 loc) · 3.03 KB
/
resources_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python3
'''Given car url, download its resources'''
__author__ = "Abdurrahman Ghanem"
import sys
import os
from lxml import html
import requests
import json
import datetime
def extract_info(url):
page = requests.get(url)
tree = html.fromstring(page.content)
milage = tree.xpath("(//div[contains(@class,'h-carBestDetails') and contains(@class,'clearfix')]/div[@class='item'])[2]/text()")
milage = milage[0].strip()
model = tree.xpath('//h2[@class="h-carName"]/text()')
model = model[0].strip()
price = tree.xpath('//span[@class="h-carPrice"]/text()')
price = price[0].strip()
tels = tree.xpath('//a/@href')
tel_nums = []
for tel in tels:
if tel.startswith("tel:"):
tel_nums.append(tel.replace("tel:", ""))
info_dict = {}
info_dict["mileage"] = milage
info_dict["model"] = model.replace("Used", "").replace("مستعملة", "").strip()
info_dict["price"] = price.replace("QAR", "").strip()
if len(tel_nums) > 0:
info_dict["tel"] = "+974 " + tel_nums[0]
print(info_dict)
return info_dict
def save_info(info_dict, res_dir):
filename = "/data.json"
with open(res_dir + filename, "w", encoding='utf-8') as json_file:
data = json.dumps(info_dict, ensure_ascii=False)
json_file.write(data)
def extract_imgs(url):
page = requests.get(url)
tree = html.fromstring(page.content)
imgs = tree.xpath('//img[@class="img-responsive"]/@src')
del imgs[0]
fixed_img_urls = []
for img in imgs:
fixed_img_urls.append(img.replace("thumb/", "o_"))
print(fixed_img_urls)
return fixed_img_urls
def download_imgs(img_urls, out_dir):
if not os.path.exists(out_dir):
os.makedirs(out_dir, exist_ok=True)
counter = 1
for url in img_urls:
f = open(out_dir + "/" + str(counter) + ".jpg", 'wb')
print("downloading image " + url)
f.write(requests.get(url).content)
f.close()
counter += 1
def download_url(url):
out_dir = url.split("/")
if len(out_dir[len(out_dir) - 1]) > 0:
out_dir = out_dir[len(out_dir) - 2]
else:
out_dir = out_dir[len(out_dir) - 3]
out_dir = datetime.datetime.now().strftime("%Y/%m/%d/") + out_dir
if os.path.exists(out_dir):
return out_dir
imgs = extract_imgs(url)
download_imgs(imgs, out_dir)
if "/en/" in url:
page_info_en = extract_info(url)
page_info_ar = extract_info(url.replace("/en/", "/ar/"))
elif "/ar/" in url:
page_info_ar = extract_info(url)
page_info_en = extract_info(url.replace("/ar/", "/en/"))
page_info = {}
page_info["ar"] = page_info_ar
page_info["en"] = page_info_en
save_info(page_info, out_dir)
return out_dir
##MAIN##
if __name__ == "__main__":
'''read the index file from the input directory and generate the corresponding ad images'''
if len(sys.argv) < 2:
print("You must enter the resources directory")
sys.exit(1)
car_url = sys.argv[1]
download_url(car_url)