-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQasaRent.py
181 lines (146 loc) · 7.52 KB
/
QasaRent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
### DO BEFORE RUNNING CODE!!!
### in cmd, run "pip install beautifulsoup4" to install beautifulsoup4 (details https://www.tutorialspoint.com/beautiful_soup/beautiful_soup_installation.htm)
### change Chrome Webdriver path in "Extract" sections if different path
### in cmd, run "pip install azure-storage-blob" to install Azure Blob Storage client library for Python package
### in cmd, run "setx AZURE_STORAGE_CONNECTION_STRING "<yourconnectionstring>" to set environment variable
### (details https://docs.microsoft.com/en-us/azure/storage/blobs/storage-quickstart-blobs-python)
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import pandas as pd
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
import os
#--------EXTRACT FROM QASA, CREATE QASA.CSV--------#
browser = webdriver.Chrome("/Users/artc/Desktop/chromedriver.exe") # change chromedriver.exe path if different
browser.get("https://en.qasa.se/p2/en/find-home/sweden?searchAreas[]=Solna%20kommun%3B%3B398040")
time.sleep(10) # time to load
try: # ACCEPT COOKIES to access other buttons
browser.find_element_by_xpath("//button[@class='sc-hKgILt hPMLcZ general-cookie-consent__AcceptButton-mls226-1 ejXQzy']").click()
except:
print("Agreed to cookies.")
can_load_more = True # click on "LOAD MORE" BUTTON until all listings shown and cannot "Load more"
while can_load_more:
try: # find and click "Load more", then give time to load
browser.find_element_by_xpath("//button[@class='sc-hKgILt hPMLcZ pagination__StyledButton-sc-17c2n48-2 kFYVIO']").click()
time.sleep(4)
except: # cannot find "Load more" once all listings loaded, so end this loop
print("All listings loaded.")
time.sleep(4)
can_load_more = False
names = []
urls = []
rooms = []
areas = []
monthly_prices_sek = []
content = browser.page_source
browser.quit()
soup = BeautifulSoup(content, "html.parser")
key_info_soup = soup.find_all("div", attrs = {"class": None, "id": None, "style":None}) # find all <div>s with no attributes
for n in range(len(key_info_soup)):
try: # some <div>s with no attributes are not listings, so try-except
# extract listing name
name = key_info_soup[n].find("div", attrs = {"class":"text__Text-sc-1ttkyfd-0 hBZlMU"}).text.strip()
names.append(name)
# extract listing url
url_soup = key_info_soup[n].find("a")
urls.append("https://en.qasa.se" + url_soup["href"]) # href in <a> is partial, so complete it with "https..."
# extract listing rooms, area
info_header = key_info_soup[n].find("div", attrs = {"class":"text__Text-sc-1ttkyfd-0 IFApQ"}).text.strip()
key_info = info_header.split("•")
if key_info[1][-5:] == "rooms": # because room info might contain "room" or "rooms"
rooms_int = float(key_info[1].replace(" rooms", ""))
else:
rooms_int = float(key_info[1].replace(" room", ""))
area = int(key_info[2].replace("m²", ""))
rooms.append(rooms_int)
areas.append(area)
# extract listing prices
price_soup = key_info_soup[n].find("div", attrs = {"class": "text__Text-sc-1ttkyfd-0 home-item__Rent-sc-1tbgb3o-9 fijJmN hPnVyJ"}).text.strip()
price = int(price_soup[4:].replace(",", ""))
monthly_prices_sek.append(price)
except Exception as ex:
print("Not a listing <div>. " + str(ex))
qasa_dict = {
"name": names,
"url": urls,
"rooms": rooms,
"area": areas,
"monthly_price_sek": monthly_prices_sek
}
qasa_df = pd.DataFrame(qasa_dict, columns = ["name", "url", "rooms", "area", "monthly_price_sek"])
qasa_df.index.name = "id"
qasa_df.index += 1
# qasa.csv might already exist, so check if ok to overwrite
overwrite_csv = input("If qasa.csv already exists, it will be overwritten. Press Enter to proceed, Ctrl + C to terminate.")
csv_file = qasa_df.to_csv("qasa.csv")
#--------EXTRACT FROM BLOCKET BOSTAD, CREATE BLOCKET.CSV--------#
browser = webdriver.Chrome("/Users/artc/Desktop/chromedriver.exe") # change chromedriver.exe path if different
browser.get("https://bostad.blocket.se/p2/sv/find-home/Stockholm%20Solna/l%C3%A4genhet?homeTypes[]=apartment&searchAreas[]=Stockholm%20Solna%3BStockholm%20Solna%3BStockholm%20Solna")
time.sleep(10) # time to load
try: # ACCEPT COOKIES to access listings
browser.find_element_by_xpath("//button[@class='sc-hKgILt esLDzC blocket-cookie-consent___StyledButton-fscird-8 epUktU']").click()
except:
print("Agreed to cookies.")
can_load_more = True # click on "LOAD MORE" BUTTON until all listings shown and cannot "Load more"
while can_load_more:
try: # find and click "Load more", then give time to load
browser.find_element_by_xpath("//button[@class='sc-hKgILt evzPbN pagination__StyledButton-sc-17c2n48-2 kFYVIO']").click()
time.sleep(4)
except: # cannot find "Load more" once all listings loaded, so end this loop
print("All listings loaded.")
time.sleep(4)
can_load_more = False
names = []
urls = []
rooms = []
areas = []
monthly_prices_sek = []
content = browser.page_source
browser.quit()
soup = BeautifulSoup(content, "html.parser")
key_info_soup = soup.find_all("div", attrs = {"class": None, "id": None, "style":None}) # find all <div>s with no attributes
for n in range(len(key_info_soup)):
try: # some <div>s with no attributes are not listings, so try-except
# extract listing name
name = key_info_soup[n].find("div", attrs = {"class":"text__Text-sc-1ttkyfd-0 iOhQqm"}).text.strip()
names.append(name)
# extract listing url
url_soup = key_info_soup[n].find("a")
urls.append("https://bostad.blocket.se/" + url_soup["href"]) # href in <a> is partial, so complete it with "https..."
# extract listing rooms, area
info_header = key_info_soup[n].find("div", attrs = {"class":"text__Text-sc-1ttkyfd-0 IFApQ"}).text.strip()
key_info = info_header.split("•")
rooms_int = float(key_info[1].replace(" rum", ""))
area = int(key_info[2].replace("m²", ""))
rooms.append(rooms_int)
areas.append(area)
# extract listing prices
price_soup = key_info_soup[n].find("div", attrs = {"class": "text__Text-sc-1ttkyfd-0 home-item__Rent-sc-1tbgb3o-9 fijJmN hPnVyJ"}).text.strip()
price = int(price_soup[:-7]+price_soup[-6:-3]) # concatenate digits only
monthly_prices_sek.append(price)
except Exception as ex:
print("Not a listing <div>. " + str(ex))
blocket_dict = {
"name": names,
"url": urls,
"rooms": rooms,
"area": areas,
"monthly_price_sek": monthly_prices_sek
}
blocket_df = pd.DataFrame(blocket_dict, columns = ["name", "url", "rooms", "area", "monthly_price_sek"])
blocket_df.index.name = "id"
blocket_df.index += 1
# blocket.csv might already exist, so check if ok to overwrite
overwrite_csv = input("If blocket.csv already exists, it will be overwritten. Press Enter to proceed, Ctrl + C to terminate.")
csv_file = blocket_df.to_csv("blocket.csv")
#--------STORE RECENTPLAY.CSV AS AZURE BLOB--------#
try:
connect_str = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
blob_service_client = BlobServiceClient.from_connection_string(connect_str)
to_upload = ["qasa.csv", "blocket.csv"] # list of .csvs to upload
for csv in to_upload:
blob_client = blob_service_client.get_blob_client(container = "solna-rent", blob = csv)
with open("./" + csv, "rb") as data:
blob_client.upload_blob(data, overwrite = True)
except Exception as ex:
print("Exception: " + str(ex))