forked from arsentievalex/otodom-scraping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwebscraping.py
106 lines (80 loc) · 3.21 KB
/
webscraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
import pandas as pd
import os
def write_to_excel(file, df, sheet_name):
# if file does not exist, then write to a new excel file, if it already exists, write to an existing one
with pd.ExcelWriter(file, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
df.to_excel(writer, index=False, sheet_name=sheet_name)
# initiate the chrome driver and declare the url
website = 'https://www.otodom.pl/pl/oferty/sprzedaz/mieszkanie/warszawa?&market=SECONDARY&&limit=72'
path = 'chromedriver.exe'
driver = webdriver.Chrome(path)
driver.maximize_window()
driver.get(website)
time.sleep(2)
# clicking through the accept cookies button
cookies_button = driver.find_element(by=By.XPATH, value='//button[@id="onetrust-accept-btn-handler"]')
cookies_button.click()
time.sleep(2)
# get total number of results
total = driver.find_element(by=By.XPATH, value='//strong[@data-cy="search.listing-panel.label.ads-number"]')
total_num = re.findall(r'\d+', total.text)
# calculate the number of pages
num_of_pages = round(int(total_num[0]) / 72)
# initiate lists to store data
location = []
prices = []
m2_price = []
rooms = []
m2 = []
urls = []
# define main function
def scrape():
# scroll down the webpage to get all results
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
# listings = driver.find_elements(by=By.XPATH, value='//div[@data-cy="search.listing"]')
# li_obj = listings[1].find_elements(by=By.TAG_NAME, value='article')
loc = driver.find_elements(by=By.XPATH, value='//p[@class="css-14aokuk e1ualqfi4"]')
details = driver.find_elements(by=By.XPATH, value='//span[@class="css-1on0450 ei6hyam2"]')
links = driver.find_elements(by=By.XPATH, value='//a[@data-cy="listing-item-link"]')
# create a list of locations
for t in loc:
location.append(t.text)
# create a list of listing details
x = 0
while x < len(details):
prices.append(details[x].text)
m2_price.append(details[x+1].text)
rooms.append(details[x + 2].text)
m2.append(details[x + 3].text)
x += 4
for l in links:
urls.append(l.get_attribute('href'))
# click on the next button
next_button = driver.find_element(by=By.XPATH, value='//button[@data-cy="pagination.next-page"]')
driver.execute_script("arguments[0].click();", next_button)
time.sleep(5)
# run the function on each page
i = 1
while i <= num_of_pages:
print('Working on page ' + str(i) + ' of ' + str(num_of_pages))
try:
scrape()
except Exception as e:
# print error code
print(e)
pass
i += 1
# convert the lists into a dataframe
data = {'Location': location, 'Price': prices, 'Price per m2': m2_price, 'Rooms': rooms, 'm2': m2 , 'URL': urls}
df = pd.DataFrame(data)
# write the dataframe to an excel file
df.to_excel('data July 23.xlsx', index=False)
#close the browser
driver.quit()