Skip to content

Commit

Permalink
Merge pull request #642 from flathunters/bugfix/immowelt-new-website
Browse files Browse the repository at this point in the history
Fix immowelt crawler
  • Loading branch information
codders authored Dec 8, 2024
2 parents 57cc225 + a39a738 commit ac7b37c
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 27 deletions.
58 changes: 33 additions & 25 deletions flathunter/crawler/immowelt.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,61 +49,70 @@ def get_expose_details(self, expose):
def extract_data(self, soup: BeautifulSoup):
"""Extracts all exposes from a provided Soup object"""
entries = []
soup_res = soup.find("main")
soup_res = soup
if not isinstance(soup_res, Tag):
return []

title_elements = soup_res.find_all("h2")
expose_ids = soup_res.find_all("a", id=True)
advertisements = soup_res.find_all("div", attrs={"class": "css-79elbk"})
for adv in advertisements:
try:
title = adv.find("div", {"class": "css-1cbj9xw"}).text
except:
title = ""

for idx, title_el in enumerate(title_elements):
try:
price = expose_ids[idx].find(
"div", attrs={"data-test": "price"}).text
except IndexError:
price = adv.find(
"div", attrs={"data-testid": "cardmfe-price-testid"}).text
except:
price = ""

try:
size = expose_ids[idx].find(
"div", attrs={"data-test": "area"}).text
descriptions = adv.find("div", attrs={"data-testid": "cardmfe-keyfacts-testid"}).children
descriptions = [result.text for result in descriptions]
except:
descriptions = []

size = list(filter(lambda x: "m²" in x, descriptions))
try:
size = size[0]
except IndexError:
size = ""

rooms = list(filter(lambda x: "Zimmer" in x, descriptions))
try:
rooms = expose_ids[idx].find(
"div", attrs={"data-test": "rooms"}).text.replace(" Zi.", "")
rooms = rooms[0]
except IndexError:
rooms = ""

id_element = adv.find("a")
try:
url = expose_ids[idx].get("href")
url = id_element.get("href")
if "https" not in url:
url = "https://immowelt.de/" + url
except IndexError:
continue

picture = expose_ids[idx].find("picture")
picture = adv.find("img")
image = None
if picture:
src = picture.find("source")
if src:
image = src.get("data-srcset")
image = picture.get('src')

try:
address = expose_ids[idx].find(
"div", attrs={"class": re.compile("IconFact.*")}
)
address = address.find("span").text
address = adv.find(
"div", attrs={"data-testid": "cardmfe-description-box-address"}
).text
except (IndexError, AttributeError):
address = ""

ad_id = url.split('/')[-1]
processed_id = int(
hashlib.sha256(expose_ids[idx].get("id").encode('utf-8')).hexdigest(), 16
hashlib.sha256(ad_id.encode('utf-8')).hexdigest(), 16
) % 10**16

details = {
'id': processed_id,
'image': image,
'url': url,
'title': title_el.text.strip(),
'title': title.strip(),
'rooms': rooms,
'price': price,
'size': size,
Expand All @@ -113,5 +122,4 @@ def extract_data(self, soup: BeautifulSoup):
entries.append(details)

logger.debug('Number of entries found: %d', len(entries))

return entries
return entries
4 changes: 2 additions & 2 deletions test/crawler/test_crawl_immowelt.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@

DUMMY_CONFIG = """
urls:
- https://www.immowelt.de/liste/muenchen/wohnungen/mieten?roomi=2&primi=600&prima=1000
- https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=House,Apartment&locations=AD08DE8634&order=Default&m=homepage_new_search_classified_search_result
"""

TEST_URL = 'https://www.immowelt.de/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc'
TEST_URL = 'https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=House,Apartment&locations=AD08DE8634&order=Default&m=homepage_new_search_classified_search_result'

@pytest.fixture
def crawler():
Expand Down

0 comments on commit ac7b37c

Please sign in to comment.