Merge pull request #642 from flathunters/bugfix/immowelt-new-website

Fix immowelt crawler
flathunters · Dec 8, 2024 · ac7b37c · ac7b37c
2 parents 57cc225 + a39a738
commit ac7b37c
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 27 deletions.
diff --git a/flathunter/crawler/immowelt.py b/flathunter/crawler/immowelt.py
@@ -49,61 +49,70 @@ def get_expose_details(self, expose):
     def extract_data(self, soup: BeautifulSoup):
         """Extracts all exposes from a provided Soup object"""
         entries = []
-        soup_res = soup.find("main")
+        soup_res = soup
         if not isinstance(soup_res, Tag):
             return []
 
-        title_elements = soup_res.find_all("h2")
-        expose_ids = soup_res.find_all("a", id=True)
+        advertisements = soup_res.find_all("div", attrs={"class": "css-79elbk"})
+        for adv in advertisements:
+            try:
+                title = adv.find("div", {"class": "css-1cbj9xw"}).text
+            except:
+                title = ""
 
-        for idx, title_el in enumerate(title_elements):
             try:
-                price = expose_ids[idx].find(
-                    "div", attrs={"data-test": "price"}).text
-            except IndexError:
+                price = adv.find(
+                    "div", attrs={"data-testid": "cardmfe-price-testid"}).text
+            except:
                 price = ""
 
             try:
-                size = expose_ids[idx].find(
-                    "div", attrs={"data-test": "area"}).text
+                descriptions = adv.find("div", attrs={"data-testid": "cardmfe-keyfacts-testid"}).children
+                descriptions = [result.text for result in descriptions]
+            except:
+                descriptions = []
+
+            size = list(filter(lambda x: "m²" in x, descriptions))
+            try:
+                size = size[0]
             except IndexError:
                 size = ""
 
+            rooms = list(filter(lambda x: "Zimmer" in x, descriptions))
             try:
-                rooms = expose_ids[idx].find(
-                    "div", attrs={"data-test": "rooms"}).text.replace(" Zi.", "")
+                rooms = rooms[0]
             except IndexError:
                 rooms = ""
 
+            id_element = adv.find("a")
             try:
-                url = expose_ids[idx].get("href")
+                url = id_element.get("href")
+                if "https" not in url:
+                    url = "https://immowelt.de/" + url
             except IndexError:
                 continue
 
-            picture = expose_ids[idx].find("picture")
+            picture = adv.find("img")
             image = None
             if picture:
-                src = picture.find("source")
-                if src:
-                    image = src.get("data-srcset")
+                image = picture.get('src')
 
             try:
-                address = expose_ids[idx].find(
-                    "div", attrs={"class": re.compile("IconFact.*")}
-                  )
-                address = address.find("span").text
+                address = adv.find(
+                    "div", attrs={"data-testid": "cardmfe-description-box-address"}
+                  ).text
             except (IndexError, AttributeError):
                 address = ""
-
+            ad_id = url.split('/')[-1]
             processed_id = int(
-              hashlib.sha256(expose_ids[idx].get("id").encode('utf-8')).hexdigest(), 16
+              hashlib.sha256(ad_id.encode('utf-8')).hexdigest(), 16
             ) % 10**16
 
             details = {
                 'id': processed_id,
                 'image': image,
                 'url': url,
-                'title': title_el.text.strip(),
+                'title': title.strip(),
                 'rooms': rooms,
                 'price': price,
                 'size': size,
@@ -113,5 +122,4 @@ def extract_data(self, soup: BeautifulSoup):
             entries.append(details)
 
         logger.debug('Number of entries found: %d', len(entries))
-
-        return entries
+        return entries
diff --git a/test/crawler/test_crawl_immowelt.py b/test/crawler/test_crawl_immowelt.py
@@ -6,10 +6,10 @@
 
 DUMMY_CONFIG = """
 urls:
-  - https://www.immowelt.de/liste/muenchen/wohnungen/mieten?roomi=2&primi=600&prima=1000
+  - https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=House,Apartment&locations=AD08DE8634&order=Default&m=homepage_new_search_classified_search_result
     """
 
-TEST_URL = 'https://www.immowelt.de/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc'
+TEST_URL = 'https://www.immowelt.de/classified-search?distributionTypes=Rent&estateTypes=House,Apartment&locations=AD08DE8634&order=Default&m=homepage_new_search_classified_search_result'
 
 @pytest.fixture
 def crawler():