diff --git a/flathunter/crawler/kleinanzeigen.py b/flathunter/crawler/kleinanzeigen.py index c1134af9..49c69e86 100644 --- a/flathunter/crawler/kleinanzeigen.py +++ b/flathunter/crawler/kleinanzeigen.py @@ -43,21 +43,22 @@ def extract_data(self, soup): entries = [] soup = soup.find(id="srchrslt-adtable") - try: - title_elements = soup.find_all(lambda e: e.has_attr('class') - and 'ellipsis' in e['class']) - except AttributeError: - return entries + exposes = soup.find_all("article", class_="aditem") + for expose in exposes: - expose_ids = soup.find_all("article", class_="aditem") + title_elem = expose.find(class_="ellipsis") + if title_elem.get("href"): + url = title_elem.get("href") + else: + # If there is no title element, just continue since we can't provide an URL + continue - for idx, title_el in enumerate(title_elements): try: - price = expose_ids[idx].find( + price = expose.find( class_="aditem-main--middle--price-shipping--price").text.strip() - tags = expose_ids[idx].find_all(class_="simpletag") - address = expose_ids[idx].find("div", {"class": "aditem-main--top--left"}) - image_element = expose_ids[idx].find("div", {"class": "galleryimage-element"}) + tags = expose.find_all(class_="simpletag") + address = expose.find("div", {"class": "aditem-main--top--left"}) + image_element = expose.find("div", {"class": "galleryimage-element"}) except AttributeError as error: logger.warning("Unable to process eBay expose: %s", str(error)) continue @@ -73,19 +74,20 @@ def extract_data(self, soup): rooms = "" if len(tags) > 1: - rooms_match = re.match(r'(\d+)', tags[1].text) + rooms_match = re.search(r'\d+[.|,]*\d*', tags[1].text, flags=re.MULTILINE) if rooms_match is not None: - rooms = rooms_match[1] + rooms = rooms_match.group() try: - size = tags[0].text + size = tags[0].text.strip() except (IndexError, TypeError): size = "" + details = { - 'id': int(expose_ids[idx].get("data-adid")), + 'id': int(expose.get("data-adid")), 'image': image, - 'url': ("https://www.kleinanzeigen.de" + title_el.get("href")), - 'title': title_el.text.strip(), + 'url': ("https://www.kleinanzeigen.de" + url), + 'title': title_elem.text.strip(), 'price': price, 'size': size, 'rooms': rooms,