Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove characters other than digits, commas (,) and periods (.) when getting price for Amazon #258

Merged
merged 1 commit into from
Jan 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions scraper/domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,8 @@ def _get_product_name(self) -> str:
return self.request_data.find("span", id="productTitle").text.strip()

def _get_product_price(self) -> float:
return float(
self.request_data.find("span", class_="a-price").span.text.replace("$", "").replace(",", "").replace(" ", "")
)
raw_price = self.request_data.find("span", class_="a-price").span.text.replace(",", "").replace(" ", "")
return float(get_number_string(raw_price))

def _get_product_currency(self) -> str:
regex_pattern = "%22currencyCode%22%3A%22(.{3})%22"
Expand Down Expand Up @@ -548,6 +547,13 @@ def get_website_handler(url: str) -> BaseWebsiteHandler:
return website_handler(url)


def get_number_string(value: str) -> str:
"""Return string with only digits, commas (,) and periods (.)"""
text_pattern = re.compile(r"[^\d.,]+")
result = text_pattern.sub("", value)
return result


SUPPORTED_DOMAINS: dict[str, BaseWebsiteHandler] = {
"komplett": KomplettHandler,
"proshop": ProshopHandler,
Expand Down
23 changes: 22 additions & 1 deletion tests/test_domains.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from dataclasses import dataclass
import pytest

from scraper.domains import get_website_name
from scraper.domains import get_website_name, get_number_string


@dataclass
Expand Down Expand Up @@ -42,3 +42,24 @@ def test_get_website_name(url: str, setting: UrlSetting, expected: str) -> None:
keep_subdomain=setting.keep_subdomain,
)
assert result == expected


test_price_values = [
("USD 12.40", "12.40"),
("$234.00", "234.00"),
("£345.37", "345.37"),
("486,89 kr", "486,89"),
("$345.37", "345.37"),
("£1345.37", "1345.37"),
("1345,37 DKK", "1345,37"),
("1345.37 DKK", "1345.37"),
("USD 1345.37", "1345.37"),
("USD 10345.37", "10345.37"),
]


@pytest.mark.parametrize("value,expected", test_price_values)
def test_get_number_string(value: str, expected: str) -> None:
result = get_number_string(value)

assert result == expected
8 changes: 4 additions & 4 deletions tests/test_objects.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
"expected_currency": "DKK"
},
"amazon": {
"link": "https://www.amazon.com/Sony-WH-1000XM5-Canceling-Headphones-Hands-Free/dp/B09XS7JWHH",
"expected_title": "Sony WH-1000XM5 The Best Wireless Noise Canceling Headphones with Auto Noise Canceling Optimizer, Crystal Clear Hands-Free Calling, and Alexa Voice Control, Black",
"expected_id": "B09XS7JWHH",
"expected_currency": "USD"
"link": "https://www.amazon.de/-/en/Google-Pixel-Pro-Smartphone-Obsidian/dp/B0DG9DD9VN",
"expected_title": "Google Pixel 9 Pro (512GB, Obsi, EU / UK) + Pixel 9/9 Pro Case, Obsidian",
"expected_id": "B0DG9DD9VN",
"expected_currency": "EUR"
},
"ebay_with_itm": {
"link": "https://www.ebay.com/itm/265771092654",
Expand Down
Loading