Skip to content

Commit

Permalink
Fix parsing (#6)
Browse files Browse the repository at this point in the history
* Download test files with a script

* Fix frontpage parsing

* Fix article comment parsing

* Add mocks
  • Loading branch information
timotk committed Jul 29, 2023
1 parent 380f416 commit fbc163b
Show file tree
Hide file tree
Showing 12 changed files with 13,062 additions and 7 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ classifiers=[
]

[tool.poetry.dependencies]
python = ">=3.6"
python = ">=3.7"
requests-html = "^0.10.0"
dateparser = "^0.7.0"
dateparser = "1.1.8"
tenacity = "^6.2.0"

[tool.poetry.dev-dependencies]
Expand Down
23 changes: 23 additions & 0 deletions scripts/download_test_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from pathlib import Path

from tweakers.utils import get

URLS = {
"topic": "https://gathering.tweakers.net/forum/list_messages/1908208",
"frontpage": "https://tweakers.net/",
"article": "https://tweakers.net/nieuws/212172/whatsapp-introduceert-functie-voor-korte-videoberichten.html",
"user": "https://tweakers.net/gallery/1/",
"active_topics": "https://gathering.tweakers.net/forum/list_activetopics",
"find": "https://tweakers.net/forum/find?keyword=playstation",
}


for name, url in URLS.items():
print("Downloading", name)
html = get(url).text
if "Sorry, je gaat even iets te snel" in html:
raise Exception("Still rate limited???")
path = Path(f"tests/pages/{name}.html")
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w") as f:
f.write(html)
25 changes: 25 additions & 0 deletions tests/mocks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pytest
from requests_html import HTML, HTMLResponse, HTMLSession


def mock_get(url: str):
if "/nieuws/" in url:
case = "article"
elif "/list_messages/" in url:
case = "topic"
elif "/gallery/" in url:
case = "user"
elif "/list_activetopics" in url:
case = "active_topics"
elif "forum/find" in url:
case = "find"
elif url in ("https://tweakers.net/", "https://tweakers.net"):
case = "frontpage"
else:
raise NotImplementedError(f"Url {url} not implemented")

with open(f"tests/pages/{case}.html") as f:
response = HTMLResponse(session=HTMLSession)
response._html = HTML(html=f.read())
return response

363 changes: 363 additions & 0 deletions tests/pages/active_topics.html

Large diffs are not rendered by default.

2,319 changes: 2,319 additions & 0 deletions tests/pages/article.html

Large diffs are not rendered by default.

4,308 changes: 4,308 additions & 0 deletions tests/pages/find.html

Large diffs are not rendered by default.

4,378 changes: 4,378 additions & 0 deletions tests/pages/frontpage.html

Large diffs are not rendered by default.

609 changes: 609 additions & 0 deletions tests/pages/topic.html

Large diffs are not rendered by default.

1,022 changes: 1,022 additions & 0 deletions tests/pages/user.html

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions tests/test_frontpage.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from unittest import mock
from tests.mocks import mock_get
from tweakers import frontpage


@mock.patch("tweakers.frontpage.get", mock_get)
def test_articles():
articles = frontpage.articles()
assert len(articles) > 0


@mock.patch("tweakers.frontpage.get", mock_get)
def test_article_comments():
article = frontpage.Article(
url="https://tweakers.net/nieuws/148534/amd-maakt-meer-winst-dankzij-goede-verkopen-van-ryzen-cpus.html"
Expand Down
4 changes: 4 additions & 0 deletions tests/test_gathering.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from unittest import mock
from tests.mocks import mock_get
from tweakers import gathering


@mock.patch("tweakers.gathering.get", mock_get)
def test_active_topics():
assert len(gathering.active_topics()) > 0


@mock.patch("tweakers.gathering.get", mock_get)
def test_search():
assert len(gathering.search("tweakers")) > 0
10 changes: 5 additions & 5 deletions tweakers/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,12 @@ def topic_comments(html: Union[HTML, str]) -> Generator[dict, None, None]:


def frontpage_articles(html: HTML) -> Generator[dict, None, None]:
for tr in html.find("tr.headline.news"):
for tr in html.find(".headline"):
topic: Dict = {
"title": tr.find(".title a", first=True).text,
"url": tr.find(".title a", first=True).attrs["href"],
"title": tr.text.strip(),
"url": tr.find("a.headline--anchor", first=True).attrs["href"],
"comment_count": get_comment_count(tr),
"publication_time": tr.find(".publicationTime", first=True).text,
"publication_time": tr.find(".headline--time", first=True).text,
}
yield topic

Expand All @@ -102,7 +102,7 @@ def article_comments(html: HTML) -> Generator[dict, None, None]:
_get_text(div, selector="a.date"), languages=["nl"]
),
"text": _get_text(div, selector=".reactieContent"),
"score": int(_get_text(div, selector="a.scoreButton")),
"score": _get_text(div, selector="a.scoreButton"),
}
yield comment

Expand Down

0 comments on commit fbc163b

Please sign in to comment.