Skip to content

Commit 2c3de28

Browse files
committed
test(scraper): refactor tests for new result format and grace_period_seconds support
1 parent eb3b841 commit 2c3de28

File tree

2 files changed

+77
-70
lines changed

2 files changed

+77
-70
lines changed

tests/test_mcp_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
async def test_call_tool_with_string_result():
77
arguments = {
88
"url": "http://example.com",
9-
"max_length": 5000,
9+
"max_length": None,
1010
"timeout_seconds": 30,
1111
"wait_for_network_idle": True
1212
}

tests/test_scraper.py

Lines changed: 76 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from bs4 import BeautifulSoup
66
from urllib.parse import urlparse
77
from src.config import DEFAULT_MIN_SECONDS_BETWEEN_REQUESTS, DEFAULT_TEST_REQUEST_TIMEOUT, DEFAULT_TEST_NO_DELAY_THRESHOLD, DEFAULT_MIN_CONTENT_LENGTH
8+
import time
89

910
from src.scraper import extract_text_from_url, get_domain_from_url, apply_rate_limiting
1011

@@ -13,51 +14,58 @@
1314
async def test_extract_text_from_example_com():
1415
url = "https://example.com"
1516
result = await extract_text_from_url(url)
16-
17-
assert result.startswith("Title:")
18-
assert "Example Domain" in result
19-
assert "Markdown Content:" in result
20-
assert "URL Source:" in result
21-
assert "[ERROR]" not in result
17+
assert isinstance(result, dict)
18+
assert result.get("title") is not None
19+
assert "Example Domain" in (result.get("title") or "") or "Example Domain" in (
20+
result.get("markdown_content") or "")
21+
assert result.get("markdown_content") is not None
22+
assert result.get("final_url") in [
23+
url, url + "/", "https://www.example.com", "https://www.example.com/"]
24+
assert not result.get("error")
2225

2326

2427
@pytest.mark.asyncio
2528
async def test_extract_text_from_wikipedia():
2629
url = "https://en.wikipedia.org/wiki/Web_scraping"
2730
result = await extract_text_from_url(url)
28-
29-
assert result.startswith("Title:")
30-
assert "Web scraping" in result
31-
assert "Markdown Content:" in result
32-
assert "URL Source:" in result
33-
assert "[ERROR]" not in result
31+
assert isinstance(result, dict)
32+
assert result.get("title") is not None
33+
assert "Web scraping" in (result.get("title") or "") or "Web scraping" in (
34+
result.get("markdown_content") or "")
35+
assert result.get("markdown_content") is not None
36+
assert result.get("final_url") == url or result.get(
37+
"final_url", "").startswith("https://en.wikipedia.org/wiki/")
38+
assert not result.get("error")
3439

3540

3641
@pytest.mark.asyncio
3742
async def test_nonexistent_domain():
3843
url = "https://nonexistent-domain-for-testing-12345.com/somepage"
3944
result = await extract_text_from_url(url)
40-
41-
assert "[ERROR]" in result
42-
assert "Could not resolve" in result or "error" in result.lower()
45+
assert isinstance(result, dict)
46+
assert result.get("error")
47+
assert "Could not resolve" in result.get(
48+
"error") or "error" in result.get("error").lower()
4349

4450

4551
@pytest.mark.asyncio
4652
async def test_invalid_url_format():
4753
url = "not-a-valid-url"
4854
result = await extract_text_from_url(url)
49-
50-
assert "[ERROR]" in result
51-
assert "invalid url" in result.lower() or "error" in result.lower()
55+
assert isinstance(result, dict)
56+
assert result.get("error")
57+
assert "invalid url" in result.get(
58+
"error").lower() or "error" in result.get("error").lower()
5259

5360

5461
@pytest.mark.asyncio
5562
async def test_http_404_page():
5663
url = "https://httpstat.us/404"
5764
result = await extract_text_from_url(url)
58-
59-
assert "[ERROR]" in result
60-
assert "404" in result or "not found" in result.lower()
65+
assert isinstance(result, dict)
66+
assert result.get("error")
67+
assert "404" in result.get(
68+
"error") or "not found" in result.get("error").lower()
6169

6270

6371
def test_get_domain_from_url():
@@ -99,13 +107,15 @@ async def test_rate_limiting():
99107
async def test_extract_real_article():
100108
url = "https://en.wikipedia.org/wiki/Web_scraping"
101109
result = await extract_text_from_url(url)
102-
103-
if "[ERROR]" in result:
110+
if result.get("error"):
104111
pytest.skip(f"Extraction failed: {result}")
105-
assert result.startswith("Title:")
106-
assert "Web scraping" in result
107-
assert "Markdown Content:" in result
108-
assert "URL Source:" in result
112+
assert isinstance(result, dict)
113+
assert result.get("title") is not None
114+
assert "Web scraping" in (result.get("title") or "") or "Web scraping" in (
115+
result.get("markdown_content") or "")
116+
assert result.get("markdown_content") is not None
117+
assert result.get("final_url") == url or result.get(
118+
"final_url", "").startswith("https://en.wikipedia.org/wiki/")
109119

110120

111121
@pytest.mark.asyncio
@@ -133,57 +143,36 @@ async def test_extract_real_article():
133143
async def test_dynamic_article_extraction(domain_info):
134144
domain, start_path = domain_info
135145
start_url = f"https://{domain}{start_path or '/'}"
136-
137146
try:
138147
resp = requests.get(start_url, timeout=DEFAULT_TEST_REQUEST_TIMEOUT)
139148
soup = BeautifulSoup(resp.text, "html.parser")
140149
link = None
141-
142150
for a in soup.find_all("a", href=True):
143151
href = a["href"]
144-
145-
if any(
146-
x in href for x in [
147-
"/article",
148-
"/news",
149-
"/story",
150-
"/202",
151-
"/p/"]):
152-
152+
if any(x in href for x in ["/article", "/news", "/story", "/202", "/p/"]):
153153
if href.startswith("/"):
154154
link = f"https://{domain}{href}"
155-
156155
elif href.startswith("http"):
157156
link = href
158-
159157
break
160-
161158
if not link:
162159
pytest.skip(
163160
f"Could not dynamically find an article link on {start_url}")
164-
165161
return
166-
167162
except Exception as e:
168163
pytest.skip(f"Failed to fetch homepage for {domain}: {e}")
169-
170164
return
171165
result = await extract_text_from_url(link)
172-
173-
if "Cloudflare challenge" in result:
166+
if result.get("error") and "Cloudflare challenge" in result.get("error"):
174167
pytest.skip(f"Cloudflare challenge detected for {link}")
175-
176168
return
177-
178-
if "[ERROR]" in result:
169+
if result.get("error"):
179170
pytest.skip(f"Extraction failed for {link}: {result}")
180-
181171
return
182-
assert result.startswith("Title:")
183-
assert "Markdown Content:" in result
184-
assert "URL Source:" in result
185-
content = result.split("Markdown Content:", 1)[-1].strip()
186-
172+
assert isinstance(result, dict)
173+
assert result.get("title") is not None
174+
assert result.get("markdown_content") is not None
175+
content = result.get("markdown_content") or ""
187176
if 'dev.to' not in link and 'forem.com' not in link:
188177
assert len(
189178
content) >= DEFAULT_MIN_CONTENT_LENGTH, f"Extracted text too short ({len(content)} chars) for {link}"
@@ -192,24 +181,42 @@ async def test_dynamic_article_extraction(domain_info):
192181
@pytest.mark.asyncio
193182
async def test_missing_url_argument():
194183
result = await extract_text_from_url("")
195-
196-
assert "[ERROR]" in result
197-
assert "url" in result.lower() or "invalid" in result.lower() or "error" in result.lower()
198-
199-
200-
@pytest.mark.asyncio
201-
async def test_nonexistent_domain():
202-
url = "https://nonexistent-domain-for-testing-12345.com/somepage"
203-
result = await extract_text_from_url(url)
204-
205-
assert "[ERROR]" in result
206-
assert "Could not resolve" in result or "error" in result.lower()
184+
assert isinstance(result, dict)
185+
assert result.get("error")
186+
assert "url" in result.get("error").lower() or "invalid" in result.get(
187+
"error").lower() or "error" in result.get("error").lower()
207188

208189

209190
@pytest.mark.asyncio
210191
async def test_404_page():
211192
url = "https://httpbin.org/status/404"
212193
result = await extract_text_from_url(url)
194+
assert isinstance(result, dict)
195+
assert result.get("error")
196+
assert "404" in result.get(
197+
"error") or "not found" in result.get("error").lower()
198+
213199

214-
assert "[ERROR]" in result
215-
assert "404" in result or "not found" in result.lower()
200+
@pytest.mark.asyncio
201+
async def test_grace_period_seconds_js_delay():
202+
"""
203+
This test checks that increasing grace_period_seconds allows the scraper to capture JS-rendered content that appears after a delay.
204+
It uses a public test page that adds content after a JS timeout. If no such page is available, this test will be skipped.
205+
"""
206+
# Example public test page that adds content after 2 seconds via JS
207+
# This is a placeholder; ideally use a real JS-delay demo page
208+
test_url = "https://httpbin.org/delay/2"
209+
210+
# Try with a short grace period (should not capture delayed content)
211+
result_short = await extract_text_from_url(test_url, grace_period_seconds=0.5)
212+
# Try with a longer grace period (should capture delayed content)
213+
result_long = await extract_text_from_url(test_url, grace_period_seconds=3.0)
214+
215+
# If the page does not actually use JS to delay content, skip the test
216+
if result_short.get("markdown_content") == result_long.get("markdown_content"):
217+
pytest.skip(
218+
"Test page does not have JS-delayed content or is not suitable for this test.")
219+
220+
# The longer grace period should yield more content
221+
assert len(result_long.get("markdown_content") or "") > len(result_short.get(
222+
"markdown_content") or ""), "Longer grace period did not capture more content."

0 commit comments

Comments
 (0)