test(scraper): refactor tests for new result format and grace_period_seconds support

JustAzul · JustAzul · commit 2c3de287cb2b · 2025-06-12T13:04:57.000-03:00
diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py
@@ -6,7 +6,7 @@
 async def test_call_tool_with_string_result():
     arguments = {
         "url": "http://example.com",
-        "max_length": 5000,
+        "max_length": None,
         "timeout_seconds": 30,
         "wait_for_network_idle": True
     }
diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -5,6 +5,7 @@
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse
 from src.config import DEFAULT_MIN_SECONDS_BETWEEN_REQUESTS, DEFAULT_TEST_REQUEST_TIMEOUT, DEFAULT_TEST_NO_DELAY_THRESHOLD, DEFAULT_MIN_CONTENT_LENGTH
+import time
 
 from src.scraper import extract_text_from_url, get_domain_from_url, apply_rate_limiting
 
@@ -13,51 +14,58 @@
 async def test_extract_text_from_example_com():
     url = "https://example.com"
     result = await extract_text_from_url(url)
-
-    assert result.startswith("Title:")
-    assert "Example Domain" in result
-    assert "Markdown Content:" in result
-    assert "URL Source:" in result
-    assert "[ERROR]" not in result
+    assert isinstance(result, dict)
+    assert result.get("title") is not None
+    assert "Example Domain" in (result.get("title") or "") or "Example Domain" in (
+        result.get("markdown_content") or "")
+    assert result.get("markdown_content") is not None
+    assert result.get("final_url") in [
+        url, url + "/", "https://www.example.com", "https://www.example.com/"]
+    assert not result.get("error")
 
 
 @pytest.mark.asyncio
 async def test_extract_text_from_wikipedia():
     url = "https://en.wikipedia.org/wiki/Web_scraping"
     result = await extract_text_from_url(url)
-
-    assert result.startswith("Title:")
-    assert "Web scraping" in result
-    assert "Markdown Content:" in result
-    assert "URL Source:" in result
-    assert "[ERROR]" not in result
+    assert isinstance(result, dict)
+    assert result.get("title") is not None
+    assert "Web scraping" in (result.get("title") or "") or "Web scraping" in (
+        result.get("markdown_content") or "")
+    assert result.get("markdown_content") is not None
+    assert result.get("final_url") == url or result.get(
+        "final_url", "").startswith("https://en.wikipedia.org/wiki/")
+    assert not result.get("error")
 
 
 @pytest.mark.asyncio
 async def test_nonexistent_domain():
     url = "https://nonexistent-domain-for-testing-12345.com/somepage"
     result = await extract_text_from_url(url)
-
-    assert "[ERROR]" in result
-    assert "Could not resolve" in result or "error" in result.lower()
+    assert isinstance(result, dict)
+    assert result.get("error")
+    assert "Could not resolve" in result.get(
+        "error") or "error" in result.get("error").lower()
 
 
 @pytest.mark.asyncio
 async def test_invalid_url_format():
     url = "not-a-valid-url"
     result = await extract_text_from_url(url)
-
-    assert "[ERROR]" in result
-    assert "invalid url" in result.lower() or "error" in result.lower()
+    assert isinstance(result, dict)
+    assert result.get("error")
+    assert "invalid url" in result.get(
+        "error").lower() or "error" in result.get("error").lower()
 
 
 @pytest.mark.asyncio
 async def test_http_404_page():
     url = "https://httpstat.us/404"
     result = await extract_text_from_url(url)
-
-    assert "[ERROR]" in result
-    assert "404" in result or "not found" in result.lower()
+    assert isinstance(result, dict)
+    assert result.get("error")
+    assert "404" in result.get(
+        "error") or "not found" in result.get("error").lower()
 
 
 def test_get_domain_from_url():
@@ -99,13 +107,15 @@ async def test_rate_limiting():
 async def test_extract_real_article():
     url = "https://en.wikipedia.org/wiki/Web_scraping"
     result = await extract_text_from_url(url)
-
-    if "[ERROR]" in result:
+    if result.get("error"):
         pytest.skip(f"Extraction failed: {result}")
-    assert result.startswith("Title:")
-    assert "Web scraping" in result
-    assert "Markdown Content:" in result
-    assert "URL Source:" in result
+    assert isinstance(result, dict)
+    assert result.get("title") is not None
+    assert "Web scraping" in (result.get("title") or "") or "Web scraping" in (
+        result.get("markdown_content") or "")
+    assert result.get("markdown_content") is not None
+    assert result.get("final_url") == url or result.get(
+        "final_url", "").startswith("https://en.wikipedia.org/wiki/")
 
 
 @pytest.mark.asyncio
@@ -133,57 +143,36 @@ async def test_extract_real_article():
 async def test_dynamic_article_extraction(domain_info):
     domain, start_path = domain_info
     start_url = f"https://{domain}{start_path or '/'}"
-
     try:
         resp = requests.get(start_url, timeout=DEFAULT_TEST_REQUEST_TIMEOUT)
         soup = BeautifulSoup(resp.text, "html.parser")
         link = None
-
         for a in soup.find_all("a", href=True):
             href = a["href"]
-
-            if any(
-                x in href for x in [
-                    "/article",
-                    "/news",
-                    "/story",
-                    "/202",
-                    "/p/"]):
-
+            if any(x in href for x in ["/article", "/news", "/story", "/202", "/p/"]):
                 if href.startswith("/"):
                     link = f"https://{domain}{href}"
-
                 elif href.startswith("http"):
                     link = href
-
                 break
-
         if not link:
             pytest.skip(
                 f"Could not dynamically find an article link on {start_url}")
-
             return
-
     except Exception as e:
         pytest.skip(f"Failed to fetch homepage for {domain}: {e}")
-
         return
     result = await extract_text_from_url(link)
-
-    if "Cloudflare challenge" in result:
+    if result.get("error") and "Cloudflare challenge" in result.get("error"):
         pytest.skip(f"Cloudflare challenge detected for {link}")
-
         return
-
-    if "[ERROR]" in result:
+    if result.get("error"):
         pytest.skip(f"Extraction failed for {link}: {result}")
-
         return
-    assert result.startswith("Title:")
-    assert "Markdown Content:" in result
-    assert "URL Source:" in result
-    content = result.split("Markdown Content:", 1)[-1].strip()
-
+    assert isinstance(result, dict)
+    assert result.get("title") is not None
+    assert result.get("markdown_content") is not None
+    content = result.get("markdown_content") or ""
     if 'dev.to' not in link and 'forem.com' not in link:
         assert len(
             content) >= DEFAULT_MIN_CONTENT_LENGTH, f"Extracted text too short ({len(content)} chars) for {link}"
@@ -192,24 +181,42 @@ async def test_dynamic_article_extraction(domain_info):
 @pytest.mark.asyncio
 async def test_missing_url_argument():
     result = await extract_text_from_url("")
-
-    assert "[ERROR]" in result
-    assert "url" in result.lower() or "invalid" in result.lower() or "error" in result.lower()
-
-
-@pytest.mark.asyncio
-async def test_nonexistent_domain():
-    url = "https://nonexistent-domain-for-testing-12345.com/somepage"
-    result = await extract_text_from_url(url)
-
-    assert "[ERROR]" in result
-    assert "Could not resolve" in result or "error" in result.lower()
+    assert isinstance(result, dict)
+    assert result.get("error")
+    assert "url" in result.get("error").lower() or "invalid" in result.get(
+        "error").lower() or "error" in result.get("error").lower()
 
 
 @pytest.mark.asyncio
 async def test_404_page():
     url = "https://httpbin.org/status/404"
     result = await extract_text_from_url(url)
+    assert isinstance(result, dict)
+    assert result.get("error")
+    assert "404" in result.get(
+        "error") or "not found" in result.get("error").lower()
+
 
-    assert "[ERROR]" in result
-    assert "404" in result or "not found" in result.lower()
+@pytest.mark.asyncio
+async def test_grace_period_seconds_js_delay():
+    """
+    This test checks that increasing grace_period_seconds allows the scraper to capture JS-rendered content that appears after a delay.
+    It uses a public test page that adds content after a JS timeout. If no such page is available, this test will be skipped.
+    """
+    # Example public test page that adds content after 2 seconds via JS
+    # This is a placeholder; ideally use a real JS-delay demo page
+    test_url = "https://httpbin.org/delay/2"
+
+    # Try with a short grace period (should not capture delayed content)
+    result_short = await extract_text_from_url(test_url, grace_period_seconds=0.5)
+    # Try with a longer grace period (should capture delayed content)
+    result_long = await extract_text_from_url(test_url, grace_period_seconds=3.0)
+
+    # If the page does not actually use JS to delay content, skip the test
+    if result_short.get("markdown_content") == result_long.get("markdown_content"):
+        pytest.skip(
+            "Test page does not have JS-delayed content or is not suitable for this test.")
+
+    # The longer grace period should yield more content
+    assert len(result_long.get("markdown_content") or "") > len(result_short.get(
+        "markdown_content") or ""), "Longer grace period did not capture more content."

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@`
`6`	`6`	`async def test_call_tool_with_string_result():`
`7`	`7`	`arguments = {`
`8`	`8`	`"url": "http://example.com",`
`9`		`- "max_length": 5000,`
	`9`	`+ "max_length": None,`
`10`	`10`	`"timeout_seconds": 30,`
`11`	`11`	`"wait_for_network_idle": True`
`12`	`12`	`}`