55from bs4 import BeautifulSoup
66from urllib .parse import urlparse
77from src .config import DEFAULT_MIN_SECONDS_BETWEEN_REQUESTS , DEFAULT_TEST_REQUEST_TIMEOUT , DEFAULT_TEST_NO_DELAY_THRESHOLD , DEFAULT_MIN_CONTENT_LENGTH
8+ import time
89
910from src .scraper import extract_text_from_url , get_domain_from_url , apply_rate_limiting
1011
1314async def test_extract_text_from_example_com ():
1415 url = "https://example.com"
1516 result = await extract_text_from_url (url )
16-
17- assert result .startswith ("Title:" )
18- assert "Example Domain" in result
19- assert "Markdown Content:" in result
20- assert "URL Source:" in result
21- assert "[ERROR]" not in result
17+ assert isinstance (result , dict )
18+ assert result .get ("title" ) is not None
19+ assert "Example Domain" in (result .get ("title" ) or "" ) or "Example Domain" in (
20+ result .get ("markdown_content" ) or "" )
21+ assert result .get ("markdown_content" ) is not None
22+ assert result .get ("final_url" ) in [
23+ url , url + "/" , "https://www.example.com" , "https://www.example.com/" ]
24+ assert not result .get ("error" )
2225
2326
2427@pytest .mark .asyncio
2528async def test_extract_text_from_wikipedia ():
2629 url = "https://en.wikipedia.org/wiki/Web_scraping"
2730 result = await extract_text_from_url (url )
28-
29- assert result .startswith ("Title:" )
30- assert "Web scraping" in result
31- assert "Markdown Content:" in result
32- assert "URL Source:" in result
33- assert "[ERROR]" not in result
31+ assert isinstance (result , dict )
32+ assert result .get ("title" ) is not None
33+ assert "Web scraping" in (result .get ("title" ) or "" ) or "Web scraping" in (
34+ result .get ("markdown_content" ) or "" )
35+ assert result .get ("markdown_content" ) is not None
36+ assert result .get ("final_url" ) == url or result .get (
37+ "final_url" , "" ).startswith ("https://en.wikipedia.org/wiki/" )
38+ assert not result .get ("error" )
3439
3540
3641@pytest .mark .asyncio
3742async def test_nonexistent_domain ():
3843 url = "https://nonexistent-domain-for-testing-12345.com/somepage"
3944 result = await extract_text_from_url (url )
40-
41- assert "[ERROR]" in result
42- assert "Could not resolve" in result or "error" in result .lower ()
45+ assert isinstance (result , dict )
46+ assert result .get ("error" )
47+ assert "Could not resolve" in result .get (
48+ "error" ) or "error" in result .get ("error" ).lower ()
4349
4450
4551@pytest .mark .asyncio
4652async def test_invalid_url_format ():
4753 url = "not-a-valid-url"
4854 result = await extract_text_from_url (url )
49-
50- assert "[ERROR]" in result
51- assert "invalid url" in result .lower () or "error" in result .lower ()
55+ assert isinstance (result , dict )
56+ assert result .get ("error" )
57+ assert "invalid url" in result .get (
58+ "error" ).lower () or "error" in result .get ("error" ).lower ()
5259
5360
5461@pytest .mark .asyncio
5562async def test_http_404_page ():
5663 url = "https://httpstat.us/404"
5764 result = await extract_text_from_url (url )
58-
59- assert "[ERROR]" in result
60- assert "404" in result or "not found" in result .lower ()
65+ assert isinstance (result , dict )
66+ assert result .get ("error" )
67+ assert "404" in result .get (
68+ "error" ) or "not found" in result .get ("error" ).lower ()
6169
6270
6371def test_get_domain_from_url ():
@@ -99,13 +107,15 @@ async def test_rate_limiting():
99107async def test_extract_real_article ():
100108 url = "https://en.wikipedia.org/wiki/Web_scraping"
101109 result = await extract_text_from_url (url )
102-
103- if "[ERROR]" in result :
110+ if result .get ("error" ):
104111 pytest .skip (f"Extraction failed: { result } " )
105- assert result .startswith ("Title:" )
106- assert "Web scraping" in result
107- assert "Markdown Content:" in result
108- assert "URL Source:" in result
112+ assert isinstance (result , dict )
113+ assert result .get ("title" ) is not None
114+ assert "Web scraping" in (result .get ("title" ) or "" ) or "Web scraping" in (
115+ result .get ("markdown_content" ) or "" )
116+ assert result .get ("markdown_content" ) is not None
117+ assert result .get ("final_url" ) == url or result .get (
118+ "final_url" , "" ).startswith ("https://en.wikipedia.org/wiki/" )
109119
110120
111121@pytest .mark .asyncio
@@ -133,57 +143,36 @@ async def test_extract_real_article():
133143async def test_dynamic_article_extraction (domain_info ):
134144 domain , start_path = domain_info
135145 start_url = f"https://{ domain } { start_path or '/' } "
136-
137146 try :
138147 resp = requests .get (start_url , timeout = DEFAULT_TEST_REQUEST_TIMEOUT )
139148 soup = BeautifulSoup (resp .text , "html.parser" )
140149 link = None
141-
142150 for a in soup .find_all ("a" , href = True ):
143151 href = a ["href" ]
144-
145- if any (
146- x in href for x in [
147- "/article" ,
148- "/news" ,
149- "/story" ,
150- "/202" ,
151- "/p/" ]):
152-
152+ if any (x in href for x in ["/article" , "/news" , "/story" , "/202" , "/p/" ]):
153153 if href .startswith ("/" ):
154154 link = f"https://{ domain } { href } "
155-
156155 elif href .startswith ("http" ):
157156 link = href
158-
159157 break
160-
161158 if not link :
162159 pytest .skip (
163160 f"Could not dynamically find an article link on { start_url } " )
164-
165161 return
166-
167162 except Exception as e :
168163 pytest .skip (f"Failed to fetch homepage for { domain } : { e } " )
169-
170164 return
171165 result = await extract_text_from_url (link )
172-
173- if "Cloudflare challenge" in result :
166+ if result .get ("error" ) and "Cloudflare challenge" in result .get ("error" ):
174167 pytest .skip (f"Cloudflare challenge detected for { link } " )
175-
176168 return
177-
178- if "[ERROR]" in result :
169+ if result .get ("error" ):
179170 pytest .skip (f"Extraction failed for { link } : { result } " )
180-
181171 return
182- assert result .startswith ("Title:" )
183- assert "Markdown Content:" in result
184- assert "URL Source:" in result
185- content = result .split ("Markdown Content:" , 1 )[- 1 ].strip ()
186-
172+ assert isinstance (result , dict )
173+ assert result .get ("title" ) is not None
174+ assert result .get ("markdown_content" ) is not None
175+ content = result .get ("markdown_content" ) or ""
187176 if 'dev.to' not in link and 'forem.com' not in link :
188177 assert len (
189178 content ) >= DEFAULT_MIN_CONTENT_LENGTH , f"Extracted text too short ({ len (content )} chars) for { link } "
@@ -192,24 +181,42 @@ async def test_dynamic_article_extraction(domain_info):
192181@pytest .mark .asyncio
193182async def test_missing_url_argument ():
194183 result = await extract_text_from_url ("" )
195-
196- assert "[ERROR]" in result
197- assert "url" in result .lower () or "invalid" in result .lower () or "error" in result .lower ()
198-
199-
200- @pytest .mark .asyncio
201- async def test_nonexistent_domain ():
202- url = "https://nonexistent-domain-for-testing-12345.com/somepage"
203- result = await extract_text_from_url (url )
204-
205- assert "[ERROR]" in result
206- assert "Could not resolve" in result or "error" in result .lower ()
184+ assert isinstance (result , dict )
185+ assert result .get ("error" )
186+ assert "url" in result .get ("error" ).lower () or "invalid" in result .get (
187+ "error" ).lower () or "error" in result .get ("error" ).lower ()
207188
208189
209190@pytest .mark .asyncio
210191async def test_404_page ():
211192 url = "https://httpbin.org/status/404"
212193 result = await extract_text_from_url (url )
194+ assert isinstance (result , dict )
195+ assert result .get ("error" )
196+ assert "404" in result .get (
197+ "error" ) or "not found" in result .get ("error" ).lower ()
198+
213199
214- assert "[ERROR]" in result
215- assert "404" in result or "not found" in result .lower ()
200+ @pytest .mark .asyncio
201+ async def test_grace_period_seconds_js_delay ():
202+ """
203+ This test checks that increasing grace_period_seconds allows the scraper to capture JS-rendered content that appears after a delay.
204+ It uses a public test page that adds content after a JS timeout. If no such page is available, this test will be skipped.
205+ """
206+ # Example public test page that adds content after 2 seconds via JS
207+ # This is a placeholder; ideally use a real JS-delay demo page
208+ test_url = "https://httpbin.org/delay/2"
209+
210+ # Try with a short grace period (should not capture delayed content)
211+ result_short = await extract_text_from_url (test_url , grace_period_seconds = 0.5 )
212+ # Try with a longer grace period (should capture delayed content)
213+ result_long = await extract_text_from_url (test_url , grace_period_seconds = 3.0 )
214+
215+ # If the page does not actually use JS to delay content, skip the test
216+ if result_short .get ("markdown_content" ) == result_long .get ("markdown_content" ):
217+ pytest .skip (
218+ "Test page does not have JS-delayed content or is not suitable for this test." )
219+
220+ # The longer grace period should yield more content
221+ assert len (result_long .get ("markdown_content" ) or "" ) > len (result_short .get (
222+ "markdown_content" ) or "" ), "Longer grace period did not capture more content."
0 commit comments