Skip to content

Commit

Permalink
✅ Test fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
hynky1999 committed Nov 13, 2023
1 parent 7385420 commit d8bc441
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 65 deletions.
10 changes: 5 additions & 5 deletions cmoncrawl/aggregator/index_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ async def get_number_of_pages(
max_retry: int,
sleep_base: float,
page_size: int | None = None,
):
) -> int:
params: Dict[str, str | int] = {
"showNumPages": "true",
"output": "json",
Expand Down Expand Up @@ -279,7 +279,7 @@ async def __prefetch_next_crawl(self) -> int:
next_crawl = self.__crawls_remaining.popleft()

try:
pages = await IndexAggregator.get_number_of_pages(
num_pages = await IndexAggregator.get_number_of_pages(
self.__client,
next_crawl.cdx_server,
next_crawl.domain,
Expand All @@ -293,10 +293,10 @@ async def __prefetch_next_crawl(self) -> int:
)
continue
all_purpose_logger.info(
f"Found {pages} pages for {next_crawl.domain} from {next_crawl.cdx_server}"
f"Found {num_pages} pages for {next_crawl.domain} from {next_crawl.cdx_server}"
)

for i in range(pages):
for i in range(num_pages):
dc = DomainCrawl(
next_crawl.domain, next_crawl.cdx_server, i
)
Expand All @@ -315,7 +315,7 @@ async def __prefetch_next_crawl(self) -> int:
),
)
)
return pages
return num_pages
return 0

async def __await_next_prefetch(self):
Expand Down
17 changes: 2 additions & 15 deletions tests/aggregator_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,18 +64,15 @@ async def asyncTearDown(self) -> None:
await self.di.aclose(None, None, None)

async def test_indexer_num_pages(self):
response = await self.di.get_number_of_pages(
num_pages = await self.di.get_number_of_pages(
self.client,
self.CC_SERVERS[0],
"idnes.cz",
max_retry=20,
sleep_base=4,
match_type=MatchType.DOMAIN,
)
self.assertIsNotNone(response)
num, size = response.content
self.assertEqual(num, 14)
self.assertEqual(size, 5)
self.assertEqual(num_pages, 14)

async def test_indexer_all_CC(self):
indexes = await get_all_CC_indexes(
Expand Down Expand Up @@ -189,16 +186,6 @@ async def test_unify_urls_id(self):
for i, url in enumerate(urls):
self.assertEquals(unify_url_id(url), urls_ids[i])

async def test_logging_failure_page(self):
async for record in self.di:
self.assertIsNotNone(record)
self.assertIsNotNone(record.url)
self.assertIsNotNone(record.filename)
self.assertIsNotNone(record.offset)
self.assertIsNotNone(record.length)
self.assertIsNotNone(record.timestamp)
self.assertIsNotNone(record.status)


if __name__ == "__main__":
unittest.main()
50 changes: 5 additions & 45 deletions tests/end_to_end_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from cmoncrawl.processor.connectors.base import ICC_Dao


class Download_files(unittest.IsolatedAsyncioTestCase):
class ExtractFiles(unittest.IsolatedAsyncioTestCase):
"""
CLI Testing
"""
Expand Down Expand Up @@ -64,9 +64,10 @@ async def test_extract_from_records(self, dao: DAOname):
with open(output_folder / "0_file.jsonl") as f:
lines = f.readlines()
self.assertEqual(len(lines), 5)
self.assertEqual(
json.loads(lines[4])["title"],
titles = [json.loads(line)["title"] for line in lines]
self.assertIn(
'<title data-document-head-keeper="0">Seznam – najdu tam, co neznám</title>',
titles,
)

async def test_extract_from_html(self):
Expand Down Expand Up @@ -95,45 +96,4 @@ async def test_extract_from_html(self):
'<title data-document-head-keeper="0">Seznam – najdu tam, co neznám</title>',
)


class Download_files(unittest.IsolatedAsyncioTestCase):
"""
CLI Testing
"""

async def asyncSetUp(self) -> None:
all_purpose_logger.setLevel("DEBUG")
metadata_logger.setLevel("DEBUG")
self.base_folder = Path(__file__).parent / "test_extract"
self.output_folder = self.base_folder / "output"

async def asyncTearDown(self) -> None:
# remoev output folder
if self.output_folder.exists():
shutil.rmtree(self.output_folder)

@parameterized.expand([(DAOname.API,), (DAOname.S3,)])
async def test_download_records(self, dao: DAOname):
await url_download(
url="https://example.com",
match_type=None,
output=self.output_folder / "directory_0" / "0_file.jsonl",
cc_server=None,
since=datetime(2021, 1, 1),
to=datetime(2021, 12, 31),
limit=100,
max_retry=40,
sleep_base=1.4,
mode=DownloadOutputFormat.RECORD,
max_crawls_per_file=1,
max_directory_size=10,
filter_non_200=True,
encoding="utf-8",
download_method=dao,
)
self.assertTrue((self.output_folder / "downloaded_file.txt").exists())
with open(self.output_folder / "directory_0" / "0_file.jsonl") as f:
lines = f.readlines()

lines = [json.loads(line) for line in lines]
self.assertEqual(len(lines), 5)
# TODO Add test for download

0 comments on commit d8bc441

Please sign in to comment.