From 826f8865837090b7a01b7e14bddf17f0e24bcef2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= Date: Wed, 25 Oct 2023 00:26:55 +0200 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=94=A5=20removal=20of=20cc=20indexes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- cmoncrawl/aggregator/athena_query.py | 8 ++------ cmoncrawl/aggregator/index_query.py | 8 ++------ cmoncrawl/aggregator/utils/constants.py | 1 + tests/aggregator_tests.py | 3 ++- 4 files changed, 7 insertions(+), 13 deletions(-) create mode 100644 cmoncrawl/aggregator/utils/constants.py diff --git a/cmoncrawl/aggregator/athena_query.py b/cmoncrawl/aggregator/athena_query.py index 8d984dba..d7fadc99 100644 --- a/cmoncrawl/aggregator/athena_query.py +++ b/cmoncrawl/aggregator/athena_query.py @@ -14,6 +14,7 @@ crawl_url_to_name, prepare_athena_sql_query, ) +from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes from cmoncrawl.common.loggers import all_purpose_logger @@ -86,7 +87,6 @@ class AthenaAggregator(AsyncIterable[DomainRecord]): Args: domains (List[str]): A list of domains to search for. - cc_indexes_server (str, optional): The commoncrawl index server to use. Defaults to "http://index.commoncrawl.org/collinfo.json". match_type (MatchType, optional): Match type for cdx-api. Defaults to MatchType.EXACT. cc_servers (List[str], optional): A list of commoncrawl servers to use. If [], then indexes will be retrieved from the cc_indexes_server. Defaults to []. since (datetime, optional): The start date for the search. Defaults to datetime.min. @@ -112,7 +112,6 @@ class AthenaAggregator(AsyncIterable[DomainRecord]): def __init__( self, domains: List[str], - cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json", match_type: MatchType = MatchType.EXACT, cc_servers: List[str] = [], since: datetime = datetime.min, @@ -129,7 +128,6 @@ def __init__( table_name: str = "ccindex", ) -> None: self.domains = domains - self.cc_indexes_server = cc_indexes_server self.match_type = match_type self.cc_servers = cc_servers self.since = since @@ -174,9 +172,7 @@ async def aopen(self) -> AthenaAggregator: ) async with ClientSession() as client: if len(self.cc_servers) == 0: - self.cc_servers = await get_all_CC_indexes( - client, self.cc_indexes_server - ) + self.cc_servers = await get_all_CC_indexes(client, CC_INDEXES_SERVER) # create bucket if not exists async with self.aws_client.client("s3") as s3: # Check if bucket exists diff --git a/cmoncrawl/aggregator/index_query.py b/cmoncrawl/aggregator/index_query.py index fbca1fd9..75b87126 100644 --- a/cmoncrawl/aggregator/index_query.py +++ b/cmoncrawl/aggregator/index_query.py @@ -2,6 +2,7 @@ from collections import deque from datetime import datetime import re +from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes, retrieve from types import TracebackType @@ -40,7 +41,6 @@ class IndexAggregator(AsyncIterable[DomainRecord]): Args: domains (List[str]): A list of domains to search for. - cc_indexes_server (str, optional): The commoncrawl index server to use. Defaults to "http://index.commoncrawl.org/collinfo.json". match_type (MatchType, optional): Match type for cdx-api. Defaults to None. cc_servers (List[str], optional): A list of commoncrawl servers to use. If [], then indexes will be retrieved from the cc_indexes_server. Defaults to []. since (datetime, optional): The start date for the search. Defaults to datetime.min. @@ -60,7 +60,6 @@ class IndexAggregator(AsyncIterable[DomainRecord]): def __init__( self, domains: List[str], - cc_indexes_server: str = "http://index.commoncrawl.org/collinfo.json", match_type: MatchType | None = None, cc_servers: List[str] = [], since: datetime = datetime.min, @@ -71,7 +70,6 @@ def __init__( sleep_step: int = 20, ) -> None: self.domains = domains - self.cc_indexes_server = cc_indexes_server self.cc_servers = cc_servers self.since = since self.to = to @@ -87,9 +85,7 @@ async def aopen(self) -> IndexAggregator: await self.client.__aenter__() if len(self.cc_servers) == 0: - self.cc_servers = await get_all_CC_indexes( - self.client, self.cc_indexes_server - ) + self.cc_servers = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER) return self async def __aenter__(self) -> IndexAggregator: diff --git a/cmoncrawl/aggregator/utils/constants.py b/cmoncrawl/aggregator/utils/constants.py new file mode 100644 index 00000000..cb92395e --- /dev/null +++ b/cmoncrawl/aggregator/utils/constants.py @@ -0,0 +1 @@ +CC_INDEXES_SERVER = "https://index.commoncrawl.org/collinfo.json" diff --git a/tests/aggregator_tests.py b/tests/aggregator_tests.py index 72075f49..ecf0fdd7 100644 --- a/tests/aggregator_tests.py +++ b/tests/aggregator_tests.py @@ -3,6 +3,7 @@ from pathlib import Path import boto3 +from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from tests.utils import MySQLRecordsDB import aioboto3 @@ -72,7 +73,7 @@ async def test_indexer_num_pages(self): self.assertEqual(size, 5) async def test_indexer_all_CC(self): - indexes = await get_all_CC_indexes(self.client, self.di.cc_indexes_server) + indexes = await get_all_CC_indexes(self.client, CC_INDEXES_SERVER) indexes = sorted(indexes) indexes = indexes[ : indexes.index("https://index.commoncrawl.org/CC-MAIN-2022-27-index") + 1 From d5311466f42c93f5d7f529d5e7174cd2fcef9be2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= Date: Mon, 20 Nov 2023 00:58:02 +0100 Subject: [PATCH 2/2] fix --- cmoncrawl/aggregator/athena_query.py | 3 +-- cmoncrawl/aggregator/gateway_query.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cmoncrawl/aggregator/athena_query.py b/cmoncrawl/aggregator/athena_query.py index f9490681..03d854b7 100644 --- a/cmoncrawl/aggregator/athena_query.py +++ b/cmoncrawl/aggregator/athena_query.py @@ -28,13 +28,12 @@ crawl_url_to_name, prepare_athena_sql_query, ) +from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from cmoncrawl.aggregator.utils.helpers import ( get_all_CC_indexes, remove_bucket_prefix, run_athena_query, ) -from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER -from cmoncrawl.aggregator.utils.helpers import get_all_CC_indexes from cmoncrawl.common.loggers import all_purpose_logger from cmoncrawl.common.types import ( DomainRecord, diff --git a/cmoncrawl/aggregator/gateway_query.py b/cmoncrawl/aggregator/gateway_query.py index d6088608..a846c098 100644 --- a/cmoncrawl/aggregator/gateway_query.py +++ b/cmoncrawl/aggregator/gateway_query.py @@ -3,7 +3,6 @@ import asyncio from collections import deque from datetime import datetime -from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from types import TracebackType from typing import ( AsyncIterator, @@ -20,6 +19,7 @@ ) from cmoncrawl.aggregator.base import IAggregator +from cmoncrawl.aggregator.utils.constants import CC_INDEXES_SERVER from cmoncrawl.aggregator.utils.helpers import ( crawl_to_year, get_all_CC_indexes,