From bbacd713b2db03d1bb812a98d5f8fd973a8dc7cc Mon Sep 17 00:00:00 2001 From: spacewaterbear Date: Sun, 11 Feb 2024 11:46:21 +0900 Subject: [PATCH 1/2] max_requests_per_second cannot be zero otherwise it will break here : cmoncrawl/processor/pipeline/downloader.py", line 97, in __init__ self.throttler = Throttler(int(1000 / max_requests_per_second)) --- cmoncrawl/integrations/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmoncrawl/integrations/download.py b/cmoncrawl/integrations/download.py index 816a5857..d4758d56 100644 --- a/cmoncrawl/integrations/download.py +++ b/cmoncrawl/integrations/download.py @@ -339,7 +339,7 @@ def run_download(args: argparse.Namespace): args.max_crawls_per_file if mode == DownloadOutputFormat.RECORD else 1 ) max_requests_per_second = ( - args.max_requests_per_second if mode == DownloadOutputFormat.RECORD else 0 + args.max_requests_per_second if mode == DownloadOutputFormat.RECORD else 1 ) # HTML exlusives encoding = args.encoding if mode == DownloadOutputFormat.HTML else None From 5b3ddd404a9d320412ccdc679af969dc26cb1dea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= Date: Tue, 13 Feb 2024 23:12:29 +0100 Subject: [PATCH 2/2] fix max requests per second --- cmoncrawl/integrations/download.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cmoncrawl/integrations/download.py b/cmoncrawl/integrations/download.py index d4758d56..d3d4e742 100644 --- a/cmoncrawl/integrations/download.py +++ b/cmoncrawl/integrations/download.py @@ -338,9 +338,6 @@ def run_download(args: argparse.Namespace): max_crawls_per_file = ( args.max_crawls_per_file if mode == DownloadOutputFormat.RECORD else 1 ) - max_requests_per_second = ( - args.max_requests_per_second if mode == DownloadOutputFormat.RECORD else 1 - ) # HTML exlusives encoding = args.encoding if mode == DownloadOutputFormat.HTML else None download_method = ( @@ -357,7 +354,7 @@ def run_download(args: argparse.Namespace): limit=args.limit, max_retry=args.max_retry, sleep_base=args.sleep_base, - max_requests_per_second=max_requests_per_second, + max_requests_per_second=args.max_requests_per_second, mode=mode, max_crawls_per_file=max_crawls_per_file, encoding=encoding,