From b99c1d5c0b38d9118ec1e5ef0d7b19b39a98a976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hynek=20Kydl=C3=AD=C4=8Dek?= Date: Mon, 8 Apr 2024 00:56:22 +0200 Subject: [PATCH] add better errror when parsing config --- README.md | 2 ++ cmoncrawl/integrations/extract.py | 17 ++++++++++++----- tests/end_to_end_test.py | 10 +++++++--- tests/test_extract/cfg_invalid.json | 17 +++++++++++++++++ 4 files changed, 38 insertions(+), 8 deletions(-) create mode 100644 tests/test_extract/cfg_invalid.json diff --git a/README.md b/README.md index cec882f9..3ba94c61 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,8 @@ Set up a configuration file, `config.json`, to specify the behavior of your extr ] } ``` +Please note that the configuration file `config.json` must be a valid JSON. Therefore, comments as shown in the example above cannot be included directly in the JSON file. + ### Step: 4 Run the extractor Test your extractor with the following command: diff --git a/cmoncrawl/integrations/extract.py b/cmoncrawl/integrations/extract.py index 5de1d79a..f220ae40 100644 --- a/cmoncrawl/integrations/extract.py +++ b/cmoncrawl/integrations/extract.py @@ -174,8 +174,13 @@ def get_domain_records_html( def load_config(config_path: Path) -> ExtractConfig: - with open(config_path, "r") as f: - config = json.load(f) + try: + with open(config_path, "r") as f: + config = json.load(f) + except Exception as e: + raise ValueError( + f"Failed to load extractor config. Ensure it's valid JSON." + ) from e return ExtractConfig.model_validate(config) @@ -273,9 +278,11 @@ def run_extract(args: argparse.Namespace): _extract_task, [ ( - args.output_path / f"{file.stem}" - if args.n_proc != 1 - else args.output_path, + ( + args.output_path / f"{file.stem}" + if args.n_proc != 1 + else args.output_path + ), config, [file], args, diff --git a/tests/end_to_end_test.py b/tests/end_to_end_test.py index 0258da4a..9dd81ef6 100644 --- a/tests/end_to_end_test.py +++ b/tests/end_to_end_test.py @@ -10,6 +10,7 @@ from cmoncrawl.integrations.extract import ( ExtractMode, extract_from_files, + load_config, ) from cmoncrawl.integrations.utils import DAOname @@ -30,12 +31,15 @@ async def asyncTearDown(self) -> None: async def test_load_config(self): cfg_path = self.base_folder / "cfg.json" - with open(cfg_path, "r") as f: - js = json.load(f) - cfg: ExtractConfig = ExtractConfig.model_validate(js) + cfg: ExtractConfig = load_config(cfg_path) self.assertEqual(cfg.routes[0].extractors[0].name, "test_extractor") + async def test_load_config_invalid_json(self): + cfg_path = self.base_folder / "cfg_invalid.json" + with self.assertRaises(ValueError): + load_config(cfg_path) + @parameterized.expand([(DAOname.API,), (DAOname.S3,)]) async def test_extract_from_records(self, dao: DAOname): cfg_path = self.base_folder / "cfg.json" diff --git a/tests/test_extract/cfg_invalid.json b/tests/test_extract/cfg_invalid.json new file mode 100644 index 00000000..e4311fa0 --- /dev/null +++ b/tests/test_extract/cfg_invalid.json @@ -0,0 +1,17 @@ +{ + # Comment + "extractors_path": "tests/test_extract/extractors", + "routes": [ + { + "regexes": [ + ".*" + ], + "extractors": [ + { + "name": "test_extractor", + "since": "2009-01-01T00:00:00+00:00" + } + ] + } + ] +} \ No newline at end of file