Skip to content

Commit

Permalink
add better errror when parsing config
Browse files Browse the repository at this point in the history
  • Loading branch information
hynky1999 committed Apr 7, 2024
1 parent c00b3cd commit b99c1d5
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 8 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ Set up a configuration file, `config.json`, to specify the behavior of your extr
]
}
```
Please note that the configuration file `config.json` must be a valid JSON. Therefore, comments as shown in the example above cannot be included directly in the JSON file.


### Step: 4 Run the extractor
Test your extractor with the following command:
Expand Down
17 changes: 12 additions & 5 deletions cmoncrawl/integrations/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,13 @@ def get_domain_records_html(


def load_config(config_path: Path) -> ExtractConfig:
with open(config_path, "r") as f:
config = json.load(f)
try:
with open(config_path, "r") as f:
config = json.load(f)
except Exception as e:
raise ValueError(
f"Failed to load extractor config. Ensure it's valid JSON."
) from e
return ExtractConfig.model_validate(config)


Expand Down Expand Up @@ -273,9 +278,11 @@ def run_extract(args: argparse.Namespace):
_extract_task,
[
(
args.output_path / f"{file.stem}"
if args.n_proc != 1
else args.output_path,
(
args.output_path / f"{file.stem}"
if args.n_proc != 1
else args.output_path
),
config,
[file],
args,
Expand Down
10 changes: 7 additions & 3 deletions tests/end_to_end_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from cmoncrawl.integrations.extract import (
ExtractMode,
extract_from_files,
load_config,
)
from cmoncrawl.integrations.utils import DAOname

Expand All @@ -30,12 +31,15 @@ async def asyncTearDown(self) -> None:

async def test_load_config(self):
cfg_path = self.base_folder / "cfg.json"
with open(cfg_path, "r") as f:
js = json.load(f)
cfg: ExtractConfig = ExtractConfig.model_validate(js)
cfg: ExtractConfig = load_config(cfg_path)

self.assertEqual(cfg.routes[0].extractors[0].name, "test_extractor")

async def test_load_config_invalid_json(self):
cfg_path = self.base_folder / "cfg_invalid.json"
with self.assertRaises(ValueError):
load_config(cfg_path)

@parameterized.expand([(DAOname.API,), (DAOname.S3,)])
async def test_extract_from_records(self, dao: DAOname):
cfg_path = self.base_folder / "cfg.json"
Expand Down
17 changes: 17 additions & 0 deletions tests/test_extract/cfg_invalid.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
# Comment
"extractors_path": "tests/test_extract/extractors",
"routes": [
{
"regexes": [
".*"
],
"extractors": [
{
"name": "test_extractor",
"since": "2009-01-01T00:00:00+00:00"
}
]
}
]
}

0 comments on commit b99c1d5

Please sign in to comment.