Skip to content

Commit

Permalink
Add validate JSON script
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Oct 9, 2023
1 parent b334893 commit 1a64e45
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 1 deletion.
36 changes: 36 additions & 0 deletions json_schema/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import argparse
import json
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from pathlib import Path


def worker(line, schema={}):
from jsonschema import validate

validate(instance=json.loads(line), schema=schema)


def main():
"""
Validate extracted JSONL file with JSON schema.
"""
parser = argparse.ArgumentParser()
parser.add_argument("jsonl_path", type=Path)
parser.add_argument("schema_path", type=Path)
args = parser.parse_args()

with (
args.jsonl_path.open(encoding="utf-8") as jsonl_f,
args.schema_path.open(encoding="utf-8") as schema_f,
ProcessPoolExecutor() as executor,
):
schema = json.load(schema_f)
for _ in executor.map(
partial(worker, schema=schema), jsonl_f, chunksize=1000
):
pass


if __name__ == "__main__":
main()
10 changes: 9 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ dependencies = [
[project.optional-dependencies]
dev = [
"black",
"jsonschema",
"mypy",
"nose2[coverage_plugin]",
"ruff",
Expand All @@ -55,7 +56,14 @@ homepage = "https://github.com/tatuylonen/wiktextract"
zip-safe = false

[tool.setuptools.packages.find]
exclude = ["languages", "overrides", "tests", "tools", "usertools"]
exclude = [
"languages",
"overrides",
"tests",
"tools",
"usertools",
"json_schema"
]

[tool.setuptools.package-data]
wiktextract = [
Expand Down

0 comments on commit 1a64e45

Please sign in to comment.