Skip to content

Commit

Permalink
Merge pull request #356 from xxyzz/json_schema
Browse files Browse the repository at this point in the history
Add French and Chinese Wiktionary JSON schema
  • Loading branch information
xxyzz authored Oct 10, 2023
2 parents 2d1a893 + 095781c commit b4a54f7
Show file tree
Hide file tree
Showing 8 changed files with 694 additions and 10 deletions.
316 changes: 316 additions & 0 deletions json_schema/fr.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,316 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://kaikki.org/fr.json",
"title": "French Wiktionary",
"description": "JSON schema of the French Wiktionary extractor",
"type": "object",
"properties": {
"lang": {
"description": "Localized langauge name of the word",
"type": "string"
},
"lang_code": {
"description": "ISO 639-1 language code",
"type": "string"
},
"word": {
"description": "word string",
"type": "string"
},
"pos": {
"description": "Part of speech type",
"type": "string"
},
"pos_title": {
"description": "Original POS title for matching etymology texts",
"type": "string"
},
"etymology_texts": {
"description": "Etymology list",
"type": "array",
"items": {
"type": "string"
}
},
"senses": {
"description": "Sense list",
"type": "array",
"items": {
"$ref": "#/$defs/sense"
}
},
"forms": {
"description": "Inflection forms list",
"type": "array",
"items": {
"$ref": "#/$defs/form"
}
},
"sounds": {
"type": "array",
"items": {
"$ref": "#/$defs/sound"
}
},
"translations": {
"type": "array",
"items": {
"$ref": "#/$defs/translation"
}
},
"synonyms": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"hyponyms": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"hypernyms": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"holonyms": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"meronyms": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"derived": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"troponyms": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"paronyms": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"related": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"abbreviation": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"proverbs": {
"type": "array",
"items": {
"$ref": "#/$defs/linkage"
}
},
"title": {
"description": "Redirect page source title",
"type": "string"
},
"redirect": {
"description": "Redirect page target title",
"type": "string"
},
"categories": {
"type": "array",
"items": {
"type": "string"
}
}
},
"$defs": {
"sense": {
"type": "object",
"properties": {
"glosses": {
"type": "array",
"items": {
"type": "string"
}
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
},
"categories": {
"type": "array",
"items": {
"type": "string"
}
},
"examples": {
"type": "array",
"items": {
"$ref": "#/$defs/example"
}
}
}
},
"example": {
"type": "object",
"properties": {
"text": {
"description": "Example usage sentence",
"type": "string"
},
"translation": {
"description": "French translation of the example sentence",
"type": "string"
},
"roman": {
"description": "Romanization of the example sentence",
"type": "string"
},
"ref": {
"description": "Source of the sentence, like book title and page number",
"type": "string"
},
"type": {
"description": "This value is 'quotation' if 'source' exists",
"type": "string",
"enum": [
"example",
"quotation"
]
}
}
},
"form": {
"type": "object",
"properties": {
"form": {
"type": "string"
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
},
"ipas": {
"description": "has more than one ipa",
"type": "array",
"items": {
"type": "string"
}
},
"ipa": {
"description": "only has one ipa",
"type": "string"
},
"source": {
"description": "form line template name",
"type": "string"
}
}
},
"sound": {
"type": "object",
"properties": {
"zh-pron": {
"description": "Chinese word pronunciation",
"type": "string"
},
"ipa": {
"description": "International Phonetic Alphabet",
"type": "string"
},
"audio": {
"description": "Audio file name",
"type": "string"
},
"wav_url": {
"type": "string"
},
"ogg_url": {
"type": "string"
},
"mp3_url": {
"type": "string"
}
}
},
"translation": {
"type": "object",
"properties": {
"code": {
"description": "ISO 639-1 code of the translation term",
"type": "string"
},
"lang": {
"description": "Transation language name",
"type": "string"
},
"word": {
"description": "Translation term",
"type": "string"
},
"sense": {
"description": "Translation gloss",
"type": "string"
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
},
"roman": {
"type": "string"
},
"traditional_writing": {
"description": "Alternative writting for Chinese, Korean and Mongolian",
"type": "string"
}
}
},
"linkage": {
"type": "object",
"properties": {
"word": {
"type": "string"
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
},
"roman": {
"type": "string"
},
"alt": {
"description": "ALternative form",
"type": "string"
},
"translation": {
"description": "French translation",
"type": "string"
}
}
}
}
}
36 changes: 36 additions & 0 deletions json_schema/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import argparse
import json
from concurrent.futures import ProcessPoolExecutor
from functools import partial
from pathlib import Path


def worker(line, schema={}):
from jsonschema import validate

validate(instance=json.loads(line), schema=schema)


def main():
"""
Validate extracted JSONL file with JSON schema.
"""
parser = argparse.ArgumentParser()
parser.add_argument("jsonl_path", type=Path)
parser.add_argument("schema_path", type=Path)
args = parser.parse_args()

with (
args.jsonl_path.open(encoding="utf-8") as jsonl_f,
args.schema_path.open(encoding="utf-8") as schema_f,
ProcessPoolExecutor() as executor,
):
schema = json.load(schema_f)
for _ in executor.map(
partial(worker, schema=schema), jsonl_f, chunksize=1000
):
pass


if __name__ == "__main__":
main()
Loading

0 comments on commit b4a54f7

Please sign in to comment.