From b334893013939c36b572ef7bb1fcfb7f72465072 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 9 Oct 2023 17:16:13 +0800 Subject: [PATCH 1/6] Add French Wiktionary JSON schema JSON schema doc: https://json-schema.org/learn/getting-started-step-by-step --- json_schema/fr.json | 310 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 310 insertions(+) create mode 100644 json_schema/fr.json diff --git a/json_schema/fr.json b/json_schema/fr.json new file mode 100644 index 00000000..04c355d0 --- /dev/null +++ b/json_schema/fr.json @@ -0,0 +1,310 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kaikki.org/fr.json", + "title": "French Wiktionary", + "description": "JSON schema of the French Wiktionary extractor", + "type": "object", + "properties": { + "lang": { + "description": "Localized langauge name of the word", + "type": "string" + }, + "lang_code": { + "description": "ISO 639-1 language code", + "type": "string" + }, + "word": { + "description": "word string", + "type": "string" + }, + "pos": { + "description": "Part of speech type", + "type": "string" + }, + "pos_title": { + "description": "Original POS title for matching etymology texts", + "type": "string" + }, + "etymology_texts": { + "description": "Etymology list", + "type": "array", + "items": { + "type": "string" + } + }, + "senses": { + "description": "Sense list", + "type": "array", + "items": { + "$ref": "#/$defs/sense" + } + }, + "forms": { + "description": "Inflection forms list", + "type": "array", + "items": { + "$ref": "#/$defs/form" + } + }, + "sounds": { + "type": "array", + "items": { + "$ref": "#/$defs/sound" + } + }, + "translations": { + "type": "array", + "items": { + "$ref": "#/$defs/translation" + } + }, + "synonyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "hyponyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "hypernyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "holonyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "meronyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "derived": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "troponyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "paronyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "related": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "abbreviation": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "proverbs": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "title": { + "description": "Redirect page source title", + "type": "string" + }, + "redirect": { + "description": "Redirect page target title", + "type": "string" + } + }, + "$defs": { + "sense": { + "type": "object", + "properties": { + "glosses": { + "type": "array", + "items": { + "type": "string" + } + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "categories": { + "type": "array", + "items": { + "type": "string" + } + }, + "examples": { + "type": "array", + "items": { + "$ref": "#/$defs/example" + } + } + } + }, + "example": { + "type": "object", + "properties": { + "text": { + "description": "Example usage sentence", + "type": "string" + }, + "translation": { + "description": "French translation of the example sentence", + "type": "string" + }, + "roman": { + "description": "Romanization of the example sentence", + "type": "string" + }, + "source": { + "description": "Source of the sentence, like book title and page number", + "type": "string" + }, + "type": { + "description": "This value is 'quotation' if 'source' exists", + "type": "string", + "enum": [ + "example", + "quotation" + ] + } + } + }, + "form": { + "type": "object", + "properties": { + "form": { + "type": "string" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "ipas": { + "description": "has more than one ipa", + "type": "array", + "items": { + "type": "string" + } + }, + "ipa": { + "description": "only has one ipa", + "type": "string" + }, + "source": { + "description": "form line template name", + "type": "string" + } + } + }, + "sound": { + "type": "object", + "properties": { + "zh-pron": { + "description": "Chinese word pronunciation", + "type": "string" + }, + "ipa": { + "description": "International Phonetic Alphabet", + "type": "string" + }, + "audio": { + "description": "Audio file name", + "type": "string" + }, + "wav_url": { + "type": "string" + }, + "ogg_url": { + "type": "string" + }, + "mp3_url": { + "type": "string" + } + } + }, + "translation": { + "type": "object", + "properties": { + "code": { + "description": "ISO 639-1 code of the translation term", + "type": "string" + }, + "lang": { + "description": "Transation language name", + "type": "string" + }, + "word": { + "description": "Translation term", + "type": "string" + }, + "sense": { + "description": "Translation gloss", + "type": "string" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "roman": { + "type": "string" + }, + "traditional_writing": { + "description": "Alternative writting for Chinese, Korean and Mongolian", + "type": "string" + } + } + }, + "linkage": { + "type": "object", + "properties": { + "word": { + "type": "string" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "roman": { + "type": "string" + }, + "alt": { + "description": "ALternative form", + "type": "string" + }, + "translation": { + "description": "French translation", + "type": "string" + } + } + } + } +} From 1a64e45e9ffc8050a914f19324b2724453e9cef1 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Mon, 9 Oct 2023 17:17:34 +0800 Subject: [PATCH 2/6] Add validate JSON script --- json_schema/validate.py | 36 ++++++++++++++++++++++++++++++++++++ pyproject.toml | 10 +++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 json_schema/validate.py diff --git a/json_schema/validate.py b/json_schema/validate.py new file mode 100644 index 00000000..1fb53e46 --- /dev/null +++ b/json_schema/validate.py @@ -0,0 +1,36 @@ +import argparse +import json +from concurrent.futures import ProcessPoolExecutor +from functools import partial +from pathlib import Path + + +def worker(line, schema={}): + from jsonschema import validate + + validate(instance=json.loads(line), schema=schema) + + +def main(): + """ + Validate extracted JSONL file with JSON schema. + """ + parser = argparse.ArgumentParser() + parser.add_argument("jsonl_path", type=Path) + parser.add_argument("schema_path", type=Path) + args = parser.parse_args() + + with ( + args.jsonl_path.open(encoding="utf-8") as jsonl_f, + args.schema_path.open(encoding="utf-8") as schema_f, + ProcessPoolExecutor() as executor, + ): + schema = json.load(schema_f) + for _ in executor.map( + partial(worker, schema=schema), jsonl_f, chunksize=1000 + ): + pass + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 8556bbcd..32afc44b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,6 +39,7 @@ dependencies = [ [project.optional-dependencies] dev = [ "black", + "jsonschema", "mypy", "nose2[coverage_plugin]", "ruff", @@ -55,7 +56,14 @@ homepage = "https://github.com/tatuylonen/wiktextract" zip-safe = false [tool.setuptools.packages.find] -exclude = ["languages", "overrides", "tests", "tools", "usertools"] +exclude = [ + "languages", + "overrides", + "tests", + "tools", + "usertools", + "json_schema" +] [tool.setuptools.package-data] wiktextract = [ From fd116222f7e8201aab63060bf1031e996304848e Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 10 Oct 2023 09:56:32 +0800 Subject: [PATCH 3/6] Change French JSON example "source" key to "ref" --- json_schema/fr.json | 8 +++++++- tests/test_fr_gloss.py | 4 ++-- wiktextract/extractor/fr/gloss.py | 4 ++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/json_schema/fr.json b/json_schema/fr.json index 04c355d0..1652ddd1 100644 --- a/json_schema/fr.json +++ b/json_schema/fr.json @@ -131,6 +131,12 @@ "redirect": { "description": "Redirect page target title", "type": "string" + }, + "categories": { + "type": "array", + "items": { + "type": "string" + } } }, "$defs": { @@ -178,7 +184,7 @@ "description": "Romanization of the example sentence", "type": "string" }, - "source": { + "ref": { "description": "Source of the sentence, like book title and page number", "type": "string" }, diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py index 43744e5b..5f5d11d6 100644 --- a/tests/test_fr_gloss.py +++ b/tests/test_fr_gloss.py @@ -73,7 +73,7 @@ def test_example_template(self): "text": "text", "translation": "translation", "roman": "roman", - "source": "source", + "ref": "source", "type": "quotation", } ], @@ -104,7 +104,7 @@ def test_example_source_template(self, mock_node_to_html): "examples": [ { "text": "example", - "source": "source_title", + "ref": "source_title", "type": "quotation", } ], diff --git a/wiktextract/extractor/fr/gloss.py b/wiktextract/extractor/fr/gloss.py index 3cba7635..bcb03994 100644 --- a/wiktextract/extractor/fr/gloss.py +++ b/wiktextract/extractor/fr/gloss.py @@ -90,7 +90,7 @@ def extract_examples( example_data = {"type": "example"} example_data["text"] = clean_node(wxr, None, example_nodes) if source_template is not None: - example_data["source"] = clean_node( + example_data["ref"] = clean_node( wxr, None, source_template ).strip("— ()") example_data["type"] = "quotation" @@ -125,7 +125,7 @@ def process_exemple_template( if len(transcription) > 0: example_data["roman"] = clean_node(wxr, None, transcription) if len(source) > 0: - example_data["source"] = clean_node(wxr, None, source) + example_data["ref"] = clean_node(wxr, None, source) example_data["type"] = "quotation" if "text" in example_data: gloss_data["examples"].append(example_data) From a7815f9e4e1be70a216b250a754281cfba8003ce Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 10 Oct 2023 11:38:47 +0800 Subject: [PATCH 4/6] Add Chinese Wiktionary JSON schema --- json_schema/zh.json | 320 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 320 insertions(+) create mode 100644 json_schema/zh.json diff --git a/json_schema/zh.json b/json_schema/zh.json new file mode 100644 index 00000000..2cdf8fab --- /dev/null +++ b/json_schema/zh.json @@ -0,0 +1,320 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://kaikki.org/zh.json", + "title": "Chinese Wiktionary", + "description": "JSON schema of the Chinese Wiktionary extractor", + "type": "object", + "properties": { + "lang": { + "description": "Localized langauge name of the word", + "type": "string" + }, + "lang_code": { + "description": "ISO 639-1 language code", + "type": "string" + }, + "word": { + "description": "word string", + "type": "string" + }, + "pos": { + "description": "Part of speech type", + "type": "string" + }, + "etymology_text": { + "type": "string" + }, + "senses": { + "description": "Sense list", + "type": "array", + "items": { + "$ref": "#/$defs/sense" + } + }, + "forms": { + "description": "Inflection forms list", + "type": "array", + "items": { + "$ref": "#/$defs/form" + } + }, + "sounds": { + "type": "array", + "items": { + "$ref": "#/$defs/sound" + } + }, + "translations": { + "type": "array", + "items": { + "$ref": "#/$defs/translation" + } + }, + "synonyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "hyponyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "hypernyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "holonyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "meronyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "derived": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "troponyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "paronyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "related": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "abbreviation": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "proverbs": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "antonyms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "coordinate_terms": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "various": { + "type": "array", + "items": { + "$ref": "#/$defs/linkage" + } + }, + "title": { + "description": "Redirect page source title", + "type": "string" + }, + "redirect": { + "description": "Redirect page target title", + "type": "string" + }, + "categories": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "$defs": { + "sense": { + "type": "object", + "properties": { + "glosses": { + "type": "array", + "items": { + "type": "string" + } + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "categories": { + "type": "array", + "items": { + "type": "string" + } + }, + "examples": { + "type": "array", + "items": { + "$ref": "#/$defs/example" + } + } + } + }, + "example": { + "type": "object", + "properties": { + "texts": { + "description": "Example usage sentences, some might have have both Simplified and Traditional Chinese forms", + "type": "array", + "items": { + "type": "string" + } + }, + "translation": { + "description": "Chinese translation of the example sentence", + "type": "string" + }, + "roman": { + "description": "Romanization of the example sentence", + "type": "string" + }, + "ref": { + "description": "Source of the sentence, like book title and page number", + "type": "string" + }, + "type": { + "description": "This value is 'quotation' if 'source' exists", + "type": "string", + "enum": [ + "example", + "quotation" + ] + } + } + }, + "form": { + "type": "object", + "properties": { + "form": { + "type": "string" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "source": { + "type": "string" + }, + "ruby": { + "description": "Japanese Kanji and furigana", + "type": "array", + "items": { + "type": "array", + "items": { + "type": "string" + } + } + } + } + }, + "sound": { + "type": "object", + "properties": { + "zh-pron": { + "description": "Chinese word pronunciation", + "type": "string" + }, + "ipa": { + "description": "International Phonetic Alphabet", + "type": "string" + }, + "audio": { + "description": "Audio file name", + "type": "string" + }, + "wav_url": { + "type": "string" + }, + "ogg_url": { + "type": "string" + }, + "mp3_url": { + "type": "string" + } + } + }, + "translation": { + "type": "object", + "properties": { + "code": { + "description": "ISO 639-1 code of the translation term", + "type": "string" + }, + "lang": { + "description": "Transation language name", + "type": "string" + }, + "word": { + "description": "Translation term", + "type": "string" + }, + "sense": { + "description": "Translation gloss", + "type": "string" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "roman": { + "type": "string" + } + } + }, + "linkage": { + "type": "object", + "properties": { + "word": { + "type": "string" + }, + "tags": { + "type": "array", + "items": { + "type": "string" + } + }, + "roman": { + "type": "string" + }, + "language_variant": { + "description": "Chinese character variant", + "type": "string", + "enum": ["zh-Hant", "zh-Hans"] + } + } + } + } +} From 10430942fd0721091b796b8edfa25766ffd5c3f8 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 10 Oct 2023 11:39:11 +0800 Subject: [PATCH 5/6] Change Chinese Wiktionary example data "type" value from "quote" to "quotation" --- tests/test_zh_example.py | 8 ++++++-- wiktextract/extractor/zh/example.py | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_zh_example.py b/tests/test_zh_example.py index caad8362..433b4cab 100644 --- a/tests/test_zh_example.py +++ b/tests/test_zh_example.py @@ -34,7 +34,11 @@ def test_example_list(self) -> None: self.assertEqual( sense_data.get("examples"), [ - {"ref": "ref text", "text": "example text", "type": "quote"}, + { + "ref": "ref text", + "text": "example text", + "type": "quotation", + }, ], ) @@ -57,7 +61,7 @@ def test_quote_example(self, mock_clean_node) -> None: "ref": "ref text", "text": "quote text", "translation": "translation text", - "type": "quote", + "type": "quotation", }, ], ) diff --git a/wiktextract/extractor/zh/example.py b/wiktextract/extractor/zh/example.py index f90e0f57..0ba257c7 100644 --- a/wiktextract/extractor/zh/example.py +++ b/wiktextract/extractor/zh/example.py @@ -56,7 +56,7 @@ def extract_example_list( isinstance(child_node, WikiNode) and child_node.kind == NodeKind.LIST ): - example_data["type"] = "quote" + example_data["type"] = "quotation" example_data["ref"] = clean_node(wxr, None, node.children[:index]) example_data["text"] = clean_node( wxr, None, child_node.children[0].children @@ -69,7 +69,7 @@ def extract_quote_templates( """ Process template `quote-book` and "RQ:*". """ - example_data["type"] = "quote" + example_data["type"] = "quotation" expanded_text = clean_node(wxr, None, node) for line_num, expanded_line in enumerate(expanded_text.splitlines()): if line_num == 0: @@ -128,7 +128,7 @@ def extract_template_zh_usex( example_data["roman"] = expanded_line elif expanded_line.startswith("來自:"): example_data["ref"] = expanded_line[3:] - example_data["type"] = "quote" + example_data["type"] = "quotation" else: example_data["translation"] = expanded_line From 095781cb5bda8fb54344c2fdc65bbed131b34725 Mon Sep 17 00:00:00 2001 From: xxyzz Date: Tue, 10 Oct 2023 13:48:31 +0800 Subject: [PATCH 6/6] language code could be null in Chinese Wiktionary JSON --- json_schema/zh.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/json_schema/zh.json b/json_schema/zh.json index 2cdf8fab..429915d8 100644 --- a/json_schema/zh.json +++ b/json_schema/zh.json @@ -11,7 +11,7 @@ }, "lang_code": { "description": "ISO 639-1 language code", - "type": "string" + "type": ["string", "null"] }, "word": { "description": "word string", @@ -269,7 +269,7 @@ "properties": { "code": { "description": "ISO 639-1 code of the translation term", - "type": "string" + "type": ["string", "null"] }, "lang": { "description": "Transation language name",