From b334893013939c36b572ef7bb1fcfb7f72465072 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Mon, 9 Oct 2023 17:16:13 +0800
Subject: [PATCH 1/6] Add French Wiktionary JSON schema

JSON schema doc: https://json-schema.org/learn/getting-started-step-by-step
---
 json_schema/fr.json | 310 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 310 insertions(+)
 create mode 100644 json_schema/fr.json

diff --git a/json_schema/fr.json b/json_schema/fr.json
new file mode 100644
index 00000000..04c355d0
--- /dev/null
+++ b/json_schema/fr.json
@@ -0,0 +1,310 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://kaikki.org/fr.json",
+  "title": "French Wiktionary",
+  "description": "JSON schema of the French Wiktionary extractor",
+  "type": "object",
+  "properties": {
+    "lang": {
+      "description": "Localized langauge name of the word",
+      "type": "string"
+    },
+    "lang_code": {
+      "description": "ISO 639-1 language code",
+      "type": "string"
+    },
+    "word": {
+      "description": "word string",
+      "type": "string"
+    },
+    "pos": {
+      "description": "Part of speech type",
+      "type": "string"
+    },
+    "pos_title": {
+      "description": "Original POS title for matching etymology texts",
+      "type": "string"
+    },
+    "etymology_texts": {
+      "description": "Etymology list",
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "senses": {
+      "description": "Sense list",
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/sense"
+      }
+    },
+    "forms": {
+      "description": "Inflection forms list",
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/form"
+      }
+    },
+    "sounds": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/sound"
+      }
+    },
+    "translations": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/translation"
+      }
+    },
+    "synonyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "hyponyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "hypernyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "holonyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "meronyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "derived": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "troponyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "paronyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "related": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "abbreviation": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "proverbs": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "title": {
+      "description": "Redirect page source title",
+      "type": "string"
+    },
+    "redirect": {
+      "description": "Redirect page target title",
+      "type": "string"
+    }
+  },
+  "$defs": {
+    "sense": {
+      "type": "object",
+      "properties": {
+        "glosses": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "categories": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "examples": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/example"
+          }
+        }
+      }
+    },
+    "example": {
+      "type": "object",
+      "properties": {
+        "text": {
+          "description": "Example usage sentence",
+          "type": "string"
+        },
+        "translation": {
+          "description": "French translation of the example sentence",
+          "type": "string"
+        },
+        "roman": {
+          "description": "Romanization of the example sentence",
+          "type": "string"
+        },
+        "source": {
+          "description": "Source of the sentence, like book title and page number",
+          "type": "string"
+        },
+        "type": {
+          "description": "This value is 'quotation' if 'source' exists",
+          "type": "string",
+          "enum": [
+            "example",
+            "quotation"
+          ]
+        }
+      }
+    },
+    "form": {
+      "type": "object",
+      "properties": {
+        "form": {
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "ipas": {
+          "description": "has more than one ipa",
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "ipa": {
+          "description": "only has one ipa",
+          "type": "string"
+        },
+        "source": {
+          "description": "form line template name",
+          "type": "string"
+        }
+      }
+    },
+    "sound": {
+      "type": "object",
+      "properties": {
+        "zh-pron": {
+          "description": "Chinese word pronunciation",
+          "type": "string"
+        },
+        "ipa": {
+          "description": "International Phonetic Alphabet",
+          "type": "string"
+        },
+        "audio": {
+          "description": "Audio file name",
+          "type": "string"
+        },
+        "wav_url": {
+          "type": "string"
+        },
+        "ogg_url": {
+          "type": "string"
+        },
+        "mp3_url": {
+          "type": "string"
+        }
+      }
+    },
+    "translation": {
+      "type": "object",
+      "properties": {
+        "code": {
+          "description": "ISO 639-1 code of the translation term",
+          "type": "string"
+        },
+        "lang": {
+          "description": "Transation language name",
+          "type": "string"
+        },
+        "word": {
+          "description": "Translation term",
+          "type": "string"
+        },
+        "sense": {
+          "description": "Translation gloss",
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "roman": {
+          "type": "string"
+        },
+        "traditional_writing": {
+          "description": "Alternative writting for Chinese, Korean and Mongolian",
+          "type": "string"
+        }
+      }
+    },
+    "linkage": {
+      "type": "object",
+      "properties": {
+        "word": {
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "roman": {
+          "type": "string"
+        },
+        "alt": {
+          "description": "ALternative form",
+          "type": "string"
+        },
+        "translation": {
+          "description": "French translation",
+          "type": "string"
+        }
+      }
+    }
+  }
+}

From 1a64e45e9ffc8050a914f19324b2724453e9cef1 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Mon, 9 Oct 2023 17:17:34 +0800
Subject: [PATCH 2/6] Add validate JSON script

---
 json_schema/validate.py | 36 ++++++++++++++++++++++++++++++++++++
 pyproject.toml          | 10 +++++++++-
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 json_schema/validate.py

diff --git a/json_schema/validate.py b/json_schema/validate.py
new file mode 100644
index 00000000..1fb53e46
--- /dev/null
+++ b/json_schema/validate.py
@@ -0,0 +1,36 @@
+import argparse
+import json
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+from pathlib import Path
+
+
+def worker(line, schema={}):
+    from jsonschema import validate
+
+    validate(instance=json.loads(line), schema=schema)
+
+
+def main():
+    """
+    Validate extracted JSONL file with JSON schema.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("jsonl_path", type=Path)
+    parser.add_argument("schema_path", type=Path)
+    args = parser.parse_args()
+
+    with (
+        args.jsonl_path.open(encoding="utf-8") as jsonl_f,
+        args.schema_path.open(encoding="utf-8") as schema_f,
+        ProcessPoolExecutor() as executor,
+    ):
+        schema = json.load(schema_f)
+        for _ in executor.map(
+            partial(worker, schema=schema), jsonl_f, chunksize=1000
+        ):
+            pass
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index 8556bbcd..32afc44b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,6 +39,7 @@ dependencies = [
 [project.optional-dependencies]
 dev = [
     "black",
+    "jsonschema",
     "mypy",
     "nose2[coverage_plugin]",
     "ruff",
@@ -55,7 +56,14 @@ homepage = "https://github.com/tatuylonen/wiktextract"
 zip-safe = false
 
 [tool.setuptools.packages.find]
-exclude = ["languages", "overrides", "tests", "tools", "usertools"]
+exclude = [
+    "languages",
+    "overrides",
+    "tests",
+    "tools",
+    "usertools",
+    "json_schema"
+]
 
 [tool.setuptools.package-data]
 wiktextract = [

From fd116222f7e8201aab63060bf1031e996304848e Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Tue, 10 Oct 2023 09:56:32 +0800
Subject: [PATCH 3/6] Change French JSON example "source" key to "ref"

---
 json_schema/fr.json               | 8 +++++++-
 tests/test_fr_gloss.py            | 4 ++--
 wiktextract/extractor/fr/gloss.py | 4 ++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/json_schema/fr.json b/json_schema/fr.json
index 04c355d0..1652ddd1 100644
--- a/json_schema/fr.json
+++ b/json_schema/fr.json
@@ -131,6 +131,12 @@
     "redirect": {
       "description": "Redirect page target title",
       "type": "string"
+    },
+    "categories": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
     }
   },
   "$defs": {
@@ -178,7 +184,7 @@
           "description": "Romanization of the example sentence",
           "type": "string"
         },
-        "source": {
+        "ref": {
           "description": "Source of the sentence, like book title and page number",
           "type": "string"
         },
diff --git a/tests/test_fr_gloss.py b/tests/test_fr_gloss.py
index 43744e5b..5f5d11d6 100644
--- a/tests/test_fr_gloss.py
+++ b/tests/test_fr_gloss.py
@@ -73,7 +73,7 @@ def test_example_template(self):
                                     "text": "text",
                                     "translation": "translation",
                                     "roman": "roman",
-                                    "source": "source",
+                                    "ref": "source",
                                     "type": "quotation",
                                 }
                             ],
@@ -104,7 +104,7 @@ def test_example_source_template(self, mock_node_to_html):
                             "examples": [
                                 {
                                     "text": "example",
-                                    "source": "source_title",
+                                    "ref": "source_title",
                                     "type": "quotation",
                                 }
                             ],
diff --git a/wiktextract/extractor/fr/gloss.py b/wiktextract/extractor/fr/gloss.py
index 3cba7635..bcb03994 100644
--- a/wiktextract/extractor/fr/gloss.py
+++ b/wiktextract/extractor/fr/gloss.py
@@ -90,7 +90,7 @@ def extract_examples(
             example_data = {"type": "example"}
             example_data["text"] = clean_node(wxr, None, example_nodes)
             if source_template is not None:
-                example_data["source"] = clean_node(
+                example_data["ref"] = clean_node(
                     wxr, None, source_template
                 ).strip("— ()")
                 example_data["type"] = "quotation"
@@ -125,7 +125,7 @@ def process_exemple_template(
     if len(transcription) > 0:
         example_data["roman"] = clean_node(wxr, None, transcription)
     if len(source) > 0:
-        example_data["source"] = clean_node(wxr, None, source)
+        example_data["ref"] = clean_node(wxr, None, source)
         example_data["type"] = "quotation"
     if "text" in example_data:
         gloss_data["examples"].append(example_data)

From a7815f9e4e1be70a216b250a754281cfba8003ce Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Tue, 10 Oct 2023 11:38:47 +0800
Subject: [PATCH 4/6] Add Chinese Wiktionary JSON schema

---
 json_schema/zh.json | 320 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 320 insertions(+)
 create mode 100644 json_schema/zh.json

diff --git a/json_schema/zh.json b/json_schema/zh.json
new file mode 100644
index 00000000..2cdf8fab
--- /dev/null
+++ b/json_schema/zh.json
@@ -0,0 +1,320 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://kaikki.org/zh.json",
+  "title": "Chinese Wiktionary",
+  "description": "JSON schema of the Chinese Wiktionary extractor",
+  "type": "object",
+  "properties": {
+    "lang": {
+      "description": "Localized langauge name of the word",
+      "type": "string"
+    },
+    "lang_code": {
+      "description": "ISO 639-1 language code",
+      "type": "string"
+    },
+    "word": {
+      "description": "word string",
+      "type": "string"
+    },
+    "pos": {
+      "description": "Part of speech type",
+      "type": "string"
+    },
+    "etymology_text": {
+      "type": "string"
+    },
+    "senses": {
+      "description": "Sense list",
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/sense"
+      }
+    },
+    "forms": {
+      "description": "Inflection forms list",
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/form"
+      }
+    },
+    "sounds": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/sound"
+      }
+    },
+    "translations": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/translation"
+      }
+    },
+    "synonyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "hyponyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "hypernyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "holonyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "meronyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "derived": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "troponyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "paronyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "related": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "abbreviation": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "proverbs": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "antonyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "coordinate_terms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "various": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "title": {
+      "description": "Redirect page source title",
+      "type": "string"
+    },
+    "redirect": {
+      "description": "Redirect page target title",
+      "type": "string"
+    },
+    "categories": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    }
+  },
+  "$defs": {
+    "sense": {
+      "type": "object",
+      "properties": {
+        "glosses": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "categories": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "examples": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/example"
+          }
+        }
+      }
+    },
+    "example": {
+      "type": "object",
+      "properties": {
+        "texts": {
+          "description": "Example usage sentences, some might have have both Simplified and Traditional Chinese forms",
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "translation": {
+          "description": "Chinese translation of the example sentence",
+          "type": "string"
+        },
+        "roman": {
+          "description": "Romanization of the example sentence",
+          "type": "string"
+        },
+        "ref": {
+          "description": "Source of the sentence, like book title and page number",
+          "type": "string"
+        },
+        "type": {
+          "description": "This value is 'quotation' if 'source' exists",
+          "type": "string",
+          "enum": [
+            "example",
+            "quotation"
+          ]
+        }
+      }
+    },
+    "form": {
+      "type": "object",
+      "properties": {
+        "form": {
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "source": {
+          "type": "string"
+        },
+        "ruby": {
+          "description": "Japanese Kanji and furigana",
+          "type": "array",
+          "items": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          }
+        }
+      }
+    },
+    "sound": {
+      "type": "object",
+      "properties": {
+        "zh-pron": {
+          "description": "Chinese word pronunciation",
+          "type": "string"
+        },
+        "ipa": {
+          "description": "International Phonetic Alphabet",
+          "type": "string"
+        },
+        "audio": {
+          "description": "Audio file name",
+          "type": "string"
+        },
+        "wav_url": {
+          "type": "string"
+        },
+        "ogg_url": {
+          "type": "string"
+        },
+        "mp3_url": {
+          "type": "string"
+        }
+      }
+    },
+    "translation": {
+      "type": "object",
+      "properties": {
+        "code": {
+          "description": "ISO 639-1 code of the translation term",
+          "type": "string"
+        },
+        "lang": {
+          "description": "Transation language name",
+          "type": "string"
+        },
+        "word": {
+          "description": "Translation term",
+          "type": "string"
+        },
+        "sense": {
+          "description": "Translation gloss",
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "roman": {
+          "type": "string"
+        }
+      }
+    },
+    "linkage": {
+      "type": "object",
+      "properties": {
+        "word": {
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "roman": {
+          "type": "string"
+        },
+        "language_variant": {
+          "description": "Chinese character variant",
+          "type": "string",
+          "enum": ["zh-Hant", "zh-Hans"]
+        }
+      }
+    }
+  }
+}

From 10430942fd0721091b796b8edfa25766ffd5c3f8 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Tue, 10 Oct 2023 11:39:11 +0800
Subject: [PATCH 5/6] Change Chinese Wiktionary example data "type" value

from "quote" to "quotation"
---
 tests/test_zh_example.py            | 8 ++++++--
 wiktextract/extractor/zh/example.py | 6 +++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/test_zh_example.py b/tests/test_zh_example.py
index caad8362..433b4cab 100644
--- a/tests/test_zh_example.py
+++ b/tests/test_zh_example.py
@@ -34,7 +34,11 @@ def test_example_list(self) -> None:
         self.assertEqual(
             sense_data.get("examples"),
             [
-                {"ref": "ref text", "text": "example text", "type": "quote"},
+                {
+                    "ref": "ref text",
+                    "text": "example text",
+                    "type": "quotation",
+                },
             ],
         )
 
@@ -57,7 +61,7 @@ def test_quote_example(self, mock_clean_node) -> None:
                     "ref": "ref text",
                     "text": "quote text",
                     "translation": "translation text",
-                    "type": "quote",
+                    "type": "quotation",
                 },
             ],
         )
diff --git a/wiktextract/extractor/zh/example.py b/wiktextract/extractor/zh/example.py
index f90e0f57..0ba257c7 100644
--- a/wiktextract/extractor/zh/example.py
+++ b/wiktextract/extractor/zh/example.py
@@ -56,7 +56,7 @@ def extract_example_list(
             isinstance(child_node, WikiNode)
             and child_node.kind == NodeKind.LIST
         ):
-            example_data["type"] = "quote"
+            example_data["type"] = "quotation"
             example_data["ref"] = clean_node(wxr, None, node.children[:index])
             example_data["text"] = clean_node(
                 wxr, None, child_node.children[0].children
@@ -69,7 +69,7 @@ def extract_quote_templates(
     """
     Process template `quote-book` and "RQ:*".
     """
-    example_data["type"] = "quote"
+    example_data["type"] = "quotation"
     expanded_text = clean_node(wxr, None, node)
     for line_num, expanded_line in enumerate(expanded_text.splitlines()):
         if line_num == 0:
@@ -128,7 +128,7 @@ def extract_template_zh_usex(
             example_data["roman"] = expanded_line
         elif expanded_line.startswith("來自："):
             example_data["ref"] = expanded_line[3:]
-            example_data["type"] = "quote"
+            example_data["type"] = "quotation"
         else:
             example_data["translation"] = expanded_line
 

From 095781cb5bda8fb54344c2fdc65bbed131b34725 Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Tue, 10 Oct 2023 13:48:31 +0800
Subject: [PATCH 6/6] language code could be null in Chinese Wiktionary JSON

---
 json_schema/zh.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/json_schema/zh.json b/json_schema/zh.json
index 2cdf8fab..429915d8 100644
--- a/json_schema/zh.json
+++ b/json_schema/zh.json
@@ -11,7 +11,7 @@
     },
     "lang_code": {
       "description": "ISO 639-1 language code",
-      "type": "string"
+      "type": ["string", "null"]
     },
     "word": {
       "description": "word string",
@@ -269,7 +269,7 @@
       "properties": {
         "code": {
           "description": "ISO 639-1 code of the translation term",
-          "type": "string"
+          "type": ["string", "null"]
         },
         "lang": {
           "description": "Transation language name",