Merge pull request #356 from xxyzz/json_schema

Add French and Chinese Wiktionary JSON schema
tatuylonen · Oct 10, 2023 · b4a54f7 · b4a54f7
2 parents 2d1a893 + 095781c
commit b4a54f7
Show file tree

Hide file tree

Showing 8 changed files with 694 additions and 10 deletions.
diff --git a/json_schema/fr.json b/json_schema/fr.json
@@ -0,0 +1,316 @@
+{
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "$id": "https://kaikki.org/fr.json",
+  "title": "French Wiktionary",
+  "description": "JSON schema of the French Wiktionary extractor",
+  "type": "object",
+  "properties": {
+    "lang": {
+      "description": "Localized langauge name of the word",
+      "type": "string"
+    },
+    "lang_code": {
+      "description": "ISO 639-1 language code",
+      "type": "string"
+    },
+    "word": {
+      "description": "word string",
+      "type": "string"
+    },
+    "pos": {
+      "description": "Part of speech type",
+      "type": "string"
+    },
+    "pos_title": {
+      "description": "Original POS title for matching etymology texts",
+      "type": "string"
+    },
+    "etymology_texts": {
+      "description": "Etymology list",
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "senses": {
+      "description": "Sense list",
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/sense"
+      }
+    },
+    "forms": {
+      "description": "Inflection forms list",
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/form"
+      }
+    },
+    "sounds": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/sound"
+      }
+    },
+    "translations": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/translation"
+      }
+    },
+    "synonyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "hyponyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "hypernyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "holonyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "meronyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "derived": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "troponyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "paronyms": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "related": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "abbreviation": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "proverbs": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/linkage"
+      }
+    },
+    "title": {
+      "description": "Redirect page source title",
+      "type": "string"
+    },
+    "redirect": {
+      "description": "Redirect page target title",
+      "type": "string"
+    },
+    "categories": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    }
+  },
+  "$defs": {
+    "sense": {
+      "type": "object",
+      "properties": {
+        "glosses": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "categories": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "examples": {
+          "type": "array",
+          "items": {
+            "$ref": "#/$defs/example"
+          }
+        }
+      }
+    },
+    "example": {
+      "type": "object",
+      "properties": {
+        "text": {
+          "description": "Example usage sentence",
+          "type": "string"
+        },
+        "translation": {
+          "description": "French translation of the example sentence",
+          "type": "string"
+        },
+        "roman": {
+          "description": "Romanization of the example sentence",
+          "type": "string"
+        },
+        "ref": {
+          "description": "Source of the sentence, like book title and page number",
+          "type": "string"
+        },
+        "type": {
+          "description": "This value is 'quotation' if 'source' exists",
+          "type": "string",
+          "enum": [
+            "example",
+            "quotation"
+          ]
+        }
+      }
+    },
+    "form": {
+      "type": "object",
+      "properties": {
+        "form": {
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "ipas": {
+          "description": "has more than one ipa",
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "ipa": {
+          "description": "only has one ipa",
+          "type": "string"
+        },
+        "source": {
+          "description": "form line template name",
+          "type": "string"
+        }
+      }
+    },
+    "sound": {
+      "type": "object",
+      "properties": {
+        "zh-pron": {
+          "description": "Chinese word pronunciation",
+          "type": "string"
+        },
+        "ipa": {
+          "description": "International Phonetic Alphabet",
+          "type": "string"
+        },
+        "audio": {
+          "description": "Audio file name",
+          "type": "string"
+        },
+        "wav_url": {
+          "type": "string"
+        },
+        "ogg_url": {
+          "type": "string"
+        },
+        "mp3_url": {
+          "type": "string"
+        }
+      }
+    },
+    "translation": {
+      "type": "object",
+      "properties": {
+        "code": {
+          "description": "ISO 639-1 code of the translation term",
+          "type": "string"
+        },
+        "lang": {
+          "description": "Transation language name",
+          "type": "string"
+        },
+        "word": {
+          "description": "Translation term",
+          "type": "string"
+        },
+        "sense": {
+          "description": "Translation gloss",
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "roman": {
+          "type": "string"
+        },
+        "traditional_writing": {
+          "description": "Alternative writting for Chinese, Korean and Mongolian",
+          "type": "string"
+        }
+      }
+    },
+    "linkage": {
+      "type": "object",
+      "properties": {
+        "word": {
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "roman": {
+          "type": "string"
+        },
+        "alt": {
+          "description": "ALternative form",
+          "type": "string"
+        },
+        "translation": {
+          "description": "French translation",
+          "type": "string"
+        }
+      }
+    }
+  }
+}
diff --git a/json_schema/validate.py b/json_schema/validate.py
@@ -0,0 +1,36 @@
+import argparse
+import json
+from concurrent.futures import ProcessPoolExecutor
+from functools import partial
+from pathlib import Path
+
+
+def worker(line, schema={}):
+    from jsonschema import validate
+
+    validate(instance=json.loads(line), schema=schema)
+
+
+def main():
+    """
+    Validate extracted JSONL file with JSON schema.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument("jsonl_path", type=Path)
+    parser.add_argument("schema_path", type=Path)
+    args = parser.parse_args()
+
+    with (
+        args.jsonl_path.open(encoding="utf-8") as jsonl_f,
+        args.schema_path.open(encoding="utf-8") as schema_f,
+        ProcessPoolExecutor() as executor,
+    ):
+        schema = json.load(schema_f)
+        for _ in executor.map(
+            partial(worker, schema=schema), jsonl_f, chunksize=1000
+        ):
+            pass
+
+
+if __name__ == "__main__":
+    main()