Add pydantic models and parse_page for Spanish Wiktionary

This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
tatuylonen · Oct 31, 2023 · e110791 · e110791
1 parent a64e729
commit e110791
Show file tree

Hide file tree

Showing 4 changed files with 298 additions and 0 deletions.
diff --git a/json_schema/es.json b/json_schema/es.json
@@ -0,0 +1,111 @@
+{
+  "$defs": {
+    "Sense": {
+      "properties": {
+        "categories": {
+          "default": [],
+          "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
+          "items": {
+            "type": "string"
+          },
+          "title": "Categories",
+          "type": "array"
+        },
+        "glosses": {
+          "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
+          "items": {
+            "type": "string"
+          },
+          "title": "Glosses",
+          "type": "array"
+        },
+        "subsenses": {
+          "default": [],
+          "description": "List of subsenses",
+          "items": {
+            "$ref": "#/$defs/Sense"
+          },
+          "title": "Subsenses",
+          "type": "array"
+        },
+        "tags": {
+          "default": [],
+          "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
+          "items": {
+            "type": "string"
+          },
+          "title": "Tags",
+          "type": "array"
+        }
+      },
+      "required": [
+        "glosses"
+      ],
+      "title": "Sense",
+      "type": "object"
+    }
+  },
+  "$id": "https://kaikki.org/es.json",
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.",
+  "properties": {
+    "categories": {
+      "default": [],
+      "description": "list of non-disambiguated categories for the word",
+      "items": {
+        "type": "string"
+      },
+      "title": "Categories",
+      "type": "array"
+    },
+    "lang_code": {
+      "description": "Wiktionary language code",
+      "examples": [
+        "es"
+      ],
+      "title": "Lang Code",
+      "type": "string"
+    },
+    "lang_name": {
+      "description": "Localized language name of the word",
+      "examples": [
+        "español"
+      ],
+      "title": "Lang Name",
+      "type": "string"
+    },
+    "pos": {
+      "default": null,
+      "description": "Part of speech type",
+      "title": "Pos",
+      "type": "string"
+    },
+    "senses": {
+      "anyOf": [
+        {
+          "items": {
+            "$ref": "#/$defs/Sense"
+          },
+          "type": "array"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": [],
+      "title": "Senses"
+    },
+    "word": {
+      "description": "word string",
+      "title": "Word",
+      "type": "string"
+    }
+  },
+  "required": [
+    "word",
+    "lang_code",
+    "lang_name"
+  ],
+  "title": "Spanish Wiktionary",
+  "type": "object"
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
     "importlib_resources; python_version < '3.10'",
     "levenshtein",
     "nltk",
+    "pydantic",
     "wikitextprocessor @ git+https://github.com/tatuylonen/wikitextprocessor.git",
 ]
 

diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py
@@ -0,0 +1,113 @@
+from typing import Optional
+import json
+
+import logging
+
+from pydantic import BaseModel, Field, model_validator
+from pydantic.json_schema import GenerateJsonSchema
+
+from wiktextract.wxr_context import WiktextractContext
+
+
+class PydanticLogger:
+    wxr: Optional[WiktextractContext] = None
+
+    @classmethod
+    def debug(
+        cls, msg: str, trace: Optional[str] = None, sortid: str = "XYZunsorted"
+    ):
+        if cls.wxr:
+            cls.wxr.wtp.debug(msg, trace=trace, sortid=sortid)
+        else:
+            logging.debug(msg)
+
+
+class BaseModelWrap(BaseModel):
+    class Config:
+        extra = "ignore"
+        validate_assignment = True
+
+    def update(self, data: dict):
+        update = self.dict(exclude_defaults=True, exclude_none=True)
+        update.update(data)
+        for k, v in (
+            self.validate(update)
+            .dict(exclude_defaults=True, exclude_none=True)
+            .items()
+        ):
+            setattr(self, k, v)
+        return self
+
+
+class LoggingExtraFieldsModel(BaseModelWrap):
+    @model_validator(mode="before")
+    def log_extra_fields(cls, values):
+        all_allowed_field_names = {key for key in cls.__fields__.keys()}
+        extra_fields = {
+            name: str(value)
+            for name, value in values.items()
+            if name not in all_allowed_field_names
+        }
+        if extra_fields:
+            class_full_name = cls.__name__
+            PydanticLogger.debug(
+                msg=f"Pydantic - Got extra fields in {class_full_name}: {extra_fields}",
+                sortid="wiktextract/extractor/es/pydantic/extra_fields/33",
+            )
+        return values
+
+
+class Sense(LoggingExtraFieldsModel):
+    glosses: list[str] = Field(
+        description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging."
+    )
+    tags: list[str] = Field(
+        default=[],
+        description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
+    )
+    categories: list[str] = Field(
+        default=[],
+        description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
+    )
+    # examples: list[SenseExample] = []
+    subsenses: list["Sense"] = Field(
+        default=[], description="List of subsenses"
+    )
+
+
+class WordEntry(LoggingExtraFieldsModel):
+    """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract."""
+
+    word: str = Field(description="word string")
+    pos: str = Field(default=None, description="Part of speech type")
+    lang_code: str = Field(
+        description="Wiktionary language code", examples=["es"]
+    )
+    lang_name: str = Field(
+        description="Localized language name of the word", examples=["español"]
+    )
+    senses: Optional[list[Sense]] = []
+    categories: list[str] = Field(
+        default=[],
+        description="list of non-disambiguated categories for the word",
+    )
+
+
+if __name__ == "__main__":
+
+    class JsonSchemaGenerator(GenerateJsonSchema):
+        def generate(self, schema, mode="validation"):
+            json_schema = super().generate(schema, mode=mode)
+            json_schema["title"] = "Spanish Wiktionary"
+            json_schema["$id"] = "https://kaikki.org/es.json"
+            json_schema["$schema"] = self.schema_dialect
+            return json_schema
+
+    with open("json_schema/es.json", "w") as f:
+        json.dump(
+            WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator),
+            f,
+            indent=2,
+            ensure_ascii=False,
+            sort_keys=True,
+        )
diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py
@@ -0,0 +1,73 @@
+import copy
+import logging
+from collections import defaultdict
+from typing import Dict, List
+
+from wikitextprocessor import NodeKind, WikiNode
+from wiktextract.extractor.es.models import WordEntry, PydanticLogger
+
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+# Templates that are used to form panels on pages and that
+# should be ignored in various positions
+PANEL_TEMPLATES = set()
+
+# Template name prefixes used for language-specific panel templates (i.e.,
+# templates that create side boxes or notice boxes or that should generally
+# be ignored).
+PANEL_PREFIXES = set()
+
+# Additional templates to be expanded in the pre-expand phase
+ADDITIONAL_EXPAND_TEMPLATES = set()
+
+
+def parse_section(
+    wxr: WiktextractContext,
+    page_data: List[Dict],
+    base_data: Dict,
+    level_node: WikiNode,
+) -> None:
+    pass
+
+
+def parse_page(
+    wxr: WiktextractContext, page_title: str, page_text: str
+) -> List[Dict[str, str]]:
+    if wxr.config.verbose:
+        logging.info(f"Parsing page: {page_title}")
+        # Pass current wiktextractcontext to pydantic for more better logging
+        PydanticLogger.wxr = wxr
+
+    wxr.config.word = page_title
+    wxr.wtp.start_page(page_title)
+
+    # Parse the page, pre-expanding those templates that are likely to
+    # influence parsing
+    tree = wxr.wtp.parse(
+        page_text,
+        pre_expand=True,
+        additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
+    )
+
+    page_data: List[WordEntry] = []
+    for level2_node in tree.find_child(NodeKind.LEVEL2):
+        for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
+            # https://es.wiktionary.org/wiki/Plantilla:lengua
+            # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma
+            if subtitle_template.template_name == "lengua":
+                categories_and_links = defaultdict(list)
+                lang_code = subtitle_template.template_parameters.get(1)
+                lang_name = clean_node(
+                    wxr, categories_and_links, subtitle_template
+                )
+                wxr.wtp.start_section(lang_name)
+                base_data = WordEntry(
+                    lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
+                )
+                base_data.update(categories_and_links)
+                page_data.append(copy.deepcopy(base_data))
+                for level3_node in level2_node.find_child(NodeKind.LEVEL3):
+                    parse_section(wxr, page_data, base_data, level3_node)
+
+    return [d.model_dump(exclude_defaults=True) for d in page_data]