Update typings and es.json for Spanish Wiktionary

This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
tatuylonen · Dec 1, 2023 · 88a79b2 · 88a79b2
1 parent cbc62fe
commit 88a79b2
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 30 deletions.
diff --git a/json_schema/es.json b/json_schema/es.json
@@ -19,6 +19,19 @@
           "title": "Glosses",
           "type": "array"
         },
+        "senseid": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Sense number used in Wiktionary",
+          "title": "Senseid"
+        },
         "subsenses": {
           "default": [],
           "description": "List of subsenses",
@@ -80,6 +93,12 @@
       "title": "Pos",
       "type": "string"
     },
+    "pos_title": {
+      "default": null,
+      "description": "Original POS title",
+      "title": "Pos Title",
+      "type": "string"
+    },
     "senses": {
       "anyOf": [
         {

diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py
@@ -2,13 +2,7 @@
 import logging
 from typing import Optional
 
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field,
-    ValidationError,
-    model_validator,
-)
+from pydantic import BaseModel, ConfigDict, Field, model_validator
 from pydantic.json_schema import GenerateJsonSchema
 
 from wiktextract.wxr_context import WiktextractContext
@@ -30,23 +24,6 @@ def debug(
 class BaseModelWrap(BaseModel):
     model_config = ConfigDict(validate_assignment=True)
 
-    # def update(self, data: dict):
-    #     for k, v in data.items():
-    #         setattr(self, k, v)
-    #     return self
-
-    # def get(self, key: str, _=None):
-    #     return getattr(self, key)
-
-    # def __getitem__(self, item):
-    #     return getattr(self, item)
-
-    # def __setitem__(self, item, value):
-    #     try:
-    #         setattr(self, item, value)
-    #     except ValidationError:
-    #         pass
-
 
 class LoggingExtraFieldsModel(BaseModelWrap):
     @model_validator(mode="before")

diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py
@@ -25,8 +25,8 @@
 
 def parse_section(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    base_data: Dict,
+    page_data: List[WordEntry],
+    base_data: WordEntry,
     level_node: WikiNode,
 ) -> None:
     # Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura
@@ -53,13 +53,14 @@ def parse_section(
 
 def process_pos_block(
     wxr: WiktextractContext,
-    page_data: List[Dict],
-    base_data: Dict,
+    page_data: List[WordEntry],
+    base_data: WordEntry,
     pos_level_node: WikiNode,
     pos_template_name: str,
     pos_title: str,
 ):
     pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"]
+
     page_data.append(copy.deepcopy(base_data))
     page_data[-1].pos = pos_type
     page_data[-1].pos_title = pos_title
@@ -91,7 +92,7 @@ def process_pos_block(
 
 def parse_page(
     wxr: WiktextractContext, page_title: str, page_text: str
-) -> List[Dict[str, str]]:
+) -> List[Dict[str, any]]:
     if wxr.config.verbose:
         logging.info(f"Parsing page: {page_title}")
         # Pass current wiktextractcontext to pydantic for more better logging
@@ -128,7 +129,6 @@ def parse_page(
                     lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
                 )
                 base_data.categories.extend(categories["categories"])
-                # page_data.append(copy.deepcopy(base_data))
                 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
                     parse_section(wxr, page_data, base_data, level3_node)