From 88a79b247ac5ca8a8a481bc1890f7fd0a2e93d33 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Fri, 1 Dec 2023 08:56:23 +0100 Subject: [PATCH] Update typings and es.json for Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- json_schema/es.json | 19 +++++++++++++++++++ src/wiktextract/extractor/es/models.py | 25 +------------------------ src/wiktextract/extractor/es/page.py | 12 ++++++------ 3 files changed, 26 insertions(+), 30 deletions(-) diff --git a/json_schema/es.json b/json_schema/es.json index 5e44e361..748bdb6a 100644 --- a/json_schema/es.json +++ b/json_schema/es.json @@ -19,6 +19,19 @@ "title": "Glosses", "type": "array" }, + "senseid": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Sense number used in Wiktionary", + "title": "Senseid" + }, "subsenses": { "default": [], "description": "List of subsenses", @@ -80,6 +93,12 @@ "title": "Pos", "type": "string" }, + "pos_title": { + "default": null, + "description": "Original POS title", + "title": "Pos Title", + "type": "string" + }, "senses": { "anyOf": [ { diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 331fce60..7c8911dc 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -2,13 +2,7 @@ import logging from typing import Optional -from pydantic import ( - BaseModel, - ConfigDict, - Field, - ValidationError, - model_validator, -) +from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic.json_schema import GenerateJsonSchema from wiktextract.wxr_context import WiktextractContext @@ -30,23 +24,6 @@ def debug( class BaseModelWrap(BaseModel): model_config = ConfigDict(validate_assignment=True) - # def update(self, data: dict): - # for k, v in data.items(): - # setattr(self, k, v) - # return self - - # def get(self, key: str, _=None): - # return getattr(self, key) - - # def __getitem__(self, item): - # return getattr(self, item) - - # def __setitem__(self, item, value): - # try: - # setattr(self, item, value) - # except ValidationError: - # pass - class LoggingExtraFieldsModel(BaseModelWrap): @model_validator(mode="before") diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index b612671b..1ac75dde 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -25,8 +25,8 @@ def parse_section( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, + page_data: List[WordEntry], + base_data: WordEntry, level_node: WikiNode, ) -> None: # Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura @@ -53,13 +53,14 @@ def parse_section( def process_pos_block( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, + page_data: List[WordEntry], + base_data: WordEntry, pos_level_node: WikiNode, pos_template_name: str, pos_title: str, ): pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"] + page_data.append(copy.deepcopy(base_data)) page_data[-1].pos = pos_type page_data[-1].pos_title = pos_title @@ -91,7 +92,7 @@ def process_pos_block( def parse_page( wxr: WiktextractContext, page_title: str, page_text: str -) -> List[Dict[str, str]]: +) -> List[Dict[str, any]]: if wxr.config.verbose: logging.info(f"Parsing page: {page_title}") # Pass current wiktextractcontext to pydantic for more better logging @@ -128,7 +129,6 @@ def parse_page( lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title ) base_data.categories.extend(categories["categories"]) - # page_data.append(copy.deepcopy(base_data)) for level3_node in level2_node.find_child(NodeKind.LEVEL3): parse_section(wxr, page_data, base_data, level3_node)