From cc2b3e907a79da22f0bb638c57332be9d012b19c Mon Sep 17 00:00:00 2001 From: Empiriker Date: Tue, 31 Oct 2023 17:43:57 +0200 Subject: [PATCH 1/7] Add pydantic models and parse_page for Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- json_schema/es.json | 111 ++++++++++++++++++++++++ pyproject.toml | 1 + src/wiktextract/extractor/es/models.py | 113 +++++++++++++++++++++++++ src/wiktextract/extractor/es/page.py | 73 ++++++++++++++++ 4 files changed, 298 insertions(+) create mode 100644 json_schema/es.json create mode 100644 src/wiktextract/extractor/es/models.py create mode 100644 src/wiktextract/extractor/es/page.py diff --git a/json_schema/es.json b/json_schema/es.json new file mode 100644 index 00000000..5e44e361 --- /dev/null +++ b/json_schema/es.json @@ -0,0 +1,111 @@ +{ + "$defs": { + "Sense": { + "properties": { + "categories": { + "default": [], + "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", + "items": { + "type": "string" + }, + "title": "Categories", + "type": "array" + }, + "glosses": { + "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + "items": { + "type": "string" + }, + "title": "Glosses", + "type": "array" + }, + "subsenses": { + "default": [], + "description": "List of subsenses", + "items": { + "$ref": "#/$defs/Sense" + }, + "title": "Subsenses", + "type": "array" + }, + "tags": { + "default": [], + "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + "items": { + "type": "string" + }, + "title": "Tags", + "type": "array" + } + }, + "required": [ + "glosses" + ], + "title": "Sense", + "type": "object" + } + }, + "$id": "https://kaikki.org/es.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", + "properties": { + "categories": { + "default": [], + "description": "list of non-disambiguated categories for the word", + "items": { + "type": "string" + }, + "title": "Categories", + "type": "array" + }, + "lang_code": { + "description": "Wiktionary language code", + "examples": [ + "es" + ], + "title": "Lang Code", + "type": "string" + }, + "lang_name": { + "description": "Localized language name of the word", + "examples": [ + "español" + ], + "title": "Lang Name", + "type": "string" + }, + "pos": { + "default": null, + "description": "Part of speech type", + "title": "Pos", + "type": "string" + }, + "senses": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Sense" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Senses" + }, + "word": { + "description": "word string", + "title": "Word", + "type": "string" + } + }, + "required": [ + "word", + "lang_code", + "lang_name" + ], + "title": "Spanish Wiktionary", + "type": "object" +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 545b444d..d596dbfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "importlib_resources; python_version < '3.10'", "levenshtein", "nltk", + "pydantic", "wikitextprocessor @ git+https://github.com/tatuylonen/wikitextprocessor.git", ] diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py new file mode 100644 index 00000000..a0927144 --- /dev/null +++ b/src/wiktextract/extractor/es/models.py @@ -0,0 +1,113 @@ +from typing import Optional +import json + +import logging + +from pydantic import BaseModel, Field, model_validator +from pydantic.json_schema import GenerateJsonSchema + +from wiktextract.wxr_context import WiktextractContext + + +class PydanticLogger: + wxr: Optional[WiktextractContext] = None + + @classmethod + def debug( + cls, msg: str, trace: Optional[str] = None, sortid: str = "XYZunsorted" + ): + if cls.wxr: + cls.wxr.wtp.debug(msg, trace=trace, sortid=sortid) + else: + logging.debug(msg) + + +class BaseModelWrap(BaseModel): + class Config: + extra = "ignore" + validate_assignment = True + + def update(self, data: dict): + update = self.dict(exclude_defaults=True, exclude_none=True) + update.update(data) + for k, v in ( + self.validate(update) + .dict(exclude_defaults=True, exclude_none=True) + .items() + ): + setattr(self, k, v) + return self + + +class LoggingExtraFieldsModel(BaseModelWrap): + @model_validator(mode="before") + def log_extra_fields(cls, values): + all_allowed_field_names = {key for key in cls.__fields__.keys()} + extra_fields = { + name: str(value) + for name, value in values.items() + if name not in all_allowed_field_names + } + if extra_fields: + class_full_name = cls.__name__ + PydanticLogger.debug( + msg=f"Pydantic - Got extra fields in {class_full_name}: {extra_fields}", + sortid="wiktextract/extractor/es/pydantic/extra_fields/33", + ) + return values + + +class Sense(LoggingExtraFieldsModel): + glosses: list[str] = Field( + description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging." + ) + tags: list[str] = Field( + default=[], + description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + ) + categories: list[str] = Field( + default=[], + description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", + ) + # examples: list[SenseExample] = [] + subsenses: list["Sense"] = Field( + default=[], description="List of subsenses" + ) + + +class WordEntry(LoggingExtraFieldsModel): + """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" + + word: str = Field(description="word string") + pos: str = Field(default=None, description="Part of speech type") + lang_code: str = Field( + description="Wiktionary language code", examples=["es"] + ) + lang_name: str = Field( + description="Localized language name of the word", examples=["español"] + ) + senses: Optional[list[Sense]] = [] + categories: list[str] = Field( + default=[], + description="list of non-disambiguated categories for the word", + ) + + +if __name__ == "__main__": + + class JsonSchemaGenerator(GenerateJsonSchema): + def generate(self, schema, mode="validation"): + json_schema = super().generate(schema, mode=mode) + json_schema["title"] = "Spanish Wiktionary" + json_schema["$id"] = "https://kaikki.org/es.json" + json_schema["$schema"] = self.schema_dialect + return json_schema + + with open("json_schema/es.json", "w") as f: + json.dump( + WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator), + f, + indent=2, + ensure_ascii=False, + sort_keys=True, + ) diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py new file mode 100644 index 00000000..f97808ab --- /dev/null +++ b/src/wiktextract/extractor/es/page.py @@ -0,0 +1,73 @@ +import copy +import logging +from collections import defaultdict +from typing import Dict, List + +from wikitextprocessor import NodeKind, WikiNode +from wiktextract.extractor.es.models import WordEntry, PydanticLogger + +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + +# Templates that are used to form panels on pages and that +# should be ignored in various positions +PANEL_TEMPLATES = set() + +# Template name prefixes used for language-specific panel templates (i.e., +# templates that create side boxes or notice boxes or that should generally +# be ignored). +PANEL_PREFIXES = set() + +# Additional templates to be expanded in the pre-expand phase +ADDITIONAL_EXPAND_TEMPLATES = set() + + +def parse_section( + wxr: WiktextractContext, + page_data: List[Dict], + base_data: Dict, + level_node: WikiNode, +) -> None: + pass + + +def parse_page( + wxr: WiktextractContext, page_title: str, page_text: str +) -> List[Dict[str, str]]: + if wxr.config.verbose: + logging.info(f"Parsing page: {page_title}") + # Pass current wiktextractcontext to pydantic for more better logging + PydanticLogger.wxr = wxr + + wxr.config.word = page_title + wxr.wtp.start_page(page_title) + + # Parse the page, pre-expanding those templates that are likely to + # influence parsing + tree = wxr.wtp.parse( + page_text, + pre_expand=True, + additional_expand=ADDITIONAL_EXPAND_TEMPLATES, + ) + + page_data: List[WordEntry] = [] + for level2_node in tree.find_child(NodeKind.LEVEL2): + for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): + # https://es.wiktionary.org/wiki/Plantilla:lengua + # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma + if subtitle_template.template_name == "lengua": + categories_and_links = defaultdict(list) + lang_code = subtitle_template.template_parameters.get(1) + lang_name = clean_node( + wxr, categories_and_links, subtitle_template + ) + wxr.wtp.start_section(lang_name) + base_data = WordEntry( + lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title + ) + base_data.update(categories_and_links) + page_data.append(copy.deepcopy(base_data)) + for level3_node in level2_node.find_child(NodeKind.LEVEL3): + parse_section(wxr, page_data, base_data, level3_node) + + return [d.model_dump(exclude_defaults=True) for d in page_data] From 5033f00d193911ffc6747c67a47624fcf19686d6 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Thu, 23 Nov 2023 13:48:06 +0100 Subject: [PATCH 2/7] Extract POS for Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/data/es/other_subtitles.json | 5 ++ src/wiktextract/data/es/pos_subtitles.json | 90 +++++++++++++++++++ src/wiktextract/datautils.py | 3 +- src/wiktextract/extractor/es/models.py | 23 +++-- src/wiktextract/extractor/es/page.py | 78 ++++++++++++++++ src/wiktextract/extractor/es/pronunciation.py | 9 ++ 6 files changed, 199 insertions(+), 9 deletions(-) create mode 100644 src/wiktextract/data/es/other_subtitles.json create mode 100644 src/wiktextract/data/es/pos_subtitles.json create mode 100644 src/wiktextract/extractor/es/pronunciation.py diff --git a/src/wiktextract/data/es/other_subtitles.json b/src/wiktextract/data/es/other_subtitles.json new file mode 100644 index 00000000..e7c1ed26 --- /dev/null +++ b/src/wiktextract/data/es/other_subtitles.json @@ -0,0 +1,5 @@ +{ + "etymology": ["Etimología"], + "pronunciation": ["pronunciación"], + "ignored_sections": ["Véase también"] +} diff --git a/src/wiktextract/data/es/pos_subtitles.json b/src/wiktextract/data/es/pos_subtitles.json new file mode 100644 index 00000000..8da3193c --- /dev/null +++ b/src/wiktextract/data/es/pos_subtitles.json @@ -0,0 +1,90 @@ +{ + "abreviatura": { "pos": "abbrev" }, + "acrónimo": { "pos": "abbrev" }, + "adjetivo": { "pos": "adj" }, + "adjetivo cardinal": { "pos": "num" }, + "adjetivo demostrativo": { "pos": "adj" }, + "adjetivo indefinido": { "pos": "adj" }, + "adjetivo indeterminado": { "pos": "adj" }, + "adjetivo interrogativo": { "pos": "adj" }, + "adjetivo numeral": { "pos": "num" }, + "adjetivo ordinal": { "pos": "num" }, + "adjetivo posesivo": { "pos": "adj" }, + "adjetivo relativo": { "pos": "adj" }, + "adverbio": { "pos": "adv" }, + "adverbio comparativo": { "pos": "adv" }, + "adverbio de afirmación": { "pos": "adv" }, + "adverbio de cantidad": { "pos": "adv" }, + "adverbio de duda": { "pos": "adv" }, + "adverbio de lugar": { "pos": "adv" }, + "adverbio de modo": { "pos": "adv" }, + "adverbio de negación": { "pos": "adv" }, + "adverbio de orden": { "pos": "adv" }, + "adverbio de tiempo": { "pos": "adv" }, + "adverbio demostrativo": { "pos": "adv" }, + "adverbio interrogativo": { "pos": "adv" }, + "adverbio relativo": { "pos": "adv" }, + "afijo": { "pos": "affix" }, + "artículo": { "pos": "article" }, + "artículo determinado": { "pos": "article" }, + "artículo indeterminado": { "pos": "article" }, + "circunfijo": { "pos": "circumfix" }, + "conjunción": { "pos": "conj" }, + "conjunción adversativa": { "pos": "conj" }, + "conjunción ilativa": { "pos": "conj" }, + "dígrafo": { "pos": "character" }, + "expresión": { "pos": "phrase" }, + "forma verbal": { "pos": "verb" }, + "interjección": { "pos": "intj" }, + "letra": { "pos": "character" }, + "locución": { "pos": "phrase" }, + "locución adjetiva": { "pos": "phrase" }, + "locución adverbial": { "pos": "phrase" }, + "locución conjuntiva": { "pos": "phrase" }, + "locución interjectiva": { "pos": "phrase" }, + "locución prepositiva": { "pos": "phrase" }, + "locución pronominal": { "pos": "phrase" }, + "locución sustantiva": { "pos": "phrase" }, + "locución verbal": { "pos": "phrase" }, + "onomatopeya": { "pos": "noun" }, + "partícula": { "pos": "particle" }, + "postposición": { "pos": "postp" }, + "prefijo": { "pos": "prefix" }, + "preposición": { "pos": "prep" }, + "preposición de ablativo": { "pos": "prep" }, + "preposición de acusativo": { "pos": "prep" }, + "preposición de acusativo o ablativo": { "pos": "prep" }, + "preposición de genitivo": { "pos": "prep" }, + "pronombre": { "pos": "pron" }, + "pronombre demostrativo": { "pos": "pron" }, + "pronombre indefinido": { "pos": "pron" }, + "pronombre interrogativo": { "pos": "pron" }, + "pronombre personal": { "pos": "pron" }, + "pronombre posesivo": { "pos": "det" }, + "pronombre relativo": { "pos": "pron" }, + "refrán": { "pos": "proverb" }, + "sigla": { "pos": "abbrev" }, + "sufijo": { "pos": "suffix" }, + "sufijo flexivo": { "pos": "suffix" }, + "sustantivo": { "pos": "noun" }, + "sustantivo ambiguo": { "pos": "noun" }, + "sustantivo animado": { "pos": "noun" }, + "sustantivo común": { "pos": "noun" }, + "sustantivo femenino": { "pos": "noun" }, + "sustantivo femenino y masculino": { "pos": "noun" }, + "sustantivo inanimado": { "pos": "noun" }, + "sustantivo masculino": { "pos": "noun" }, + "sustantivo neutro": { "pos": "noun" }, + "sustantivo neutro y masculino": { "pos": "noun" }, + "sustantivo propio": { "pos": "name" }, + "sustantivo propio/pruebas": { "pos": "name" }, + "símbolo": { "pos": "symbol" }, + "verbo": { "pos": "verb" }, + "verbo auxiliar": { "pos": "verb" }, + "verbo impersonal": { "pos": "verb" }, + "verbo intransitivo": { "pos": "verb" }, + "verbo modal": { "pos": "verb" }, + "verbo perfectivo": { "pos": "verb" }, + "verbo pronominal": { "pos": "verb" }, + "verbo transitivo": { "pos": "verb" } +} diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py index 25844eb2..ff196c2f 100644 --- a/src/wiktextract/datautils.py +++ b/src/wiktextract/datautils.py @@ -61,7 +61,8 @@ def data_extend(data: Dict, key: str, values: Iterable) -> None: data_append(data, key, x) -def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=() +def split_at_comma_semi( + text: str, separators=(",", ";", ",", "،"), extra=() ) -> List[str]: """Splits the text at commas and semicolons, unless they are inside parenthesis. ``separators`` is default separators (setting it eliminates diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index a0927144..1250bedb 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -3,7 +3,7 @@ import logging -from pydantic import BaseModel, Field, model_validator +from pydantic import BaseModel, Field, model_validator, ValidationError from pydantic.json_schema import GenerateJsonSchema from wiktextract.wxr_context import WiktextractContext @@ -28,16 +28,22 @@ class Config: validate_assignment = True def update(self, data: dict): - update = self.dict(exclude_defaults=True, exclude_none=True) - update.update(data) - for k, v in ( - self.validate(update) - .dict(exclude_defaults=True, exclude_none=True) - .items() - ): + for k, v in data.items(): setattr(self, k, v) return self + def get(self, key: str, _=None): + return getattr(self, key) + + def __getitem__(self, item): + return getattr(self, item) + + def __setitem__(self, item, value): + try: + setattr(self, item, value) + except ValidationError: + pass + class LoggingExtraFieldsModel(BaseModelWrap): @model_validator(mode="before") @@ -80,6 +86,7 @@ class WordEntry(LoggingExtraFieldsModel): word: str = Field(description="word string") pos: str = Field(default=None, description="Part of speech type") + pos_title: str = Field(default=None, description="Original POS title") lang_code: str = Field( description="Wiktionary language code", examples=["es"] ) diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index f97808ab..538a94be 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -4,6 +4,8 @@ from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode +from wiktextract.datautils import append_base_data +from wiktextract.extractor.es.pronunciation import extract_pronunciation from wiktextract.extractor.es.models import WordEntry, PydanticLogger from wiktextract.page import clean_node @@ -28,6 +30,58 @@ def parse_section( base_data: Dict, level_node: WikiNode, ) -> None: + # Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura + subtitle = clean_node(wxr, page_data[-1], level_node.largs) + wxr.wtp.start_subsection(subtitle) + + pos_template_name = None + for level_node_template in level_node.find_content(NodeKind.TEMPLATE): + pos_template_name = level_node_template.template_name + + if subtitle in wxr.config.OTHER_SUBTITLES["ignored_sections"]: + pass + + elif pos_template_name and pos_template_name in wxr.config.POS_SUBTITLES: + process_pos_block( + wxr, page_data, base_data, level_node, pos_template_name, subtitle + ) + else: + wxr.wtp.debug( + f"Unprocessed section: {subtitle}", + sortid="extractor/es/page/parse_section/48", + ) + + +def process_pos_block( + wxr: WiktextractContext, + page_data: List[Dict], + base_data: Dict, + pos_level_node: WikiNode, + pos_template_name: str, + pos_title: str, +): + pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"] + append_base_data(page_data, "pos", pos_type, base_data) + page_data[-1]["pos_title"] = pos_title + child_nodes = list(pos_level_node.filter_empty_str_child()) + + for child in child_nodes: + if ( + isinstance(child, WikiNode) + and child.kind == NodeKind.TEMPLATE + and ( + "inflect" in child.template_name + or "v.conj" in child.template_name + ) + ): + # XXX: Extract forms + pass + elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: + # XXX: Extract data + pass + else: + # XXX: Extract data + pass pass @@ -58,6 +112,12 @@ def parse_page( if subtitle_template.template_name == "lengua": categories_and_links = defaultdict(list) lang_code = subtitle_template.template_parameters.get(1) + if ( + wxr.config.capture_language_codes is not None + and lang_code not in wxr.config.capture_language_codes + ): + continue + lang_name = clean_node( wxr, categories_and_links, subtitle_template ) @@ -70,4 +130,22 @@ def parse_page( for level3_node in level2_node.find_child(NodeKind.LEVEL3): parse_section(wxr, page_data, base_data, level3_node) + for not_level3_node in level2_node.invert_find_child( + NodeKind.LEVEL3 + ): + if ( + isinstance(not_level3_node, WikiNode) + and not_level3_node.kind == NodeKind.TEMPLATE + and not_level3_node.template_name == "pron-graf" + ): + if wxr.config.capture_pronunciation: + extract_pronunciation( + wxr, page_data[-1], not_level3_node + ) + else: + wxr.wtp.debug( + f"Found unexpected child in level 2 'lengua' node: {not_level3_node}", + sortid="extractor/es/page/parse_page/80", + ) + return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/src/wiktextract/extractor/es/pronunciation.py b/src/wiktextract/extractor/es/pronunciation.py new file mode 100644 index 00000000..ce60d6ba --- /dev/null +++ b/src/wiktextract/extractor/es/pronunciation.py @@ -0,0 +1,9 @@ +from wiktextract.wxr_context import WiktextractContext +from typing import Dict, List +from wikitextprocessor import WikiNode + + +def extract_pronunciation( + wxr: WiktextractContext, page_data: List[Dict], template_node: WikiNode +) -> None: + pass From aed70aa7322fd5dc2f560bf6bbb9496507867fd8 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Fri, 24 Nov 2023 13:04:45 +0100 Subject: [PATCH 3/7] Extract glosses from Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/es/gloss.py | 60 ++++++++++++++++++ src/wiktextract/extractor/es/models.py | 3 + src/wiktextract/extractor/es/page.py | 11 +++- tests/test_es_gloss.py | 88 ++++++++++++++++++++++++++ 4 files changed, 159 insertions(+), 3 deletions(-) create mode 100644 src/wiktextract/extractor/es/gloss.py create mode 100644 tests/test_es_gloss.py diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py new file mode 100644 index 00000000..51562d41 --- /dev/null +++ b/src/wiktextract/extractor/es/gloss.py @@ -0,0 +1,60 @@ +import re +from typing import List +from wiktextract.extractor.es.models import Sense, WordEntry +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext +from wikitextprocessor import WikiNode, NodeKind +from wikitextprocessor.parser import WikiNodeChildrenList + + +def extract_gloss( + wxr: WiktextractContext, + page_data: List[WordEntry], + list_node: WikiNode, +) -> None: + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + gloss_data = Sense(glosses=[]) + + definition: WikiNodeChildrenList = [] + other: WikiNodeChildrenList = [] + + for node in list_item.definition: + if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + other.append(node) + else: + definition.append(node) + + list_item.definition + + gloss = clean_node(wxr, gloss_data, definition) + gloss_data.glosses.append(gloss) + + gloss_note = clean_node(wxr, gloss_data, list_item.children) + + match = re.match(r"^(\d+)", gloss_note) + + if match: + gloss_data["senseid"] = int(match.group(1)) + tag_string = gloss_note[len(match.group(1)) :].strip() + else: + tag_string = gloss_data["tags"] = gloss_note.strip() + + # split tags by comma or "y" + tags = re.split(r",|y", tag_string) + for tag in tags: + tag = ( + tag.strip() + .removesuffix(".") + .removesuffix("Main") + .removeprefix("Main") + ) + if tag: + gloss_data["tags"].append(tag) + + if other: + wxr.wtp.debug( + f"Found nodes that are not part of definition: {other}", + sortid="extractor/es/gloss/extract_gloss/46", + ) + + page_data[-1].senses.append(gloss_data) diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 1250bedb..4f695b30 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -79,6 +79,9 @@ class Sense(LoggingExtraFieldsModel): subsenses: list["Sense"] = Field( default=[], description="List of subsenses" ) + senseid: Optional[int] = Field( + default=None, description="Sense number used in Wiktionary" + ) class WordEntry(LoggingExtraFieldsModel): diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 538a94be..3d764225 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -5,6 +5,7 @@ from wikitextprocessor import NodeKind, WikiNode from wiktextract.datautils import append_base_data +from wiktextract.extractor.es.gloss import extract_gloss from wiktextract.extractor.es.pronunciation import extract_pronunciation from wiktextract.extractor.es.models import WordEntry, PydanticLogger @@ -76,9 +77,13 @@ def process_pos_block( ): # XXX: Extract forms pass - elif isinstance(child, WikiNode) and child.kind == NodeKind.LIST: - # XXX: Extract data - pass + elif ( + isinstance(child, WikiNode) + and child.kind == NodeKind.LIST + and child.sarg == ";" + ): + extract_gloss(wxr, page_data, child) + else: # XXX: Extract data pass diff --git a/tests/test_es_gloss.py b/tests/test_es_gloss.py new file mode 100644 index 00000000..00a20915 --- /dev/null +++ b/tests/test_es_gloss.py @@ -0,0 +1,88 @@ +from typing import List +import unittest + +from wikitextprocessor import Wtp +from wiktextract.extractor.es.gloss import extract_gloss +from wiktextract.extractor.es.models import WordEntry + +from wiktextract.config import WiktionaryConfig +from wiktextract.wxr_context import WiktextractContext + + +class TestESGloss(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="es"), + WiktionaryConfig(dump_file_lang_code="es"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_page_data(self) -> List[WordEntry]: + return [WordEntry(word="test", lang_code="es", lang_name="Language")] + + def test_es_extract_glosses(self): + # https://es.wiktionary.org/wiki/ayudar + + self.wxr.wtp.add_page("Plantilla:plm", 10, "Contribuir") + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse( + """;1: {{plm|contribuir}} [[esfuerzo]] o [[recurso]]s para la [[realización]] de algo. +;2: Por antonomasia, [[cooperar]] a que alguno [[salir|salga]] de una [[situación]] [[dificultoso|dificultosa]]""" + ) + + page_data = self.get_default_page_data() + + extract_gloss(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": [ + "Contribuir esfuerzo o recursos para la realización de algo." + ], + "senseid": 1, + }, + { + "glosses": [ + "Por antonomasia, cooperar a que alguno salga de una situación dificultosa" + ], + "senseid": 2, + }, + ], + ) + + def test_es_extract_gloss_categories(self): + # https://es.wiktionary.org/wiki/amor + self.wxr.wtp.add_page("Plantilla:plm", 10, "Sentimiento") + self.wxr.wtp.add_page( + "Plantilla:sentimientos", + 10, + "Humanidades. [[Categoría:ES:Sentimientos]]", + ) + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse( + ";1 {{sentimientos}}: {{plm|sentimiento}} [[afectivo]] de [[atracción]], [[unión]] y [[afinidad]] que se experimenta hacia una persona, animal o cosa" + ) + + page_data = self.get_default_page_data() + + extract_gloss(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": [ + "Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa" + ], + "senseid": 1, + "tags": ["Humanidades"], + "categories": ["ES:Sentimientos"], + } + ], + ) From b3dbc296c9c40098527db1aa47a6ce37417c1f38 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Tue, 28 Nov 2023 11:29:00 +0100 Subject: [PATCH 4/7] Update deprecated fields in pydantic models of Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/es/models.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 4f695b30..4f03268c 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -1,9 +1,14 @@ -from typing import Optional import json - import logging +from typing import Optional -from pydantic import BaseModel, Field, model_validator, ValidationError +from pydantic import ( + BaseModel, + ConfigDict, + Field, + ValidationError, + model_validator, +) from pydantic.json_schema import GenerateJsonSchema from wiktextract.wxr_context import WiktextractContext @@ -23,9 +28,7 @@ def debug( class BaseModelWrap(BaseModel): - class Config: - extra = "ignore" - validate_assignment = True + model_config = ConfigDict(validate_assignment=True) def update(self, data: dict): for k, v in data.items(): @@ -48,7 +51,7 @@ def __setitem__(self, item, value): class LoggingExtraFieldsModel(BaseModelWrap): @model_validator(mode="before") def log_extra_fields(cls, values): - all_allowed_field_names = {key for key in cls.__fields__.keys()} + all_allowed_field_names = cls.model_fields.keys() extra_fields = { name: str(value) for name, value in values.items() From 2d10d328f5ac607383913a53c56cdd2847de65d0 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Tue, 28 Nov 2023 12:12:20 +0100 Subject: [PATCH 5/7] Avoid dict-like assignment to pydantic classes in Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/es/gloss.py | 12 +++++++----- src/wiktextract/extractor/es/page.py | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py index 51562d41..44f209e7 100644 --- a/src/wiktextract/extractor/es/gloss.py +++ b/src/wiktextract/extractor/es/gloss.py @@ -1,10 +1,12 @@ import re from typing import List + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import WikiNodeChildrenList + from wiktextract.extractor.es.models import Sense, WordEntry from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext -from wikitextprocessor import WikiNode, NodeKind -from wikitextprocessor.parser import WikiNodeChildrenList def extract_gloss( @@ -34,10 +36,10 @@ def extract_gloss( match = re.match(r"^(\d+)", gloss_note) if match: - gloss_data["senseid"] = int(match.group(1)) + gloss_data.senseid = int(match.group(1)) tag_string = gloss_note[len(match.group(1)) :].strip() else: - tag_string = gloss_data["tags"] = gloss_note.strip() + tag_string = gloss_data.tags = gloss_note.strip() # split tags by comma or "y" tags = re.split(r",|y", tag_string) @@ -49,7 +51,7 @@ def extract_gloss( .removeprefix("Main") ) if tag: - gloss_data["tags"].append(tag) + gloss_data.tags.append(tag) if other: wxr.wtp.debug( diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 3d764225..688d70f3 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -4,11 +4,11 @@ from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode + from wiktextract.datautils import append_base_data from wiktextract.extractor.es.gloss import extract_gloss +from wiktextract.extractor.es.models import PydanticLogger, WordEntry from wiktextract.extractor.es.pronunciation import extract_pronunciation -from wiktextract.extractor.es.models import WordEntry, PydanticLogger - from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -63,7 +63,7 @@ def process_pos_block( ): pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"] append_base_data(page_data, "pos", pos_type, base_data) - page_data[-1]["pos_title"] = pos_title + page_data[-1].pos_title = pos_title child_nodes = list(pos_level_node.filter_empty_str_child()) for child in child_nodes: From cbc62feb1121a8b5a6ed76a8faf0849ac9de900b Mon Sep 17 00:00:00 2001 From: Empiriker Date: Wed, 29 Nov 2023 09:40:13 +0100 Subject: [PATCH 6/7] Remove boilerplate from pydantic models and avoid append_base_data() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- src/wiktextract/extractor/es/models.py | 32 +++++++++++++------------- src/wiktextract/extractor/es/page.py | 17 ++++++-------- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 4f03268c..331fce60 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -30,22 +30,22 @@ def debug( class BaseModelWrap(BaseModel): model_config = ConfigDict(validate_assignment=True) - def update(self, data: dict): - for k, v in data.items(): - setattr(self, k, v) - return self - - def get(self, key: str, _=None): - return getattr(self, key) - - def __getitem__(self, item): - return getattr(self, item) - - def __setitem__(self, item, value): - try: - setattr(self, item, value) - except ValidationError: - pass + # def update(self, data: dict): + # for k, v in data.items(): + # setattr(self, k, v) + # return self + + # def get(self, key: str, _=None): + # return getattr(self, key) + + # def __getitem__(self, item): + # return getattr(self, item) + + # def __setitem__(self, item, value): + # try: + # setattr(self, item, value) + # except ValidationError: + # pass class LoggingExtraFieldsModel(BaseModelWrap): diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 688d70f3..b612671b 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -1,11 +1,9 @@ import copy import logging -from collections import defaultdict from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode -from wiktextract.datautils import append_base_data from wiktextract.extractor.es.gloss import extract_gloss from wiktextract.extractor.es.models import PydanticLogger, WordEntry from wiktextract.extractor.es.pronunciation import extract_pronunciation @@ -32,7 +30,7 @@ def parse_section( level_node: WikiNode, ) -> None: # Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura - subtitle = clean_node(wxr, page_data[-1], level_node.largs) + subtitle = clean_node(wxr, base_data, level_node.largs) wxr.wtp.start_subsection(subtitle) pos_template_name = None @@ -62,7 +60,8 @@ def process_pos_block( pos_title: str, ): pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"] - append_base_data(page_data, "pos", pos_type, base_data) + page_data.append(copy.deepcopy(base_data)) + page_data[-1].pos = pos_type page_data[-1].pos_title = pos_title child_nodes = list(pos_level_node.filter_empty_str_child()) @@ -115,7 +114,7 @@ def parse_page( # https://es.wiktionary.org/wiki/Plantilla:lengua # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma if subtitle_template.template_name == "lengua": - categories_and_links = defaultdict(list) + categories = {"categories": []} lang_code = subtitle_template.template_parameters.get(1) if ( wxr.config.capture_language_codes is not None @@ -123,15 +122,13 @@ def parse_page( ): continue - lang_name = clean_node( - wxr, categories_and_links, subtitle_template - ) + lang_name = clean_node(wxr, categories, subtitle_template) wxr.wtp.start_section(lang_name) base_data = WordEntry( lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title ) - base_data.update(categories_and_links) - page_data.append(copy.deepcopy(base_data)) + base_data.categories.extend(categories["categories"]) + # page_data.append(copy.deepcopy(base_data)) for level3_node in level2_node.find_child(NodeKind.LEVEL3): parse_section(wxr, page_data, base_data, level3_node) From 88a79b247ac5ca8a8a481bc1890f7fd0a2e93d33 Mon Sep 17 00:00:00 2001 From: Empiriker Date: Fri, 1 Dec 2023 08:56:23 +0100 Subject: [PATCH 7/7] Update typings and es.json for Spanish Wiktionary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France. --- json_schema/es.json | 19 +++++++++++++++++++ src/wiktextract/extractor/es/models.py | 25 +------------------------ src/wiktextract/extractor/es/page.py | 12 ++++++------ 3 files changed, 26 insertions(+), 30 deletions(-) diff --git a/json_schema/es.json b/json_schema/es.json index 5e44e361..748bdb6a 100644 --- a/json_schema/es.json +++ b/json_schema/es.json @@ -19,6 +19,19 @@ "title": "Glosses", "type": "array" }, + "senseid": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Sense number used in Wiktionary", + "title": "Senseid" + }, "subsenses": { "default": [], "description": "List of subsenses", @@ -80,6 +93,12 @@ "title": "Pos", "type": "string" }, + "pos_title": { + "default": null, + "description": "Original POS title", + "title": "Pos Title", + "type": "string" + }, "senses": { "anyOf": [ { diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index 331fce60..7c8911dc 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -2,13 +2,7 @@ import logging from typing import Optional -from pydantic import ( - BaseModel, - ConfigDict, - Field, - ValidationError, - model_validator, -) +from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic.json_schema import GenerateJsonSchema from wiktextract.wxr_context import WiktextractContext @@ -30,23 +24,6 @@ def debug( class BaseModelWrap(BaseModel): model_config = ConfigDict(validate_assignment=True) - # def update(self, data: dict): - # for k, v in data.items(): - # setattr(self, k, v) - # return self - - # def get(self, key: str, _=None): - # return getattr(self, key) - - # def __getitem__(self, item): - # return getattr(self, item) - - # def __setitem__(self, item, value): - # try: - # setattr(self, item, value) - # except ValidationError: - # pass - class LoggingExtraFieldsModel(BaseModelWrap): @model_validator(mode="before") diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index b612671b..1ac75dde 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -25,8 +25,8 @@ def parse_section( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, + page_data: List[WordEntry], + base_data: WordEntry, level_node: WikiNode, ) -> None: # Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura @@ -53,13 +53,14 @@ def parse_section( def process_pos_block( wxr: WiktextractContext, - page_data: List[Dict], - base_data: Dict, + page_data: List[WordEntry], + base_data: WordEntry, pos_level_node: WikiNode, pos_template_name: str, pos_title: str, ): pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"] + page_data.append(copy.deepcopy(base_data)) page_data[-1].pos = pos_type page_data[-1].pos_title = pos_title @@ -91,7 +92,7 @@ def process_pos_block( def parse_page( wxr: WiktextractContext, page_title: str, page_text: str -) -> List[Dict[str, str]]: +) -> List[Dict[str, any]]: if wxr.config.verbose: logging.info(f"Parsing page: {page_title}") # Pass current wiktextractcontext to pydantic for more better logging @@ -128,7 +129,6 @@ def parse_page( lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title ) base_data.categories.extend(categories["categories"]) - # page_data.append(copy.deepcopy(base_data)) for level3_node in level2_node.find_child(NodeKind.LEVEL3): parse_section(wxr, page_data, base_data, level3_node)