diff --git a/json_schema/es.json b/json_schema/es.json new file mode 100644 index 00000000..748bdb6a --- /dev/null +++ b/json_schema/es.json @@ -0,0 +1,130 @@ +{ + "$defs": { + "Sense": { + "properties": { + "categories": { + "default": [], + "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", + "items": { + "type": "string" + }, + "title": "Categories", + "type": "array" + }, + "glosses": { + "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + "items": { + "type": "string" + }, + "title": "Glosses", + "type": "array" + }, + "senseid": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Sense number used in Wiktionary", + "title": "Senseid" + }, + "subsenses": { + "default": [], + "description": "List of subsenses", + "items": { + "$ref": "#/$defs/Sense" + }, + "title": "Subsenses", + "type": "array" + }, + "tags": { + "default": [], + "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + "items": { + "type": "string" + }, + "title": "Tags", + "type": "array" + } + }, + "required": [ + "glosses" + ], + "title": "Sense", + "type": "object" + } + }, + "$id": "https://kaikki.org/es.json", + "$schema": "https://json-schema.org/draft/2020-12/schema", + "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", + "properties": { + "categories": { + "default": [], + "description": "list of non-disambiguated categories for the word", + "items": { + "type": "string" + }, + "title": "Categories", + "type": "array" + }, + "lang_code": { + "description": "Wiktionary language code", + "examples": [ + "es" + ], + "title": "Lang Code", + "type": "string" + }, + "lang_name": { + "description": "Localized language name of the word", + "examples": [ + "español" + ], + "title": "Lang Name", + "type": "string" + }, + "pos": { + "default": null, + "description": "Part of speech type", + "title": "Pos", + "type": "string" + }, + "pos_title": { + "default": null, + "description": "Original POS title", + "title": "Pos Title", + "type": "string" + }, + "senses": { + "anyOf": [ + { + "items": { + "$ref": "#/$defs/Sense" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": [], + "title": "Senses" + }, + "word": { + "description": "word string", + "title": "Word", + "type": "string" + } + }, + "required": [ + "word", + "lang_code", + "lang_name" + ], + "title": "Spanish Wiktionary", + "type": "object" +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 545b444d..d596dbfc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "importlib_resources; python_version < '3.10'", "levenshtein", "nltk", + "pydantic", "wikitextprocessor @ git+https://github.com/tatuylonen/wikitextprocessor.git", ] diff --git a/src/wiktextract/data/es/other_subtitles.json b/src/wiktextract/data/es/other_subtitles.json new file mode 100644 index 00000000..e7c1ed26 --- /dev/null +++ b/src/wiktextract/data/es/other_subtitles.json @@ -0,0 +1,5 @@ +{ + "etymology": ["Etimología"], + "pronunciation": ["pronunciación"], + "ignored_sections": ["Véase también"] +} diff --git a/src/wiktextract/data/es/pos_subtitles.json b/src/wiktextract/data/es/pos_subtitles.json new file mode 100644 index 00000000..8da3193c --- /dev/null +++ b/src/wiktextract/data/es/pos_subtitles.json @@ -0,0 +1,90 @@ +{ + "abreviatura": { "pos": "abbrev" }, + "acrónimo": { "pos": "abbrev" }, + "adjetivo": { "pos": "adj" }, + "adjetivo cardinal": { "pos": "num" }, + "adjetivo demostrativo": { "pos": "adj" }, + "adjetivo indefinido": { "pos": "adj" }, + "adjetivo indeterminado": { "pos": "adj" }, + "adjetivo interrogativo": { "pos": "adj" }, + "adjetivo numeral": { "pos": "num" }, + "adjetivo ordinal": { "pos": "num" }, + "adjetivo posesivo": { "pos": "adj" }, + "adjetivo relativo": { "pos": "adj" }, + "adverbio": { "pos": "adv" }, + "adverbio comparativo": { "pos": "adv" }, + "adverbio de afirmación": { "pos": "adv" }, + "adverbio de cantidad": { "pos": "adv" }, + "adverbio de duda": { "pos": "adv" }, + "adverbio de lugar": { "pos": "adv" }, + "adverbio de modo": { "pos": "adv" }, + "adverbio de negación": { "pos": "adv" }, + "adverbio de orden": { "pos": "adv" }, + "adverbio de tiempo": { "pos": "adv" }, + "adverbio demostrativo": { "pos": "adv" }, + "adverbio interrogativo": { "pos": "adv" }, + "adverbio relativo": { "pos": "adv" }, + "afijo": { "pos": "affix" }, + "artículo": { "pos": "article" }, + "artículo determinado": { "pos": "article" }, + "artículo indeterminado": { "pos": "article" }, + "circunfijo": { "pos": "circumfix" }, + "conjunción": { "pos": "conj" }, + "conjunción adversativa": { "pos": "conj" }, + "conjunción ilativa": { "pos": "conj" }, + "dígrafo": { "pos": "character" }, + "expresión": { "pos": "phrase" }, + "forma verbal": { "pos": "verb" }, + "interjección": { "pos": "intj" }, + "letra": { "pos": "character" }, + "locución": { "pos": "phrase" }, + "locución adjetiva": { "pos": "phrase" }, + "locución adverbial": { "pos": "phrase" }, + "locución conjuntiva": { "pos": "phrase" }, + "locución interjectiva": { "pos": "phrase" }, + "locución prepositiva": { "pos": "phrase" }, + "locución pronominal": { "pos": "phrase" }, + "locución sustantiva": { "pos": "phrase" }, + "locución verbal": { "pos": "phrase" }, + "onomatopeya": { "pos": "noun" }, + "partícula": { "pos": "particle" }, + "postposición": { "pos": "postp" }, + "prefijo": { "pos": "prefix" }, + "preposición": { "pos": "prep" }, + "preposición de ablativo": { "pos": "prep" }, + "preposición de acusativo": { "pos": "prep" }, + "preposición de acusativo o ablativo": { "pos": "prep" }, + "preposición de genitivo": { "pos": "prep" }, + "pronombre": { "pos": "pron" }, + "pronombre demostrativo": { "pos": "pron" }, + "pronombre indefinido": { "pos": "pron" }, + "pronombre interrogativo": { "pos": "pron" }, + "pronombre personal": { "pos": "pron" }, + "pronombre posesivo": { "pos": "det" }, + "pronombre relativo": { "pos": "pron" }, + "refrán": { "pos": "proverb" }, + "sigla": { "pos": "abbrev" }, + "sufijo": { "pos": "suffix" }, + "sufijo flexivo": { "pos": "suffix" }, + "sustantivo": { "pos": "noun" }, + "sustantivo ambiguo": { "pos": "noun" }, + "sustantivo animado": { "pos": "noun" }, + "sustantivo común": { "pos": "noun" }, + "sustantivo femenino": { "pos": "noun" }, + "sustantivo femenino y masculino": { "pos": "noun" }, + "sustantivo inanimado": { "pos": "noun" }, + "sustantivo masculino": { "pos": "noun" }, + "sustantivo neutro": { "pos": "noun" }, + "sustantivo neutro y masculino": { "pos": "noun" }, + "sustantivo propio": { "pos": "name" }, + "sustantivo propio/pruebas": { "pos": "name" }, + "símbolo": { "pos": "symbol" }, + "verbo": { "pos": "verb" }, + "verbo auxiliar": { "pos": "verb" }, + "verbo impersonal": { "pos": "verb" }, + "verbo intransitivo": { "pos": "verb" }, + "verbo modal": { "pos": "verb" }, + "verbo perfectivo": { "pos": "verb" }, + "verbo pronominal": { "pos": "verb" }, + "verbo transitivo": { "pos": "verb" } +} diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py index 25844eb2..ff196c2f 100644 --- a/src/wiktextract/datautils.py +++ b/src/wiktextract/datautils.py @@ -61,7 +61,8 @@ def data_extend(data: Dict, key: str, values: Iterable) -> None: data_append(data, key, x) -def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=() +def split_at_comma_semi( + text: str, separators=(",", ";", ",", "،"), extra=() ) -> List[str]: """Splits the text at commas and semicolons, unless they are inside parenthesis. ``separators`` is default separators (setting it eliminates diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py new file mode 100644 index 00000000..44f209e7 --- /dev/null +++ b/src/wiktextract/extractor/es/gloss.py @@ -0,0 +1,62 @@ +import re +from typing import List + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import WikiNodeChildrenList + +from wiktextract.extractor.es.models import Sense, WordEntry +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + + +def extract_gloss( + wxr: WiktextractContext, + page_data: List[WordEntry], + list_node: WikiNode, +) -> None: + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + gloss_data = Sense(glosses=[]) + + definition: WikiNodeChildrenList = [] + other: WikiNodeChildrenList = [] + + for node in list_item.definition: + if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + other.append(node) + else: + definition.append(node) + + list_item.definition + + gloss = clean_node(wxr, gloss_data, definition) + gloss_data.glosses.append(gloss) + + gloss_note = clean_node(wxr, gloss_data, list_item.children) + + match = re.match(r"^(\d+)", gloss_note) + + if match: + gloss_data.senseid = int(match.group(1)) + tag_string = gloss_note[len(match.group(1)) :].strip() + else: + tag_string = gloss_data.tags = gloss_note.strip() + + # split tags by comma or "y" + tags = re.split(r",|y", tag_string) + for tag in tags: + tag = ( + tag.strip() + .removesuffix(".") + .removesuffix("Main") + .removeprefix("Main") + ) + if tag: + gloss_data.tags.append(tag) + + if other: + wxr.wtp.debug( + f"Found nodes that are not part of definition: {other}", + sortid="extractor/es/gloss/extract_gloss/46", + ) + + page_data[-1].senses.append(gloss_data) diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py new file mode 100644 index 00000000..7c8911dc --- /dev/null +++ b/src/wiktextract/extractor/es/models.py @@ -0,0 +1,103 @@ +import json +import logging +from typing import Optional + +from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic.json_schema import GenerateJsonSchema + +from wiktextract.wxr_context import WiktextractContext + + +class PydanticLogger: + wxr: Optional[WiktextractContext] = None + + @classmethod + def debug( + cls, msg: str, trace: Optional[str] = None, sortid: str = "XYZunsorted" + ): + if cls.wxr: + cls.wxr.wtp.debug(msg, trace=trace, sortid=sortid) + else: + logging.debug(msg) + + +class BaseModelWrap(BaseModel): + model_config = ConfigDict(validate_assignment=True) + + +class LoggingExtraFieldsModel(BaseModelWrap): + @model_validator(mode="before") + def log_extra_fields(cls, values): + all_allowed_field_names = cls.model_fields.keys() + extra_fields = { + name: str(value) + for name, value in values.items() + if name not in all_allowed_field_names + } + if extra_fields: + class_full_name = cls.__name__ + PydanticLogger.debug( + msg=f"Pydantic - Got extra fields in {class_full_name}: {extra_fields}", + sortid="wiktextract/extractor/es/pydantic/extra_fields/33", + ) + return values + + +class Sense(LoggingExtraFieldsModel): + glosses: list[str] = Field( + description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging." + ) + tags: list[str] = Field( + default=[], + description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", + ) + categories: list[str] = Field( + default=[], + description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", + ) + # examples: list[SenseExample] = [] + subsenses: list["Sense"] = Field( + default=[], description="List of subsenses" + ) + senseid: Optional[int] = Field( + default=None, description="Sense number used in Wiktionary" + ) + + +class WordEntry(LoggingExtraFieldsModel): + """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" + + word: str = Field(description="word string") + pos: str = Field(default=None, description="Part of speech type") + pos_title: str = Field(default=None, description="Original POS title") + lang_code: str = Field( + description="Wiktionary language code", examples=["es"] + ) + lang_name: str = Field( + description="Localized language name of the word", examples=["español"] + ) + senses: Optional[list[Sense]] = [] + categories: list[str] = Field( + default=[], + description="list of non-disambiguated categories for the word", + ) + + +if __name__ == "__main__": + + class JsonSchemaGenerator(GenerateJsonSchema): + def generate(self, schema, mode="validation"): + json_schema = super().generate(schema, mode=mode) + json_schema["title"] = "Spanish Wiktionary" + json_schema["$id"] = "https://kaikki.org/es.json" + json_schema["$schema"] = self.schema_dialect + return json_schema + + with open("json_schema/es.json", "w") as f: + json.dump( + WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator), + f, + indent=2, + ensure_ascii=False, + sort_keys=True, + ) diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py new file mode 100644 index 00000000..1ac75dde --- /dev/null +++ b/src/wiktextract/extractor/es/page.py @@ -0,0 +1,153 @@ +import copy +import logging +from typing import Dict, List + +from wikitextprocessor import NodeKind, WikiNode + +from wiktextract.extractor.es.gloss import extract_gloss +from wiktextract.extractor.es.models import PydanticLogger, WordEntry +from wiktextract.extractor.es.pronunciation import extract_pronunciation +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + +# Templates that are used to form panels on pages and that +# should be ignored in various positions +PANEL_TEMPLATES = set() + +# Template name prefixes used for language-specific panel templates (i.e., +# templates that create side boxes or notice boxes or that should generally +# be ignored). +PANEL_PREFIXES = set() + +# Additional templates to be expanded in the pre-expand phase +ADDITIONAL_EXPAND_TEMPLATES = set() + + +def parse_section( + wxr: WiktextractContext, + page_data: List[WordEntry], + base_data: WordEntry, + level_node: WikiNode, +) -> None: + # Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura + subtitle = clean_node(wxr, base_data, level_node.largs) + wxr.wtp.start_subsection(subtitle) + + pos_template_name = None + for level_node_template in level_node.find_content(NodeKind.TEMPLATE): + pos_template_name = level_node_template.template_name + + if subtitle in wxr.config.OTHER_SUBTITLES["ignored_sections"]: + pass + + elif pos_template_name and pos_template_name in wxr.config.POS_SUBTITLES: + process_pos_block( + wxr, page_data, base_data, level_node, pos_template_name, subtitle + ) + else: + wxr.wtp.debug( + f"Unprocessed section: {subtitle}", + sortid="extractor/es/page/parse_section/48", + ) + + +def process_pos_block( + wxr: WiktextractContext, + page_data: List[WordEntry], + base_data: WordEntry, + pos_level_node: WikiNode, + pos_template_name: str, + pos_title: str, +): + pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"] + + page_data.append(copy.deepcopy(base_data)) + page_data[-1].pos = pos_type + page_data[-1].pos_title = pos_title + child_nodes = list(pos_level_node.filter_empty_str_child()) + + for child in child_nodes: + if ( + isinstance(child, WikiNode) + and child.kind == NodeKind.TEMPLATE + and ( + "inflect" in child.template_name + or "v.conj" in child.template_name + ) + ): + # XXX: Extract forms + pass + elif ( + isinstance(child, WikiNode) + and child.kind == NodeKind.LIST + and child.sarg == ";" + ): + extract_gloss(wxr, page_data, child) + + else: + # XXX: Extract data + pass + pass + + +def parse_page( + wxr: WiktextractContext, page_title: str, page_text: str +) -> List[Dict[str, any]]: + if wxr.config.verbose: + logging.info(f"Parsing page: {page_title}") + # Pass current wiktextractcontext to pydantic for more better logging + PydanticLogger.wxr = wxr + + wxr.config.word = page_title + wxr.wtp.start_page(page_title) + + # Parse the page, pre-expanding those templates that are likely to + # influence parsing + tree = wxr.wtp.parse( + page_text, + pre_expand=True, + additional_expand=ADDITIONAL_EXPAND_TEMPLATES, + ) + + page_data: List[WordEntry] = [] + for level2_node in tree.find_child(NodeKind.LEVEL2): + for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): + # https://es.wiktionary.org/wiki/Plantilla:lengua + # https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma + if subtitle_template.template_name == "lengua": + categories = {"categories": []} + lang_code = subtitle_template.template_parameters.get(1) + if ( + wxr.config.capture_language_codes is not None + and lang_code not in wxr.config.capture_language_codes + ): + continue + + lang_name = clean_node(wxr, categories, subtitle_template) + wxr.wtp.start_section(lang_name) + base_data = WordEntry( + lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title + ) + base_data.categories.extend(categories["categories"]) + for level3_node in level2_node.find_child(NodeKind.LEVEL3): + parse_section(wxr, page_data, base_data, level3_node) + + for not_level3_node in level2_node.invert_find_child( + NodeKind.LEVEL3 + ): + if ( + isinstance(not_level3_node, WikiNode) + and not_level3_node.kind == NodeKind.TEMPLATE + and not_level3_node.template_name == "pron-graf" + ): + if wxr.config.capture_pronunciation: + extract_pronunciation( + wxr, page_data[-1], not_level3_node + ) + else: + wxr.wtp.debug( + f"Found unexpected child in level 2 'lengua' node: {not_level3_node}", + sortid="extractor/es/page/parse_page/80", + ) + + return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/src/wiktextract/extractor/es/pronunciation.py b/src/wiktextract/extractor/es/pronunciation.py new file mode 100644 index 00000000..ce60d6ba --- /dev/null +++ b/src/wiktextract/extractor/es/pronunciation.py @@ -0,0 +1,9 @@ +from wiktextract.wxr_context import WiktextractContext +from typing import Dict, List +from wikitextprocessor import WikiNode + + +def extract_pronunciation( + wxr: WiktextractContext, page_data: List[Dict], template_node: WikiNode +) -> None: + pass diff --git a/tests/test_es_gloss.py b/tests/test_es_gloss.py new file mode 100644 index 00000000..00a20915 --- /dev/null +++ b/tests/test_es_gloss.py @@ -0,0 +1,88 @@ +from typing import List +import unittest + +from wikitextprocessor import Wtp +from wiktextract.extractor.es.gloss import extract_gloss +from wiktextract.extractor.es.models import WordEntry + +from wiktextract.config import WiktionaryConfig +from wiktextract.wxr_context import WiktextractContext + + +class TestESGloss(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="es"), + WiktionaryConfig(dump_file_lang_code="es"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_page_data(self) -> List[WordEntry]: + return [WordEntry(word="test", lang_code="es", lang_name="Language")] + + def test_es_extract_glosses(self): + # https://es.wiktionary.org/wiki/ayudar + + self.wxr.wtp.add_page("Plantilla:plm", 10, "Contribuir") + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse( + """;1: {{plm|contribuir}} [[esfuerzo]] o [[recurso]]s para la [[realización]] de algo. +;2: Por antonomasia, [[cooperar]] a que alguno [[salir|salga]] de una [[situación]] [[dificultoso|dificultosa]]""" + ) + + page_data = self.get_default_page_data() + + extract_gloss(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": [ + "Contribuir esfuerzo o recursos para la realización de algo." + ], + "senseid": 1, + }, + { + "glosses": [ + "Por antonomasia, cooperar a que alguno salga de una situación dificultosa" + ], + "senseid": 2, + }, + ], + ) + + def test_es_extract_gloss_categories(self): + # https://es.wiktionary.org/wiki/amor + self.wxr.wtp.add_page("Plantilla:plm", 10, "Sentimiento") + self.wxr.wtp.add_page( + "Plantilla:sentimientos", + 10, + "Humanidades. [[Categoría:ES:Sentimientos]]", + ) + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse( + ";1 {{sentimientos}}: {{plm|sentimiento}} [[afectivo]] de [[atracción]], [[unión]] y [[afinidad]] que se experimenta hacia una persona, animal o cosa" + ) + + page_data = self.get_default_page_data() + + extract_gloss(self.wxr, page_data, root.children[0]) + + self.assertEqual( + page_data[0].model_dump(exclude_defaults=True)["senses"], + [ + { + "glosses": [ + "Sentimiento afectivo de atracción, unión y afinidad que se experimenta hacia una persona, animal o cosa" + ], + "senseid": 1, + "tags": ["Humanidades"], + "categories": ["ES:Sentimientos"], + } + ], + )