diff --git a/src/wiktextract/data/es/linkage_subtitles.json b/src/wiktextract/data/es/linkage_subtitles.json new file mode 100644 index 00000000..b00b03fa --- /dev/null +++ b/src/wiktextract/data/es/linkage_subtitles.json @@ -0,0 +1,9 @@ +{ + "antónimo": "antonyms", + "derivad": "derived", + "hipónimo": "hyponyms", + "hiperónimo": "hypernyms", + "merónimo": "meronyms", + "relacionado": "related", + "sinónimo": "synonyms" +} diff --git a/src/wiktextract/data/es/other_subtitles.json b/src/wiktextract/data/es/other_subtitles.json index e7c1ed26..c59bca58 100644 --- a/src/wiktextract/data/es/other_subtitles.json +++ b/src/wiktextract/data/es/other_subtitles.json @@ -1,5 +1,5 @@ { "etymology": ["Etimología"], - "pronunciation": ["pronunciación"], - "ignored_sections": ["Véase también"] + "ignored_sections": ["Véase también"], + "translations": ["Traducciones"] } diff --git a/src/wiktextract/extractor/es/example.py b/src/wiktextract/extractor/es/example.py new file mode 100644 index 00000000..240863fd --- /dev/null +++ b/src/wiktextract/extractor/es/example.py @@ -0,0 +1,179 @@ +import re +from typing import Optional, Tuple, Union + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import WikiNodeChildrenList + +from wiktextract.extractor.es.models import Example, Reference, Sense +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + +EXAMPLE_TEMPLATE_KEY_MAPPING = { + "título": "title", + "nombre": "first_name", + "apellidos": "last_name", + "páginas": "pages", + "URL": "url", + "año": "year", + "capítulo": "chapter", + "fecha": "date", + "editorial": "journal", + "editor": "editor", + "ubicación": "place", +} + + +def clean_text_and_url_from_text_nodes( + wxr: WiktextractContext, nodes: WikiNodeChildrenList +) -> Tuple[str, Optional[str]]: + if not nodes: + return "", None + + url_node = None + text_nodes_without_url = [] + for n in nodes: + if isinstance(n, WikiNode) and n.kind == NodeKind.URL: + url_node = n + else: + text_nodes_without_url.append(n) + + url = None + if url_node: + url = clean_node(wxr, {}, url_node) + + text = clean_node(wxr, {}, text_nodes_without_url) + + return text, url + + +def add_template_params_to_reference( + wxr: WiktextractContext, + params: Optional[ + dict[ + Union[str, int], + Union[str, WikiNode, list[Union[str, WikiNode]]], + ] + ], + reference: Reference, +): + for key in params.keys(): + if isinstance(key, int): + continue + + ref_key = EXAMPLE_TEMPLATE_KEY_MAPPING.get(key, key) + if ref_key in reference.model_fields: + setattr(reference, ref_key, clean_node(wxr, {}, params.get(key))) + else: + wxr.wtp.debug( + f"Unknown key {key} in example template {params}", + sortid="wiktextract/extractor/es/example/add_template_params_to_reference/73", + ) + + +def process_example_template( + wxr: WiktextractContext, + sense_data: Sense, + template_node: WikiNode, + reference: Reference, +): + params = template_node.template_parameters + text_nodes = params.get(1) + + # Remove url node before cleaning text nodes + text, url = clean_text_and_url_from_text_nodes(wxr, text_nodes) + + if not text: + return + + example = Example(text=text) + + if url: + example.ref = Reference(url=url) + + if template_node.template_name == "ejemplo_y_trad": + example.translation = clean_node(wxr, {}, params.get(2)) + + add_template_params_to_reference(wxr, params, reference) + + sense_data.examples.append(example) + + +def extract_example( + wxr: WiktextractContext, + sense_data: Sense, + nodes: WikiNodeChildrenList, +): + rest: WikiNodeChildrenList = [] + + reference = Reference() + for node in nodes: + if isinstance(node, WikiNode) and node.kind == NodeKind.TEMPLATE: + if node.template_name in ["ejemplo", "ejemplo_y_trad"]: + process_example_template(wxr, sense_data, node, reference) + else: + rest.append(node) + elif isinstance(node, WikiNode) and node.kind == NodeKind.URL: + reference.url = clean_node(wxr, {}, node) + else: + rest.append(node) + + if not sense_data.examples and rest: + example = Example(text=clean_node(wxr, {}, rest)) + sense_data.examples.append(example) + elif rest: + wxr.wtp.debug( + f"Unprocessed nodes from example group: {rest}", + sortid="extractor/es/example/extract_example/87", + ) + + if sense_data.examples and reference.model_dump(exclude_defaults=True): + sense_data.examples[-1].ref = reference + + +def process_example_list( + wxr: WiktextractContext, + sense_data: Sense, + list_item: WikiNode, +): + for sub_list_item in list_item.find_child_recursively(NodeKind.LIST_ITEM): + text_nodes: WikiNodeChildrenList = [] + template_nodes: list[WikiNode] = [] + for child in sub_list_item.children: + if isinstance(child, WikiNode) and child.kind == NodeKind.TEMPLATE: + template_nodes.append(child) + else: + text_nodes.append(child) + + text, url = clean_text_and_url_from_text_nodes(wxr, text_nodes) + + if not text: + continue + + example = Example(text=text) + if url: + example.ref = Reference(url=url) + + for template_node in template_nodes: + reference = Reference() + if template_node.template_name == "cita libro": + add_template_params_to_reference( + wxr, template_node.template_parameters, reference + ) + if reference.model_dump(exclude_defaults=True): + example.ref = reference + + sense_data.examples.append(example) + + # If no example was found in sublists, assume example is in list_item.children directly. + if not sense_data.examples: + text, url = clean_text_and_url_from_text_nodes(wxr, list_item.children) + + text = re.sub(r"^(Ejemplos?:?)", "", text).strip() + + if not text: + return + example = Example(text=text) + if url: + example.ref = Reference(url=url) + + sense_data.examples.append(example) diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py index 44f209e7..38892b10 100644 --- a/src/wiktextract/extractor/es/gloss.py +++ b/src/wiktextract/extractor/es/gloss.py @@ -1,17 +1,17 @@ import re -from typing import List from wikitextprocessor import NodeKind, WikiNode from wikitextprocessor.parser import WikiNodeChildrenList from wiktextract.extractor.es.models import Sense, WordEntry +from wiktextract.extractor.es.sense_data import process_sense_data_list from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext def extract_gloss( wxr: WiktextractContext, - page_data: List[WordEntry], + page_data: list[WordEntry], list_node: WikiNode, ) -> None: for list_item in list_node.find_child(NodeKind.LIST_ITEM): @@ -20,6 +20,9 @@ def extract_gloss( definition: WikiNodeChildrenList = [] other: WikiNodeChildrenList = [] + if not list_item.definition: + continue + for node in list_item.definition: if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: other.append(node) @@ -53,10 +56,18 @@ def extract_gloss( if tag: gloss_data.tags.append(tag) - if other: - wxr.wtp.debug( - f"Found nodes that are not part of definition: {other}", - sortid="extractor/es/gloss/extract_gloss/46", - ) - page_data[-1].senses.append(gloss_data) + + if other: + for node in other: + if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: + process_sense_data_list( + wxr, + page_data[-1].senses[-1], + node, + ) + else: + wxr.wtp.debug( + f"Found nodes that are not part of definition: {node}", + sortid="extractor/es/gloss/extract_gloss/46", + ) diff --git a/src/wiktextract/extractor/es/linkage.py b/src/wiktextract/extractor/es/linkage.py new file mode 100644 index 00000000..7286790d --- /dev/null +++ b/src/wiktextract/extractor/es/linkage.py @@ -0,0 +1,20 @@ +from wikitextprocessor.parser import WikiNodeChildrenList + +from wiktextract.extractor.es.models import WordEntry +from wiktextract.wxr_context import WiktextractContext + + +def extract_linkage( + wxr: WiktextractContext, + page_data: list[WordEntry], + nodes: WikiNodeChildrenList, +): + pass + + +def process_linkage_list_children( + wxr: WiktextractContext, + page_data: list[WordEntry], + nodes: WikiNodeChildrenList, +): + pass diff --git a/src/wiktextract/extractor/es/models.py b/src/wiktextract/extractor/es/models.py index a0e57f48..63d2649f 100644 --- a/src/wiktextract/extractor/es/models.py +++ b/src/wiktextract/extractor/es/models.py @@ -1,49 +1,45 @@ import json -import logging -from typing import List, Optional +from typing import Optional -from pydantic import BaseModel, ConfigDict, Field, model_validator +from pydantic import BaseModel, ConfigDict, Field from pydantic.json_schema import GenerateJsonSchema -from wiktextract.wxr_context import WiktextractContext +class BaseModelWrap(BaseModel): + model_config = ConfigDict(validate_assignment=True, extra="forbid") + + +class Reference(BaseModelWrap): + url: Optional[str] = Field(default=None, description="A web link") + first_name: Optional[str] = Field( + default=None, description="Author's first name" + ) + last_name: Optional[str] = Field( + default=None, description="Author's last name" + ) + title: Optional[str] = Field( + default=None, description="Title of the reference" + ) + pages: Optional[str] = Field(default=None, description="Page numbers") + year: Optional[str] = Field(default=None, description="Year of publication") + date: Optional[str] = Field(default=None, description="Date of publication") + journal: Optional[str] = Field(default=None, description="Name of journal") + chapter: Optional[str] = Field(default=None, description="Chapter name") + place: Optional[str] = Field( + default=None, description="Place of publication" + ) + editor: Optional[str] = Field(default=None, description="Editor") -class PydanticLogger: - wxr: Optional[WiktextractContext] = None - @classmethod - def debug( - cls, msg: str, trace: Optional[str] = None, sortid: str = "XYZunsorted" - ): - if cls.wxr: - cls.wxr.wtp.debug(msg, trace=trace, sortid=sortid) - else: - logging.debug(msg) +class Example(BaseModelWrap): + text: str = Field(description="Example usage sentence") + translation: Optional[str] = Field( + default=None, description="Spanish translation of the example sentence" + ) + ref: Optional["Reference"] = Field(default=None, description="") -class BaseModelWrap(BaseModel): - model_config = ConfigDict(validate_assignment=True) - - -class LoggingExtraFieldsModel(BaseModelWrap): - @model_validator(mode="before") - def log_extra_fields(cls, values): - all_allowed_field_names = cls.model_fields.keys() - extra_fields = { - name: str(value) - for name, value in values.items() - if name not in all_allowed_field_names - } - if extra_fields: - class_full_name = cls.__name__ - PydanticLogger.debug( - msg=f"Pydantic - Got extra fields in {class_full_name}: {extra_fields}", - sortid="wiktextract/extractor/es/pydantic/extra_fields/33", - ) - return values - - -class Sense(LoggingExtraFieldsModel): +class Sense(BaseModelWrap): glosses: list[str] = Field( description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging." ) @@ -55,7 +51,9 @@ class Sense(LoggingExtraFieldsModel): default=[], description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", ) - # examples: list[SenseExample] = [] + examples: list["Example"] = Field( + default=[], description="List of examples" + ) subsenses: list["Sense"] = Field( default=[], description="List of subsenses" ) @@ -64,7 +62,7 @@ class Sense(LoggingExtraFieldsModel): ) -class Spelling(LoggingExtraFieldsModel): +class Spelling(BaseModelWrap): alternative: Optional[str] = Field( default=None, description="Alternative spelling with same pronunciation" ) @@ -77,30 +75,30 @@ class Spelling(LoggingExtraFieldsModel): ) -class Sound(LoggingExtraFieldsModel): - ipa: List[str] = Field( +class Sound(BaseModelWrap): + ipa: list[str] = Field( default=[], description="International Phonetic Alphabet" ) - phonetic_transcription: List[str] = Field( + phonetic_transcription: list[str] = Field( default=[], description="Phonetic transcription, less exact than IPA." ) - audio: List[str] = Field(default=[], description="Audio file name") - wav_url: List[str] = Field(default=[]) - ogg_url: List[str] = Field(default=[]) - mp3_url: List[str] = Field(default=[]) - flac_url: List[str] = Field(default=[]) - roman: List[str] = Field( + audio: list[str] = Field(default=[], description="Audio file name") + wav_url: list[str] = Field(default=[]) + ogg_url: list[str] = Field(default=[]) + mp3_url: list[str] = Field(default=[]) + flac_url: list[str] = Field(default=[]) + roman: list[str] = Field( default=[], description="Translitaration to Roman characters" ) - syllabic: List[str] = Field( + syllabic: list[str] = Field( default=[], description="Syllabic transcription" ) - tag: List[str] = Field( + tag: list[str] = Field( default=[], description="Specifying the variant of the pronunciation" ) -class WordEntry(LoggingExtraFieldsModel): +class WordEntry(BaseModelWrap): """WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" word: str = Field(description="word string") diff --git a/src/wiktextract/extractor/es/page.py b/src/wiktextract/extractor/es/page.py index 174c0a0b..060e34d2 100644 --- a/src/wiktextract/extractor/es/page.py +++ b/src/wiktextract/extractor/es/page.py @@ -1,12 +1,16 @@ import copy import logging -from typing import Dict, List +import re from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import WikiNodeChildrenList +from wiktextract.extractor.es.example import extract_example from wiktextract.extractor.es.gloss import extract_gloss -from wiktextract.extractor.es.models import PydanticLogger, WordEntry +from wiktextract.extractor.es.linkage import extract_linkage +from wiktextract.extractor.es.models import WordEntry from wiktextract.extractor.es.pronunciation import process_pron_graf_template +from wiktextract.extractor.es.sense_data import process_sense_data_list from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext @@ -23,80 +27,276 @@ ADDITIONAL_EXPAND_TEMPLATES = set() +def parse_entries( + wxr: WiktextractContext, + page_data: list[WordEntry], + base_data: WordEntry, + level_node: WikiNode, +): + """ + Parse entries in a language section (level 2) or etymology section (level 3) + and extract data affecting all subsections, e.g. the {pron-graf} template. + + A language section may contain multiple entries, usually devided by + different POS with level 3 headings, + e.g. https://es.wiktionary.org/wiki/agua or + https://es.wiktionary.org/wiki/love + + If a word has distinct etmylogies, these are separated by level 3 headings + and subdivided by their POS at level 4 headings, + e.g. https://es.wiktionary.org/wiki/churro + + + """ + next_level_kind = ( + NodeKind.LEVEL3 + if level_node.kind == NodeKind.LEVEL2 + else NodeKind.LEVEL4 + ) + + # This might not be necessary but it's to prevent that base_data is applied + # to entries that it shouldn't be applied to + base_data_copy = copy.deepcopy(base_data) + + # Parse data affecting all subsections and add to base_data_copy + for not_sub_level_node in level_node.invert_find_child(next_level_kind): + if ( + isinstance(not_sub_level_node, WikiNode) + and not_sub_level_node.kind == NodeKind.TEMPLATE + and not_sub_level_node.template_name == "pron-graf" + ): + if wxr.config.capture_pronunciation: + process_pron_graf_template( + wxr, base_data_copy, not_sub_level_node + ) + else: + wxr.wtp.debug( + f"Found unexpected child in page_entry for {level_node.largs}: {not_sub_level_node}", + sortid="extractor/es/page/parse_entries/69", + ) + + for sub_level_node in level_node.find_child(next_level_kind): + parse_section(wxr, page_data, base_data_copy, sub_level_node) + + def parse_section( wxr: WiktextractContext, - page_data: List[WordEntry], + page_data: list[WordEntry], base_data: WordEntry, level_node: WikiNode, ) -> None: - # Page Structure: https://es.wiktionary.org/wiki/Wikcionario:Estructura - subtitle = clean_node(wxr, base_data, level_node.largs) - wxr.wtp.start_subsection(subtitle) + """ + Parses indidividual sibling sections of an entry, + e.g. https://es.wiktionary.org/wiki/amor: + + === Etimología === + === {{sustantivo masculino|es}} === + === Locuciones === + + See also page structure: + https://es.wiktionary.org/wiki/Wikcionario:Estructura + """ + + section_title = clean_node(wxr, base_data, level_node.largs) + wxr.wtp.start_subsection(section_title) pos_template_name = None for level_node_template in level_node.find_content(NodeKind.TEMPLATE): pos_template_name = level_node_template.template_name - if subtitle in wxr.config.OTHER_SUBTITLES["ignored_sections"]: + if re.match(r"Etimología \d+", section_title): + parse_entries(wxr, page_data, base_data, level_node) + + elif section_title in wxr.config.OTHER_SUBTITLES["ignored_sections"]: pass elif pos_template_name and pos_template_name in wxr.config.POS_SUBTITLES: + pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"] + + page_data.append(copy.deepcopy(base_data)) + page_data[-1].pos = pos_type + page_data[-1].pos_title = section_title + process_pos_block( - wxr, page_data, base_data, level_node, pos_template_name, subtitle + wxr, + page_data, + level_node, ) + elif section_title in wxr.config.OTHER_SUBTITLES["etymology"]: + if wxr.config.capture_etymologies: + # XXX: Extract etymology + pass + elif section_title in wxr.config.OTHER_SUBTITLES["translations"]: + if wxr.config.capture_translations: + # XXX: Extract translations + pass else: wxr.wtp.debug( - f"Unprocessed section: {subtitle}", + f"Unprocessed section: {section_title}", sortid="extractor/es/page/parse_section/48", ) def process_pos_block( wxr: WiktextractContext, - page_data: List[WordEntry], - base_data: WordEntry, + page_data: list[WordEntry], pos_level_node: WikiNode, - pos_template_name: str, - pos_title: str, ): - pos_type = wxr.config.POS_SUBTITLES[pos_template_name]["pos"] + """ + Senses are indicated by ListNodes with a semicolon as argument. They can be + followed by multiple nodes that add different kinds of information to the + sense. These nodes are collected in sense_children and processed after the + next sense is encountered or after the last sense has been processed. + """ - page_data.append(copy.deepcopy(base_data)) - page_data[-1].pos = pos_type - page_data[-1].pos_title = pos_title child_nodes = list(pos_level_node.filter_empty_str_child()) + sense_children: WikiNodeChildrenList = ( + [] + ) # All non-gloss nodes that add additional information to a sense + for child in child_nodes: if ( - isinstance(child, WikiNode) - and child.kind == NodeKind.TEMPLATE - and ( - "inflect" in child.template_name - or "v.conj" in child.template_name - ) - ): - # XXX: Extract forms - pass - elif ( isinstance(child, WikiNode) and child.kind == NodeKind.LIST and child.sarg == ";" ): + # Consume sense_children of previous sense and extract gloss of new sense + process_sense_children(wxr, page_data, sense_children) + sense_children = [] + extract_gloss(wxr, page_data, child) + elif page_data[-1].senses: + sense_children.append(child) + else: - # XXX: Extract data - pass - pass + # Process nodes before first sense + if ( + isinstance(child, WikiNode) + and child.kind == NodeKind.TEMPLATE + and ( + "inflect" in child.template_name + or "v.conj" in child.template_name + ) + ): + # XXX: Extract forms + pass + + elif ( + isinstance(child, WikiNode) + and child.kind == NodeKind.LINK + and "Categoría" in child.largs[0][0] + ): + clean_node( + wxr, + page_data[-1], + child, + ) + else: + wxr.wtp.debug( + f"Found unexpected node in pos_block: {child}", + sortid="extractor/es/page/process_pos_block/184", + ) + process_sense_children(wxr, page_data, sense_children) + + +def process_sense_children( + wxr: WiktextractContext, + page_data: list[WordEntry], + sense_children: WikiNodeChildrenList, +) -> None: + """In most cases additional information to a sense is given via special + templates or lists. However, sometimes string nodes are used to add + information to a preceeding template or list. + + This function collects the nodes that form a group and calls the relevant + methods for extraction. + """ + + def starts_new_group(child: WikiNode) -> bool: + # Nested function for readibility + return isinstance(child, WikiNode) and ( + child.kind == NodeKind.TEMPLATE + or child.kind == NodeKind.LIST + or child.kind == NodeKind.LINK + ) + + def process_group( + wxr: WiktextractContext, + page_data: list[WordEntry], + group: WikiNodeChildrenList, + ) -> None: + # Nested function for readibility + + if len(group) == 0: + return + elif ( + isinstance(group[0], WikiNode) + and group[0].kind == NodeKind.TEMPLATE + ): + template_name = group[0].template_name + + if template_name == "clear": + return + elif ( + template_name.removesuffix("s") in wxr.config.LINKAGE_SUBTITLES + ): + extract_linkage(wxr, page_data, group) + elif template_name in ["ejemplo", "ejemplos", "ejemplo_y_trad"]: + extract_example(wxr, page_data[-1].senses[-1], group) + elif template_name == "uso": + # XXX: Extract usage note + pass + elif template_name == "ámbito": + # XXX Extract scope note + pass + else: + wxr.wtp.debug( + f"Found unexpected group specifying a sense: {group}, head template {template_name}", + sortid="extractor/es/page/process_group/102", + ) + + elif isinstance(group[0], WikiNode) and group[0].kind == NodeKind.LIST: + list_node = group[ + 0 + ] # List groups seem to not be followed by string nodes. We, therefore, only process the list_node. + process_sense_data_list(wxr, page_data[-1].senses[-1], list_node) + + elif ( + isinstance(child, WikiNode) + and child.kind == NodeKind.LINK + and "Categoría" in child.largs[0][0] + ): + # Extract sense categories + clean_node( + wxr, + page_data[-1].senses[-1], + child, + ) + + else: + wxr.wtp.debug( + f"Found unexpected group specifying a sense: {group}", + sortid="extractor/es/page/process_group/117", + ) + + group: WikiNodeChildrenList = [] + + for child in sense_children: + if starts_new_group(child): + process_group(wxr, page_data, group) + group = [] + group.append(child) + + process_group(wxr, page_data, group) def parse_page( wxr: WiktextractContext, page_title: str, page_text: str -) -> List[Dict[str, any]]: +) -> list[dict[str, any]]: if wxr.config.verbose: logging.info(f"Parsing page: {page_title}") - # Pass current wiktextractcontext to pydantic for more better logging - PydanticLogger.wxr = wxr wxr.config.word = page_title wxr.wtp.start_page(page_title) @@ -109,7 +309,7 @@ def parse_page( additional_expand=ADDITIONAL_EXPAND_TEMPLATES, ) - page_data: List[WordEntry] = [] + page_data: list[WordEntry] = [] for level2_node in tree.find_child(NodeKind.LEVEL2): for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): # https://es.wiktionary.org/wiki/Plantilla:lengua @@ -129,28 +329,6 @@ def parse_page( lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title ) base_data.categories.extend(categories["categories"]) - for level3_node in level2_node.find_child(NodeKind.LEVEL3): - parse_section(wxr, page_data, base_data, level3_node) - - for not_level3_node in level2_node.invert_find_child( - NodeKind.LEVEL3 - ): - if ( - isinstance(not_level3_node, WikiNode) - and not_level3_node.kind == NodeKind.TEMPLATE - and not_level3_node.template_name == "pron-graf" - ): - if ( - wxr.config.capture_pronunciation - and len(page_data) > 0 - ): - process_pron_graf_template( - wxr, page_data[-1], not_level3_node - ) - else: - wxr.wtp.debug( - f"Found unexpected child in level 2 'lengua' node: {not_level3_node}", - sortid="extractor/es/page/parse_page/80", - ) + parse_entries(wxr, page_data, base_data, level2_node) return [d.model_dump(exclude_defaults=True) for d in page_data] diff --git a/src/wiktextract/extractor/es/sense_data.py b/src/wiktextract/extractor/es/sense_data.py new file mode 100644 index 00000000..21191f3b --- /dev/null +++ b/src/wiktextract/extractor/es/sense_data.py @@ -0,0 +1,57 @@ +from wikitextprocessor import NodeKind, WikiNode + +from wiktextract.extractor.es.example import process_example_list +from wiktextract.extractor.es.linkage import process_linkage_list_children +from wiktextract.extractor.es.models import Sense +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + + +def process_sense_data_list( + wxr: WiktextractContext, + sense_data: Sense, + list_node: WikiNode, +): + list_marker = list_node.sarg + + if list_marker == ":;": + # XXX: Extract subsenses (rare!) + pass + elif list_marker in [":*"]: + for list_item in list_node.find_child(NodeKind.LIST_ITEM): + children = list(list_item.filter_empty_str_child()) + # The first child will specify what data is listed + list_type = ( + clean_node(wxr, {}, children[0]) + .strip() + .removesuffix(":") + .removesuffix("s") + .lower() + ) + + if list_type == "ejemplo": + process_example_list(wxr, sense_data, list_item) + elif list_type in wxr.config.LINKAGE_SUBTITLES: + process_linkage_list_children(wxr, sense_data, children[1:]) + elif list_type == "ámbito": + # XXX: Extract scope tag + pass + elif list_type == "uso": + # XXX: Extract usage note + pass + else: + wxr.wtp.debug( + f"Found unknown list type '{list_type}' in {list_item}", + sortid="extractor/es/sense_data/process_sense_data_list/46", + ) + + elif list_marker in ["::", ":::"]: + # E.g. https://es.wiktionary.org/wiki/silepsis + for list_item in list_node.find_child_recursively(NodeKind.LIST_ITEM): + process_example_list(wxr, sense_data, list_item) + + else: + wxr.wtp.debug( + f"Found unknown list marker {list_marker} in: {list_node}", + sortid="extractor/es/sense_data/process_sense_data_list/52", + ) diff --git a/tests/test_es_example.py b/tests/test_es_example.py new file mode 100644 index 00000000..e6ca9f0c --- /dev/null +++ b/tests/test_es_example.py @@ -0,0 +1,157 @@ +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.es.example import ( + extract_example, + process_example_list, +) +from wiktextract.extractor.es.models import Sense +from wiktextract.wxr_context import WiktextractContext + + +class TestESExample(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="es"), + WiktionaryConfig(dump_file_lang_code="es"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_sense_data(self) -> Sense: + return Sense(glosses=["gloss1"]) + + def test_es_extract_example(self): + test_cases = [ + # https://es.wiktionary.org/wiki/coñazo + { + "input": "{{ejemplo|La conferencia ha sido un ''coñazo''}}", + "expected": [{"text": "La conferencia ha sido un coñazo"}], + }, + # https://es.wiktionary.org/wiki/necroporra + { + "input": "{{ejemplo|Nos gusta lo oscuro, y por eso triunfa la Necroporra, sea ético o no}}[https://www.menzig.es/a/necroporra-fantamorto-porra-famosos-muertos/ ]", + "expected": [ + { + "text": "Nos gusta lo oscuro, y por eso triunfa la Necroporra, sea ético o no", + "ref": { + "url": "https://www.menzig.es/a/necroporra-fantamorto-porra-famosos-muertos/" + }, + } + ], + }, + # https://es.wiktionary.org/wiki/ser_más_viejo_que_Matusalén + { + "input": """{{ejemplo|Papel: más viejo que Matusalén, pero graduado "cum laude" en eficacia publicitaria [https://www.marketingdirecto.com/marketing-general/publicidad/papel-mas-viejo-matusalen-pero-graduado-cum-laude-eficacia-publicitaria]}}""", + "expected": [ + { + "text": """Papel: más viejo que Matusalén, pero graduado "cum laude" en eficacia publicitaria""", + "ref": { + "url": "https://www.marketingdirecto.com/marketing-general/publicidad/papel-mas-viejo-matusalen-pero-graduado-cum-laude-eficacia-publicitaria" + }, + } + ], + }, + # https://es.wiktionary.org/wiki/zapotear + { + "input": "{{ejemplo|Era persona inteligente, culta, que me permitía ''zapotear'' los libros y me hacía comentarios sobre ellos y sus autores|título=Memorias intelectuales|apellidos=Jaramillo Uribe|nombre=Jaime|páginas=19|URL=https://books.google.com.co/books?id=X9MSAQAAIAAJ&q=zapotear|año=2007}}", + "expected": [ + { + "text": "Era persona inteligente, culta, que me permitía zapotear los libros y me hacía comentarios sobre ellos y sus autores", + "ref": { + "title": "Memorias intelectuales", + "first_name": "Jaime", + "last_name": "Jaramillo Uribe", + "pages": "19", + "url": "https://books.google.com.co/books?id=X9MSAQAAIAAJ&q=zapotear", + "year": "2007", + }, + } + ], + }, + # https://es.wiktionary.org/wiki/meek + { + "input": "{{ejemplo_y_trad|Blessed are the '''meek''', For they shall inherit the earth|Bienaventurados los '''mansos''', porque recibirán la tierra por heredad}}", + "expected": [ + { + "text": "Blessed are the meek, For they shall inherit the earth", + "translation": "Bienaventurados los mansos, porque recibirán la tierra por heredad", + } + ], + }, + # https://es.wiktionary.org/wiki/confesar + { + "input": "{{ejemplo}} El interrogatorio fue efectivo y el detenido ''confesó''.", + "expected": [ + { + "text": "El interrogatorio fue efectivo y el detenido confesó.", + } + ], + }, + ] + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + sense_data = self.get_default_sense_data() + + root = self.wxr.wtp.parse(case["input"]) + + extract_example(self.wxr, sense_data, root.children) + self.assertEqual( + sense_data.model_dump(exclude_defaults=True)["examples"], + case["expected"], + ) + + def test_es_process_example_list(self): + test_cases = [ + {"input": ":*'''Ejemplo:'''\n", "expected": []}, + # https://es.wiktionary.org/wiki/cerebro + { + "input": ":*'''Ejemplo:''' Tú serás el cerebro del plan.", + "expected": [{"text": "Tú serás el cerebro del plan."}], + }, + # https://es.wiktionary.org/wiki/quicio + { + "input": """:*'''Ejemplo:''' +::* «Apoyado contra el ''quicio'' de la puerta, adivina, de pronto, a su marido.» {{cita libro|nombre=María Luisa|apellidos=Bombal}}""", + "expected": [ + { + "text": "«Apoyado contra el quicio de la puerta, adivina, de pronto, a su marido.»", + "ref": { + "first_name": "María Luisa", + "last_name": "Bombal", + }, + } + ], + }, + # https://es.wiktionary.org/wiki/silepsis + { + "input": "::Su [[obra]] comprendió [[esculpir]] un [[busto]], varios [[retrato|retratos]] y uno que otro [[dibujo]] al [[carbón]].", + "expected": [ + { + "text": "Su obra comprendió esculpir un busto, varios retratos y uno que otro dibujo al carbón." + } + ], + }, + ] + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + sense_data = self.get_default_sense_data() + + root = self.wxr.wtp.parse(case["input"]) + + process_example_list( + self.wxr, sense_data, root.children[0].children[0] + ) + examples = [ + e.model_dump(exclude_defaults=True) + for e in sense_data.examples + ] + self.assertEqual( + examples, + case["expected"], + ) diff --git a/tests/test_es_page.py b/tests/test_es_page.py new file mode 100644 index 00000000..5281a587 --- /dev/null +++ b/tests/test_es_page.py @@ -0,0 +1,51 @@ +import unittest + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.es.models import WordEntry +from wiktextract.extractor.es.page import parse_entries +from wiktextract.wxr_context import WiktextractContext + + +class TestESPage(unittest.TestCase): + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="es"), + WiktionaryConfig(dump_file_lang_code="es"), + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + + def get_default_page_data(self) -> list[WordEntry]: + return [WordEntry(word="test", lang_code="es", lang_name="Language")] + + def test_es_parse_entries(self): + """ + Writes data affecting multiple entries to all affected WordEntry objects. + """ + self.wxr.wtp.start_page("love") + + # https://es.wiktionary.org/wiki/love + root = self.wxr.wtp.parse( + """== {{lengua|en}} == +{{pron-graf|leng=en|fone=lʌv}} +=== {{verbo|en}} === +=== {{sustantivo|en}} === +""" + ) + + base_data = self.get_default_page_data()[0] + page_data = [] + + parse_entries(self.wxr, page_data, base_data, root.children[0]) + + self.assertEqual(len(page_data), 2) + + self.assertEqual(page_data[0].sounds, page_data[1].sounds) + + self.assertNotEqual( + page_data[0].sounds, + base_data.sounds, + )