Skip to content

Commit

Permalink
Merge pull request #414 from empiriker/es
Browse files Browse the repository at this point in the history
Extract examples from Spanish Wiktionary
  • Loading branch information
xxyzz authored Dec 5, 2023
2 parents b0c038f + de483a9 commit 805e5e9
Show file tree
Hide file tree
Showing 10 changed files with 778 additions and 118 deletions.
9 changes: 9 additions & 0 deletions src/wiktextract/data/es/linkage_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"antónimo": "antonyms",
"derivad": "derived",
"hipónimo": "hyponyms",
"hiperónimo": "hypernyms",
"merónimo": "meronyms",
"relacionado": "related",
"sinónimo": "synonyms"
}
4 changes: 2 additions & 2 deletions src/wiktextract/data/es/other_subtitles.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"etymology": ["Etimología"],
"pronunciation": ["pronunciación"],
"ignored_sections": ["Véase también"]
"ignored_sections": ["Véase también"],
"translations": ["Traducciones"]
}
179 changes: 179 additions & 0 deletions src/wiktextract/extractor/es/example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import re
from typing import Optional, Tuple, Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.models import Example, Reference, Sense
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

EXAMPLE_TEMPLATE_KEY_MAPPING = {
"título": "title",
"nombre": "first_name",
"apellidos": "last_name",
"páginas": "pages",
"URL": "url",
"año": "year",
"capítulo": "chapter",
"fecha": "date",
"editorial": "journal",
"editor": "editor",
"ubicación": "place",
}


def clean_text_and_url_from_text_nodes(
wxr: WiktextractContext, nodes: WikiNodeChildrenList
) -> Tuple[str, Optional[str]]:
if not nodes:
return "", None

url_node = None
text_nodes_without_url = []
for n in nodes:
if isinstance(n, WikiNode) and n.kind == NodeKind.URL:
url_node = n
else:
text_nodes_without_url.append(n)

url = None
if url_node:
url = clean_node(wxr, {}, url_node)

text = clean_node(wxr, {}, text_nodes_without_url)

return text, url


def add_template_params_to_reference(
wxr: WiktextractContext,
params: Optional[
dict[
Union[str, int],
Union[str, WikiNode, list[Union[str, WikiNode]]],
]
],
reference: Reference,
):
for key in params.keys():
if isinstance(key, int):
continue

ref_key = EXAMPLE_TEMPLATE_KEY_MAPPING.get(key, key)
if ref_key in reference.model_fields:
setattr(reference, ref_key, clean_node(wxr, {}, params.get(key)))
else:
wxr.wtp.debug(
f"Unknown key {key} in example template {params}",
sortid="wiktextract/extractor/es/example/add_template_params_to_reference/73",
)


def process_example_template(
wxr: WiktextractContext,
sense_data: Sense,
template_node: WikiNode,
reference: Reference,
):
params = template_node.template_parameters
text_nodes = params.get(1)

# Remove url node before cleaning text nodes
text, url = clean_text_and_url_from_text_nodes(wxr, text_nodes)

if not text:
return

example = Example(text=text)

if url:
example.ref = Reference(url=url)

if template_node.template_name == "ejemplo_y_trad":
example.translation = clean_node(wxr, {}, params.get(2))

add_template_params_to_reference(wxr, params, reference)

sense_data.examples.append(example)


def extract_example(
wxr: WiktextractContext,
sense_data: Sense,
nodes: WikiNodeChildrenList,
):
rest: WikiNodeChildrenList = []

reference = Reference()
for node in nodes:
if isinstance(node, WikiNode) and node.kind == NodeKind.TEMPLATE:
if node.template_name in ["ejemplo", "ejemplo_y_trad"]:
process_example_template(wxr, sense_data, node, reference)
else:
rest.append(node)
elif isinstance(node, WikiNode) and node.kind == NodeKind.URL:
reference.url = clean_node(wxr, {}, node)
else:
rest.append(node)

if not sense_data.examples and rest:
example = Example(text=clean_node(wxr, {}, rest))
sense_data.examples.append(example)
elif rest:
wxr.wtp.debug(
f"Unprocessed nodes from example group: {rest}",
sortid="extractor/es/example/extract_example/87",
)

if sense_data.examples and reference.model_dump(exclude_defaults=True):
sense_data.examples[-1].ref = reference


def process_example_list(
wxr: WiktextractContext,
sense_data: Sense,
list_item: WikiNode,
):
for sub_list_item in list_item.find_child_recursively(NodeKind.LIST_ITEM):
text_nodes: WikiNodeChildrenList = []
template_nodes: list[WikiNode] = []
for child in sub_list_item.children:
if isinstance(child, WikiNode) and child.kind == NodeKind.TEMPLATE:
template_nodes.append(child)
else:
text_nodes.append(child)

text, url = clean_text_and_url_from_text_nodes(wxr, text_nodes)

if not text:
continue

example = Example(text=text)
if url:
example.ref = Reference(url=url)

for template_node in template_nodes:
reference = Reference()
if template_node.template_name == "cita libro":
add_template_params_to_reference(
wxr, template_node.template_parameters, reference
)
if reference.model_dump(exclude_defaults=True):
example.ref = reference

sense_data.examples.append(example)

# If no example was found in sublists, assume example is in list_item.children directly.
if not sense_data.examples:
text, url = clean_text_and_url_from_text_nodes(wxr, list_item.children)

text = re.sub(r"^(Ejemplos?:?)", "", text).strip()

if not text:
return
example = Example(text=text)
if url:
example.ref = Reference(url=url)

sense_data.examples.append(example)
27 changes: 19 additions & 8 deletions src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import re
from typing import List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.models import Sense, WordEntry
from wiktextract.extractor.es.sense_data import process_sense_data_list
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_gloss(
wxr: WiktextractContext,
page_data: List[WordEntry],
page_data: list[WordEntry],
list_node: WikiNode,
) -> None:
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
Expand All @@ -20,6 +20,9 @@ def extract_gloss(
definition: WikiNodeChildrenList = []
other: WikiNodeChildrenList = []

if not list_item.definition:
continue

for node in list_item.definition:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
other.append(node)
Expand Down Expand Up @@ -53,10 +56,18 @@ def extract_gloss(
if tag:
gloss_data.tags.append(tag)

if other:
wxr.wtp.debug(
f"Found nodes that are not part of definition: {other}",
sortid="extractor/es/gloss/extract_gloss/46",
)

page_data[-1].senses.append(gloss_data)

if other:
for node in other:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
process_sense_data_list(
wxr,
page_data[-1].senses[-1],
node,
)
else:
wxr.wtp.debug(
f"Found nodes that are not part of definition: {node}",
sortid="extractor/es/gloss/extract_gloss/46",
)
20 changes: 20 additions & 0 deletions src/wiktextract/extractor/es/linkage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.models import WordEntry
from wiktextract.wxr_context import WiktextractContext


def extract_linkage(
wxr: WiktextractContext,
page_data: list[WordEntry],
nodes: WikiNodeChildrenList,
):
pass


def process_linkage_list_children(
wxr: WiktextractContext,
page_data: list[WordEntry],
nodes: WikiNodeChildrenList,
):
pass
Loading

0 comments on commit 805e5e9

Please sign in to comment.