Skip to content

Commit

Permalink
Merge pull request #392 from empiriker/es
Browse files Browse the repository at this point in the history
Add pydantic models and parse_page for Spanish Wiktionary
  • Loading branch information
xxyzz authored Dec 1, 2023
2 parents 295f910 + 88a79b2 commit be8a46c
Show file tree
Hide file tree
Showing 10 changed files with 643 additions and 1 deletion.
130 changes: 130 additions & 0 deletions json_schema/es.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"$defs": {
"Sense": {
"properties": {
"categories": {
"default": [],
"description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
"items": {
"type": "string"
},
"title": "Categories",
"type": "array"
},
"glosses": {
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
"items": {
"type": "string"
},
"title": "Glosses",
"type": "array"
},
"senseid": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Sense number used in Wiktionary",
"title": "Senseid"
},
"subsenses": {
"default": [],
"description": "List of subsenses",
"items": {
"$ref": "#/$defs/Sense"
},
"title": "Subsenses",
"type": "array"
},
"tags": {
"default": [],
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
"items": {
"type": "string"
},
"title": "Tags",
"type": "array"
}
},
"required": [
"glosses"
],
"title": "Sense",
"type": "object"
}
},
"$id": "https://kaikki.org/es.json",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.",
"properties": {
"categories": {
"default": [],
"description": "list of non-disambiguated categories for the word",
"items": {
"type": "string"
},
"title": "Categories",
"type": "array"
},
"lang_code": {
"description": "Wiktionary language code",
"examples": [
"es"
],
"title": "Lang Code",
"type": "string"
},
"lang_name": {
"description": "Localized language name of the word",
"examples": [
"español"
],
"title": "Lang Name",
"type": "string"
},
"pos": {
"default": null,
"description": "Part of speech type",
"title": "Pos",
"type": "string"
},
"pos_title": {
"default": null,
"description": "Original POS title",
"title": "Pos Title",
"type": "string"
},
"senses": {
"anyOf": [
{
"items": {
"$ref": "#/$defs/Sense"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"title": "Senses"
},
"word": {
"description": "word string",
"title": "Word",
"type": "string"
}
},
"required": [
"word",
"lang_code",
"lang_name"
],
"title": "Spanish Wiktionary",
"type": "object"
}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies = [
"importlib_resources; python_version < '3.10'",
"levenshtein",
"nltk",
"pydantic",
"wikitextprocessor @ git+https://github.com/tatuylonen/wikitextprocessor.git",
]

Expand Down
5 changes: 5 additions & 0 deletions src/wiktextract/data/es/other_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"etymology": ["Etimología"],
"pronunciation": ["pronunciación"],
"ignored_sections": ["Véase también"]
}
90 changes: 90 additions & 0 deletions src/wiktextract/data/es/pos_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"abreviatura": { "pos": "abbrev" },
"acrónimo": { "pos": "abbrev" },
"adjetivo": { "pos": "adj" },
"adjetivo cardinal": { "pos": "num" },
"adjetivo demostrativo": { "pos": "adj" },
"adjetivo indefinido": { "pos": "adj" },
"adjetivo indeterminado": { "pos": "adj" },
"adjetivo interrogativo": { "pos": "adj" },
"adjetivo numeral": { "pos": "num" },
"adjetivo ordinal": { "pos": "num" },
"adjetivo posesivo": { "pos": "adj" },
"adjetivo relativo": { "pos": "adj" },
"adverbio": { "pos": "adv" },
"adverbio comparativo": { "pos": "adv" },
"adverbio de afirmación": { "pos": "adv" },
"adverbio de cantidad": { "pos": "adv" },
"adverbio de duda": { "pos": "adv" },
"adverbio de lugar": { "pos": "adv" },
"adverbio de modo": { "pos": "adv" },
"adverbio de negación": { "pos": "adv" },
"adverbio de orden": { "pos": "adv" },
"adverbio de tiempo": { "pos": "adv" },
"adverbio demostrativo": { "pos": "adv" },
"adverbio interrogativo": { "pos": "adv" },
"adverbio relativo": { "pos": "adv" },
"afijo": { "pos": "affix" },
"artículo": { "pos": "article" },
"artículo determinado": { "pos": "article" },
"artículo indeterminado": { "pos": "article" },
"circunfijo": { "pos": "circumfix" },
"conjunción": { "pos": "conj" },
"conjunción adversativa": { "pos": "conj" },
"conjunción ilativa": { "pos": "conj" },
"dígrafo": { "pos": "character" },
"expresión": { "pos": "phrase" },
"forma verbal": { "pos": "verb" },
"interjección": { "pos": "intj" },
"letra": { "pos": "character" },
"locución": { "pos": "phrase" },
"locución adjetiva": { "pos": "phrase" },
"locución adverbial": { "pos": "phrase" },
"locución conjuntiva": { "pos": "phrase" },
"locución interjectiva": { "pos": "phrase" },
"locución prepositiva": { "pos": "phrase" },
"locución pronominal": { "pos": "phrase" },
"locución sustantiva": { "pos": "phrase" },
"locución verbal": { "pos": "phrase" },
"onomatopeya": { "pos": "noun" },
"partícula": { "pos": "particle" },
"postposición": { "pos": "postp" },
"prefijo": { "pos": "prefix" },
"preposición": { "pos": "prep" },
"preposición de ablativo": { "pos": "prep" },
"preposición de acusativo": { "pos": "prep" },
"preposición de acusativo o ablativo": { "pos": "prep" },
"preposición de genitivo": { "pos": "prep" },
"pronombre": { "pos": "pron" },
"pronombre demostrativo": { "pos": "pron" },
"pronombre indefinido": { "pos": "pron" },
"pronombre interrogativo": { "pos": "pron" },
"pronombre personal": { "pos": "pron" },
"pronombre posesivo": { "pos": "det" },
"pronombre relativo": { "pos": "pron" },
"refrán": { "pos": "proverb" },
"sigla": { "pos": "abbrev" },
"sufijo": { "pos": "suffix" },
"sufijo flexivo": { "pos": "suffix" },
"sustantivo": { "pos": "noun" },
"sustantivo ambiguo": { "pos": "noun" },
"sustantivo animado": { "pos": "noun" },
"sustantivo común": { "pos": "noun" },
"sustantivo femenino": { "pos": "noun" },
"sustantivo femenino y masculino": { "pos": "noun" },
"sustantivo inanimado": { "pos": "noun" },
"sustantivo masculino": { "pos": "noun" },
"sustantivo neutro": { "pos": "noun" },
"sustantivo neutro y masculino": { "pos": "noun" },
"sustantivo propio": { "pos": "name" },
"sustantivo propio/pruebas": { "pos": "name" },
"símbolo": { "pos": "symbol" },
"verbo": { "pos": "verb" },
"verbo auxiliar": { "pos": "verb" },
"verbo impersonal": { "pos": "verb" },
"verbo intransitivo": { "pos": "verb" },
"verbo modal": { "pos": "verb" },
"verbo perfectivo": { "pos": "verb" },
"verbo pronominal": { "pos": "verb" },
"verbo transitivo": { "pos": "verb" }
}
3 changes: 2 additions & 1 deletion src/wiktextract/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ def data_extend(data: Dict, key: str, values: Iterable) -> None:
data_append(data, key, x)


def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=()
def split_at_comma_semi(
text: str, separators=(",", ";", ",", "،"), extra=()
) -> List[str]:
"""Splits the text at commas and semicolons, unless they are inside
parenthesis. ``separators`` is default separators (setting it eliminates
Expand Down
62 changes: 62 additions & 0 deletions src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import re
from typing import List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.models import Sense, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_gloss(
wxr: WiktextractContext,
page_data: List[WordEntry],
list_node: WikiNode,
) -> None:
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
gloss_data = Sense(glosses=[])

definition: WikiNodeChildrenList = []
other: WikiNodeChildrenList = []

for node in list_item.definition:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
other.append(node)
else:
definition.append(node)

list_item.definition

gloss = clean_node(wxr, gloss_data, definition)
gloss_data.glosses.append(gloss)

gloss_note = clean_node(wxr, gloss_data, list_item.children)

match = re.match(r"^(\d+)", gloss_note)

if match:
gloss_data.senseid = int(match.group(1))
tag_string = gloss_note[len(match.group(1)) :].strip()
else:
tag_string = gloss_data.tags = gloss_note.strip()

# split tags by comma or "y"
tags = re.split(r",|y", tag_string)
for tag in tags:
tag = (
tag.strip()
.removesuffix(".")
.removesuffix("Main")
.removeprefix("Main")
)
if tag:
gloss_data.tags.append(tag)

if other:
wxr.wtp.debug(
f"Found nodes that are not part of definition: {other}",
sortid="extractor/es/gloss/extract_gloss/46",
)

page_data[-1].senses.append(gloss_data)
Loading

0 comments on commit be8a46c

Please sign in to comment.