Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pydantic models and parse_page for Spanish Wiktionary #392

Merged
merged 7 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions json_schema/es.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{
"$defs": {
"Sense": {
"properties": {
"categories": {
"default": [],
"description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
"items": {
"type": "string"
},
"title": "Categories",
"type": "array"
},
"glosses": {
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
"items": {
"type": "string"
},
"title": "Glosses",
"type": "array"
},
"senseid": {
"anyOf": [
{
"type": "integer"
},
{
"type": "null"
}
],
"default": null,
"description": "Sense number used in Wiktionary",
"title": "Senseid"
},
"subsenses": {
"default": [],
"description": "List of subsenses",
"items": {
"$ref": "#/$defs/Sense"
},
"title": "Subsenses",
"type": "array"
},
"tags": {
"default": [],
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
"items": {
"type": "string"
},
"title": "Tags",
"type": "array"
}
},
"required": [
"glosses"
],
"title": "Sense",
"type": "object"
}
},
"$id": "https://kaikki.org/es.json",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.",
"properties": {
"categories": {
"default": [],
"description": "list of non-disambiguated categories for the word",
"items": {
"type": "string"
},
"title": "Categories",
"type": "array"
},
"lang_code": {
"description": "Wiktionary language code",
"examples": [
"es"
],
"title": "Lang Code",
"type": "string"
},
"lang_name": {
"description": "Localized language name of the word",
"examples": [
"español"
],
"title": "Lang Name",
"type": "string"
},
"pos": {
"default": null,
"description": "Part of speech type",
"title": "Pos",
"type": "string"
},
"pos_title": {
"default": null,
"description": "Original POS title",
"title": "Pos Title",
"type": "string"
},
"senses": {
"anyOf": [
{
"items": {
"$ref": "#/$defs/Sense"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"title": "Senses"
},
"word": {
"description": "word string",
"title": "Word",
"type": "string"
}
},
"required": [
"word",
"lang_code",
"lang_name"
],
"title": "Spanish Wiktionary",
"type": "object"
}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies = [
"importlib_resources; python_version < '3.10'",
"levenshtein",
"nltk",
"pydantic",
"wikitextprocessor @ git+https://github.com/tatuylonen/wikitextprocessor.git",
]

Expand Down
5 changes: 5 additions & 0 deletions src/wiktextract/data/es/other_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"etymology": ["Etimología"],
"pronunciation": ["pronunciación"],
"ignored_sections": ["Véase también"]
}
90 changes: 90 additions & 0 deletions src/wiktextract/data/es/pos_subtitles.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"abreviatura": { "pos": "abbrev" },
"acrónimo": { "pos": "abbrev" },
"adjetivo": { "pos": "adj" },
"adjetivo cardinal": { "pos": "num" },
"adjetivo demostrativo": { "pos": "adj" },
"adjetivo indefinido": { "pos": "adj" },
"adjetivo indeterminado": { "pos": "adj" },
"adjetivo interrogativo": { "pos": "adj" },
"adjetivo numeral": { "pos": "num" },
"adjetivo ordinal": { "pos": "num" },
"adjetivo posesivo": { "pos": "adj" },
"adjetivo relativo": { "pos": "adj" },
"adverbio": { "pos": "adv" },
"adverbio comparativo": { "pos": "adv" },
"adverbio de afirmación": { "pos": "adv" },
"adverbio de cantidad": { "pos": "adv" },
"adverbio de duda": { "pos": "adv" },
"adverbio de lugar": { "pos": "adv" },
"adverbio de modo": { "pos": "adv" },
"adverbio de negación": { "pos": "adv" },
"adverbio de orden": { "pos": "adv" },
"adverbio de tiempo": { "pos": "adv" },
"adverbio demostrativo": { "pos": "adv" },
"adverbio interrogativo": { "pos": "adv" },
"adverbio relativo": { "pos": "adv" },
"afijo": { "pos": "affix" },
"artículo": { "pos": "article" },
"artículo determinado": { "pos": "article" },
"artículo indeterminado": { "pos": "article" },
"circunfijo": { "pos": "circumfix" },
"conjunción": { "pos": "conj" },
"conjunción adversativa": { "pos": "conj" },
"conjunción ilativa": { "pos": "conj" },
"dígrafo": { "pos": "character" },
"expresión": { "pos": "phrase" },
"forma verbal": { "pos": "verb" },
"interjección": { "pos": "intj" },
"letra": { "pos": "character" },
"locución": { "pos": "phrase" },
"locución adjetiva": { "pos": "phrase" },
"locución adverbial": { "pos": "phrase" },
"locución conjuntiva": { "pos": "phrase" },
"locución interjectiva": { "pos": "phrase" },
"locución prepositiva": { "pos": "phrase" },
"locución pronominal": { "pos": "phrase" },
"locución sustantiva": { "pos": "phrase" },
"locución verbal": { "pos": "phrase" },
"onomatopeya": { "pos": "noun" },
"partícula": { "pos": "particle" },
"postposición": { "pos": "postp" },
"prefijo": { "pos": "prefix" },
"preposición": { "pos": "prep" },
"preposición de ablativo": { "pos": "prep" },
"preposición de acusativo": { "pos": "prep" },
"preposición de acusativo o ablativo": { "pos": "prep" },
"preposición de genitivo": { "pos": "prep" },
"pronombre": { "pos": "pron" },
"pronombre demostrativo": { "pos": "pron" },
"pronombre indefinido": { "pos": "pron" },
"pronombre interrogativo": { "pos": "pron" },
"pronombre personal": { "pos": "pron" },
"pronombre posesivo": { "pos": "det" },
"pronombre relativo": { "pos": "pron" },
"refrán": { "pos": "proverb" },
"sigla": { "pos": "abbrev" },
"sufijo": { "pos": "suffix" },
"sufijo flexivo": { "pos": "suffix" },
"sustantivo": { "pos": "noun" },
"sustantivo ambiguo": { "pos": "noun" },
"sustantivo animado": { "pos": "noun" },
"sustantivo común": { "pos": "noun" },
"sustantivo femenino": { "pos": "noun" },
"sustantivo femenino y masculino": { "pos": "noun" },
"sustantivo inanimado": { "pos": "noun" },
"sustantivo masculino": { "pos": "noun" },
"sustantivo neutro": { "pos": "noun" },
"sustantivo neutro y masculino": { "pos": "noun" },
"sustantivo propio": { "pos": "name" },
"sustantivo propio/pruebas": { "pos": "name" },
"símbolo": { "pos": "symbol" },
"verbo": { "pos": "verb" },
"verbo auxiliar": { "pos": "verb" },
"verbo impersonal": { "pos": "verb" },
"verbo intransitivo": { "pos": "verb" },
"verbo modal": { "pos": "verb" },
"verbo perfectivo": { "pos": "verb" },
"verbo pronominal": { "pos": "verb" },
"verbo transitivo": { "pos": "verb" }
}
3 changes: 2 additions & 1 deletion src/wiktextract/datautils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ def data_extend(data: Dict, key: str, values: Iterable) -> None:
data_append(data, key, x)


def split_at_comma_semi(text: str, separators=(",", ";", ",", "،"), extra=()
def split_at_comma_semi(
text: str, separators=(",", ";", ",", "،"), extra=()
) -> List[str]:
"""Splits the text at commas and semicolons, unless they are inside
parenthesis. ``separators`` is default separators (setting it eliminates
Expand Down
62 changes: 62 additions & 0 deletions src/wiktextract/extractor/es/gloss.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import re
from typing import List

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import WikiNodeChildrenList

from wiktextract.extractor.es.models import Sense, WordEntry
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext


def extract_gloss(
wxr: WiktextractContext,
page_data: List[WordEntry],
list_node: WikiNode,
) -> None:
for list_item in list_node.find_child(NodeKind.LIST_ITEM):
gloss_data = Sense(glosses=[])

definition: WikiNodeChildrenList = []
other: WikiNodeChildrenList = []

for node in list_item.definition:
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
other.append(node)
else:
definition.append(node)

list_item.definition

gloss = clean_node(wxr, gloss_data, definition)
gloss_data.glosses.append(gloss)

gloss_note = clean_node(wxr, gloss_data, list_item.children)

match = re.match(r"^(\d+)", gloss_note)

if match:
gloss_data.senseid = int(match.group(1))
tag_string = gloss_note[len(match.group(1)) :].strip()
else:
tag_string = gloss_data.tags = gloss_note.strip()

# split tags by comma or "y"
tags = re.split(r",|y", tag_string)
for tag in tags:
tag = (
tag.strip()
.removesuffix(".")
.removesuffix("Main")
.removeprefix("Main")
)
if tag:
gloss_data.tags.append(tag)

if other:
wxr.wtp.debug(
f"Found nodes that are not part of definition: {other}",
sortid="extractor/es/gloss/extract_gloss/46",
)

page_data[-1].senses.append(gloss_data)
Loading