-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #392 from empiriker/es
Add pydantic models and parse_page for Spanish Wiktionary
- Loading branch information
Showing
10 changed files
with
643 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
{ | ||
"$defs": { | ||
"Sense": { | ||
"properties": { | ||
"categories": { | ||
"default": [], | ||
"description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Categories", | ||
"type": "array" | ||
}, | ||
"glosses": { | ||
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Glosses", | ||
"type": "array" | ||
}, | ||
"senseid": { | ||
"anyOf": [ | ||
{ | ||
"type": "integer" | ||
}, | ||
{ | ||
"type": "null" | ||
} | ||
], | ||
"default": null, | ||
"description": "Sense number used in Wiktionary", | ||
"title": "Senseid" | ||
}, | ||
"subsenses": { | ||
"default": [], | ||
"description": "List of subsenses", | ||
"items": { | ||
"$ref": "#/$defs/Sense" | ||
}, | ||
"title": "Subsenses", | ||
"type": "array" | ||
}, | ||
"tags": { | ||
"default": [], | ||
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Tags", | ||
"type": "array" | ||
} | ||
}, | ||
"required": [ | ||
"glosses" | ||
], | ||
"title": "Sense", | ||
"type": "object" | ||
} | ||
}, | ||
"$id": "https://kaikki.org/es.json", | ||
"$schema": "https://json-schema.org/draft/2020-12/schema", | ||
"description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", | ||
"properties": { | ||
"categories": { | ||
"default": [], | ||
"description": "list of non-disambiguated categories for the word", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Categories", | ||
"type": "array" | ||
}, | ||
"lang_code": { | ||
"description": "Wiktionary language code", | ||
"examples": [ | ||
"es" | ||
], | ||
"title": "Lang Code", | ||
"type": "string" | ||
}, | ||
"lang_name": { | ||
"description": "Localized language name of the word", | ||
"examples": [ | ||
"español" | ||
], | ||
"title": "Lang Name", | ||
"type": "string" | ||
}, | ||
"pos": { | ||
"default": null, | ||
"description": "Part of speech type", | ||
"title": "Pos", | ||
"type": "string" | ||
}, | ||
"pos_title": { | ||
"default": null, | ||
"description": "Original POS title", | ||
"title": "Pos Title", | ||
"type": "string" | ||
}, | ||
"senses": { | ||
"anyOf": [ | ||
{ | ||
"items": { | ||
"$ref": "#/$defs/Sense" | ||
}, | ||
"type": "array" | ||
}, | ||
{ | ||
"type": "null" | ||
} | ||
], | ||
"default": [], | ||
"title": "Senses" | ||
}, | ||
"word": { | ||
"description": "word string", | ||
"title": "Word", | ||
"type": "string" | ||
} | ||
}, | ||
"required": [ | ||
"word", | ||
"lang_code", | ||
"lang_name" | ||
], | ||
"title": "Spanish Wiktionary", | ||
"type": "object" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"etymology": ["Etimología"], | ||
"pronunciation": ["pronunciación"], | ||
"ignored_sections": ["Véase también"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
{ | ||
"abreviatura": { "pos": "abbrev" }, | ||
"acrónimo": { "pos": "abbrev" }, | ||
"adjetivo": { "pos": "adj" }, | ||
"adjetivo cardinal": { "pos": "num" }, | ||
"adjetivo demostrativo": { "pos": "adj" }, | ||
"adjetivo indefinido": { "pos": "adj" }, | ||
"adjetivo indeterminado": { "pos": "adj" }, | ||
"adjetivo interrogativo": { "pos": "adj" }, | ||
"adjetivo numeral": { "pos": "num" }, | ||
"adjetivo ordinal": { "pos": "num" }, | ||
"adjetivo posesivo": { "pos": "adj" }, | ||
"adjetivo relativo": { "pos": "adj" }, | ||
"adverbio": { "pos": "adv" }, | ||
"adverbio comparativo": { "pos": "adv" }, | ||
"adverbio de afirmación": { "pos": "adv" }, | ||
"adverbio de cantidad": { "pos": "adv" }, | ||
"adverbio de duda": { "pos": "adv" }, | ||
"adverbio de lugar": { "pos": "adv" }, | ||
"adverbio de modo": { "pos": "adv" }, | ||
"adverbio de negación": { "pos": "adv" }, | ||
"adverbio de orden": { "pos": "adv" }, | ||
"adverbio de tiempo": { "pos": "adv" }, | ||
"adverbio demostrativo": { "pos": "adv" }, | ||
"adverbio interrogativo": { "pos": "adv" }, | ||
"adverbio relativo": { "pos": "adv" }, | ||
"afijo": { "pos": "affix" }, | ||
"artículo": { "pos": "article" }, | ||
"artículo determinado": { "pos": "article" }, | ||
"artículo indeterminado": { "pos": "article" }, | ||
"circunfijo": { "pos": "circumfix" }, | ||
"conjunción": { "pos": "conj" }, | ||
"conjunción adversativa": { "pos": "conj" }, | ||
"conjunción ilativa": { "pos": "conj" }, | ||
"dígrafo": { "pos": "character" }, | ||
"expresión": { "pos": "phrase" }, | ||
"forma verbal": { "pos": "verb" }, | ||
"interjección": { "pos": "intj" }, | ||
"letra": { "pos": "character" }, | ||
"locución": { "pos": "phrase" }, | ||
"locución adjetiva": { "pos": "phrase" }, | ||
"locución adverbial": { "pos": "phrase" }, | ||
"locución conjuntiva": { "pos": "phrase" }, | ||
"locución interjectiva": { "pos": "phrase" }, | ||
"locución prepositiva": { "pos": "phrase" }, | ||
"locución pronominal": { "pos": "phrase" }, | ||
"locución sustantiva": { "pos": "phrase" }, | ||
"locución verbal": { "pos": "phrase" }, | ||
"onomatopeya": { "pos": "noun" }, | ||
"partícula": { "pos": "particle" }, | ||
"postposición": { "pos": "postp" }, | ||
"prefijo": { "pos": "prefix" }, | ||
"preposición": { "pos": "prep" }, | ||
"preposición de ablativo": { "pos": "prep" }, | ||
"preposición de acusativo": { "pos": "prep" }, | ||
"preposición de acusativo o ablativo": { "pos": "prep" }, | ||
"preposición de genitivo": { "pos": "prep" }, | ||
"pronombre": { "pos": "pron" }, | ||
"pronombre demostrativo": { "pos": "pron" }, | ||
"pronombre indefinido": { "pos": "pron" }, | ||
"pronombre interrogativo": { "pos": "pron" }, | ||
"pronombre personal": { "pos": "pron" }, | ||
"pronombre posesivo": { "pos": "det" }, | ||
"pronombre relativo": { "pos": "pron" }, | ||
"refrán": { "pos": "proverb" }, | ||
"sigla": { "pos": "abbrev" }, | ||
"sufijo": { "pos": "suffix" }, | ||
"sufijo flexivo": { "pos": "suffix" }, | ||
"sustantivo": { "pos": "noun" }, | ||
"sustantivo ambiguo": { "pos": "noun" }, | ||
"sustantivo animado": { "pos": "noun" }, | ||
"sustantivo común": { "pos": "noun" }, | ||
"sustantivo femenino": { "pos": "noun" }, | ||
"sustantivo femenino y masculino": { "pos": "noun" }, | ||
"sustantivo inanimado": { "pos": "noun" }, | ||
"sustantivo masculino": { "pos": "noun" }, | ||
"sustantivo neutro": { "pos": "noun" }, | ||
"sustantivo neutro y masculino": { "pos": "noun" }, | ||
"sustantivo propio": { "pos": "name" }, | ||
"sustantivo propio/pruebas": { "pos": "name" }, | ||
"símbolo": { "pos": "symbol" }, | ||
"verbo": { "pos": "verb" }, | ||
"verbo auxiliar": { "pos": "verb" }, | ||
"verbo impersonal": { "pos": "verb" }, | ||
"verbo intransitivo": { "pos": "verb" }, | ||
"verbo modal": { "pos": "verb" }, | ||
"verbo perfectivo": { "pos": "verb" }, | ||
"verbo pronominal": { "pos": "verb" }, | ||
"verbo transitivo": { "pos": "verb" } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import re | ||
from typing import List | ||
|
||
from wikitextprocessor import NodeKind, WikiNode | ||
from wikitextprocessor.parser import WikiNodeChildrenList | ||
|
||
from wiktextract.extractor.es.models import Sense, WordEntry | ||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
def extract_gloss( | ||
wxr: WiktextractContext, | ||
page_data: List[WordEntry], | ||
list_node: WikiNode, | ||
) -> None: | ||
for list_item in list_node.find_child(NodeKind.LIST_ITEM): | ||
gloss_data = Sense(glosses=[]) | ||
|
||
definition: WikiNodeChildrenList = [] | ||
other: WikiNodeChildrenList = [] | ||
|
||
for node in list_item.definition: | ||
if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: | ||
other.append(node) | ||
else: | ||
definition.append(node) | ||
|
||
list_item.definition | ||
|
||
gloss = clean_node(wxr, gloss_data, definition) | ||
gloss_data.glosses.append(gloss) | ||
|
||
gloss_note = clean_node(wxr, gloss_data, list_item.children) | ||
|
||
match = re.match(r"^(\d+)", gloss_note) | ||
|
||
if match: | ||
gloss_data.senseid = int(match.group(1)) | ||
tag_string = gloss_note[len(match.group(1)) :].strip() | ||
else: | ||
tag_string = gloss_data.tags = gloss_note.strip() | ||
|
||
# split tags by comma or "y" | ||
tags = re.split(r",|y", tag_string) | ||
for tag in tags: | ||
tag = ( | ||
tag.strip() | ||
.removesuffix(".") | ||
.removesuffix("Main") | ||
.removeprefix("Main") | ||
) | ||
if tag: | ||
gloss_data.tags.append(tag) | ||
|
||
if other: | ||
wxr.wtp.debug( | ||
f"Found nodes that are not part of definition: {other}", | ||
sortid="extractor/es/gloss/extract_gloss/46", | ||
) | ||
|
||
page_data[-1].senses.append(gloss_data) |
Oops, something went wrong.