-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add pydantic models and parse_page for Spanish Wiktionary
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
- Loading branch information
Showing
4 changed files
with
298 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
{ | ||
"$defs": { | ||
"Sense": { | ||
"properties": { | ||
"categories": { | ||
"default": [], | ||
"description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Categories", | ||
"type": "array" | ||
}, | ||
"glosses": { | ||
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Glosses", | ||
"type": "array" | ||
}, | ||
"subsenses": { | ||
"default": [], | ||
"description": "List of subsenses", | ||
"items": { | ||
"$ref": "#/$defs/Sense" | ||
}, | ||
"title": "Subsenses", | ||
"type": "array" | ||
}, | ||
"tags": { | ||
"default": [], | ||
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Tags", | ||
"type": "array" | ||
} | ||
}, | ||
"required": [ | ||
"glosses" | ||
], | ||
"title": "Sense", | ||
"type": "object" | ||
} | ||
}, | ||
"$id": "https://kaikki.org/es.json", | ||
"$schema": "https://json-schema.org/draft/2020-12/schema", | ||
"description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.", | ||
"properties": { | ||
"categories": { | ||
"default": [], | ||
"description": "list of non-disambiguated categories for the word", | ||
"items": { | ||
"type": "string" | ||
}, | ||
"title": "Categories", | ||
"type": "array" | ||
}, | ||
"lang_code": { | ||
"description": "Wiktionary language code", | ||
"examples": [ | ||
"es" | ||
], | ||
"title": "Lang Code", | ||
"type": "string" | ||
}, | ||
"lang_name": { | ||
"description": "Localized language name of the word", | ||
"examples": [ | ||
"español" | ||
], | ||
"title": "Lang Name", | ||
"type": "string" | ||
}, | ||
"pos": { | ||
"default": null, | ||
"description": "Part of speech type", | ||
"title": "Pos", | ||
"type": "string" | ||
}, | ||
"senses": { | ||
"anyOf": [ | ||
{ | ||
"items": { | ||
"$ref": "#/$defs/Sense" | ||
}, | ||
"type": "array" | ||
}, | ||
{ | ||
"type": "null" | ||
} | ||
], | ||
"default": [], | ||
"title": "Senses" | ||
}, | ||
"word": { | ||
"description": "word string", | ||
"title": "Word", | ||
"type": "string" | ||
} | ||
}, | ||
"required": [ | ||
"word", | ||
"lang_code", | ||
"lang_name" | ||
], | ||
"title": "Spanish Wiktionary", | ||
"type": "object" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
from typing import Optional | ||
import json | ||
|
||
import logging | ||
|
||
from pydantic import BaseModel, Field, model_validator | ||
from pydantic.json_schema import GenerateJsonSchema | ||
|
||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class PydanticLogger: | ||
wxr: Optional[WiktextractContext] = None | ||
|
||
@classmethod | ||
def debug( | ||
cls, msg: str, trace: Optional[str] = None, sortid: str = "XYZunsorted" | ||
): | ||
if cls.wxr: | ||
cls.wxr.wtp.debug(msg, trace=trace, sortid=sortid) | ||
else: | ||
logging.debug(msg) | ||
|
||
|
||
class BaseModelWrap(BaseModel): | ||
class Config: | ||
extra = "ignore" | ||
validate_assignment = True | ||
|
||
def update(self, data: dict): | ||
update = self.dict(exclude_defaults=True, exclude_none=True) | ||
update.update(data) | ||
for k, v in ( | ||
self.validate(update) | ||
.dict(exclude_defaults=True, exclude_none=True) | ||
.items() | ||
): | ||
setattr(self, k, v) | ||
return self | ||
|
||
|
||
class LoggingExtraFieldsModel(BaseModelWrap): | ||
@model_validator(mode="before") | ||
def log_extra_fields(cls, values): | ||
all_allowed_field_names = {key for key in cls.__fields__.keys()} | ||
extra_fields = { | ||
name: str(value) | ||
for name, value in values.items() | ||
if name not in all_allowed_field_names | ||
} | ||
if extra_fields: | ||
class_full_name = cls.__name__ | ||
PydanticLogger.debug( | ||
msg=f"Pydantic - Got extra fields in {class_full_name}: {extra_fields}", | ||
sortid="wiktextract/extractor/es/pydantic/extra_fields/33", | ||
) | ||
return values | ||
|
||
|
||
class Sense(LoggingExtraFieldsModel): | ||
glosses: list[str] = Field( | ||
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging." | ||
) | ||
tags: list[str] = Field( | ||
default=[], | ||
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.", | ||
) | ||
categories: list[str] = Field( | ||
default=[], | ||
description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page", | ||
) | ||
# examples: list[SenseExample] = [] | ||
subsenses: list["Sense"] = Field( | ||
default=[], description="List of subsenses" | ||
) | ||
|
||
|
||
class WordEntry(LoggingExtraFieldsModel): | ||
"""WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.""" | ||
|
||
word: str = Field(description="word string") | ||
pos: str = Field(default=None, description="Part of speech type") | ||
lang_code: str = Field( | ||
description="Wiktionary language code", examples=["es"] | ||
) | ||
lang_name: str = Field( | ||
description="Localized language name of the word", examples=["español"] | ||
) | ||
senses: Optional[list[Sense]] = [] | ||
categories: list[str] = Field( | ||
default=[], | ||
description="list of non-disambiguated categories for the word", | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
class JsonSchemaGenerator(GenerateJsonSchema): | ||
def generate(self, schema, mode="validation"): | ||
json_schema = super().generate(schema, mode=mode) | ||
json_schema["title"] = "Spanish Wiktionary" | ||
json_schema["$id"] = "https://kaikki.org/es.json" | ||
json_schema["$schema"] = self.schema_dialect | ||
return json_schema | ||
|
||
with open("json_schema/es.json", "w") as f: | ||
json.dump( | ||
WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator), | ||
f, | ||
indent=2, | ||
ensure_ascii=False, | ||
sort_keys=True, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import copy | ||
import logging | ||
from collections import defaultdict | ||
from typing import Dict, List | ||
|
||
from wikitextprocessor import NodeKind, WikiNode | ||
from wiktextract.extractor.es.models import WordEntry, PydanticLogger | ||
|
||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
# Templates that are used to form panels on pages and that | ||
# should be ignored in various positions | ||
PANEL_TEMPLATES = set() | ||
|
||
# Template name prefixes used for language-specific panel templates (i.e., | ||
# templates that create side boxes or notice boxes or that should generally | ||
# be ignored). | ||
PANEL_PREFIXES = set() | ||
|
||
# Additional templates to be expanded in the pre-expand phase | ||
ADDITIONAL_EXPAND_TEMPLATES = set() | ||
|
||
|
||
def parse_section( | ||
wxr: WiktextractContext, | ||
page_data: List[Dict], | ||
base_data: Dict, | ||
level_node: WikiNode, | ||
) -> None: | ||
pass | ||
|
||
|
||
def parse_page( | ||
wxr: WiktextractContext, page_title: str, page_text: str | ||
) -> List[Dict[str, str]]: | ||
if wxr.config.verbose: | ||
logging.info(f"Parsing page: {page_title}") | ||
# Pass current wiktextractcontext to pydantic for more better logging | ||
PydanticLogger.wxr = wxr | ||
|
||
wxr.config.word = page_title | ||
wxr.wtp.start_page(page_title) | ||
|
||
# Parse the page, pre-expanding those templates that are likely to | ||
# influence parsing | ||
tree = wxr.wtp.parse( | ||
page_text, | ||
pre_expand=True, | ||
additional_expand=ADDITIONAL_EXPAND_TEMPLATES, | ||
) | ||
|
||
page_data: List[WordEntry] = [] | ||
for level2_node in tree.find_child(NodeKind.LEVEL2): | ||
for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): | ||
# https://es.wiktionary.org/wiki/Plantilla:lengua | ||
# https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma | ||
if subtitle_template.template_name == "lengua": | ||
categories_and_links = defaultdict(list) | ||
lang_code = subtitle_template.template_parameters.get(1) | ||
lang_name = clean_node( | ||
wxr, categories_and_links, subtitle_template | ||
) | ||
wxr.wtp.start_section(lang_name) | ||
base_data = WordEntry( | ||
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title | ||
) | ||
base_data.update(categories_and_links) | ||
page_data.append(copy.deepcopy(base_data)) | ||
for level3_node in level2_node.find_child(NodeKind.LEVEL3): | ||
parse_section(wxr, page_data, base_data, level3_node) | ||
|
||
return [d.model_dump(exclude_defaults=True) for d in page_data] |