Skip to content

Commit

Permalink
Add pydantic models and parse_page for Spanish Wiktionary
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Oct 31, 2023
1 parent a64e729 commit e110791
Show file tree
Hide file tree
Showing 4 changed files with 298 additions and 0 deletions.
111 changes: 111 additions & 0 deletions json_schema/es.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
{
"$defs": {
"Sense": {
"properties": {
"categories": {
"default": [],
"description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
"items": {
"type": "string"
},
"title": "Categories",
"type": "array"
},
"glosses": {
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
"items": {
"type": "string"
},
"title": "Glosses",
"type": "array"
},
"subsenses": {
"default": [],
"description": "List of subsenses",
"items": {
"$ref": "#/$defs/Sense"
},
"title": "Subsenses",
"type": "array"
},
"tags": {
"default": [],
"description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
"items": {
"type": "string"
},
"title": "Tags",
"type": "array"
}
},
"required": [
"glosses"
],
"title": "Sense",
"type": "object"
}
},
"$id": "https://kaikki.org/es.json",
"$schema": "https://json-schema.org/draft/2020-12/schema",
"description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.",
"properties": {
"categories": {
"default": [],
"description": "list of non-disambiguated categories for the word",
"items": {
"type": "string"
},
"title": "Categories",
"type": "array"
},
"lang_code": {
"description": "Wiktionary language code",
"examples": [
"es"
],
"title": "Lang Code",
"type": "string"
},
"lang_name": {
"description": "Localized language name of the word",
"examples": [
"español"
],
"title": "Lang Name",
"type": "string"
},
"pos": {
"default": null,
"description": "Part of speech type",
"title": "Pos",
"type": "string"
},
"senses": {
"anyOf": [
{
"items": {
"$ref": "#/$defs/Sense"
},
"type": "array"
},
{
"type": "null"
}
],
"default": [],
"title": "Senses"
},
"word": {
"description": "word string",
"title": "Word",
"type": "string"
}
},
"required": [
"word",
"lang_code",
"lang_name"
],
"title": "Spanish Wiktionary",
"type": "object"
}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ dependencies = [
"importlib_resources; python_version < '3.10'",
"levenshtein",
"nltk",
"pydantic",
"wikitextprocessor @ git+https://github.com/tatuylonen/wikitextprocessor.git",
]

Expand Down
113 changes: 113 additions & 0 deletions src/wiktextract/extractor/es/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
from typing import Optional
import json

import logging

from pydantic import BaseModel, Field, model_validator
from pydantic.json_schema import GenerateJsonSchema

from wiktextract.wxr_context import WiktextractContext


class PydanticLogger:
wxr: Optional[WiktextractContext] = None

@classmethod
def debug(
cls, msg: str, trace: Optional[str] = None, sortid: str = "XYZunsorted"
):
if cls.wxr:
cls.wxr.wtp.debug(msg, trace=trace, sortid=sortid)
else:
logging.debug(msg)


class BaseModelWrap(BaseModel):
class Config:
extra = "ignore"
validate_assignment = True

def update(self, data: dict):
update = self.dict(exclude_defaults=True, exclude_none=True)
update.update(data)
for k, v in (
self.validate(update)
.dict(exclude_defaults=True, exclude_none=True)
.items()
):
setattr(self, k, v)
return self


class LoggingExtraFieldsModel(BaseModelWrap):
@model_validator(mode="before")
def log_extra_fields(cls, values):
all_allowed_field_names = {key for key in cls.__fields__.keys()}
extra_fields = {
name: str(value)
for name, value in values.items()
if name not in all_allowed_field_names
}
if extra_fields:
class_full_name = cls.__name__
PydanticLogger.debug(
msg=f"Pydantic - Got extra fields in {class_full_name}: {extra_fields}",
sortid="wiktextract/extractor/es/pydantic/extra_fields/33",
)
return values


class Sense(LoggingExtraFieldsModel):
glosses: list[str] = Field(
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging."
)
tags: list[str] = Field(
default=[],
description="list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
)
categories: list[str] = Field(
default=[],
description="list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
)
# examples: list[SenseExample] = []
subsenses: list["Sense"] = Field(
default=[], description="List of subsenses"
)


class WordEntry(LoggingExtraFieldsModel):
"""WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract."""

word: str = Field(description="word string")
pos: str = Field(default=None, description="Part of speech type")
lang_code: str = Field(
description="Wiktionary language code", examples=["es"]
)
lang_name: str = Field(
description="Localized language name of the word", examples=["español"]
)
senses: Optional[list[Sense]] = []
categories: list[str] = Field(
default=[],
description="list of non-disambiguated categories for the word",
)


if __name__ == "__main__":

class JsonSchemaGenerator(GenerateJsonSchema):
def generate(self, schema, mode="validation"):
json_schema = super().generate(schema, mode=mode)
json_schema["title"] = "Spanish Wiktionary"
json_schema["$id"] = "https://kaikki.org/es.json"
json_schema["$schema"] = self.schema_dialect
return json_schema

with open("json_schema/es.json", "w") as f:
json.dump(
WordEntry.model_json_schema(schema_generator=JsonSchemaGenerator),
f,
indent=2,
ensure_ascii=False,
sort_keys=True,
)
73 changes: 73 additions & 0 deletions src/wiktextract/extractor/es/page.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import copy
import logging
from collections import defaultdict
from typing import Dict, List

from wikitextprocessor import NodeKind, WikiNode
from wiktextract.extractor.es.models import WordEntry, PydanticLogger

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

# Templates that are used to form panels on pages and that
# should be ignored in various positions
PANEL_TEMPLATES = set()

# Template name prefixes used for language-specific panel templates (i.e.,
# templates that create side boxes or notice boxes or that should generally
# be ignored).
PANEL_PREFIXES = set()

# Additional templates to be expanded in the pre-expand phase
ADDITIONAL_EXPAND_TEMPLATES = set()


def parse_section(
wxr: WiktextractContext,
page_data: List[Dict],
base_data: Dict,
level_node: WikiNode,
) -> None:
pass


def parse_page(
wxr: WiktextractContext, page_title: str, page_text: str
) -> List[Dict[str, str]]:
if wxr.config.verbose:
logging.info(f"Parsing page: {page_title}")
# Pass current wiktextractcontext to pydantic for more better logging
PydanticLogger.wxr = wxr

wxr.config.word = page_title
wxr.wtp.start_page(page_title)

# Parse the page, pre-expanding those templates that are likely to
# influence parsing
tree = wxr.wtp.parse(
page_text,
pre_expand=True,
additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
)

page_data: List[WordEntry] = []
for level2_node in tree.find_child(NodeKind.LEVEL2):
for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
# https://es.wiktionary.org/wiki/Plantilla:lengua
# https://es.wiktionary.org/wiki/Apéndice:Códigos_de_idioma
if subtitle_template.template_name == "lengua":
categories_and_links = defaultdict(list)
lang_code = subtitle_template.template_parameters.get(1)
lang_name = clean_node(
wxr, categories_and_links, subtitle_template
)
wxr.wtp.start_section(lang_name)
base_data = WordEntry(
lang_name=lang_name, lang_code=lang_code, word=wxr.wtp.title
)
base_data.update(categories_and_links)
page_data.append(copy.deepcopy(base_data))
for level3_node in level2_node.find_child(NodeKind.LEVEL3):
parse_section(wxr, page_data, base_data, level3_node)

return [d.model_dump(exclude_defaults=True) for d in page_data]

0 comments on commit e110791

Please sign in to comment.