Merge pull request #392 from empiriker/es

Add pydantic models and parse_page for Spanish Wiktionary
tatuylonen · Dec 1, 2023 · be8a46c · be8a46c
2 parents 295f910 + 88a79b2
commit be8a46c
Show file tree

Hide file tree

Showing 10 changed files with 643 additions and 1 deletion.
diff --git a/json_schema/es.json b/json_schema/es.json
@@ -0,0 +1,130 @@
+{
+  "$defs": {
+    "Sense": {
+      "properties": {
+        "categories": {
+          "default": [],
+          "description": "list of sense-disambiguated category names extracted from (a subset) of the Category links on the page",
+          "items": {
+            "type": "string"
+          },
+          "title": "Categories",
+          "type": "array"
+        },
+        "glosses": {
+          "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
+          "items": {
+            "type": "string"
+          },
+          "title": "Glosses",
+          "type": "array"
+        },
+        "senseid": {
+          "anyOf": [
+            {
+              "type": "integer"
+            },
+            {
+              "type": "null"
+            }
+          ],
+          "default": null,
+          "description": "Sense number used in Wiktionary",
+          "title": "Senseid"
+        },
+        "subsenses": {
+          "default": [],
+          "description": "List of subsenses",
+          "items": {
+            "$ref": "#/$defs/Sense"
+          },
+          "title": "Subsenses",
+          "type": "array"
+        },
+        "tags": {
+          "default": [],
+          "description": "list of gloss strings for the word sense (usually only one). This has been cleaned, and should be straightforward text with no tagging.",
+          "items": {
+            "type": "string"
+          },
+          "title": "Tags",
+          "type": "array"
+        }
+      },
+      "required": [
+        "glosses"
+      ],
+      "title": "Sense",
+      "type": "object"
+    }
+  },
+  "$id": "https://kaikki.org/es.json",
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "description": "WordEntry is a dictionary containing lexical information of a single word extracted from Wiktionary with wiktextract.",
+  "properties": {
+    "categories": {
+      "default": [],
+      "description": "list of non-disambiguated categories for the word",
+      "items": {
+        "type": "string"
+      },
+      "title": "Categories",
+      "type": "array"
+    },
+    "lang_code": {
+      "description": "Wiktionary language code",
+      "examples": [
+        "es"
+      ],
+      "title": "Lang Code",
+      "type": "string"
+    },
+    "lang_name": {
+      "description": "Localized language name of the word",
+      "examples": [
+        "español"
+      ],
+      "title": "Lang Name",
+      "type": "string"
+    },
+    "pos": {
+      "default": null,
+      "description": "Part of speech type",
+      "title": "Pos",
+      "type": "string"
+    },
+    "pos_title": {
+      "default": null,
+      "description": "Original POS title",
+      "title": "Pos Title",
+      "type": "string"
+    },
+    "senses": {
+      "anyOf": [
+        {
+          "items": {
+            "$ref": "#/$defs/Sense"
+          },
+          "type": "array"
+        },
+        {
+          "type": "null"
+        }
+      ],
+      "default": [],
+      "title": "Senses"
+    },
+    "word": {
+      "description": "word string",
+      "title": "Word",
+      "type": "string"
+    }
+  },
+  "required": [
+    "word",
+    "lang_code",
+    "lang_name"
+  ],
+  "title": "Spanish Wiktionary",
+  "type": "object"
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,7 @@ dependencies = [
     "importlib_resources; python_version < '3.10'",
     "levenshtein",
     "nltk",
+    "pydantic",
     "wikitextprocessor @ git+https://github.com/tatuylonen/wikitextprocessor.git",
 ]
 

diff --git a/src/wiktextract/data/es/other_subtitles.json b/src/wiktextract/data/es/other_subtitles.json
@@ -0,0 +1,5 @@
+{
+  "etymology": ["Etimología"],
+  "pronunciation": ["pronunciación"],
+  "ignored_sections": ["Véase también"]
+}
diff --git a/src/wiktextract/data/es/pos_subtitles.json b/src/wiktextract/data/es/pos_subtitles.json
@@ -0,0 +1,90 @@
+{
+  "abreviatura": { "pos": "abbrev" },
+  "acrónimo": { "pos": "abbrev" },
+  "adjetivo": { "pos": "adj" },
+  "adjetivo cardinal": { "pos": "num" },
+  "adjetivo demostrativo": { "pos": "adj" },
+  "adjetivo indefinido": { "pos": "adj" },
+  "adjetivo indeterminado": { "pos": "adj" },
+  "adjetivo interrogativo": { "pos": "adj" },
+  "adjetivo numeral": { "pos": "num" },
+  "adjetivo ordinal": { "pos": "num" },
+  "adjetivo posesivo": { "pos": "adj" },
+  "adjetivo relativo": { "pos": "adj" },
+  "adverbio": { "pos": "adv" },
+  "adverbio comparativo": { "pos": "adv" },
+  "adverbio de afirmación": { "pos": "adv" },
+  "adverbio de cantidad": { "pos": "adv" },
+  "adverbio de duda": { "pos": "adv" },
+  "adverbio de lugar": { "pos": "adv" },
+  "adverbio de modo": { "pos": "adv" },
+  "adverbio de negación": { "pos": "adv" },
+  "adverbio de orden": { "pos": "adv" },
+  "adverbio de tiempo": { "pos": "adv" },
+  "adverbio demostrativo": { "pos": "adv" },
+  "adverbio interrogativo": { "pos": "adv" },
+  "adverbio relativo": { "pos": "adv" },
+  "afijo": { "pos": "affix" },
+  "artículo": { "pos": "article" },
+  "artículo determinado": { "pos": "article" },
+  "artículo indeterminado": { "pos": "article" },
+  "circunfijo": { "pos": "circumfix" },
+  "conjunción": { "pos": "conj" },
+  "conjunción adversativa": { "pos": "conj" },
+  "conjunción ilativa": { "pos": "conj" },
+  "dígrafo": { "pos": "character" },
+  "expresión": { "pos": "phrase" },
+  "forma verbal": { "pos": "verb" },
+  "interjección": { "pos": "intj" },
+  "letra": { "pos": "character" },
+  "locución": { "pos": "phrase" },
+  "locución adjetiva": { "pos": "phrase" },
+  "locución adverbial": { "pos": "phrase" },
+  "locución conjuntiva": { "pos": "phrase" },
+  "locución interjectiva": { "pos": "phrase" },
+  "locución prepositiva": { "pos": "phrase" },
+  "locución pronominal": { "pos": "phrase" },
+  "locución sustantiva": { "pos": "phrase" },
+  "locución verbal": { "pos": "phrase" },
+  "onomatopeya": { "pos": "noun" },
+  "partícula": { "pos": "particle" },
+  "postposición": { "pos": "postp" },
+  "prefijo": { "pos": "prefix" },
+  "preposición": { "pos": "prep" },
+  "preposición de ablativo": { "pos": "prep" },
+  "preposición de acusativo": { "pos": "prep" },
+  "preposición de acusativo o ablativo": { "pos": "prep" },
+  "preposición de genitivo": { "pos": "prep" },
+  "pronombre": { "pos": "pron" },
+  "pronombre demostrativo": { "pos": "pron" },
+  "pronombre indefinido": { "pos": "pron" },
+  "pronombre interrogativo": { "pos": "pron" },
+  "pronombre personal": { "pos": "pron" },
+  "pronombre posesivo": { "pos": "det" },
+  "pronombre relativo": { "pos": "pron" },
+  "refrán": { "pos": "proverb" },
+  "sigla": { "pos": "abbrev" },
+  "sufijo": { "pos": "suffix" },
+  "sufijo flexivo": { "pos": "suffix" },
+  "sustantivo": { "pos": "noun" },
+  "sustantivo ambiguo": { "pos": "noun" },
+  "sustantivo animado": { "pos": "noun" },
+  "sustantivo común": { "pos": "noun" },
+  "sustantivo femenino": { "pos": "noun" },
+  "sustantivo femenino y masculino": { "pos": "noun" },
+  "sustantivo inanimado": { "pos": "noun" },
+  "sustantivo masculino": { "pos": "noun" },
+  "sustantivo neutro": { "pos": "noun" },
+  "sustantivo neutro y masculino": { "pos": "noun" },
+  "sustantivo propio": { "pos": "name" },
+  "sustantivo propio/pruebas": { "pos": "name" },
+  "símbolo": { "pos": "symbol" },
+  "verbo": { "pos": "verb" },
+  "verbo auxiliar": { "pos": "verb" },
+  "verbo impersonal": { "pos": "verb" },
+  "verbo intransitivo": { "pos": "verb" },
+  "verbo modal": { "pos": "verb" },
+  "verbo perfectivo": { "pos": "verb" },
+  "verbo pronominal": { "pos": "verb" },
+  "verbo transitivo": { "pos": "verb" }
+}
diff --git a/src/wiktextract/datautils.py b/src/wiktextract/datautils.py
@@ -61,7 +61,8 @@ def data_extend(data: Dict, key: str, values: Iterable) -> None:
         data_append(data, key, x)
 
 
-def split_at_comma_semi(text: str, separators=(",", ";", "，", "،"), extra=()
+def split_at_comma_semi(
+    text: str, separators=(",", ";", "，", "،"), extra=()
 ) -> List[str]:
     """Splits the text at commas and semicolons, unless they are inside
     parenthesis.  ``separators`` is default separators (setting it eliminates

diff --git a/src/wiktextract/extractor/es/gloss.py b/src/wiktextract/extractor/es/gloss.py
@@ -0,0 +1,62 @@
+import re
+from typing import List
+
+from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import WikiNodeChildrenList
+
+from wiktextract.extractor.es.models import Sense, WordEntry
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+
+def extract_gloss(
+    wxr: WiktextractContext,
+    page_data: List[WordEntry],
+    list_node: WikiNode,
+) -> None:
+    for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+        gloss_data = Sense(glosses=[])
+
+        definition: WikiNodeChildrenList = []
+        other: WikiNodeChildrenList = []
+
+        for node in list_item.definition:
+            if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+                other.append(node)
+            else:
+                definition.append(node)
+
+        list_item.definition
+
+        gloss = clean_node(wxr, gloss_data, definition)
+        gloss_data.glosses.append(gloss)
+
+        gloss_note = clean_node(wxr, gloss_data, list_item.children)
+
+        match = re.match(r"^(\d+)", gloss_note)
+
+        if match:
+            gloss_data.senseid = int(match.group(1))
+            tag_string = gloss_note[len(match.group(1)) :].strip()
+        else:
+            tag_string = gloss_data.tags = gloss_note.strip()
+
+        # split tags by comma or "y"
+        tags = re.split(r",|y", tag_string)
+        for tag in tags:
+            tag = (
+                tag.strip()
+                .removesuffix(".")
+                .removesuffix("Main")
+                .removeprefix("Main")
+            )
+            if tag:
+                gloss_data.tags.append(tag)
+
+        if other:
+            wxr.wtp.debug(
+                f"Found nodes that are not part of definition: {other}",
+                sortid="extractor/es/gloss/extract_gloss/46",
+            )
+
+        page_data[-1].senses.append(gloss_data)