Merge pull request #358 from xxyzz/zh

Extract Chinese Wiktionary descendant data
tatuylonen · Oct 12, 2023 · d7e9505 · d7e9505
2 parents 8057bec + 5ac2c4a
commit d7e9505
Show file tree

Hide file tree

Showing 8 changed files with 290 additions and 70 deletions.
diff --git a/json_schema/zh.json b/json_schema/zh.json
@@ -147,6 +147,12 @@
       "items": {
         "type": "string"
       }
+    },
+    "descendants": {
+      "type": "array",
+      "items": {
+        "$ref": "#/$defs/descendant"
+      }
     }
   },
   "$defs": {
@@ -315,6 +321,46 @@
           "enum": ["zh-Hant", "zh-Hans"]
         }
       }
+    },
+    "descendant": {
+      "type": "object",
+      "properties": {
+        "lang_code": {
+          "description": "ISO 639-1 code",
+          "type": "string"
+        },
+        "lang_name": {
+          "type": "string"
+        },
+        "word": {
+          "type": "string"
+        },
+        "roman": {
+          "type": "string"
+        },
+        "tags": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "descendants": {
+          "type": "array",
+          "items": {
+            "$refs": "#/$defs/descendant"
+          }
+        },
+        "ruby": {
+          "description": "Japanese Kanji and furigana",
+          "type": "array",
+          "items": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          }
+        }
+      }
     }
   }
 }
diff --git a/tests/test_zh_descendant.py b/tests/test_zh_descendant.py
@@ -0,0 +1,117 @@
+from collections import defaultdict
+from unittest import TestCase
+from unittest.mock import Mock
+
+from wikitextprocessor import Wtp
+
+from wiktextract.extractor.zh.descendant import extract_descendants
+from wiktextract.thesaurus import close_thesaurus_db
+from wiktextract.wxr_context import WiktextractContext
+
+
+class TestDescendant(TestCase):
+    def setUp(self):
+        self.wxr = WiktextractContext(Wtp(lang_code="zh"), Mock())
+
+    def tearDown(self):
+        self.wxr.wtp.close_db_conn()
+        close_thesaurus_db(
+            self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
+        )
+
+    def test_ruby(self):
+        # https://zh.wiktionary.org/wiki/你好
+        self.wxr.wtp.start_page("你好")
+        self.wxr.wtp.add_page(
+            "Template:desc",
+            10,
+            '<span class="desc-arr" title="借詞">→</span> 日語：',
+        )
+        self.wxr.wtp.add_page(
+            "Template:ja-r",
+            10,
+            '<span class="Jpan" lang="ja">[[你好#日語|-{<ruby>你好<rp>(</rp><rt>ニイハオ</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">nīhao</span></span><span class="mention-gloss-paren annotation-paren">)</span>',
+        )
+        root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}")
+        page_data = defaultdict(list)
+        extract_descendants(self.wxr, root, page_data)
+        self.assertEqual(
+            page_data.get("descendants"),
+            [
+                {
+                    "lang_code": "ja",
+                    "lang_name": "日語",
+                    "roman": "nīhao",
+                    "ruby": [("你好", "ニイハオ")],
+                    "word": "你好",
+                }
+            ],
+        )
+
+    def test_roman_only_list(self):
+        self.wxr.wtp.start_page("你好")
+        self.wxr.wtp.add_page(
+            "Template:desc",
+            10,
+            '<span class="desc-arr" title="仿譯詞">→</span> 壯語：<span class="Latn" lang="za">[[mwngz ndei#壯語|-{mwngz ndei}-]]</span> <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content">仿譯</span><span class="ib-brac qualifier-brac">)</span>',
+        )
+        root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}")
+        page_data = defaultdict(list)
+        extract_descendants(self.wxr, root, page_data)
+        self.assertEqual(
+            page_data.get("descendants"),
+            [
+                {
+                    "lang_code": "za",
+                    "lang_name": "壯語",
+                    "tags": ["仿譯"],
+                    "word": "mwngz ndei",
+                }
+            ],
+        )
+
+    def test_nested_list(self):
+        # https://zh.wiktionary.org/wiki/オタク
+        self.wxr.wtp.start_page("オタク")
+        self.wxr.wtp.add_page(
+            "Template:desc",
+            10,
+            '<span class="desc-arr" title="詞形受類比影響或添加了額外詞素">⇒</span> 官話：',
+        )
+        self.wxr.wtp.add_page(
+            "Template:zh-l",
+            10,
+            '<span class="Hani" lang="zh">{{{1}}}</span> (<i><span class="tr Latn" lang="la">{{{1}}}</span></i>',
+        )
+        root = self.wxr.wtp.parse(
+            """*: {{desc|cmn|-}} {{zh-l|御宅族}}
+*:* {{desc|cmn|-|der=1}} {{zh-l|宅男}}
+*:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}"""
+        )
+        page_data = defaultdict(list)
+        extract_descendants(self.wxr, root, page_data)
+        self.assertEqual(
+            page_data.get("descendants"),
+            [
+                {
+                    "descendants": [
+                        {
+                            "lang_code": "cmn",
+                            "lang_name": "官話",
+                            "roman": "宅男",
+                            "word": "宅男",
+                        },
+                        {
+                            "lang_code": "cmn",
+                            "lang_name": "官話",
+                            "roman": "宅女",
+                            "word": "宅女",
+                        },
+                    ],
+                    "lang_code": "cmn",
+                    "lang_name": "官話",
+                    "roman": "御宅族",
+                    "word": "御宅族",
+                }
+            ],
+        )
diff --git a/wiktextract/data/zh/linkage_subtitles.json b/wiktextract/data/zh/linkage_subtitles.json
@@ -84,7 +84,6 @@
   "派生詞": "derived",
   "派生詞彙": "derived",
   "派生詞語": "derived",
-  "派生語彙": "derived",
   "派生词": "derived",
   "派生词汇": "derived",
   "派生词组": "derived",
@@ -133,4 +132,4 @@
   "部分詞": "meronyms",
   "關聯詞": "related",
   "關聯詞彙": "related"
-}
+}
diff --git a/wiktextract/data/zh/other_subtitles.json b/wiktextract/data/zh/other_subtitles.json
@@ -74,5 +74,8 @@
   "translations": [
     "翻譯",
     "翻译"
+  ],
+  "descendants": [
+    "派生語彙"
   ]
-}
+}
diff --git a/wiktextract/extractor/ruby.py b/wiktextract/extractor/ruby.py
@@ -1,6 +1,7 @@
 from typing import List, Optional, Tuple, Union
 
 from wikitextprocessor import NodeKind, WikiNode
+from wikitextprocessor.parser import HTMLNode, LevelNode, TemplateNode
 
 from wiktextract.page import clean_node
 from wiktextract.wxr_context import WiktextractContext
@@ -58,7 +59,6 @@ def extract_ruby(
     # Otherwise content is WikiNode, and we must recurse into it.
     kind = contents.kind
     new_node = WikiNode(kind, contents.loc)
-    new_contents.append(new_node)
     if kind in {
         NodeKind.LEVEL2,
         NodeKind.LEVEL3,
@@ -68,6 +68,8 @@ def extract_ruby(
         NodeKind.LINK,
     }:
         # Process args and children
+        if kind != NodeKind.LINK:
+            new_node = LevelNode(new_node.loc)
         new_args = []
         for arg in contents.largs:
             e1, c1 = extract_ruby(wxr, arg)
@@ -108,6 +110,8 @@ def extract_ruby(
         NodeKind.URL,
     }:
         # Process only args
+        if kind == NodeKind.TEMPLATE:
+            new_node = TemplateNode(new_node.loc)
         new_args = []
         for arg in contents.largs:
             e1, c1 = extract_ruby(wxr, arg)
@@ -116,11 +120,13 @@ def extract_ruby(
         new_node.largs = new_args
     elif kind == NodeKind.HTML:
         # Keep attrs and args as-is, process children
+        new_node = HTMLNode(new_node.loc)
         new_node.attrs = contents.attrs
         new_node.sarg = contents.sarg
         e1, c1 = extract_ruby(wxr, contents.children)
         extracted.extend(e1)
         new_node.children = c1
     else:
         raise RuntimeError(f"extract_ruby: unhandled kind {kind}")
+    new_contents.append(new_node)
     return extracted, new_contents
diff --git a/wiktextract/extractor/zh/descendant.py b/wiktextract/extractor/zh/descendant.py
@@ -0,0 +1,97 @@
+from collections import defaultdict
+from typing import Dict
+
+from wikitextprocessor import NodeKind, WikiNode
+
+from wiktextract.page import clean_node
+from wiktextract.wxr_context import WiktextractContext
+
+from ..ruby import extract_ruby
+
+DESCENDANT_TEMPLATES = frozenset(["desc", "descendant"])
+
+
+def extract_descendants(
+    wxr: WiktextractContext,
+    level_node: WikiNode,
+    parent_data: Dict,
+) -> None:
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
+            extract_descendant_list_item(wxr, list_item_node, parent_data)
+
+
+def extract_descendant_list_item(
+    wxr: WiktextractContext,
+    list_item_node: WikiNode,
+    parent_data: Dict,
+) -> None:
+    lang_code = ""
+    lang_name = ""
+    descendant_data = defaultdict(list)
+    for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
+        expanded_template = wxr.wtp.parse(
+            wxr.wtp.node_to_wikitext(template_node), expand_all=True
+        )
+        if template_node.template_name.lower() in DESCENDANT_TEMPLATES:
+            lang_code = template_node.template_parameters.get(1)
+            descendant_data["lang_code"] = lang_code
+        ruby_data, nodes_without_ruby = extract_ruby(
+            wxr, expanded_template.children
+        )
+        if len(ruby_data) > 0:
+            descendant_data["ruby"] = ruby_data
+        for child_index, child_node in enumerate(nodes_without_ruby):
+            if isinstance(child_node, str) and child_node.endswith("："):
+                lang_name = child_node.strip(" ：")
+                descendant_data["lang_name"] = lang_name
+            elif (
+                isinstance(child_node, WikiNode)
+                and child_node.kind == NodeKind.HTML
+            ):
+                if child_node.tag == "span":
+                    class_names = child_node.attrs.get("class", "")
+                    if (
+                        "Latn" in class_names or "tr" in class_names
+                    ) and "word" in descendant_data:
+                        # template:ja-r
+                        descendant_data["roman"] = clean_node(
+                            wxr, None, child_node
+                        )
+                    elif "lang" in child_node.attrs:
+                        if "word" in descendant_data:
+                            parent_data["descendants"].append(descendant_data)
+                            descendant_data = defaultdict(
+                                list,
+                                {
+                                    "lang_code": lang_code,
+                                    "lang_name": lang_name,
+                                },
+                            )
+                            if len(ruby_data) > 0:
+                                descendant_data["ruby"] = ruby_data
+                        descendant_data["word"] = clean_node(
+                            wxr, None, child_node
+                        )
+                    if "qualifier-content" in class_names:
+                        descendant_data["tags"].append(
+                            clean_node(wxr, None, child_node)
+                        )
+                elif child_node.tag == "i":
+                    # template:zh-l
+                    for span_tag in child_node.find_html(
+                        "span", attr_name="class", attr_value="Latn"
+                    ):
+                        descendant_data["roman"] = clean_node(
+                            wxr, None, span_tag
+                        )
+
+        if "word" in descendant_data:
+            parent_data["descendants"].append(descendant_data)
+
+    if list_item_node.contain_node(NodeKind.LIST):
+        extract_descendants(
+            wxr,
+            list_item_node,
+            descendant_data if "word" in descendant_data else parent_data,
+        )
diff --git a/wiktextract/extractor/zh/linkage.py b/wiktextract/extractor/zh/linkage.py
@@ -13,6 +13,7 @@
     split_chinese_variants,
     strip_nodes,
 )
+from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item
 
 
 def extract_linkages(
@@ -34,6 +35,7 @@ def extract_linkages(
             append_to = find_similar_gloss(page_data, sense)
         elif isinstance(node, WikiNode):
             if node.kind == NodeKind.LIST_ITEM:
+                is_descendant = False
                 not_term_indexes = set()
                 filtered_children = list(node.filter_empty_str_child())
                 linkage_data = defaultdict(list)
@@ -57,6 +59,14 @@ def extract_linkages(
                             linkage_data["tags"].append(
                                 clean_node(wxr, None, item_child).strip("()")
                             )
+                        elif template_name.lower() in DESCENDANT_TEMPLATES:
+                            extract_descendant_list_item(
+                                wxr, node, page_data[-1]
+                            )
+                            is_descendant = True
+                            break
+                if is_descendant:
+                    continue
                 # sense template before entry and they are inside the same
                 # list item
                 terms = clean_node(