Skip to content

Commit

Permalink
Merge pull request #358 from xxyzz/zh
Browse files Browse the repository at this point in the history
Extract Chinese Wiktionary descendant data
  • Loading branch information
xxyzz authored Oct 12, 2023
2 parents 8057bec + 5ac2c4a commit d7e9505
Show file tree
Hide file tree
Showing 8 changed files with 290 additions and 70 deletions.
46 changes: 46 additions & 0 deletions json_schema/zh.json
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,12 @@
"items": {
"type": "string"
}
},
"descendants": {
"type": "array",
"items": {
"$ref": "#/$defs/descendant"
}
}
},
"$defs": {
Expand Down Expand Up @@ -315,6 +321,46 @@
"enum": ["zh-Hant", "zh-Hans"]
}
}
},
"descendant": {
"type": "object",
"properties": {
"lang_code": {
"description": "ISO 639-1 code",
"type": "string"
},
"lang_name": {
"type": "string"
},
"word": {
"type": "string"
},
"roman": {
"type": "string"
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
},
"descendants": {
"type": "array",
"items": {
"$refs": "#/$defs/descendant"
}
},
"ruby": {
"description": "Japanese Kanji and furigana",
"type": "array",
"items": {
"type": "array",
"items": {
"type": "string"
}
}
}
}
}
}
}
117 changes: 117 additions & 0 deletions tests/test_zh_descendant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from collections import defaultdict
from unittest import TestCase
from unittest.mock import Mock

from wikitextprocessor import Wtp

from wiktextract.extractor.zh.descendant import extract_descendants
from wiktextract.thesaurus import close_thesaurus_db
from wiktextract.wxr_context import WiktextractContext


class TestDescendant(TestCase):
def setUp(self):
self.wxr = WiktextractContext(Wtp(lang_code="zh"), Mock())

def tearDown(self):
self.wxr.wtp.close_db_conn()
close_thesaurus_db(
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn
)

def test_ruby(self):
# https://zh.wiktionary.org/wiki/你好
self.wxr.wtp.start_page("你好")
self.wxr.wtp.add_page(
"Template:desc",
10,
'<span class="desc-arr" title="借詞">→</span> 日語:',
)
self.wxr.wtp.add_page(
"Template:ja-r",
10,
'<span class="Jpan" lang="ja">[[你好#日語|-{<ruby>你好<rp>(</rp><rt>ニイハオ</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">nīhao</span></span><span class="mention-gloss-paren annotation-paren">)</span>',
)
root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}")
page_data = defaultdict(list)
extract_descendants(self.wxr, root, page_data)
self.assertEqual(
page_data.get("descendants"),
[
{
"lang_code": "ja",
"lang_name": "日語",
"roman": "nīhao",
"ruby": [("你好", "ニイハオ")],
"word": "你好",
}
],
)

def test_roman_only_list(self):
self.wxr.wtp.start_page("你好")
self.wxr.wtp.add_page(
"Template:desc",
10,
'<span class="desc-arr" title="仿譯詞">→</span> 壯語:<span class="Latn" lang="za">[[mwngz ndei#壯語|-{mwngz ndei}-]]</span> <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content">仿譯</span><span class="ib-brac qualifier-brac">)</span>',
)
root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}")
page_data = defaultdict(list)
extract_descendants(self.wxr, root, page_data)
self.assertEqual(
page_data.get("descendants"),
[
{
"lang_code": "za",
"lang_name": "壯語",
"tags": ["仿譯"],
"word": "mwngz ndei",
}
],
)

def test_nested_list(self):
# https://zh.wiktionary.org/wiki/オタク
self.wxr.wtp.start_page("オタク")
self.wxr.wtp.add_page(
"Template:desc",
10,
'<span class="desc-arr" title="詞形受類比影響或添加了額外詞素">⇒</span> 官話:',
)
self.wxr.wtp.add_page(
"Template:zh-l",
10,
'<span class="Hani" lang="zh">{{{1}}}</span> (<i><span class="tr Latn" lang="la">{{{1}}}</span></i>',
)
root = self.wxr.wtp.parse(
"""*: {{desc|cmn|-}} {{zh-l|御宅族}}
*:* {{desc|cmn|-|der=1}} {{zh-l|宅男}}
*:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}"""
)
page_data = defaultdict(list)
extract_descendants(self.wxr, root, page_data)
self.assertEqual(
page_data.get("descendants"),
[
{
"descendants": [
{
"lang_code": "cmn",
"lang_name": "官話",
"roman": "宅男",
"word": "宅男",
},
{
"lang_code": "cmn",
"lang_name": "官話",
"roman": "宅女",
"word": "宅女",
},
],
"lang_code": "cmn",
"lang_name": "官話",
"roman": "御宅族",
"word": "御宅族",
}
],
)
3 changes: 1 addition & 2 deletions wiktextract/data/zh/linkage_subtitles.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,6 @@
"派生詞": "derived",
"派生詞彙": "derived",
"派生詞語": "derived",
"派生語彙": "derived",
"派生词": "derived",
"派生词汇": "derived",
"派生词组": "derived",
Expand Down Expand Up @@ -133,4 +132,4 @@
"部分詞": "meronyms",
"關聯詞": "related",
"關聯詞彙": "related"
}
}
5 changes: 4 additions & 1 deletion wiktextract/data/zh/other_subtitles.json
Original file line number Diff line number Diff line change
Expand Up @@ -74,5 +74,8 @@
"translations": [
"翻譯",
"翻译"
],
"descendants": [
"派生語彙"
]
}
}
8 changes: 7 additions & 1 deletion wiktextract/extractor/ruby.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import List, Optional, Tuple, Union

from wikitextprocessor import NodeKind, WikiNode
from wikitextprocessor.parser import HTMLNode, LevelNode, TemplateNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext
Expand Down Expand Up @@ -58,7 +59,6 @@ def extract_ruby(
# Otherwise content is WikiNode, and we must recurse into it.
kind = contents.kind
new_node = WikiNode(kind, contents.loc)
new_contents.append(new_node)
if kind in {
NodeKind.LEVEL2,
NodeKind.LEVEL3,
Expand All @@ -68,6 +68,8 @@ def extract_ruby(
NodeKind.LINK,
}:
# Process args and children
if kind != NodeKind.LINK:
new_node = LevelNode(new_node.loc)
new_args = []
for arg in contents.largs:
e1, c1 = extract_ruby(wxr, arg)
Expand Down Expand Up @@ -108,6 +110,8 @@ def extract_ruby(
NodeKind.URL,
}:
# Process only args
if kind == NodeKind.TEMPLATE:
new_node = TemplateNode(new_node.loc)
new_args = []
for arg in contents.largs:
e1, c1 = extract_ruby(wxr, arg)
Expand All @@ -116,11 +120,13 @@ def extract_ruby(
new_node.largs = new_args
elif kind == NodeKind.HTML:
# Keep attrs and args as-is, process children
new_node = HTMLNode(new_node.loc)
new_node.attrs = contents.attrs
new_node.sarg = contents.sarg
e1, c1 = extract_ruby(wxr, contents.children)
extracted.extend(e1)
new_node.children = c1
else:
raise RuntimeError(f"extract_ruby: unhandled kind {kind}")
new_contents.append(new_node)
return extracted, new_contents
97 changes: 97 additions & 0 deletions wiktextract/extractor/zh/descendant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from collections import defaultdict
from typing import Dict

from wikitextprocessor import NodeKind, WikiNode

from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

from ..ruby import extract_ruby

DESCENDANT_TEMPLATES = frozenset(["desc", "descendant"])


def extract_descendants(
wxr: WiktextractContext,
level_node: WikiNode,
parent_data: Dict,
) -> None:
for list_node in level_node.find_child(NodeKind.LIST):
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
extract_descendant_list_item(wxr, list_item_node, parent_data)


def extract_descendant_list_item(
wxr: WiktextractContext,
list_item_node: WikiNode,
parent_data: Dict,
) -> None:
lang_code = ""
lang_name = ""
descendant_data = defaultdict(list)
for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
expanded_template = wxr.wtp.parse(
wxr.wtp.node_to_wikitext(template_node), expand_all=True
)
if template_node.template_name.lower() in DESCENDANT_TEMPLATES:
lang_code = template_node.template_parameters.get(1)
descendant_data["lang_code"] = lang_code
ruby_data, nodes_without_ruby = extract_ruby(
wxr, expanded_template.children
)
if len(ruby_data) > 0:
descendant_data["ruby"] = ruby_data
for child_index, child_node in enumerate(nodes_without_ruby):
if isinstance(child_node, str) and child_node.endswith(":"):
lang_name = child_node.strip(" :")
descendant_data["lang_name"] = lang_name
elif (
isinstance(child_node, WikiNode)
and child_node.kind == NodeKind.HTML
):
if child_node.tag == "span":
class_names = child_node.attrs.get("class", "")
if (
"Latn" in class_names or "tr" in class_names
) and "word" in descendant_data:
# template:ja-r
descendant_data["roman"] = clean_node(
wxr, None, child_node
)
elif "lang" in child_node.attrs:
if "word" in descendant_data:
parent_data["descendants"].append(descendant_data)
descendant_data = defaultdict(
list,
{
"lang_code": lang_code,
"lang_name": lang_name,
},
)
if len(ruby_data) > 0:
descendant_data["ruby"] = ruby_data
descendant_data["word"] = clean_node(
wxr, None, child_node
)
if "qualifier-content" in class_names:
descendant_data["tags"].append(
clean_node(wxr, None, child_node)
)
elif child_node.tag == "i":
# template:zh-l
for span_tag in child_node.find_html(
"span", attr_name="class", attr_value="Latn"
):
descendant_data["roman"] = clean_node(
wxr, None, span_tag
)

if "word" in descendant_data:
parent_data["descendants"].append(descendant_data)

if list_item_node.contain_node(NodeKind.LIST):
extract_descendants(
wxr,
list_item_node,
descendant_data if "word" in descendant_data else parent_data,
)
10 changes: 10 additions & 0 deletions wiktextract/extractor/zh/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
split_chinese_variants,
strip_nodes,
)
from .descendant import DESCENDANT_TEMPLATES, extract_descendant_list_item


def extract_linkages(
Expand All @@ -34,6 +35,7 @@ def extract_linkages(
append_to = find_similar_gloss(page_data, sense)
elif isinstance(node, WikiNode):
if node.kind == NodeKind.LIST_ITEM:
is_descendant = False
not_term_indexes = set()
filtered_children = list(node.filter_empty_str_child())
linkage_data = defaultdict(list)
Expand All @@ -57,6 +59,14 @@ def extract_linkages(
linkage_data["tags"].append(
clean_node(wxr, None, item_child).strip("()")
)
elif template_name.lower() in DESCENDANT_TEMPLATES:
extract_descendant_list_item(
wxr, node, page_data[-1]
)
is_descendant = True
break
if is_descendant:
continue
# sense template before entry and they are inside the same
# list item
terms = clean_node(
Expand Down
Loading

0 comments on commit d7e9505

Please sign in to comment.