-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #358 from xxyzz/zh
Extract Chinese Wiktionary descendant data
- Loading branch information
Showing
8 changed files
with
290 additions
and
70 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from collections import defaultdict | ||
from unittest import TestCase | ||
from unittest.mock import Mock | ||
|
||
from wikitextprocessor import Wtp | ||
|
||
from wiktextract.extractor.zh.descendant import extract_descendants | ||
from wiktextract.thesaurus import close_thesaurus_db | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
|
||
class TestDescendant(TestCase): | ||
def setUp(self): | ||
self.wxr = WiktextractContext(Wtp(lang_code="zh"), Mock()) | ||
|
||
def tearDown(self): | ||
self.wxr.wtp.close_db_conn() | ||
close_thesaurus_db( | ||
self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn | ||
) | ||
|
||
def test_ruby(self): | ||
# https://zh.wiktionary.org/wiki/你好 | ||
self.wxr.wtp.start_page("你好") | ||
self.wxr.wtp.add_page( | ||
"Template:desc", | ||
10, | ||
'<span class="desc-arr" title="借詞">→</span> 日語:', | ||
) | ||
self.wxr.wtp.add_page( | ||
"Template:ja-r", | ||
10, | ||
'<span class="Jpan" lang="ja">[[你好#日語|-{<ruby>你好<rp>(</rp><rt>ニイハオ</rt><rp>)</rp></ruby>}-]]</span> <span class="mention-gloss-paren annotation-paren">(</span><span class="tr"><span class="mention-tr tr">nīhao</span></span><span class="mention-gloss-paren annotation-paren">)</span>', | ||
) | ||
root = self.wxr.wtp.parse("* {{desc|bor=1|ja|-}} {{ja-r|你好|ニイハオ}}") | ||
page_data = defaultdict(list) | ||
extract_descendants(self.wxr, root, page_data) | ||
self.assertEqual( | ||
page_data.get("descendants"), | ||
[ | ||
{ | ||
"lang_code": "ja", | ||
"lang_name": "日語", | ||
"roman": "nīhao", | ||
"ruby": [("你好", "ニイハオ")], | ||
"word": "你好", | ||
} | ||
], | ||
) | ||
|
||
def test_roman_only_list(self): | ||
self.wxr.wtp.start_page("你好") | ||
self.wxr.wtp.add_page( | ||
"Template:desc", | ||
10, | ||
'<span class="desc-arr" title="仿譯詞">→</span> 壯語:<span class="Latn" lang="za">[[mwngz ndei#壯語|-{mwngz ndei}-]]</span> <span class="ib-brac qualifier-brac">(</span><span class="ib-content qualifier-content">仿譯</span><span class="ib-brac qualifier-brac">)</span>', | ||
) | ||
root = self.wxr.wtp.parse("* {{desc|za|mwngz ndei|cal=1}}") | ||
page_data = defaultdict(list) | ||
extract_descendants(self.wxr, root, page_data) | ||
self.assertEqual( | ||
page_data.get("descendants"), | ||
[ | ||
{ | ||
"lang_code": "za", | ||
"lang_name": "壯語", | ||
"tags": ["仿譯"], | ||
"word": "mwngz ndei", | ||
} | ||
], | ||
) | ||
|
||
def test_nested_list(self): | ||
# https://zh.wiktionary.org/wiki/オタク | ||
self.wxr.wtp.start_page("オタク") | ||
self.wxr.wtp.add_page( | ||
"Template:desc", | ||
10, | ||
'<span class="desc-arr" title="詞形受類比影響或添加了額外詞素">⇒</span> 官話:', | ||
) | ||
self.wxr.wtp.add_page( | ||
"Template:zh-l", | ||
10, | ||
'<span class="Hani" lang="zh">{{{1}}}</span> (<i><span class="tr Latn" lang="la">{{{1}}}</span></i>', | ||
) | ||
root = self.wxr.wtp.parse( | ||
"""*: {{desc|cmn|-}} {{zh-l|御宅族}} | ||
*:* {{desc|cmn|-|der=1}} {{zh-l|宅男}} | ||
*:* {{desc|cmn|-|der=1}} {{zh-l|宅女}}""" | ||
) | ||
page_data = defaultdict(list) | ||
extract_descendants(self.wxr, root, page_data) | ||
self.assertEqual( | ||
page_data.get("descendants"), | ||
[ | ||
{ | ||
"descendants": [ | ||
{ | ||
"lang_code": "cmn", | ||
"lang_name": "官話", | ||
"roman": "宅男", | ||
"word": "宅男", | ||
}, | ||
{ | ||
"lang_code": "cmn", | ||
"lang_name": "官話", | ||
"roman": "宅女", | ||
"word": "宅女", | ||
}, | ||
], | ||
"lang_code": "cmn", | ||
"lang_name": "官話", | ||
"roman": "御宅族", | ||
"word": "御宅族", | ||
} | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -74,5 +74,8 @@ | |
"translations": [ | ||
"翻譯", | ||
"翻译" | ||
], | ||
"descendants": [ | ||
"派生語彙" | ||
] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
from collections import defaultdict | ||
from typing import Dict | ||
|
||
from wikitextprocessor import NodeKind, WikiNode | ||
|
||
from wiktextract.page import clean_node | ||
from wiktextract.wxr_context import WiktextractContext | ||
|
||
from ..ruby import extract_ruby | ||
|
||
DESCENDANT_TEMPLATES = frozenset(["desc", "descendant"]) | ||
|
||
|
||
def extract_descendants( | ||
wxr: WiktextractContext, | ||
level_node: WikiNode, | ||
parent_data: Dict, | ||
) -> None: | ||
for list_node in level_node.find_child(NodeKind.LIST): | ||
for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): | ||
extract_descendant_list_item(wxr, list_item_node, parent_data) | ||
|
||
|
||
def extract_descendant_list_item( | ||
wxr: WiktextractContext, | ||
list_item_node: WikiNode, | ||
parent_data: Dict, | ||
) -> None: | ||
lang_code = "" | ||
lang_name = "" | ||
descendant_data = defaultdict(list) | ||
for template_node in list_item_node.find_child(NodeKind.TEMPLATE): | ||
expanded_template = wxr.wtp.parse( | ||
wxr.wtp.node_to_wikitext(template_node), expand_all=True | ||
) | ||
if template_node.template_name.lower() in DESCENDANT_TEMPLATES: | ||
lang_code = template_node.template_parameters.get(1) | ||
descendant_data["lang_code"] = lang_code | ||
ruby_data, nodes_without_ruby = extract_ruby( | ||
wxr, expanded_template.children | ||
) | ||
if len(ruby_data) > 0: | ||
descendant_data["ruby"] = ruby_data | ||
for child_index, child_node in enumerate(nodes_without_ruby): | ||
if isinstance(child_node, str) and child_node.endswith(":"): | ||
lang_name = child_node.strip(" :") | ||
descendant_data["lang_name"] = lang_name | ||
elif ( | ||
isinstance(child_node, WikiNode) | ||
and child_node.kind == NodeKind.HTML | ||
): | ||
if child_node.tag == "span": | ||
class_names = child_node.attrs.get("class", "") | ||
if ( | ||
"Latn" in class_names or "tr" in class_names | ||
) and "word" in descendant_data: | ||
# template:ja-r | ||
descendant_data["roman"] = clean_node( | ||
wxr, None, child_node | ||
) | ||
elif "lang" in child_node.attrs: | ||
if "word" in descendant_data: | ||
parent_data["descendants"].append(descendant_data) | ||
descendant_data = defaultdict( | ||
list, | ||
{ | ||
"lang_code": lang_code, | ||
"lang_name": lang_name, | ||
}, | ||
) | ||
if len(ruby_data) > 0: | ||
descendant_data["ruby"] = ruby_data | ||
descendant_data["word"] = clean_node( | ||
wxr, None, child_node | ||
) | ||
if "qualifier-content" in class_names: | ||
descendant_data["tags"].append( | ||
clean_node(wxr, None, child_node) | ||
) | ||
elif child_node.tag == "i": | ||
# template:zh-l | ||
for span_tag in child_node.find_html( | ||
"span", attr_name="class", attr_value="Latn" | ||
): | ||
descendant_data["roman"] = clean_node( | ||
wxr, None, span_tag | ||
) | ||
|
||
if "word" in descendant_data: | ||
parent_data["descendants"].append(descendant_data) | ||
|
||
if list_item_node.contain_node(NodeKind.LIST): | ||
extract_descendants( | ||
wxr, | ||
list_item_node, | ||
descendant_data if "word" in descendant_data else parent_data, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.