Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Combine separated synonyms tag string and template #347

Merged
merged 1 commit into from
Sep 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions tests/test_fr_linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def tearDown(self) -> None:

def test_tags(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("")
self.wxr.wtp.start_page("bonjour")
self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
self.wxr.wtp.add_page("Modèle:Louisiane", 10, body="(Louisiane)")
root = self.wxr.wtp.parse(
Expand All @@ -43,7 +43,7 @@ def test_tags(self):

def test_zh_synonyms(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("")
self.wxr.wtp.start_page("你好")
root = self.wxr.wtp.parse(
"==== {{S|synonymes}} ====\n* {{zh-lien|你们好|nǐmen hǎo|你們好}} — Bonjour (au pluriel)."
)
Expand All @@ -63,3 +63,24 @@ def test_zh_synonyms(self):
}
],
)

def test_template_as_partial_tag(self):
page_data = [defaultdict(list)]
self.wxr.wtp.start_page("bonjour")
self.wxr.wtp.add_page("Modèle:lien", 10, body="kwei")
self.wxr.wtp.add_page("Modèle:Canada", 10, body="(Canada)")
self.wxr.wtp.add_page("Modèle:L", 10, body="Atikamekw")
root = self.wxr.wtp.parse(
"==== {{S|synonymes}} ====\n* {{lien|kwei|fr}} {{Canada|nocat=1}} (mot {{L|atj}})"
)
extract_linkage(self.wxr, page_data, root.children[0], "synonyms")
self.assertEqual(
page_data,
[
{
"synonyms": [
{"word": "kwei", "tags": ["Canada", "mot Atikamekw"]}
]
}
],
)
2 changes: 1 addition & 1 deletion wiktextract/extractor/fr/form_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def extract_form_line(
):
# it's the location of the previous IPA template
page_data[-1]["sounds"][-1]["tags"].append(tag.strip("()"))
else:
elif len(tag.strip("()")) > 0:
page_data[-1]["tags"].append(tag.strip("()"))

pre_template_name = node.template_name
Expand Down
28 changes: 24 additions & 4 deletions wiktextract/extractor/fr/linkage.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def extract_linkage(
) -> None:
for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):
linkage_data = defaultdict(list)
pending_tag = ""
for index, child_node in enumerate(
list_item_node.filter_empty_str_child()
):
Expand All @@ -28,10 +29,29 @@ def extract_linkage(
else:
linkage_data["word"] = clean_node(wxr, None, child_node)
else:
tag = clean_node(wxr, page_data[-1], child_node).strip("()")
tag = (
child_node
if isinstance(child_node, str)
else clean_node(wxr, page_data[-1], child_node)
)
if tag.strip().startswith("(") and not tag.strip().endswith(
")"
):
pending_tag = tag
continue
elif not tag.strip().startswith("(") and tag.strip().endswith(
")"
):
tag = pending_tag + tag
pending_tag = ""
elif len(pending_tag) > 0:
pending_tag += tag
continue

tag = tag.strip("() \n")
if tag.startswith("— "):
linkage_data["translation"] = tag.removeprefix("— ")
else:
elif len(tag) > 0:
linkage_data["tags"].append(tag)

page_data[-1][linkage_type].append(linkage_data)
Expand All @@ -53,8 +73,8 @@ def process_lien_template(
node: TemplateNode,
linkage_data: Dict[str, Union[str, List[str]]],
) -> None:
# https://fr.wiktionary.org/wiki/Modèle:lien
if "dif" in node.template_parameters:
# link word template: https://fr.wiktionary.org/wiki/Modèle:lien
if "dif" in node.template_parameters: # displayed word
word = clean_node(wxr, None, node.template_parameters.get("dif"))
else:
word = clean_node(wxr, None, node.template_parameters.get(1))
Expand Down