Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[de, nl] improve sound section code #939

Merged
merged 3 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def process_noun_table(
form.raw_tags.append(row_header)
if col_index < len(column_headers):
form.raw_tags.append(column_headers[col_index])
if len(form.form) > 0 and form.form != "—":
if form.form not in ["—", "", "?"]:
translate_raw_tags(form)
word_entry.forms.append(form)

Expand Down Expand Up @@ -181,7 +181,7 @@ def process_adj_table(
column_headers.append(cell_text)
else:
for form_text in cell_text.splitlines():
if form_text in ("—", ""):
if form_text in ("—", "", "?"):
continue
form = Form(form=form_text)
if col_index < len(column_headers):
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class Sound(BaseModelWrap):
mp3_url: str = Field(default="")
oga_url: str = Field(default="")
flac_url: str = Field(default="")
opus_url: str = Field(default="")
raw_tags: list[str] = []
tags: list[str] = []
rhymes: str = ""
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,5 @@ def extract_hyphenation_section(
break
else:
word_entry.hyphenation += node.strip()
if word_entry.hyphenation == "?":
word_entry.hyphenation = ""
33 changes: 23 additions & 10 deletions src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
Expand All @@ -30,6 +29,18 @@ def extract_section_categories(
clean_node(wxr, word_entry, link_node)


def select_word_entry(
page_data: list[WordEntry], base_data: WordEntry
) -> WordEntry:
# use a function not a variable because new data could be appended to
# `page_data` after the variable is created
return (
page_data[-1]
if len(page_data) > 0 and page_data[-1].lang_code == base_data.lang_code
else base_data
)


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
Expand All @@ -43,6 +54,7 @@ def parse_section(
title_text = re.sub(r"\s+#?\d+:?$", "", title_text)
wxr.wtp.start_subsection(title_text)
etymology_data = []

if title_text in POS_DATA:
last_data_len = len(page_data)
extract_pos_section(
Expand All @@ -57,40 +69,40 @@ def parse_section(
)
elif title_text == "Uitspraak":
extract_sound_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text in LINKAGE_SECTIONS:
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
select_word_entry(page_data, base_data),
level_node,
LINKAGE_SECTIONS[title_text],
)
elif title_text == "Vertalingen":
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Woordafbreking":
extract_hyphenation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Woordherkomst en -opbouw":
etymology_data = extract_etymology_section(wxr, level_node)
elif title_text in ["Schrijfwijzen", "Verdere woordvormen"]:
extract_spelling_form_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Opmerkingen":
extract_note_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Overerving en ontlening":
extract_descendant_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Vaste voorzetsels":
extract_fixed_preposition_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text in [
"Gangbaarheid",
Expand All @@ -105,7 +117,7 @@ def parse_section(
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, forms_data, next_level)
extract_section_categories(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
is_first_forms_template = True
for t_node in level_node.find_child(NodeKind.TEMPLATE):
Expand All @@ -120,6 +132,7 @@ def parse_section(
page_data[-1]
if title_text.startswith(("Vervoeging", "Verbuiging"))
and len(page_data) > 0
and page_data[-1].lang_code == base_data.lang_code
else forms_data,
t_node,
)
Expand Down
5 changes: 3 additions & 2 deletions src/wiktextract/extractor/nl/sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def extract_audio_template(
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:audio
audio_file = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
set_sound_file_url_fields(wxr, audio_file, sound)
clean_node(wxr, word_entry, t_node)
if audio_file not in ["", "..."]:
set_sound_file_url_fields(wxr, audio_file, sound)
clean_node(wxr, word_entry, t_node)


def extract_ipa_template(
Expand Down
17 changes: 17 additions & 0 deletions tests/test_nl_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,20 @@ def test_hyphenation(self):
# stappen""",
)
self.assertEqual(data[0]["hyphenation"], "lo·pen")

def test_sound_section(self):
data = parse_page(
self.wxr,
"vin",
"""==Deens==
===Zelfstandig naamwoord===
# [[wijn]]

==Frans==
===Uitspraak===
*{{sound}}: {{audio|Fr-vin.ogg|vin|fra}}
===Zelfstandig naamwoord===
# [[wijn]]""",
)
self.assertTrue("sounds" not in data[0])
self.assertTrue("sounds" in data[1])
Loading