Skip to content

Commit

Permalink
Merge pull request #939 from xxyzz/de
Browse files Browse the repository at this point in the history
[de, nl] improve sound section code
  • Loading branch information
xxyzz authored Dec 9, 2024
2 parents bb46d54 + 1d17b6d commit 0a549b6
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 14 deletions.
4 changes: 2 additions & 2 deletions src/wiktextract/extractor/de/inflection.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def process_noun_table(
form.raw_tags.append(row_header)
if col_index < len(column_headers):
form.raw_tags.append(column_headers[col_index])
if len(form.form) > 0 and form.form != "—":
if form.form not in ["—", "", "?"]:
translate_raw_tags(form)
word_entry.forms.append(form)

Expand Down Expand Up @@ -181,7 +181,7 @@ def process_adj_table(
column_headers.append(cell_text)
else:
for form_text in cell_text.splitlines():
if form_text in ("—", ""):
if form_text in ("—", "", "?"):
continue
form = Form(form=form_text)
if col_index < len(column_headers):
Expand Down
1 change: 1 addition & 0 deletions src/wiktextract/extractor/de/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class Sound(BaseModelWrap):
mp3_url: str = Field(default="")
oga_url: str = Field(default="")
flac_url: str = Field(default="")
opus_url: str = Field(default="")
raw_tags: list[str] = []
tags: list[str] = []
rhymes: str = ""
Expand Down
2 changes: 2 additions & 0 deletions src/wiktextract/extractor/de/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,5 @@ def extract_hyphenation_section(
break
else:
word_entry.hyphenation += node.strip()
if word_entry.hyphenation == "?":
word_entry.hyphenation = ""
33 changes: 23 additions & 10 deletions src/wiktextract/extractor/nl/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
LEVEL_KIND_FLAGS,
LevelNode,
NodeKind,
WikiNode,
)

from ...page import clean_node
Expand All @@ -30,6 +29,18 @@ def extract_section_categories(
clean_node(wxr, word_entry, link_node)


def select_word_entry(
page_data: list[WordEntry], base_data: WordEntry
) -> WordEntry:
# use a function not a variable because new data could be appended to
# `page_data` after the variable is created
return (
page_data[-1]
if len(page_data) > 0 and page_data[-1].lang_code == base_data.lang_code
else base_data
)


def parse_section(
wxr: WiktextractContext,
page_data: list[WordEntry],
Expand All @@ -43,6 +54,7 @@ def parse_section(
title_text = re.sub(r"\s+#?\d+:?$", "", title_text)
wxr.wtp.start_subsection(title_text)
etymology_data = []

if title_text in POS_DATA:
last_data_len = len(page_data)
extract_pos_section(
Expand All @@ -57,40 +69,40 @@ def parse_section(
)
elif title_text == "Uitspraak":
extract_sound_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text in LINKAGE_SECTIONS:
extract_linkage_section(
wxr,
page_data[-1] if len(page_data) > 0 else base_data,
select_word_entry(page_data, base_data),
level_node,
LINKAGE_SECTIONS[title_text],
)
elif title_text == "Vertalingen":
extract_translation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Woordafbreking":
extract_hyphenation_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Woordherkomst en -opbouw":
etymology_data = extract_etymology_section(wxr, level_node)
elif title_text in ["Schrijfwijzen", "Verdere woordvormen"]:
extract_spelling_form_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Opmerkingen":
extract_note_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Overerving en ontlening":
extract_descendant_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text == "Vaste voorzetsels":
extract_fixed_preposition_section(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
elif title_text in [
"Gangbaarheid",
Expand All @@ -105,7 +117,7 @@ def parse_section(
for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
parse_section(wxr, page_data, base_data, forms_data, next_level)
extract_section_categories(
wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
wxr, select_word_entry(page_data, base_data), level_node
)
is_first_forms_template = True
for t_node in level_node.find_child(NodeKind.TEMPLATE):
Expand All @@ -120,6 +132,7 @@ def parse_section(
page_data[-1]
if title_text.startswith(("Vervoeging", "Verbuiging"))
and len(page_data) > 0
and page_data[-1].lang_code == base_data.lang_code
else forms_data,
t_node,
)
Expand Down
5 changes: 3 additions & 2 deletions src/wiktextract/extractor/nl/sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def extract_audio_template(
) -> None:
# https://nl.wiktionary.org/wiki/Sjabloon:audio
audio_file = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
set_sound_file_url_fields(wxr, audio_file, sound)
clean_node(wxr, word_entry, t_node)
if audio_file not in ["", "..."]:
set_sound_file_url_fields(wxr, audio_file, sound)
clean_node(wxr, word_entry, t_node)


def extract_ipa_template(
Expand Down
17 changes: 17 additions & 0 deletions tests/test_nl_sound.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,20 @@ def test_hyphenation(self):
# stappen""",
)
self.assertEqual(data[0]["hyphenation"], "lo·pen")

def test_sound_section(self):
data = parse_page(
self.wxr,
"vin",
"""==Deens==
===Zelfstandig naamwoord===
# [[wijn]]
==Frans==
===Uitspraak===
*{{sound}}: {{audio|Fr-vin.ogg|vin|fra}}
===Zelfstandig naamwoord===
# [[wijn]]""",
)
self.assertTrue("sounds" not in data[0])
self.assertTrue("sounds" in data[1])

0 comments on commit 0a549b6

Please sign in to comment.