Merge pull request #936 from xxyzz/pt

[pt] improve linkage and pos section code, extract pronunciation section
tatuylonen · Dec 6, 2024 · bb46d54 · bb46d54
2 parents 7ec39a8 + d2ca145
commit bb46d54
Show file tree

Hide file tree

Showing 11 changed files with 396 additions and 118 deletions.
diff --git a/src/wiktextract/extractor/pt/linkage.py b/src/wiktextract/extractor/pt/linkage.py
@@ -107,29 +107,46 @@ def extract_linkage_list_item(
     linkage_words = []
     raw_tags = []
     for node in list_item.children:
-        if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
-            word = clean_node(wxr, None, node)
-            if word != "":
-                linkage_words.append(word)
-        elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
-            bold_str = clean_node(wxr, None, node)
-            if re.fullmatch(r"\d+", bold_str):
-                sense_index = int(bold_str)
+        if isinstance(node, TemplateNode):
+            match node.template_name:
+                case "link preto":
+                    word = clean_node(
+                        wxr, None, node.template_parameters.get(1, "")
+                    )
+                    if word != "":
+                        linkage_words.append(word)
+                case "escopo2":
+                    from .pos import extract_escopo2_template
+
+                    raw_tags.extend(extract_escopo2_template(wxr, node))
+        elif isinstance(node, WikiNode):
+            match node.kind:
+                case NodeKind.LINK:
+                    word = clean_node(wxr, None, node)
+                    if word != "" and not word.startswith("Wikisaurus:"):
+                        linkage_words.append(word)
+                case NodeKind.BOLD:
+                    bold_str = clean_node(wxr, None, node)
+                    if re.fullmatch(r"\d+", bold_str):
+                        sense_index = int(bold_str)
+                case NodeKind.ITALIC:
+                    raw_tag = clean_node(wxr, None, node)
+                    if raw_tag != "":
+                        raw_tags.append(raw_tag)
+                case NodeKind.LIST:
+                    for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+                        extract_linkage_list_item(
+                            wxr,
+                            word_entry,
+                            child_list_item,
+                            linkage_type,
+                            sense,
+                            sense_index,
+                        )
         elif isinstance(node, str):
             m = re.search(r"\((.+)\)", node)
             if m is not None:
                 sense = m.group(1)
-        elif (
-            isinstance(node, TemplateNode)
-            and node.template_name == "link preto"
-        ):
-            word = clean_node(wxr, None, node.template_parameters.get(1, ""))
-            if word != "":
-                linkage_words.append(word)
-        elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
-            raw_tag = clean_node(wxr, None, node)
-            if raw_tag != "":
-                raw_tags.append(raw_tag)
 
     for word in linkage_words:
         linkage = Linkage(

diff --git a/src/wiktextract/extractor/pt/models.py b/src/wiktextract/extractor/pt/models.py
@@ -52,6 +52,19 @@ class Linkage(PortugueseBaseModel):
     )
 
 
+class Sound(PortugueseBaseModel):
+    ipa: str = Field(default="", description="International Phonetic Alphabet")
+    audio: str = Field(default="", description="Audio file name")
+    wav_url: str = ""
+    oga_url: str = ""
+    ogg_url: str = ""
+    mp3_url: str = ""
+    opus_url: str = ""
+    flac_url: str = ""
+    tags: list[str] = []
+    raw_tags: list[str] = []
+
+
 class WordEntry(PortugueseBaseModel):
     model_config = ConfigDict(title="Portuguese Wiktionary")
     word: str = Field(description="Word string", min_length=1)
@@ -69,3 +82,4 @@ class WordEntry(PortugueseBaseModel):
     synonyms: list[Linkage] = []
     derived: list[Linkage] = []
     etymology_texts: list[str] = []
+    sounds: list[Sound] = []
diff --git a/src/wiktextract/extractor/pt/page.py b/src/wiktextract/extractor/pt/page.py
@@ -12,6 +12,7 @@
 from .linkage import extract_expression_section, extract_linkage_section
 from .models import Sense, WordEntry
 from .pos import extract_pos_section
+from .pronunciation import extract_pronunciation_section
 from .section_titles import LINKAGE_SECTIONS, POS_DATA
 from .translation import extract_translation_section
 
@@ -23,7 +24,7 @@ def parse_section(
     level_node: LevelNode,
 ) -> None:
     cats = {}
-    title_text = clean_node(wxr, cats, level_node.largs)
+    title_text = clean_node(wxr, cats, level_node.largs).strip("⁰¹²³⁴⁵⁶⁷⁸⁹")
     if title_text in POS_DATA:
         extract_pos_section(
             wxr,
@@ -50,16 +51,35 @@ def parse_section(
         )
     elif title_text == "Etimologia":
         extract_etymology_section(wxr, page_data, level_node)
+    elif title_text == "Pronúncia":
+        extract_pronunciation_section(wxr, page_data, level_node)
 
+    if title_text not in POS_DATA:
+        save_section_cats(
+            cats.get("categories", []), page_data, level_node, True
+        )
     cats = {}
     for link_node in level_node.find_child(NodeKind.LINK):
         clean_node(wxr, cats, link_node)
-    for data in page_data:
-        if data.lang_code == page_data[-1].lang_code:
-            data.categories.extend(cats.get("categories", []))
+    save_section_cats(cats.get("categories", []), page_data, level_node, False)
+
+    if title_text != "Pronúncia":
+        for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
+            parse_section(wxr, page_data, base_data, next_level)
+
 
-    for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
-        parse_section(wxr, page_data, base_data, next_level)
+def save_section_cats(
+    cats: list[str],
+    page_data: list[WordEntry],
+    level_node: LevelNode,
+    from_title: bool,
+) -> None:
+    if not from_title or (from_title and level_node.kind == NodeKind.LEVEL2):
+        for data in page_data:
+            if data.lang_code == page_data[-1].lang_code:
+                data.categories.extend(cats)
+    elif len(page_data) > 0:
+        page_data[-1].categories.extend(cats)
 
 
 def parse_page(

diff --git a/src/wiktextract/extractor/pt/pos.py b/src/wiktextract/extractor/pt/pos.py
@@ -53,7 +53,7 @@ def extract_gloss_list_item(
             if node.template_name == "escopo":
                 extract_escopo_template(wxr, sense, node)
             elif node.template_name == "escopo2":
-                extract_escopo2_template(wxr, sense, node)
+                sense.raw_tags.extend(extract_escopo2_template(wxr, node))
             else:
                 gloss_nodes.append(node)
         elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
@@ -80,24 +80,25 @@ def extract_escopo_template(
     for arg in range(2, 9):
         if arg not in t_node.template_parameters:
             break
-        sense.raw_tags.append(
-            clean_node(wxr, None, t_node.template_parameters[arg])
-        )
+        raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
+        if raw_tag != "":
+            sense.raw_tags.append(raw_tag)
     clean_node(wxr, sense, t_node)
 
 
 def extract_escopo2_template(
     wxr: WiktextractContext,
-    sense: Sense,
     t_node: TemplateNode,
-) -> None:
+) -> list[str]:
     # https://pt.wiktionary.org/wiki/Predefinição:escopo2
+    raw_tags = []
     for arg in range(1, 4):
         if arg not in t_node.template_parameters:
             break
-        sense.raw_tags.append(
-            clean_node(wxr, None, t_node.template_parameters[arg])
-        )
+        raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
+        if raw_tag != "":
+            raw_tags.append(raw_tag)
+    return raw_tags
 
 
 def extract_example_list_item(
@@ -106,8 +107,13 @@ def extract_example_list_item(
     list_item: WikiNode,
 ) -> None:
     example = Example()
+    ref_nodes = []
     for node in list_item.children:
-        if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
+        if (
+            isinstance(node, WikiNode)
+            and node.kind == NodeKind.ITALIC
+            and example.text == ""
+        ):
             example.text = clean_node(wxr, None, node)
         elif isinstance(node, HTMLNode) and node.tag == "small":
             example.translation = clean_node(wxr, None, node)
@@ -131,5 +137,10 @@ def extract_example_list_item(
                     example.text = clean_node(
                         wxr, sense, node.template_parameters.get(1, "")
                     )
+        else:
+            ref_nodes.append(node)
+
     if example.text != "":
+        if example.ref == "":
+            example.ref = clean_node(wxr, sense, ref_nodes).strip(":() \n")
         sense.examples.append(example)
diff --git a/src/wiktextract/extractor/pt/pronunciation.py b/src/wiktextract/extractor/pt/pronunciation.py
@@ -0,0 +1,73 @@
+from wikitextprocessor.parser import (
+    LEVEL_KIND_FLAGS,
+    LevelNode,
+    NodeKind,
+    WikiNode,
+)
+
+from ...page import clean_node
+from ...wxr_context import WiktextractContext
+from .models import Sound, WordEntry
+from .tags import translate_raw_tags
+
+
+def extract_pronunciation_section(
+    wxr: WiktextractContext,
+    page_data: list[WordEntry],
+    level_node: LevelNode,
+) -> None:
+    raw_tags = []
+    sounds = []
+    title_text = clean_node(wxr, None, level_node.largs)
+    if title_text not in ["", "Pronúncia"]:
+        raw_tags.append(title_text)
+
+    for list_node in level_node.find_child(NodeKind.LIST):
+        for list_item in list_node.find_child(NodeKind.LIST_ITEM):
+            sounds.extend(
+                extract_pronunciation_list_item(wxr, list_item, raw_tags)
+            )
+
+    for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
+        extract_pronunciation_section(wxr, page_data, child_level_node)
+
+    for data in page_data:
+        if data.lang_code == page_data[-1].lang_code:
+            for sound in sounds:
+                translate_raw_tags(sound)
+                data.sounds.append(sound)
+
+
+def extract_pronunciation_list_item(
+    wxr: WiktextractContext, list_item: WikiNode, raw_tags: list[str]
+) -> list[Sound]:
+    sounds = []
+    for index, node in enumerate(list_item.children):
+        if isinstance(node, str) and ":" in node:
+            raw_tag = clean_node(wxr, None, list_item.children[:index])
+            sound_value = clean_node(
+                wxr,
+                None,
+                [node[node.index(":") + 1 :]]
+                + [
+                    n
+                    for n in list_item.children[index + 1 :]
+                    if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
+                ],
+            )
+            if sound_value != "":
+                sound = Sound(ipa=sound_value, raw_tags=raw_tags)
+                if raw_tag == "X-SAMPA":
+                    sound.tags.append("X-SAMPA")
+                sounds.append(sound)
+            elif raw_tag != "":
+                raw_tags.append(raw_tag)
+        elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
+            for child_list_item in node.find_child(NodeKind.LIST_ITEM):
+                sounds.extend(
+                    extract_pronunciation_list_item(
+                        wxr, child_list_item, raw_tags
+                    )
+                )
+
+    return sounds
diff --git a/src/wiktextract/extractor/pt/section_titles.py b/src/wiktextract/extractor/pt/section_titles.py
@@ -30,5 +30,7 @@
 LINKAGE_SECTIONS = {
     "Antônimos": "antonyms",
     "Sinônimos": "synonyms",
+    "Sinónimos/Sinônimos": "synonyms",
+    "Sinónimos": "synonyms",
     "Verbetes derivados": "derived",
 }
diff --git a/src/wiktextract/extractor/pt/translation.py b/src/wiktextract/extractor/pt/translation.py
@@ -87,7 +87,7 @@ def extract_translation_list_item(
                         )
                     )
         elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None:
-            roman = node.strip("() ")
+            roman = node.strip("() \n")
             for tr_data in translations:
                 tr_data.roman = roman
         elif (