diff --git a/usertools/de_language_data.py b/languages/get_de_data.py similarity index 100% rename from usertools/de_language_data.py rename to languages/get_de_data.py diff --git a/overrides/de.json b/overrides/de.json new file mode 100644 index 00000000..0ddfe81b --- /dev/null +++ b/overrides/de.json @@ -0,0 +1,202 @@ +{ + "Vorlage:Abkürzungen": { + "body": "==== Abkürzungen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Alternative Schreibweisen": { + "body": "==== Alternative Schreibweisen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Anmerkung": { + "body": "==== Anmerkung ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Aussprache": { + "body": "==== Aussprache ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Bedeutungen": { + "body": "==== Bedeutungen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Beispiele": { + "body": "==== Beispiele ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Bekannte Namensträger": { + "body": "==== Bekannte Namensträger ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Charakteristische Wortkombinationen": { + "body": "==== Charakteristische Wortkombinationen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Entlehnungen": { + "body": "==== Entlehnungen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Gegenwörter": { + "body": "==== Gegenwörter ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Grammatische Merkmale": { + "body": "==== Grammatische Merkmale ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Herkunft": { + "body": "==== Herkunft ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Holonyme": { + "body": "==== Holonyme ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Koseformen": { + "body": "==== Koseformen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Lesungen": { + "body": "==== Lesungen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Männliche Wortformen": { + "body": "==== Männliche Wortformen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Namensvarianten": { + "body": "==== Namensvarianten ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Nebenformen": { + "body": "==== Nebenformen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Nicht mehr gültige Schreibweisen": { + "body": "==== Nicht mehr gültige Schreibweisen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Oberbegriffe": { + "body": "==== Oberbegriffe ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Quellen": { + "body": "", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:erweitern": { + "body": "", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:QS Herkunft": { + "body": "", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:QS Bedeutungen": { + "body": "", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Redewendungen": { + "body": "==== Redewendungen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Referenzen": { + "body": "==== Referenzen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Sinnverwandte Wörter": { + "body": "==== Sinnverwandte Wörter ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Sprichwörter": { + "body": "==== Sprichwörter ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Symbole": { + "body": "==== Symbole ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Synonyme": { + "body": "==== Synonyme ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Umschrift": { + "body": "==== Umschrift ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Unterbegriffe": { + "body": "==== Unterbegriffe ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Verkleinerungsformen": { + "body": "==== Verkleinerungsformen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Vokalisierung": { + "body": "==== Vokalisierung ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Weibliche Wortformen": { + "body": "==== Weibliche Wortformen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Wortbildungen": { + "body": "==== Wortbildungen ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Wortfamilie": { + "body": "==== Wortfamilie ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Worttrennung": { + "body": "==== Worttrennung ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:in kyrillischer Schrift": { + "body": "==== in kyrillischer Schrift ====", + "namespace_id": 10, + "need_pre_expand": true + }, + "Vorlage:Übersetzungen": { + "body": "Übersetzungen", + "namespace_id": 10, + "need_pre_expand": true + } +} diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py index 8d80276c..b2e3c426 100644 --- a/src/wiktextract/config.py +++ b/src/wiktextract/config.py @@ -120,7 +120,7 @@ def __init__( "FORM_OF_TEMPLATES", "form_of_templates.json" ) if dump_file_lang_code == "de": - self.set_attr_from_json("DE_FORM_TABLES", "form_templates.json") + self.set_attr_from_json("DE_FORM_TABLES", "form_tables.json") self.analyze_templates = True # find templates that need pre-expand self.extract_thesaurus_pages = True self.load_edition_settings() diff --git a/src/wiktextract/data/de/form_tables.json b/src/wiktextract/data/de/form_tables.json index 939a2674..6d4afc53 100644 --- a/src/wiktextract/data/de/form_tables.json +++ b/src/wiktextract/data/de/form_tables.json @@ -13,6 +13,7 @@ "Pronomina-Tabelle", "Afrikaans Substantiv Übersicht", "Albanisch Verb Übersicht", + "Altenglisch Substantiv Übersicht", "Altgriechisch Adjektiv Übersicht", "Altgriechisch Substantiv Übersicht", "Altirisch Substantiv Übersicht", @@ -40,6 +41,7 @@ "Französisch Substantiv Übersicht", "Französisch Verb Übersicht", "Galicisch Substantiv Übersicht", + "Georgisch Substantiv Übersicht", "Hausa Adjektiv Übersicht", "Hausa Possessiv Übersicht", "Hausa Substantiv Übersicht", @@ -48,11 +50,12 @@ "Irisch Adjektiv Übersicht", "Irisch Substantiv Übersicht", "Isländisch Name Übersicht", + "Isländisch Adjektiv Übersicht", "Isländisch Substantiv Übersicht", "Isländisch Verb Übersicht", "Italienisch Adjektiv Übersicht", "Italienisch Substantiv Übersicht", - "Italienisch Verb Übersicht", + "Italienisch Substantiv Übersicht", "Katalanisch Adjektiv Übersicht", "Katalanisch Substantiv Übersicht", "Katalanisch Verb Übersicht", @@ -64,12 +67,14 @@ "Latein Adjektiv Übersicht", "Latein Adverb Übersicht", "Latein Substantiv Übersicht", + "Lateinisch Substantiv Übersicht", "Lettisch Substantiv Übersicht", "Lettisch Verb Übersicht", "Mazedonisch Substantiv Übersicht", "Nahuatl Substantiv Übersicht", "Neugriechisch Substantiv Übersicht", "Niederdeutsch Adjektiv Übersicht", + "Niederdeutsch Substantiv Übersicht", "Niederländisch Adjektiv Übersicht", "Niederländisch Substantiv Übersicht", "Niedersorbisch Substantiv Übersicht", @@ -83,6 +88,7 @@ "Polnisch Grundzahl Übersicht", "Polnisch Substantiv Übersicht", "Portugiesisch Substantiv Übersicht", + "Prußisch Substantiv Übersicht", "Rumänisch Numerale Übersicht", "Rumänisch Personalpronomen Übersicht", "Rumänisch Substantiv Übersicht", diff --git a/src/wiktextract/data/de/languages.json b/src/wiktextract/data/de/languages.json index bc0151f2..06dc7b9e 100644 --- a/src/wiktextract/data/de/languages.json +++ b/src/wiktextract/data/de/languages.json @@ -1,5 +1,4 @@ { - "MHA": ["modernes Hocharabisch"], "aa": ["Afar"], "aae": ["Arbëresh"], "ab": ["Abchasisch"], @@ -46,6 +45,7 @@ "aua": ["Asumboa"], "aud": ["Anutisch"], "av": ["Awarisch"], + "avk": ["Kotava"], "ay": ["Aymara"], "az": ["Aserbaidschanisch"], "azb": ["Südaserbaidschanisch"], @@ -97,10 +97,12 @@ "chr": ["Cherokee"], "chy": ["Cheyenne"], "ciw": ["Chippewa"], + "cjm": ["Östliches Cham"], "ckb": ["Sorani"], "ckt": ["Tschuktschisch"], "co": ["Korsisch"], "com": ["Comanche"], + "cop": ["Koptisch"], "cr": ["Cree"], "crh": ["Krimtatarisch"], "cri": ["Saotomensisches Kreol"], @@ -109,7 +111,7 @@ "cs": ["Tschechisch"], "csb": ["Kaschubisch"], "ctu": ["Tumbalá-Chol"], - "cu": ["Altkirchenslawisch"], + "cu": ["Altkirchenslawisch", "Kirchenslawisch"], "cv": ["Tschuwaschisch"], "cy": ["Walisisch"], "da": ["Dänisch"], @@ -129,9 +131,10 @@ "ee": ["Ewe"], "egl": ["Emilianisch"], "egy": ["Ägyptisch"], - "el": ["Griechisch (Neu-)"], + "el": ["Griechisch (Neu-)", "Neugriechisch"], "ems": ["Alutiiq"], "en": ["Englisch"], + "ENHG": ["Frühneuhochdeutsch"], "enm": ["Mittelenglisch"], "eo": ["Esperanto"], "es": ["Spanisch"], @@ -174,7 +177,7 @@ "gnc": ["Guanche"], "goh": ["Althochdeutsch"], "got": ["Gotisch"], - "grc": ["Altgriechisch"], + "grc": ["Altgriechisch", "Mittelgriechisch"], "gsw": ["Schweizerdeutsch"], "gu": ["Gujarati"], "gv": ["Manx"], @@ -203,12 +206,15 @@ "ik": ["Inupiaq"], "ikt": ["Inuinnaqtun"], "ilo": ["Ilokano"], + "ils": ["International"], + "ims": ["Marsisch"], "inh": ["Inguschisch"], "io": ["Ido"], "is": ["Isländisch"], "it": ["Italienisch"], "iu": ["Inuktitut"], "ja": ["Japanisch"], + "jam": ["Jamaika-Kreolisch"], "jbo": ["Lojban"], "jv": ["Javanisch"], "ka": ["Georgisch"], @@ -229,6 +235,8 @@ "kjj": ["Chinalugisch"], "kk": ["Kasachisch"], "kl": ["Grönländisch"], + "kla": ["Klamath"], + "klb": ["Kiliwa"], "km": ["Kambodschanisch"], "kmr": ["Kurmandschi"], "kn": ["Kannada"], @@ -236,6 +244,8 @@ "koi": ["Komi-Permjakisch"], "kok": ["Konkani"], "kos": ["Kosraeanisch"], + "kpg": ["Kapingamarangi"], + "kr": ["Kanuri"], "krc": ["Karatschai-Balkarisch"], "krl": ["Karelisch"], "ks": ["Kashmiri"], @@ -250,6 +260,7 @@ "la": ["Latein"], "lad": ["Ladino"], "lb": ["Luxemburgisch"], + "ldn": ["Láadan"], "lep": ["Lepcha"], "lg": ["Luganda"], "li": ["Limburgisch"], @@ -261,15 +272,18 @@ "ln": ["Lingala"], "lo": ["Laotisch"], "lt": ["Litauisch"], + "ltg": ["Lettgallisch"], "lv": ["Lettisch"], "lzz": ["Lasisch"], "mad": ["Maduresisch"], "mak": ["Makassar"], "mas": ["Maa"], "mdf": ["Mokscha"], + "mfe": ["Morisien"], "mg": ["Madagassisch"], "mga": ["Mittelirisch"], "mh": ["Marshallesisch"], + "MHA": ["modernes Hocharabisch"], "mi": ["Maori"], "mia": ["Miami-Illinois"], "mic": ["Micmac"], @@ -281,30 +295,43 @@ "mnc": ["Mandschurisch"], "mns": ["Mansisch"], "moh": ["Mohawk"], + "mpm": ["Yosondúa-Mixtekisch"], "mr": ["Marathi"], + "mrv": ["Mangarevanisch"], "ms": ["Malaiisch"], "mt": ["Maltesisch"], "mus": ["Creek"], + "mwl": ["Mirandés"], "mxi": ["Mozarabisch"], "my": ["Birmanisch"], + "myn": ["Huastekisch"], "myv": ["Ersja"], "na": ["Nauruisch"], - "nah": ["Nahuatl"], + "nah": ["Nahuatl", "Zentral-Nahuatl"], "nan": ["Min Nan"], "nap": ["Neapolitanisch"], "naq": ["Nama"], "nb": ["Bokmål"], "nch": ["Huastekisches Zentral-Nahuatl"], - "nci": ["Klassisches Nahuatl"], + "nci": ["Klassisches Nahuatl", "Klassisches Nahuatl‎"], + "ncx": ["Zentrales Puebla-Nahuatl"], "nd": ["Nord-Ndebele"], + "ndo": ["Oshivambo"], "nds": ["Niederdeutsch"], "ne": ["Nepalesisch"], "new": ["Newari"], "ng": ["Ndonga"], "ngo": ["Ngoni"], + "ngu": ["Guerrero-Nahuatl"], + "nhe": ["Huastekisches Ost-Nahuatl"], + "nhg": ["Tetelcingo-Nahuatl"], + "nhv": ["Temascaltepec-Nahuatl"], + "nhw": ["Huastekisches West-Nahuatl"], "nic": ["Dogon"], + "niu": ["Niueanisch"], "nl": ["Niederländisch"], "nld": ["Flämisch"], + "nlv": ["Orizaba-Nahuatl"], "nmn": ["ǃXóõ"], "nn": ["Nynorsk"], "no": ["Norwegisch"], @@ -314,6 +341,7 @@ "nqo": ["N'Ko"], "nr": ["Süd-Ndebele"], "nrf": ["Altnormannisch"], + "nrn": ["Norn"], "nso": ["Nord-Sotho"], "nup": ["Nupe"], "nv": ["Navajo"], @@ -321,6 +349,7 @@ "obt": ["Altbretonisch"], "oc": ["Okzitanisch"], "oco": ["Altkornisch"], + "odt": ["Altniederländisch"], "ofs": ["Altfriesisch"], "oge": ["Altgeorgisch"], "oj": ["Ojibwe"], @@ -330,8 +359,11 @@ "orv": ["Altostslawisch"], "os": ["Ossetisch"], "osa": ["Osage"], + "osc": ["Oskisch"], "osx": ["Altsächsisch"], + "ota": ["Osmanisches Türkisch"], "otk": ["Alttürkisch"], + "oto": ["Mezquital-Otomi"], "otw": ["Ottawa"], "owl": ["Altwalisisch"], "pa": ["Pandschabi"], @@ -339,14 +371,18 @@ "pap": ["Papiamentu"], "pau": ["Palauisch"], "paw": ["Pawnee"], + "pcd": ["Pikardisch"], "pdc": ["Pennsylvaniadeutsch"], "pdt": ["Plautdietsch"], "peo": ["Altpersisch"], + "pgn": ["Pälignisch"], "pi": ["Pali"], "pih": ["Pitkern"], "pis": ["Pijin"], + "pkp": ["Pukapuka"], "pl": ["Polnisch"], "pms": ["Piemontesisch"], + "pnb": ["West-Pandschabi"], "pov": ["Guineabissauisches Kreol"], "pox": ["Polabisch"], "pqm": ["Malecite-Passamaquoddy"], @@ -359,6 +395,7 @@ "qka": ["Erzgebirgisch"], "qu": ["Quechua"], "qua": ["Quapaw"], + "que": ["Argentinisches Quechua"], "quz": ["Cusco-Quechua"], "raj": ["Rajasthani"], "rap": ["Rapanui"], @@ -382,6 +419,7 @@ "sco": ["Scots"], "sd": ["Sindhi"], "se": ["Nordsamisch"], + "sei": ["Seri"], "sg": ["Sango"], "sga": ["Altirisch"], "sgs": ["Schemaitisch"], @@ -398,12 +436,14 @@ "smn": ["Inarisamisch"], "sn": ["Shona"], "so": ["Somalisch"], + "sog": ["Sogdisch"], + "spx": ["Südpikenisch"], "sq": ["Albanisch"], "sr": ["Serbisch"], "srn": ["Sranantongo"], "ss": ["Siswati"], "st": ["Sesotho"], - "stq": ["Saterfriesisch"], + "stq": ["Saterfriesisch", "Ostfriesisch"], "su": ["Sundanesisch"], "sux": ["Sumerisch"], "sv": ["Schwedisch"], @@ -422,6 +462,7 @@ "th": ["Thai"], "ti": ["Tigrinya"], "tk": ["Turkmenisch"], + "tkl": ["Tokelauisch"], "tl": ["Tagalog"], "tlh": ["Klingonisch"], "tli": ["Tlingit"], @@ -436,13 +477,19 @@ "trw": ["Torwali"], "ts": ["Xitsonga"], "tsi": ["Tsimshian"], + "tsz": ["Purépecha"], "tt": ["Tatarisch"], "tvl": ["Tuvaluisch"], + "twi": ["Twi"], "ty": ["Tahitianisch"], "tyv": ["Tuwinisch"], + "tzh": ["Tzeltal"], + "tzo": ["Tzotzil"], "udm": ["Udmurtisch"], "ug": ["Uigurisch"], + "uga": ["Ugaritisch"], "uk": ["Ukrainisch"], + "umc": ["Marrukinisch"], "umu": ["Munsee"], "unm": ["Unami"], "ur": ["Urdu"], @@ -461,6 +508,7 @@ "wlc": ["shiMwali"], "wni": ["shiNdzuani"], "wo": ["Wolof"], + "xae": ["Äquisch"], "xal": ["Kalmückisch"], "xcl": ["Altarmenisch"], "xfa": ["Faliskisch"], @@ -472,19 +520,25 @@ "xno": ["Anglonormannisch"], "xpq": ["Mohegan-Pequot"], "xtg": ["Gallisch"], + "xum": ["Umbrisch"], "xur": ["Urartäisch"], "xve": ["Venetisch"], + "xvo": ["Volskisch"], + "xvs": ["Vestinisch"], "yak": ["Yakima"], + "yaq": ["Yaqui"], "yi": ["Jiddisch"], "yo": ["Yoruba"], "yua": ["Mayathan"], "yue": ["Kantonesisch"], "za": ["Zhuang"], + "zai": ["Isthmus-Zapotekisch"], "zdj": ["shiNgazidja"], "zea": ["Seeländisch"], - "zh": ["Chinesisch"], "zh-cn": ["Chinesisch (vereinfacht)"], "zh-tw": ["Chinesisch (traditionell)"], + "zh": ["Chinesisch"], + "zlw-ocs": ["Alttschechisch"], "zu": ["isiZulu"], "zza": ["Zazaki"] } diff --git a/src/wiktextract/extractor/de/example.py b/src/wiktextract/extractor/de/example.py new file mode 100644 index 00000000..8a3a97f6 --- /dev/null +++ b/src/wiktextract/extractor/de/example.py @@ -0,0 +1,82 @@ +from collections import defaultdict +from typing import Dict, List + + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import LevelNode +from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid + +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + + +def extract_examples( + wxr: WiktextractContext, + page_data: List[Dict], + level_node: LevelNode, +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): + example_data = defaultdict(str) + + ref_nodes = find_and_remove_child( + list_item_node, + NodeKind.HTML, + lambda html_node: html_node.tag == "ref", + ) + for ref_node in ref_nodes: + extract_reference(wxr, example_data, ref_node) + + example_text = clean_node(wxr, {}, list_item_node.children) + + senseid, example_text = match_senseid(example_text) + + if example_text: + example_data["text"] = example_text + + if senseid: + for sense in page_data[-1]["senses"]: + if sense["senseid"] == senseid: + sense["examples"].append(example_data) + + else: + if example_data: + wxr.wtp.debug( + f"Found example data without senseid and text: {example_data}", + sortid="extractor/de/examples/extract_examples/28", + ) + for non_list_node in level_node.invert_find_child(NodeKind.LIST): + wxr.wtp.debug( + f"Found unexpected non-list node in example section: {non_list_node}", + sortid="extractor/de/examples/extract_examples/33", + ) + + +def extract_reference( + wxr: WiktextractContext, example_data: Dict[str, str], ref_node: WikiNode +): + reference_data = defaultdict() + + reference_data["raw_ref"] = clean_node(wxr, {}, ref_node.children) + + template_nodes = list(ref_node.find_child(NodeKind.TEMPLATE)) + + if len(template_nodes) > 1: + wxr.wtp.debug( + f"Found unexpected number of templates in example: {template_nodes}", + sortid="extractor/de/examples/extract_examples/64", + ) + elif len(template_nodes) == 1: + template_node = template_nodes[0] + + # Most reference templates follow the Literatur template and use named + # parameters. We extract them here. + # https://de.wiktionary.org/wiki/Vorlage:Literatur + for key, value in template_node.template_parameters.items(): + if isinstance(key, str): + reference_data[key.lower()] = clean_node(wxr, {}, value) + + # XXX: Treat other templates as well. + # E.g. https://de.wiktionary.org/wiki/Vorlage:Ref-OWID + + example_data["ref"] = reference_data diff --git a/src/wiktextract/extractor/de/gloss.py b/src/wiktextract/extractor/de/gloss.py index ffce2c82..ea2761c2 100644 --- a/src/wiktextract/extractor/de/gloss.py +++ b/src/wiktextract/extractor/de/gloss.py @@ -3,54 +3,100 @@ from typing import Dict, List from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import LevelNode +from wiktextract.extractor.de.utils import find_and_remove_child, match_senseid from wiktextract.page import clean_node from wiktextract.wxr_context import WiktextractContext def extract_glosses( + wxr: WiktextractContext, + page_data: List[Dict], + level_node: LevelNode, +) -> None: + for list_node in level_node.find_child(NodeKind.LIST): + process_gloss_list_item(wxr, page_data, list_node) + + for non_list_node in level_node.invert_find_child(NodeKind.LIST): + wxr.wtp.debug( + f"Found unexpected non-list node in pronunciation section: {non_list_node}", + sortid="extractor/de/pronunciation/extract_pronunciation/64", + ) + + +def process_gloss_list_item( wxr: WiktextractContext, page_data: List[Dict], list_node: WikiNode, + parent_senseid: str = "", + parent_gloss_data: defaultdict(list) = None, ) -> None: for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): item_type = list_item_node.sarg if item_type == "*": - wxr.wtp.debug( - f"Skipped a sense modifier in gloss list: {list_item_node}", - sortid="extractor/de/glosses/extract_glosses/19", + handle_sense_modifier(wxr, list_item_node) + + elif item_type in [":", "::"]: + if any( + [ + template_node.template_name + in ["QS Herkunft", "QS Bedeutungen"] + for template_node in list_item_node.find_child_recursively( + NodeKind.TEMPLATE + ) + ] + ): + continue + + gloss_data = ( + defaultdict(list) + if parent_gloss_data is None + else parent_gloss_data.copy() ) - # XXX: We should extract the modifier. However, it seems to affect - # multiple glosses. Needs investigation. - pass - elif item_type == ":": - gloss_data = defaultdict(list) - for sub_list_node in list_item_node.find_child(NodeKind.LIST): - wxr.wtp.debug( - f"Skipped a sub-list in gloss list: {sub_list_node}", - sortid="extractor/de/glosses/extract_glosses/27", - ) - # XXX: We should extract the subglosses as subsenses. - pass + + # Extract sub-glosses for later processing + sub_glosses_list_nodes = list( + find_and_remove_child(list_item_node, NodeKind.LIST) + ) + + raw_gloss = clean_node(wxr, {}, list_item_node.children) + gloss_data["raw_glosses"] = [raw_gloss] + + process_K_template(wxr, gloss_data, list_item_node) gloss_text = clean_node(wxr, gloss_data, list_item_node.children) - match = re.match(r"\[(\d+[a-z]?)\]", gloss_text) - if match: - sense_number = match.group(1) - gloss_text = gloss_text[match.end() :].strip() - else: - sense_number = None + senseid, gloss_text = match_senseid(gloss_text) - if not sense_number: + if senseid: + senseid = ( + senseid + if senseid[0].isnumeric() + else parent_senseid + senseid + ) + gloss_data["senseid"] = senseid + else: wxr.wtp.debug( - f"Failed to extract sense number from gloss: {gloss_text}", + f"Failed to extract sense number from gloss node: {list_item_node}", sortid="extractor/de/glosses/extract_glosses/28", ) - gloss_data["glosses"] = [gloss_text] + # XXX: Extract tags from nodes instead using Italic and Template + gloss_text = extract_tags_from_gloss_text(gloss_data, gloss_text) - page_data[-1]["senses"].append(gloss_data) + if gloss_text or not sub_glosses_list_nodes: + gloss_data["glosses"] = [gloss_text] + page_data[-1]["senses"].append(gloss_data) + + for sub_list_node in sub_glosses_list_nodes: + process_gloss_list_item( + wxr, + page_data, + sub_list_node, + senseid, + gloss_data if not gloss_text else None, + ) else: wxr.wtp.debug( @@ -58,3 +104,62 @@ def extract_glosses( sortid="extractor/de/glosses/extract_glosses/29", ) continue + + +def handle_sense_modifier(wxr, list_item_node): + wxr.wtp.debug( + f"Skipped a sense modifier in gloss list: {list_item_node}", + sortid="extractor/de/glosses/extract_glosses/19", + ) + # XXX: We should extract the modifier. However, it seems to affect + # multiple glosses. Needs investigation. + pass + + +def process_K_template( + wxr: WiktextractContext, + gloss_data: defaultdict(list), + list_item_node: NodeKind.LIST_ITEM, +) -> None: + for template_node in list_item_node.find_child(NodeKind.TEMPLATE): + if template_node.template_name == "K": + text = clean_node(wxr, gloss_data, template_node).removesuffix(":") + tags = re.split(r";|,", text) + gloss_data["tags"] = [t.strip() for t in tags] + + # Prepositional and case information is sometimes only expanded to + # category links and not present in cleaned node. We still want it + # as a tag. + prep = template_node.template_parameters.get("Prä") + case = template_node.template_parameters.get("Kas") + category = (prep if prep else "") + (" + " + case if case else "") + if category: + gloss_data["tags"].append(category) + + # XXX: Investigate better ways to handle free text in K template + ft = template_node.template_parameters.get("ft") + if ft: + wxr.wtp.debug( + f"Found ft '{ft}' in K template which could be considered part of the gloss. Moved to tags for now.", + sortid="extractor/de/glosses/extract_glosses/63", + ) + + # Remove the template_node from the children of list_item_node + list_item_node.children = [ + c for c in list_item_node.children if c != template_node + ] + + +def extract_tags_from_gloss_text( + gloss_data: defaultdict(list), gloss_text: str +) -> None: + parts = gloss_text.split(":", 1) + if len(parts) > 1: + tags_part = parts[0].strip() + + categories = [c.strip() for c in re.split(",", tags_part)] + if all(c.isalnum() for c in categories): + gloss_data["tags"].extend(categories) + return parts[1].strip() + + return gloss_text diff --git a/src/wiktextract/extractor/de/page.py b/src/wiktextract/extractor/de/page.py index 07ba4d88..29b36d0f 100644 --- a/src/wiktextract/extractor/de/page.py +++ b/src/wiktextract/extractor/de/page.py @@ -7,9 +7,11 @@ from wikitextprocessor.parser import LevelNode from wiktextract.datautils import append_base_data +from wiktextract.extractor.de.pronunciation import extract_pronunciation from wiktextract.wxr_context import WiktextractContext from .gloss import extract_glosses +from .example import extract_examples # Templates that are used to form panels on pages and that should be ignored in # various positions @@ -27,108 +29,10 @@ # Templates that should not be pre-expanded DO_NOT_PRE_EXPAND_TEMPLATES = { "Ü-Tabelle", # Translation table - "Quellen", # Can be ignored since we have the tags in the tree + "Übersetzungen umleiten", # Translation table redirect } -def fix_level_hierarchy_of_subsections( - wxr: WiktextractContext, tree: List[WikiNode] -) -> List[WikiNode]: - """ - This function introduces level hierarchy to subsections and their content. - - The German Wiktionary does generally not use level 4 headings but instead - uses templates to define the subsections. These templates are usually - followed by a list of content that belongs to the subsection. Yet, in the - tree the content is on the same level as the subsection template. In Gernman - wiktionary, for cosmetic reasons, a level 4 heading is used to introduce the - translation subsection that then also contains other subsections not related - to translations. - - See: - https://de.wiktionary.org/wiki/Hilfe:Formatvorlage#Der_%E2%80%9EEndteil%E2%80%9C - """ - level_nodes: List[WikiNode] = [] - for node in tree: - if isinstance(node, WikiNode): - # A level 4 heading is used to introduce the translation - # section. - if node.kind == NodeKind.LEVEL4: - # Find the index of the first template after the Ü-Tabelle - # template - split_idx = len(node.children) - for idx, child in enumerate(node.children): - if split_idx < len(node.children): - if ( - isinstance(child, WikiNode) - and child.kind == NodeKind.TEMPLATE - ): - break - else: - split_idx = idx + 1 - if ( - isinstance(child, WikiNode) - and child.kind == NodeKind.TEMPLATE - and child.template_name == "Ü-Tabelle" - ): - split_idx = idx + 1 - - children_until_translation_table = node.children[:split_idx] - - children_after_translation_table = node.children[split_idx:] - - node.children = children_until_translation_table - level_nodes.append(node) - - level_nodes.extend( - fix_level_hierarchy_of_subsections( - wxr, children_after_translation_table - ) - ) - - elif node.kind == NodeKind.TEMPLATE: - level_node = LevelNode(NodeKind.LEVEL4, node.loc) - level_node.largs = [[node]] - level_nodes.append(level_node) - - elif node.kind == NodeKind.LIST: - if len(level_nodes) > 0: - level_nodes[-1].children.append(node) - else: - wxr.wtp.debug( - f"Unexpected list while introducing level hierarchy: {node}", - sortid="extractor/de/page/introduce_level_hierarchy/52", - ) - continue - - # Sometimes links are used outside of a section to link the whole - # entry to a category. We treat them here as level 4 headings, - # without any children. - elif node.kind == NodeKind.LINK: - level_node = LevelNode(NodeKind.LEVEL4, node.loc) - level_node.largs = [[node]] - level_nodes.append(level_node) - - # ignore
tags - elif node.kind == NodeKind.HTML and node.sarg == "br": - pass - else: - wxr.wtp.debug( - f"Unexpected WikiNode while introducing level hierarchy: {node}", - sortid="extractor/de/page/introduce_level_hierarchy/55", - ) - else: - if not len(level_nodes): - if not isinstance(node, str) or not node.strip() == "": - wxr.wtp.debug( - f"Unexpected string while introducing level hierarchy: {node}", - sortid="extractor/de/page/introduce_level_hierarchy/61", - ) - continue - level_nodes[-1].children.append(node) - return level_nodes - - def parse_section( wxr: WiktextractContext, page_data: List[Dict], @@ -161,15 +65,17 @@ def parse_section( ) return - # Level 4 headings were introduced by fix_level_hierarchy_of_subsections() - # for subsections that are introduced by templates. + # Level 4 headings were introduced by overriding the default templates. + # See overrides/de.json for details. elif level_node.kind == NodeKind.LEVEL4: - for template_node in level_node.find_content(NodeKind.TEMPLATE): - section_name = template_node.template_name - wxr.wtp.start_subsection(section_name) - if section_name == "Bedeutungen": - for list_node in level_node.find_child(NodeKind.LIST): - extract_glosses(wxr, page_data, list_node) + section_name = level_node.largs[0][0] + wxr.wtp.start_subsection(section_name) + if section_name == "Bedeutungen": + extract_glosses(wxr, page_data, level_node) + if section_name == "Aussprache": + extract_pronunciation(wxr, page_data, level_node) + if section_name == "Beispiele": + extract_examples(wxr, page_data, level_node) FORM_POS = { @@ -292,10 +198,22 @@ def process_pos_section( sortid="extractor/de/page/process_pos_section/31", ) - subsections = fix_level_hierarchy_of_subsections(wxr, level_node.children) + for level_4_node in level_node.find_child(NodeKind.LEVEL4): + parse_section(wxr, page_data, base_data, level_4_node) - for subsection in subsections: - parse_section(wxr, page_data, base_data, subsection) + for non_l4_node in level_node.invert_find_child(NodeKind.LEVEL4): + if ( + isinstance(non_l4_node, WikiNode) + and non_l4_node.kind == NodeKind.TEMPLATE + and "Übersicht" in non_l4_node.template_name + ): + # XXX: de: Extract form tables + pass + else: + wxr.wtp.debug( + f"Unexpected node in pos section: {non_l4_node}", + sortid="extractor/de/page/process_pos_section/41", + ) return @@ -310,13 +228,12 @@ def parse_page( # Parse the page, pre-expanding those templates that are likely to # influence parsing + DO_NOT_PRE_EXPAND_TEMPLATES.update(wxr.config.DE_FORM_TABLES) tree = wxr.wtp.parse( page_text, pre_expand=True, additional_expand=ADDITIONAL_EXPAND_TEMPLATES, - do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES.update( - wxr.config.DE_FORM_TABLES - ), + do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, ) page_data = [] diff --git a/src/wiktextract/extractor/de/pronunciation.py b/src/wiktextract/extractor/de/pronunciation.py new file mode 100644 index 00000000..b5fb1d0d --- /dev/null +++ b/src/wiktextract/extractor/de/pronunciation.py @@ -0,0 +1,181 @@ +from collections import defaultdict +from typing import Dict, List, Union + +from wikitextprocessor import NodeKind, WikiNode +from wikitextprocessor.parser import LevelNode +from wiktextract.extractor.share import create_audio_url_dict + +from wiktextract.page import clean_node +from wiktextract.wxr_context import WiktextractContext + + +def extract_pronunciation( + wxr: WiktextractContext, + page_data: List[Dict], + level_node: LevelNode, +): + for list_node in level_node.find_child(NodeKind.LIST): + sound_data = [defaultdict(list)] + + for not_list_item_node in list_node.invert_find_child( + NodeKind.LIST_ITEM + ): + wxr.wtp.debug( + f"Found unexpected non-list-item node in pronunciation section: {not_list_item_node}", + sortid="extractor/de/pronunciation/extract_pronunciation/28", + ) + + for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): + children = list(list_item_node.filter_empty_str_child()) + if len(children) == 0: + continue + + head_template, rest = children[0], children[1:] + if ( + not isinstance(head_template, WikiNode) + or head_template.kind != NodeKind.TEMPLATE + or not rest + ): + wxr.wtp.debug( + f"Found unexpected non-template node in pronunciation section: {head_template}", + sortid="extractor/de/pronunciation/extract_pronunciation/37", + ) + continue + if head_template.template_name == "IPA": + process_ipa(wxr, sound_data, rest) + elif head_template.template_name == "Hörbeispiele": + sound_data.append(defaultdict(list)) + process_hoerbeispiele(wxr, sound_data, rest) + elif head_template.template_name == "Reime": + process_rhymes(wxr, sound_data, rest) + else: + wxr.wtp.debug( + f"Found unexpected template in pronunciation section: {head_template} with content {rest}", + sortid="extractor/de/pronunciation/extract_pronunciation/45)", + ) + + # Remove empty entries + sound_data = [entry for entry in sound_data if entry != {}] + if len(sound_data) > 0: + page_data[-1]["sounds"].extend(sound_data) + + for non_list_node in level_node.invert_find_child(NodeKind.LIST): + wxr.wtp.debug( + f"Found unexpected non-list node in pronunciation section: {non_list_node}", + sortid="extractor/de/pronunciation/extract_pronunciation/64", + ) + + +def process_ipa( + wxr: WiktextractContext, + sound_data: List[Dict], + nodes: List[Union[WikiNode, str]], +): + for node in nodes: + if is_template_node_with_name(node, "Lautschrift"): + process_lautschrift_template(wxr, sound_data, node) + elif is_tag_node(node): + append_tag(wxr, sound_data, node) + elif is_new_sound_data_entry_sep(node): + sound_data.append(defaultdict(list)) + else: + wxr.wtp.debug( + f"Found unexpected non-Lautschrift node in IPA section: {node}", + sortid="extractor/de/pronunciation/process_ipa/57", + ) + + +def process_lautschrift_template( + wxr: WiktextractContext, sound_data: List[Dict], node +): + template_parameters = node.template_parameters + + ipa = template_parameters.get(1) + + lang_code = template_parameters.get("spr") + if lang_code: + language = wxr.wtp.LANGUAGES_BY_CODE[lang_code] + add_sound_data_without_appending_to_existing_properties( + sound_data, + { + "ipa": [ipa], + "lang_code": lang_code, + "language": language, + }, + ) + else: + sound_data[-1]["ipa"].append(ipa) + + +def process_hoerbeispiele( + wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode] +): + for node in nodes: + if is_template_node_with_name(node, "Audio"): + process_audio_template(wxr, sound_data, node) + elif is_tag_node(node): + append_tag(wxr, sound_data, node) + elif is_new_sound_data_entry_sep(node): + sound_data.append(defaultdict(list)) + else: + wxr.wtp.debug( + f"Found unexpected node in Hoerbeispiele section: {node}", + sortid="extractor/de/pronunciation/process_hoerbeispiele/193", + ) + + +def process_audio_template( + wxr: WiktextractContext, sound_data: List[Dict], node +): + audio_file = node.template_parameters.get(1) + if audio_file: + add_sound_data_without_appending_to_existing_properties( + sound_data, create_audio_url_dict(audio_file) + ) + + +def process_rhymes( + wxr: WiktextractContext, sound_data: List[Dict], nodes: List[WikiNode] +): + # XXX: Extract rhymes from the referenced rhymes page + pass + + +def is_template_node_with_name(node: Union[WikiNode, str], template_name: str): + return ( + isinstance(node, WikiNode) + and node.kind == NodeKind.TEMPLATE + and node.template_name == template_name + ) + + +def add_sound_data_without_appending_to_existing_properties( + sound_data: List[Dict], + new_sound_data: Dict, +): + """Creates a new IPA data entry if properties exist in previous entry.""" + if any([key in sound_data[-1] for key in new_sound_data.keys()]): + sound_data.append(defaultdict(list)) + + for key, value in new_sound_data.items(): + if isinstance(value, str): + sound_data[-1][key] = value + else: + sound_data[-1][key].extend(value) + + +def is_tag_node(node: Union[WikiNode, str]): + return isinstance(node, WikiNode) and node.kind in [ + NodeKind.TEMPLATE, + NodeKind.ITALIC, + ] + + +def append_tag(wxr: WiktextractContext, sound_data: Dict, node: WikiNode): + tag = clean_node(wxr, {}, node).strip() + if tag: + sound_data[-1]["tags"].append(tag) + + +def is_new_sound_data_entry_sep(node: Union[WikiNode, str]): + return isinstance(node, str) and node.strip() in [",", ";"] diff --git a/src/wiktextract/extractor/de/utils.py b/src/wiktextract/extractor/de/utils.py new file mode 100644 index 00000000..73416645 --- /dev/null +++ b/src/wiktextract/extractor/de/utils.py @@ -0,0 +1,24 @@ +import re +from wikitextprocessor import NodeKind, WikiNode + + +def match_senseid(node_text: str): + match = re.match(r"\[(\d*[a-z]?)\]", node_text) + + if match: + senseid = match.group(1) + node_text = node_text[match.end() :].strip() + else: + senseid = None + + return senseid, node_text + + +def find_and_remove_child(node: WikiNode, kind: NodeKind, cb=None): + children = [] + for idx, child in reversed(list(node.find_child(kind, with_index=True))): + if cb and not cb(child): + continue + del node.children[idx] + children.append(child) + return reversed(children) diff --git a/tests/test_de_example.py b/tests/test_de_example.py new file mode 100644 index 00000000..980a0be6 --- /dev/null +++ b/tests/test_de_example.py @@ -0,0 +1,140 @@ +import unittest +from collections import defaultdict + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.de.example import extract_examples, extract_reference + +from wiktextract.thesaurus import close_thesaurus_db +from wiktextract.wxr_context import WiktextractContext + + +class TestDEExample(unittest.TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + close_thesaurus_db( + self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn + ) + + def test_de_extract_examples(self): + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse( + ":[1] example1A \n:[1] example1B\n:[2] example2\n:[3] example3" + ) + + page_data = [defaultdict(list)] + page_data[-1]["senses"] = [ + defaultdict(list, {"senseid": "1"}), + defaultdict(list, {"senseid": "2"}), + ] + + extract_examples(self.wxr, page_data, root) + + self.assertEqual( + page_data, + [ + { + "senses": [ + { + "examples": [ + {"text": "example1A"}, + {"text": "example1B"}, + ], + "senseid": "1", + }, + { + "examples": [{"text": "example2"}], + "senseid": "2", + }, + ] + } + ], + ) + + def test_de_extract_example_with_reference(self): + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse(":[1] example1 ref1A") + + page_data = [defaultdict(list)] + page_data[-1]["senses"] = [ + defaultdict(list, {"senseid": "1"}), + ] + + extract_examples(self.wxr, page_data, root) + + self.assertEqual( + page_data, + [ + { + "senses": [ + { + "examples": [ + { + "text": "example1", + "ref": {"raw_ref": "ref1A"}, + }, + ], + "senseid": "1", + }, + ] + } + ], + ) + + def test_de_extract_reference_from_literatur_template(self): + # https://de.wiktionary.org/wiki/Beispiel + self.wxr.wtp.start_page("Beispiel") + self.wxr.wtp.add_page("Vorlage:Literatur", 10, "Expanded template") + root = self.wxr.wtp.parse( + "{{Literatur|Autor=Steffen Möller|Titel=Viva Warszawa|TitelErg=Polen für Fortgeschrittene|Verlag=Piper|Ort=München/Berlin|Jahr=2015}}, Seite 273. ISBN 978-3-89029-459-9." + ) + + example_data = defaultdict(str) + + extract_reference(self.wxr, example_data, root.children[0]) + + self.assertEqual( + example_data, + { + "ref": { + "raw_ref": "Expanded template, Seite 273. ISBN 978-3-89029-459-9.", + "titel": "Viva Warszawa", + "autor": "Steffen Möller", + "titelerg": "Polen für Fortgeschrittene", + "verlag": "Piper", + "ort": "München/Berlin", + "jahr": "2015", + } + }, + ) + + def test_de_extract_reference_from_templates_without_named_args(self): + # https://de.wiktionary.org/wiki/Beispiel + # Reference templates not following the Literatur template pattern are + # currently not extracted field by field (e.g. Vorlage:Ref-OWID) + self.wxr.wtp.start_page("Beispiel") + self.wxr.wtp.add_page("Vorlage:Ref-OWID", 10, "Expanded template") + root = self.wxr.wtp.parse( + "{{Ref-OWID|Sprichwörter|401781|Schlechte Beispiele verderben gute Sitten.}}" + ) + + example_data = defaultdict(str) + + extract_reference(self.wxr, example_data, root.children[0]) + + self.assertEqual( + example_data, + { + "ref": { + "raw_ref": "Expanded template", + } + }, + ) diff --git a/tests/test_de_gloss.py b/tests/test_de_gloss.py index 3d19861b..02a41751 100644 --- a/tests/test_de_gloss.py +++ b/tests/test_de_gloss.py @@ -1,18 +1,26 @@ import unittest from collections import defaultdict +from unittest.mock import patch from wikitextprocessor import Wtp from wiktextract.config import WiktionaryConfig -from wiktextract.extractor.de.gloss import extract_glosses +from wiktextract.extractor.de.gloss import ( + extract_glosses, + extract_tags_from_gloss_text, + process_K_template, +) from wiktextract.thesaurus import close_thesaurus_db from wiktextract.wxr_context import WiktextractContext class TestGlossList(unittest.TestCase): + maxDiff = None + def setUp(self) -> None: self.wxr = WiktextractContext( - Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") + Wtp(lang_code="de"), + WiktionaryConfig(dump_file_lang_code="de"), ) def tearDown(self) -> None: @@ -27,7 +35,7 @@ def test_de_extract_glosses(self): page_data = [defaultdict(list)] - extract_glosses(self.wxr, page_data, root.children[0]) + extract_glosses(self.wxr, page_data, root) self.assertEqual( page_data, @@ -36,11 +44,243 @@ def test_de_extract_glosses(self): "senses": [ { "glosses": ["gloss1"], + "raw_glosses": ["[1] gloss1"], + "senseid": "1", }, { "glosses": ["gloss2"], + "raw_glosses": ["[2] gloss2"], + "senseid": "2", + }, + ] + } + ], + ) + + def test_de_extract_glosses_with_subglosses(self): + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse( + ":[1] gloss1\n::[a] subglossA\n::[b] subglossB" + ) + + page_data = [defaultdict(list)] + + extract_glosses(self.wxr, page_data, root) + + self.assertEqual( + page_data, + [ + { + "senses": [ + { + "glosses": ["gloss1"], + "raw_glosses": ["[1] gloss1"], + "senseid": "1", + }, + { + "glosses": ["subglossA"], + "raw_glosses": ["[a] subglossA"], + "senseid": "1a", + }, + { + "glosses": ["subglossB"], + "raw_glosses": ["[b] subglossB"], + "senseid": "1b", + }, + ] + } + ], + ) + + def test_de_extract_glosses_with_only_subglosses(self): + self.wxr.wtp.add_page("Vorlage:K", 10, "tag") + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse( + ":[1] {{K|tag}}\n::[a] subglossA\n::[1b] subglossB" + ) + + page_data = [defaultdict(list)] + + extract_glosses(self.wxr, page_data, root) + self.assertEqual( + page_data, + [ + { + "senses": [ + { + "tags": ["tag"], + "glosses": ["subglossA"], + "raw_glosses": ["[a] subglossA"], + "senseid": "1a", + }, + { + "tags": ["tag"], + "glosses": ["subglossB"], + "raw_glosses": ["[1b] subglossB"], + "senseid": "1b", }, ] } ], ) + + def test_process_K_template_removes_K_template_nodes(self): + self.wxr.wtp.add_page("Vorlage:K", 10, "tag1, tag2") + self.wxr.wtp.start_page("") + root = self.wxr.wtp.parse("{{K|tag1|tag2}} gloss1") + + gloss_data = defaultdict(list) + + self.assertEqual(len(root.children), 2) + + process_K_template(self.wxr, gloss_data, root) + + self.assertEqual( + gloss_data, + { + "tags": ["tag1", "tag2"], + }, + ) + + self.assertEqual(len(root.children), 1) + + def get_mock(self, mock_return_value: str): + def generic_mock(*args, **kwargs): + return mock_return_value + + return generic_mock + + def test_process_K_template(self): + # Test cases chosen from: + # https://de.wiktionary.org/wiki/Vorlage:K/Doku + test_cases = [ + # https://de.wiktionary.org/wiki/delektieren + # One tag + { + "input": "{{K|refl.}}", + "expected_tags": ["reflexiv"], + "mock_return": "reflexiv:", + }, + # https://de.wiktionary.org/wiki/abbreviare + # With ft and spr args + { + "input": "{{K|trans.|ft=etwas in seinem [[räumlich]]en oder [[zeitlich]]en [[Ausmaß]] verringern|spr=it}}", + "expected_tags": [ + "transitiv", + "etwas in seinem räumlichen oder zeitlichen Ausmaß verringern", + ], + "mock_return": "transitiv, etwas in seinem räumlichen oder zeitlichen Ausmaß verringern:", + }, + # https://de.wiktionary.org/wiki/abbreviare + # With multiple tags + { + "input": "{{K|trans.|Linguistik|Wortbildung|spr=it}}", + "expected_tags": [ + "transitiv", + "Linguistik", + "Wortbildung", + ], + "mock_return": "transitiv, Linguistik, Wortbildung:", + }, + # https://de.wiktionary.org/wiki/almen + # Ideally we would filter out "besonders" but there doesn't seem + # to be a general rule which tags are semmantially relevant + # With multiple tags and t1, t2 args + { + "input": "{{K|trans.|t1=;|besonders|t2=_|bayrisch|österr.}}", + "expected_tags": [ + "transitiv", + "besonders bayrisch", + "österreichisch", + ], + "mock_return": "transitiv, besonders bayrisch, österreichisch", + }, + # https://de.wiktionary.org/wiki/einlaufen + # With two tags and t7 arg + { + "input": "{{K|intrans.|Nautik|t7=_|ft=(von Schiffen)}}", + "expected_tags": ["intransitiv", "Nautik (von Schiffen)"], + "mock_return": "intransitiv, Nautik (von Schiffen):", + }, + # https://de.wiktionary.org/wiki/zählen + # With Prä and Kas args + { + "input": "{{K|intrans.|Prä=auf|Kas=Akk.|ft=(auf jemanden/etwas zählen)}}", + "expected_tags": [ + "intransitiv", + "(auf jemanden/etwas zählen)", + "auf + Akk.", + ], + "mock_return": "intransitiv, (auf jemanden/etwas zählen):", + }, + # https://de.wiktionary.org/wiki/bojovat + # With Prä and Kas args and redundant ft arg + { + "input": "{{K|intrans.|Prä=proti|Kas=Dativ||ft=bojovat [[proti]] + [[Dativ]]|spr=cs}}", + "expected_tags": [ + "intransitiv", + "bojovat proti + Dativ", + "proti + Dativ", + ], + "mock_return": "intransitiv, bojovat proti + Dativ:", + }, + ] + + for case in test_cases: + with self.subTest(case=case): + gloss_data = defaultdict(list) + + self.wxr.wtp.start_page("") + + root = self.wxr.wtp.parse(case["input"]) + + with patch( + "wiktextract.extractor.de.gloss.clean_node", + self.get_mock(case["mock_return"]), + ): + process_K_template(self.wxr, gloss_data, root) + self.assertEqual( + gloss_data["tags"], + case["expected_tags"], + ) + + def test_de_extract_tags_from_gloss_text(self): + test_cases = [ + # https://de.wiktionary.org/wiki/Hengst + { + "input": "Zoologie: männliches Tier aus der Familie der Einhufer und Kamele", + "expected_tags": ["Zoologie"], + "expected_gloss": "männliches Tier aus der Familie der Einhufer und Kamele", + }, + # https://de.wiktionary.org/wiki/ARD + { + "input": "umgangssprachlich, Kurzwort, Akronym: für das erste Fernsehprogramm der ARD", + "expected_tags": ["umgangssprachlich", "Kurzwort", "Akronym"], + "expected_gloss": "für das erste Fernsehprogramm der ARD", + }, + # https://de.wiktionary.org/wiki/Endspiel + { + "input": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg", + "expected_tags": None, + "expected_gloss": "Drama von Samuel Beckett: Menschliche Existenz in der Endphase des Verfalls und der vergeblichen Suche nach einem Ausweg", + } + # Add more test cases as needed + ] + for case in test_cases: + with self.subTest(case=case): + gloss_data = defaultdict(list) + + gloss_text = extract_tags_from_gloss_text( + gloss_data, case["input"] + ) + + if case["expected_tags"] is None: + self.assertEqual(gloss_data, {}) + else: + self.assertEqual( + gloss_data, + { + "tags": case["expected_tags"], + }, + ) + self.assertEqual(gloss_text, case["expected_gloss"]) diff --git a/tests/test_de_page.py b/tests/test_de_page.py index 47a1aef7..0e2c14c0 100644 --- a/tests/test_de_page.py +++ b/tests/test_de_page.py @@ -7,7 +7,6 @@ from wiktextract.config import WiktionaryConfig from wiktextract.extractor.de.page import ( - fix_level_hierarchy_of_subsections, parse_page, parse_section, ) @@ -28,6 +27,7 @@ def setUp(self): # capture_examples=True, ) self.wxr = WiktextractContext(Wtp(lang_code="de"), conf1) + self.maxDiff = None def tearDown(self) -> None: self.wxr.wtp.close_db_conn() @@ -82,21 +82,11 @@ def test_de_parse_page_skipping_head_templates(self): # The way append_base_data() works requires the presence of a sense # dictionary before starting a new pos section. Therefore, we need to add # at least one sense data point to the test case. + def test_de_parse_section(self): self.wxr.wtp.add_page("Vorlage:Wortart", 10, "") self.wxr.wtp.add_page("Vorlage:Bedeutungen", 10, "") - page_text = """ -=== {{Wortart|Adjektiv|Englisch}}, {{Wortart|Adverb|Englisch}} === -{{Bedeutungen}} -:[1] gloss1 -=== {{Wortart|Verb|Englisch}} === -{{Bedeutungen}} -:[1] gloss2 -=== {{Wortart|Substantiv|Englisch}} === -{{Bedeutungen}} -:[1] gloss3 - -""" + page_text = "=== {{Wortart|Adjektiv|Englisch}}, {{Wortart|Adverb|Englisch}} ===\n====Bedeutungen====\n:[1] gloss1\n=== {{Wortart|Verb|Englisch}} ===\n====Bedeutungen====\n:[1] gloss2\n=== {{Wortart|Substantiv|Englisch}} ===\n====Bedeutungen====\n:[1] gloss3" self.wxr.wtp.start_page("") root = self.wxr.wtp.parse( page_text, @@ -116,6 +106,8 @@ def test_de_parse_section(self): "senses": [ { "glosses": ["gloss1"], + "senseid": "1", + "raw_glosses": ["[1] gloss1"], }, ], }, @@ -125,6 +117,8 @@ def test_de_parse_section(self): "senses": [ { "glosses": ["gloss1"], + "senseid": "1", + "raw_glosses": ["[1] gloss1"], }, ], }, @@ -134,6 +128,8 @@ def test_de_parse_section(self): "senses": [ { "glosses": ["gloss2"], + "senseid": "1", + "raw_glosses": ["[1] gloss2"], }, ], }, @@ -143,70 +139,10 @@ def test_de_parse_section(self): "senses": [ { "glosses": ["gloss3"], + "senseid": "1", + "raw_glosses": ["[1] gloss3"], }, ], }, ], ) - - def test_de_fix_level_hierarchy_of_subsections(self): - self.wxr.wtp.add_page("Vorlage:Englisch Substantiv Übersicht", 10, "") - self.wxr.wtp.add_page("Vorlage:Worttrennung", 10, "") - self.wxr.wtp.add_page("Vorlage:Aussprache", 10, "") - self.wxr.wtp.add_page("Vorlage:Übersetzungen", 10, "") - self.wxr.wtp.add_page("Vorlage:Ü-Tabelle", 10, "") - self.wxr.wtp.add_page("Vorlage:Referenzen", 10, "") - - page_text = """ -{{Englisch Substantiv Übersicht -|args=args}} - -{{Worttrennung}} -:item - -{{Aussprache}} -:item - -==== {{Übersetzungen}} ==== -{{Ü-Tabelle|1|G=arg|Ü-Liste= -:item -}} - -{{Referenzen}} -:item -""" - self.wxr.wtp.start_page("") - root = self.wxr.wtp.parse( - page_text, - pre_expand=True, - ) - - subsections = fix_level_hierarchy_of_subsections( - self.wxr, root.children - ) - - target_page_text = """==== {{Englisch Substantiv Übersicht\n|args=args}} ==== - -==== {{Worttrennung}} ==== -:item - -==== {{Aussprache}} ==== -:item - -==== {{Übersetzungen}} ==== -{{Ü-Tabelle|1|G=arg|Ü-Liste= -:item -}} - -==== {{Referenzen}} ==== -:item -""" - root = self.wxr.wtp.parse( - target_page_text, - pre_expand=True, - ) - - self.assertEqual( - [str(s) for s in subsections], - [str(t) for t in root.children], - ) diff --git a/tests/test_de_pronunciation.py b/tests/test_de_pronunciation.py new file mode 100644 index 00000000..6fae64eb --- /dev/null +++ b/tests/test_de_pronunciation.py @@ -0,0 +1,176 @@ +import unittest +from collections import defaultdict + +from wikitextprocessor import Wtp + +from wiktextract.config import WiktionaryConfig +from wiktextract.extractor.de.pronunciation import ( + process_ipa, + process_hoerbeispiele, +) +from wiktextract.thesaurus import close_thesaurus_db +from wiktextract.wxr_context import WiktextractContext + + +class TestDEPronunciation(unittest.TestCase): + maxDiff = None + + def setUp(self) -> None: + self.wxr = WiktextractContext( + Wtp(lang_code="de"), WiktionaryConfig(dump_file_lang_code="de") + ) + + def tearDown(self) -> None: + self.wxr.wtp.close_db_conn() + close_thesaurus_db( + self.wxr.thesaurus_db_path, self.wxr.thesaurus_db_conn + ) + + def test_de_process_ipa(self): + test_cases = [ + { + "input": "{{Lautschrift|ipa1}}", + "expected": [ + { + "ipa": ["ipa1"], + } + ], + }, + { + "input": "{{Lautschrift|ipa1|spr=de}}", + "expected": [ + {"ipa": ["ipa1"], "language": "Deutsch", "lang_code": "de"} + ], + }, + { + "input": "{{Lautschrift|ipa1}} {{Lautschrift|ipa2}}{{Lautschrift|ipa3|spr=de}}", + "expected": [ + {"ipa": ["ipa1", "ipa2"]}, + {"ipa": ["ipa3"], "language": "Deutsch", "lang_code": "de"}, + ], + }, + { + "input": "{{Lautschrift|ipa1}}, ''tag1'' {{Lautschrift|ipa2}}", + "expected": [ + {"ipa": ["ipa1"]}, + {"ipa": ["ipa2"], "tags": ["tag1"]}, + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + self.wxr.wtp.add_page("Vorlage:IPA", 10, "") + self.wxr.wtp.add_page("Vorlage:Lautschrift", 10, "(Deutsch)") + + self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch" + + root = self.wxr.wtp.parse(case["input"]) + + sound_data = [defaultdict(list)] + + process_ipa( + self.wxr, sound_data, list(root.filter_empty_str_child()) + ) + + self.assertEqual(sound_data, case["expected"]) + + def test_de_process_hoerbeispiele(self): + # https://de.wiktionary.org/wiki/Beispiel + filename1 = "De-Beispiel.ogg" + # https://de.wiktionary.org/wiki/butineur + filename2 = "LL-Q150 (fra)-WikiLucas00-butineur.wav" + test_cases = [ + { + "input": "{{Audio|" + filename1 + "}}", + "expected": [ + { + "audio": filename1, + "mp3_url": None, # None indicates we don't care about the exact value + "ogg_url": None, + } + ], + }, + { + "input": "{{Audio|" + + filename1 + + "}} {{Audio|" + + filename2 + + "}}", + "expected": [ + { + "audio": filename1, + "mp3_url": None, + "ogg_url": None, + }, + { + "audio": filename2, + "ogg_url": None, + "mp3_url": None, + "wav_url": None, + }, + ], + }, + { + "input": "{{Audio|" + + filename1 + + "}} ''tag1'', ''tag2'' {{Audio|" + + filename2 + + "}}", + "expected": [ + { + "audio": filename1, + "mp3_url": None, + "ogg_url": None, + "tags": ["tag1"], + }, + { + "audio": filename2, + "mp3_url": None, + "ogg_url": None, + "wav_url": None, + "tags": ["tag2"], + }, + ], + }, + ] + + for case in test_cases: + with self.subTest(case=case): + self.wxr.wtp.start_page("") + self.wxr.wtp.add_page("Vorlage:IPA", 10, "") + self.wxr.wtp.add_page("Vorlage:Audio", 10, "") + + self.wxr.wtp.LANGUAGES_BY_CODE["de"] = "Deutsch" + + root = self.wxr.wtp.parse(case["input"]) + + sound_data = [defaultdict(list)] + + process_hoerbeispiele( + self.wxr, sound_data, list(root.filter_empty_str_child()) + ) + + self.assertSoundDataMatchesExpected( + sound_data, case["expected"] + ) + + def assertSoundDataMatchesExpected(self, sound_data, expected): + self.assertEqual( + len(sound_data), + len(expected), + f"Mismatch in number of sound data entries{sound_data}", + ) + + for data, exp in zip(sound_data, expected): + for key, value in exp.items(): + if value is None: + self.assertIn(key, data) + else: + self.assertEqual(data[key], value) + + for key in data: + self.assertIn(key, exp) + if exp[key] is not None: + self.assertEqual(data[key], exp[key])