Skip to content

Commit

Permalink
Keep API aligned between Spanish and German get_data_**.py scripts
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Oct 31, 2023
1 parent 3e3b67e commit a64e729
Showing 1 changed file with 12 additions and 15 deletions.
27 changes: 12 additions & 15 deletions languages/get_de_data.py → languages/get_data_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,21 @@
# python language_data.py de dewiktionary_dump_file [--languages languages_output_file]

import argparse
from wikitextprocessor import Wtp
from wiktextract.config import WiktionaryConfig
from wiktextract.wxr_context import WiktextractContext
from wiktextract.page import clean_node
from wikitextprocessor.dumpparser import process_dump
from wikitextprocessor import NodeKind, WikiNode

import json

from wikitextprocessor import NodeKind, WikiNode, Wtp
from wikitextprocessor.dumpparser import process_dump

from wiktextract.config import WiktionaryConfig
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Export Wiktionary language data to JSON"
)
parser.add_argument("lang_code", type=str, help="Dump file language code")
parser.add_argument("dump", type=str, help="Wiktionary xml dump file path")
parser.add_argument(
"--languages",
type=str,
default="languages.json",
help="Language data output file path",
)
args = parser.parse_args()
wxr = WiktextractContext(Wtp(lang_code=args.lang_code), WiktionaryConfig())

Expand All @@ -40,7 +33,7 @@
template_ns_id = wxr.wtp.NAMESPACE_DATA["Template"]["id"]
process_dump(wxr.wtp, args.dump, {help_ns_id, template_ns_id})

# The page 'Hilfe:Sprachkürzel seems to be the only central collection of
# The page 'Hilfe:Sprachkürzel seems to be the only central collection of
# language codes and their German expansions. We will use this until we find
# perhaps a more authoritative source.
sprachkuerzel = wxr.wtp.get_page("Hilfe:Sprachkürzel")
Expand Down Expand Up @@ -68,5 +61,9 @@

languages[lang_code] = [clean_node(wxr, None, third_row_content)]

with open(args.languages, "w", encoding="utf-8") as fout:
with open(
f"src/wiktextract/data/{args.lang_code}/languages.json",
"w",
encoding="utf-8",
) as fout:
json.dump(languages, fout, indent=2, ensure_ascii=False, sort_keys=True)

0 comments on commit a64e729

Please sign in to comment.