Skip to content

Commit

Permalink
Add languages.json for Spanish Wiktionary
Browse files Browse the repository at this point in the history
This work is a contribution to the EWOK project, which receives funding from LABEX ASLAN (ANR–10–LABX–0081) at the Université de Lyon, as part of the "Investissements d'Avenir" program initiated and overseen by the Agence Nationale de la Recherche (ANR) in France.
  • Loading branch information
empiriker committed Oct 31, 2023
1 parent 28697b9 commit 3e3b67e
Show file tree
Hide file tree
Showing 3 changed files with 3,702 additions and 0 deletions.
60 changes: 60 additions & 0 deletions languages/get_data_es.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Export Spanish Wiktionary language data to JSON.
#
# Usage:
#
# python language_data.py de dewiktionary_dump_file [--languages languages_output_file]

import argparse
import json

from wikitextprocessor import NodeKind, WikiNode, Wtp
from wikitextprocessor.dumpparser import process_dump

from wiktextract.config import WiktionaryConfig
from wiktextract.page import clean_node
from wiktextract.wxr_context import WiktextractContext

if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Export Wiktionary language data to JSON"
)
parser.add_argument("lang_code", type=str, help="Dump file language code")
parser.add_argument("dump", type=str, help="Wiktionary xml dump file path")
args = parser.parse_args()
wxr = WiktextractContext(Wtp(lang_code=args.lang_code), WiktionaryConfig())

wxr = WiktextractContext(
Wtp(
lang_code=args.lang_code, db_path="wikt-db_es_language_data_temp.db"
),
WiktionaryConfig(),
)
appendix_ns_id = wxr.wtp.NAMESPACE_DATA["Appendix"]["id"]
process_dump(wxr.wtp, args.dump, {appendix_ns_id})

# https://es.wiktionary.org/wiki/Ap%C3%A9ndice:C%C3%B3digos_de_idioma
codigos_de_idioma = wxr.wtp.get_page("Apéndice:Códigos de idioma")

wxr.config.word = codigos_de_idioma.title
wxr.wtp.start_page(codigos_de_idioma.title)
tree = wxr.wtp.parse(
codigos_de_idioma.body,
pre_expand=True,
)
languages = {}
for table in tree.find_child_recursively(NodeKind.TABLE):
for table_row in table.find_child(NodeKind.TABLE_ROW):
lang_code_language = []
for table_cell in table_row.find_child(NodeKind.TABLE_CELL):
lang_code_language.append(table_cell.children[0])

if lang_code_language:
languages[clean_node(wxr, None, lang_code_language[0])] = [
clean_node(wxr, None, lang_code_language[1])
]
with open(
f"src/wiktextract/data/{args.lang_code}/languages.json",
"w",
encoding="utf-8",
) as fout:
json.dump(languages, fout, indent=2, ensure_ascii=False, sort_keys=True)
4 changes: 4 additions & 0 deletions src/wiktextract/data/es/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"analyze_templates": false,
"extract_thesaurus_pages": false
}
Loading

0 comments on commit 3e3b67e

Please sign in to comment.