From 6edbb3f75b5b367548f800afaa4cd7181733b0ac Mon Sep 17 00:00:00 2001
From: xxyzz <gitpull@protonmail.com>
Date: Tue, 7 Nov 2023 16:37:28 +0800
Subject: [PATCH] Break `--language` option to `--language-code` and
 `--language-name`

---
 README.md                    |  3 ++-
 src/wiktextract/config.py    |  2 +-
 src/wiktextract/wiktwords.py | 45 +++++++++++++++++++++++++-----------
 3 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index bb600b06..70d1babf 100644
--- a/README.md
+++ b/README.md
@@ -408,7 +408,8 @@ The following command-line options can be used to control its operation:
 
 * --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout)
 * --all-languages: extract words for all available languages
-* --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted)
+* --language-code LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; defaults to dump file language code and `mul`(Translingual))
+* --language-name LANGUAGE_NAME: Similar to `--language-code` except this option accepts language name
 * --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added)
 * --all: causes all data to be captured for the selected languages
 * --translations: causes translations to be captured
diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
index 41829c13..98d44f58 100644
--- a/src/wiktextract/config.py
+++ b/src/wiktextract/config.py
@@ -58,7 +58,7 @@ class WiktionaryConfig:
     def __init__(
         self,
         dump_file_lang_code="en",
-        capture_language_codes=["en", "mul"],
+        capture_language_codes={"en", "mul"},
         capture_translations=True,
         capture_pronunciation=True,
         capture_linkages=True,
diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py
index 10d8821a..d9423c21 100755
--- a/src/wiktextract/wiktwords.py
+++ b/src/wiktextract/wiktwords.py
@@ -23,7 +23,7 @@
 else:
     from importlib.resources import files
 
-from mediawiki_langcodes import code_to_name
+from mediawiki_langcodes import code_to_name, name_to_code
 from wikitextprocessor import Wtp
 from wikitextprocessor.dumpparser import analyze_and_overwrite_pages
 
@@ -125,12 +125,20 @@ def main():
         help="Language code of the dump file.",
     )
     parser.add_argument(
-        "--language",
+        "--language-code",
         type=str,
         action="append",
         default=[],
         help="Language code to capture (can specify multiple times, defaults "
-        "to English [en] and Translingual [mul])",
+        "to dump file language code and `mul`(Translingual))",
+    )
+    parser.add_argument(
+        "--language-name",
+        type=str,
+        action="append",
+        default=[],
+        help="Language names to capture (can specify multiple times, defaults "
+        "to dump file language and Translingual)",
     )
     parser.add_argument(
         "--all-languages",
@@ -331,19 +339,30 @@ def main():
         args.inflections = True
         args.descendants = True
 
-    # Default to English and Translingual if language not specified.
-    if not args.language:
-        args.language = ["en", "mul"]
-    else:
-        for lang_code in args.language:
-            if code_to_name(lang_code) == "":
+    # Default to dump file language and Translingual if not specified.
+    capture_lang_codes = set()
+    if len(args.language_code) > 0:
+        for lang_code in args.language_code:
+            lang_name = code_to_name(lang_code, args.dump_file_language_code)
+            if lang_name == "":
                 logging.warning(f"Unknown language code: {lang_code}")
+            else:
+                capture_lang_codes.add(lang_code)
+    if len(args.language_name) > 0:
+        for lang_name in args.language_name:
+            lang_code = name_to_code(lang_name, args.dump_file_language_code)
+            if lang_code == "":
+                logging.warning(f"Unknown language name: {lang_name}")
+            else:
+                capture_lang_codes.add(lang_code)
+    if len(capture_lang_codes) == 0:
+        capture_lang_codes = {args.dump_file_language_code, "mul"}
 
     if args.all_languages:
-        args.language = None
-        print("Capturing words for all available languages")
+        capture_lang_codes = None
+        logging.info("Capturing words for all available languages")
     else:
-        print("Capturing words for:", ", ".join(args.language))
+        logging.info(f"Capturing words for: {', '.join(capture_lang_codes)}")
 
     # Open output file.
     out_path = args.out
@@ -363,7 +382,7 @@ def main():
 
     conf1 = WiktionaryConfig(
         dump_file_lang_code=args.dump_file_language_code,
-        capture_language_codes=args.language,
+        capture_language_codes=capture_lang_codes,
         capture_translations=args.translations,
         capture_pronunciation=args.pronunciations,
         capture_linkages=args.linkages,