Break --language option to --language-code and --language-name

tatuylonen · Nov 7, 2023 · 6edbb3f · 6edbb3f
1 parent 7a765d8
commit 6edbb3f
Show file tree

Hide file tree

Showing 3 changed files with 35 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -408,7 +408,8 @@ The following command-line options can be used to control its operation:
 
 * --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout)
 * --all-languages: extract words for all available languages
-* --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted)
+* --language-code LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; defaults to dump file language code and `mul`(Translingual))
+* --language-name LANGUAGE_NAME: Similar to `--language-code` except this option accepts language name
 * --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added)
 * --all: causes all data to be captured for the selected languages
 * --translations: causes translations to be captured

diff --git a/src/wiktextract/config.py b/src/wiktextract/config.py
@@ -58,7 +58,7 @@ class WiktionaryConfig:
     def __init__(
         self,
         dump_file_lang_code="en",
-        capture_language_codes=["en", "mul"],
+        capture_language_codes={"en", "mul"},
         capture_translations=True,
         capture_pronunciation=True,
         capture_linkages=True,

diff --git a/src/wiktextract/wiktwords.py b/src/wiktextract/wiktwords.py
@@ -23,7 +23,7 @@
 else:
     from importlib.resources import files
 
-from mediawiki_langcodes import code_to_name
+from mediawiki_langcodes import code_to_name, name_to_code
 from wikitextprocessor import Wtp
 from wikitextprocessor.dumpparser import analyze_and_overwrite_pages
 
@@ -125,12 +125,20 @@ def main():
         help="Language code of the dump file.",
     )
     parser.add_argument(
-        "--language",
+        "--language-code",
         type=str,
         action="append",
         default=[],
         help="Language code to capture (can specify multiple times, defaults "
-        "to English [en] and Translingual [mul])",
+        "to dump file language code and `mul`(Translingual))",
+    )
+    parser.add_argument(
+        "--language-name",
+        type=str,
+        action="append",
+        default=[],
+        help="Language names to capture (can specify multiple times, defaults "
+        "to dump file language and Translingual)",
     )
     parser.add_argument(
         "--all-languages",
@@ -331,19 +339,30 @@ def main():
         args.inflections = True
         args.descendants = True
 
-    # Default to English and Translingual if language not specified.
-    if not args.language:
-        args.language = ["en", "mul"]
-    else:
-        for lang_code in args.language:
-            if code_to_name(lang_code) == "":
+    # Default to dump file language and Translingual if not specified.
+    capture_lang_codes = set()
+    if len(args.language_code) > 0:
+        for lang_code in args.language_code:
+            lang_name = code_to_name(lang_code, args.dump_file_language_code)
+            if lang_name == "":
                 logging.warning(f"Unknown language code: {lang_code}")
+            else:
+                capture_lang_codes.add(lang_code)
+    if len(args.language_name) > 0:
+        for lang_name in args.language_name:
+            lang_code = name_to_code(lang_name, args.dump_file_language_code)
+            if lang_code == "":
+                logging.warning(f"Unknown language name: {lang_name}")
+            else:
+                capture_lang_codes.add(lang_code)
+    if len(capture_lang_codes) == 0:
+        capture_lang_codes = {args.dump_file_language_code, "mul"}
 
     if args.all_languages:
-        args.language = None
-        print("Capturing words for all available languages")
+        capture_lang_codes = None
+        logging.info("Capturing words for all available languages")
     else:
-        print("Capturing words for:", ", ".join(args.language))
+        logging.info(f"Capturing words for: {', '.join(capture_lang_codes)}")
 
     # Open output file.
     out_path = args.out
@@ -363,7 +382,7 @@ def main():
 
     conf1 = WiktionaryConfig(
         dump_file_lang_code=args.dump_file_language_code,
-        capture_language_codes=args.language,
+        capture_language_codes=capture_lang_codes,
         capture_translations=args.translations,
         capture_pronunciation=args.pronunciations,
         capture_linkages=args.linkages,