Skip to content

Commit

Permalink
Break --language option to --language-code and --language-name
Browse files Browse the repository at this point in the history
  • Loading branch information
xxyzz committed Nov 7, 2023
1 parent 7a765d8 commit 6edbb3f
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 15 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,8 @@ The following command-line options can be used to control its operation:

* --out FILE: specifies the name of the file to write (specifying "-" as the file writes to stdout)
* --all-languages: extract words for all available languages
* --language LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; by default, English [en] and Translingual [mul] words are extracted)
* --language-code LANGUAGE_CODE: extracts the given language (this option may be specified multiple times; defaults to dump file language code and `mul`(Translingual))
* --language-name LANGUAGE_NAME: Similar to `--language-code` except this option accepts language name
* --dump-file-language-code LANGUAGE_CODE: specifies the language code for the Wiktionary edition that the dump file is for (defaults to "en"; "zh" is supported and others are being added)
* --all: causes all data to be captured for the selected languages
* --translations: causes translations to be captured
Expand Down
2 changes: 1 addition & 1 deletion src/wiktextract/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class WiktionaryConfig:
def __init__(
self,
dump_file_lang_code="en",
capture_language_codes=["en", "mul"],
capture_language_codes={"en", "mul"},
capture_translations=True,
capture_pronunciation=True,
capture_linkages=True,
Expand Down
45 changes: 32 additions & 13 deletions src/wiktextract/wiktwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
else:
from importlib.resources import files

from mediawiki_langcodes import code_to_name
from mediawiki_langcodes import code_to_name, name_to_code
from wikitextprocessor import Wtp
from wikitextprocessor.dumpparser import analyze_and_overwrite_pages

Expand Down Expand Up @@ -125,12 +125,20 @@ def main():
help="Language code of the dump file.",
)
parser.add_argument(
"--language",
"--language-code",
type=str,
action="append",
default=[],
help="Language code to capture (can specify multiple times, defaults "
"to English [en] and Translingual [mul])",
"to dump file language code and `mul`(Translingual))",
)
parser.add_argument(
"--language-name",
type=str,
action="append",
default=[],
help="Language names to capture (can specify multiple times, defaults "
"to dump file language and Translingual)",
)
parser.add_argument(
"--all-languages",
Expand Down Expand Up @@ -331,19 +339,30 @@ def main():
args.inflections = True
args.descendants = True

# Default to English and Translingual if language not specified.
if not args.language:
args.language = ["en", "mul"]
else:
for lang_code in args.language:
if code_to_name(lang_code) == "":
# Default to dump file language and Translingual if not specified.
capture_lang_codes = set()
if len(args.language_code) > 0:
for lang_code in args.language_code:
lang_name = code_to_name(lang_code, args.dump_file_language_code)
if lang_name == "":
logging.warning(f"Unknown language code: {lang_code}")
else:
capture_lang_codes.add(lang_code)
if len(args.language_name) > 0:
for lang_name in args.language_name:
lang_code = name_to_code(lang_name, args.dump_file_language_code)
if lang_code == "":
logging.warning(f"Unknown language name: {lang_name}")
else:
capture_lang_codes.add(lang_code)
if len(capture_lang_codes) == 0:
capture_lang_codes = {args.dump_file_language_code, "mul"}

if args.all_languages:
args.language = None
print("Capturing words for all available languages")
capture_lang_codes = None
logging.info("Capturing words for all available languages")
else:
print("Capturing words for:", ", ".join(args.language))
logging.info(f"Capturing words for: {', '.join(capture_lang_codes)}")

# Open output file.
out_path = args.out
Expand All @@ -363,7 +382,7 @@ def main():

conf1 = WiktionaryConfig(
dump_file_lang_code=args.dump_file_language_code,
capture_language_codes=args.language,
capture_language_codes=capture_lang_codes,
capture_translations=args.translations,
capture_pronunciation=args.pronunciations,
capture_linkages=args.linkages,
Expand Down

0 comments on commit 6edbb3f

Please sign in to comment.