Set default values through clap, fix some unsensible defaults

The changed defaults are: - Context size: 5 -> 10 - Dims: 100 -> 300 - Epochs: 5 -> 15
finalfusion · Jun 13, 2019 · 1e84919 · 1e84919
1 parent 4a75976
commit 1e84919
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 55 deletions.
diff --git a/finalfrontier-utils/src/util.rs b/finalfrontier-utils/src/util.rs
@@ -65,15 +65,18 @@ impl SkipGramApp {
                 Arg::with_name(CONTEXT)
                     .long("context")
                     .value_name("CONTEXT_SIZE")
-                    .help("Context size (default: 5)")
-                    .takes_value(true),
+                    .help("Context size")
+                    .takes_value(true)
+                    .default_value("10"),
             )
             .arg(
                 Arg::with_name(MODEL)
                     .long(MODEL)
                     .value_name("MODEL")
-                    .help("Model: skipgram, structgram or dirgram")
-                    .takes_value(true),
+                    .help("Model")
+                    .takes_value(true)
+                    .possible_values(&["dirgram", "skipgram", "structgram"])
+                    .default_value("skipgram"),
             )
             .get_matches();
         let corpus = matches.value_of(CORPUS).unwrap().into();
@@ -126,11 +129,11 @@ impl SkipGramApp {
         let context_size = matches
             .value_of(CONTEXT)
             .map(|v| v.parse().or_exit("Cannot parse context size", 1))
-            .unwrap_or(5);
+            .unwrap();
         let model = matches
             .value_of(MODEL)
             .map(|v| ModelType::try_from_str(v).or_exit("Cannot parse model type", 1))
-            .unwrap_or(ModelType::SkipGram);
+            .unwrap();
 
         SkipGramConfig {
             context_size,
@@ -171,11 +174,11 @@ impl DepembedsApp {
         let discard_threshold = matches
             .value_of(CONTEXT_DISCARD)
             .map(|v| v.parse().or_exit("Cannot parse discard threshold", 1))
-            .unwrap_or(1e-4);
+            .unwrap();
         let min_count = matches
             .value_of(CONTEXT_MINCOUNT)
             .map(|v| v.parse().or_exit("Cannot parse mincount", 1))
-            .unwrap_or(5);
+            .unwrap();
 
         let output_vocab_config = SimpleVocabConfig {
             min_count,
@@ -233,22 +236,25 @@ impl DepembedsApp {
             Arg::with_name(CONTEXT_DISCARD)
                 .long("context_discard")
                 .value_name("CONTEXT_THRESHOLD")
-                .help("Context discard threshold (default: 1e-4)")
-                .takes_value(true),
+                .help("Context discard threshold")
+                .takes_value(true)
+                .default_value("1e-4"),
         )
         .arg(
             Arg::with_name(CONTEXT_MINCOUNT)
                 .long("context_mincount")
                 .value_name("CONTEXT_FREQ")
-                .help("Context mincount (default: 5)")
-                .takes_value(true),
+                .help("Context mincount")
+                .takes_value(true)
+                .default_value("5"),
         )
         .arg(
             Arg::with_name(DEPENDENCY_DEPTH)
                 .long("dependency_depth")
                 .value_name("DEPENDENCY_DEPTH")
-                .help("Dependency depth (default: 2)")
-                .takes_value(true),
+                .help("Dependency depth")
+                .takes_value(true)
+                .default_value("1"),
         )
         .arg(
             Arg::with_name(UNTYPED_DEPS)
@@ -276,7 +282,7 @@ impl DepembedsApp {
         let depth = matches
             .value_of(DEPENDENCY_DEPTH)
             .map(|v| v.parse().or_exit("Cannot parse dependency depth", 1))
-            .unwrap_or(1);
+            .unwrap();
         let untyped = matches.is_present(UNTYPED_DEPS);
         let normalize = matches.is_present(NORMALIZE_CONTEXT);
         let projectivize = matches.is_present(PROJECTIVIZE);
@@ -298,64 +304,73 @@ fn build_with_common_opts<'a, 'b>(name: &str) -> App<'a, 'b> {
             Arg::with_name(BUCKETS)
                 .long("buckets")
                 .value_name("EXP")
-                .help("Number of buckets: 2^EXP (default: 21)")
-                .takes_value(true),
+                .help("Number of buckets: 2^EXP")
+                .takes_value(true)
+                .default_value("21"),
         )
         .arg(
             Arg::with_name(DIMS)
                 .long("dims")
                 .value_name("DIMENSIONS")
-                .help("Embedding dimensionality (default: 100)")
-                .takes_value(true),
+                .help("Embedding dimensionality")
+                .takes_value(true)
+                .default_value("300"),
         )
         .arg(
             Arg::with_name(DISCARD)
                 .long("discard")
                 .value_name("THRESHOLD")
-                .help("Discard threshold (default: 1e-4)")
-                .takes_value(true),
+                .help("Discard threshold")
+                .takes_value(true)
+                .default_value("1e-4"),
         )
         .arg(
             Arg::with_name(EPOCHS)
                 .long("epochs")
                 .value_name("N")
-                .help("Number of epochs (default: 5)")
-                .takes_value(true),
+                .help("Number of epochs")
+                .takes_value(true)
+                .default_value("15"),
         )
         .arg(
             Arg::with_name(LR)
                 .long("lr")
                 .value_name("LEARNING_RATE")
-                .help("Initial learning rate (default: 0.05)")
-                .takes_value(true),
+                .help("Initial learning rate")
+                .takes_value(true)
+                .default_value("0.05"),
         )
         .arg(
             Arg::with_name(MINCOUNT)
                 .long("mincount")
                 .value_name("FREQ")
-                .help("Minimum token frequency (default: 5)")
-                .takes_value(true),
+                .help("Minimum token frequency")
+                .takes_value(true)
+                .default_value("5"),
         )
         .arg(
             Arg::with_name(MINN)
                 .long("minn")
                 .value_name("LEN")
-                .help("Minimum ngram length (default: 3)")
-                .takes_value(true),
+                .help("Minimum ngram length")
+                .takes_value(true)
+                .default_value("3"),
         )
         .arg(
             Arg::with_name(MAXN)
                 .long("maxn")
                 .value_name("LEN")
-                .help("Maximum ngram length (default: 6)")
-                .takes_value(true),
+                .help("Maximum ngram length")
+                .takes_value(true)
+                .default_value("6"),
         )
         .arg(
             Arg::with_name(NS)
                 .long("ns")
                 .value_name("FREQ")
-                .help("Negative samples per word (default: 5)")
-                .takes_value(true),
+                .help("Negative samples per word")
+                .takes_value(true)
+                .default_value("5"),
         )
         .arg(
             Arg::with_name(THREADS)
@@ -368,8 +383,9 @@ fn build_with_common_opts<'a, 'b>(name: &str) -> App<'a, 'b> {
             Arg::with_name(ZIPF_EXPONENT)
                 .long("zipf")
                 .value_name("EXP")
-                .help("Exponent Zipf distribution for negative sampling (default: 0.5)")
-                .takes_value(true),
+                .help("Exponent Zipf distribution for negative sampling")
+                .takes_value(true)
+                .default_value("0.5"),
         )
         .arg(
             Arg::with_name(CORPUS)
@@ -390,29 +406,29 @@ fn common_config_from_matches(matches: &ArgMatches) -> CommonConfig {
     let dims = matches
         .value_of(DIMS)
         .map(|v| v.parse().or_exit("Cannot parse dimensionality", 1))
-        .unwrap_or(100);
+        .unwrap();
     let epochs = matches
         .value_of(EPOCHS)
         .map(|v| v.parse().or_exit("Cannot parse number of epochs", 1))
-        .unwrap_or(5);
+        .unwrap();
     let lr = matches
         .value_of(LR)
         .map(|v| v.parse().or_exit("Cannot parse learning rate", 1))
-        .unwrap_or(0.05);
+        .unwrap();
     let negative_samples = matches
         .value_of(NS)
         .map(|v| {
             v.parse()
                 .or_exit("Cannot parse number of negative samples", 1)
         })
-        .unwrap_or(5);
+        .unwrap();
     let zipf_exponent = matches
         .value_of(ZIPF_EXPONENT)
         .map(|v| {
             v.parse()
                 .or_exit("Cannot parse exponent zipf distribution", 1)
         })
-        .unwrap_or(0.5);
+        .unwrap();
 
     CommonConfig {
         loss: LossType::LogisticNegativeSampling,
@@ -429,23 +445,23 @@ fn subword_config_from_matches(matches: &ArgMatches) -> SubwordVocabConfig {
     let buckets_exp = matches
         .value_of(BUCKETS)
         .map(|v| v.parse().or_exit("Cannot parse bucket exponent", 1))
-        .unwrap_or(21);
+        .unwrap();
     let discard_threshold = matches
         .value_of(DISCARD)
         .map(|v| v.parse().or_exit("Cannot parse discard threshold", 1))
-        .unwrap_or(1e-4);
+        .unwrap();
     let min_count = matches
         .value_of(MINCOUNT)
         .map(|v| v.parse().or_exit("Cannot parse mincount", 1))
-        .unwrap_or(5);
+        .unwrap();
     let min_n = matches
         .value_of(MINN)
         .map(|v| v.parse().or_exit("Cannot parse minimum n-gram length", 1))
-        .unwrap_or(3);
+        .unwrap();
     let max_n = matches
         .value_of(MAXN)
         .map(|v| v.parse().or_exit("Cannot parse maximum n-gram length", 1))
-        .unwrap_or(6);
+        .unwrap();
 
     SubwordVocabConfig {
         min_n,

diff --git a/man/ff-train-deps.1.md b/man/ff-train-deps.1.md
@@ -45,7 +45,7 @@ default minimum count is 5.
 `--dims` *DIMS*
 
 :   The dimensionality of the trained word embeddings. The default
-dimensionality is 100.
+dimensionality is 300.
 
 `--dependency_depth` *DEPTH*
 
@@ -61,7 +61,7 @@ discarded from training. The default discard threshold is *1e-4*.
 `--epochs` *N*
 
 :   The number of training epochs. The number of necessary training epochs
-typically decreases with the corpus size. The default number of epochs is *5*.
+typically decreases with the corpus size. The default number of epochs is *15*.
 
 `--lr` *LEARNING_RATE*
 
@@ -123,9 +123,9 @@ parameters:
 
     ff-train-deps dewiki.conll dewiki-deps.bin
 
-Train embeddings with dimensionality 300 on *dewiki.conll* using the dependency
+Train embeddings with dimensionality 200 on *dewiki.conll* using the dependency
 model from contexts with depth up to 2:
 
-    ff-train-deps --depth 2 --normalize --dims 300 \
+    ff-train-deps --depth 2 --normalize --dims 200 \
       dewiki.conll dewiki-deps.bin
 
diff --git a/man/ff-train-skipgram.1.md b/man/ff-train-skipgram.1.md
@@ -35,12 +35,12 @@ OPTIONS
 `--context` *CONTEXT_SIZE*
 
 :   Words within the *CONTEXT_SIZE* of a focus word will be used to learn
-    the representation of the focus word. The default context size is *5*.
+    the representation of the focus word. The default context size is *10*.
 
 `--dims` *DIMENSIONS*
 
 :   The dimensionality of the trained word embeddings. The default
-    dimensionality is 100.
+    dimensionality is 300.
 
 `--discard` *THRESHOLD*
 
@@ -51,7 +51,7 @@ OPTIONS
 
 :   The number of training epochs. The number of necessary training epochs
     typically decreases with the corpus size. The default number of epochs
-    is *5*.
+    is *15*.
 
 `--lr` *LEARNING_RATE*
 
@@ -110,10 +110,10 @@ Train embeddings on *dewiki.txt* using the skip-gram model:
 
     ff-train-skipgram dewiki.txt dewiki-skipgram.bin
 
-Train embeddings with dimensionality 300 on *dewiki.txt* using the
-structured skip-gram model with a context window of 10 tokens:
+Train embeddings with dimensionality 200 on *dewiki.txt* using the
+structured skip-gram model with a context window of 5 tokens:
 
-    ff-train-skipgram --model structgram --context 10 --dims 300 \
+    ff-train-skipgram --model structgram --context 5 --dims 200 \
       dewiki.txt dewiki-structgram.bin
 
 SEE ALSO