Skip to content

Commit

Permalink
Set default values through clap, fix some unsensible defaults
Browse files Browse the repository at this point in the history
The changed defaults are:

- Context size: 5 -> 10
- Dims: 100 -> 300
- Epochs: 5 -> 15
  • Loading branch information
danieldk committed Jun 13, 2019
1 parent 4a75976 commit 1e84919
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 55 deletions.
106 changes: 61 additions & 45 deletions finalfrontier-utils/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,18 @@ impl SkipGramApp {
Arg::with_name(CONTEXT)
.long("context")
.value_name("CONTEXT_SIZE")
.help("Context size (default: 5)")
.takes_value(true),
.help("Context size")
.takes_value(true)
.default_value("10"),
)
.arg(
Arg::with_name(MODEL)
.long(MODEL)
.value_name("MODEL")
.help("Model: skipgram, structgram or dirgram")
.takes_value(true),
.help("Model")
.takes_value(true)
.possible_values(&["dirgram", "skipgram", "structgram"])
.default_value("skipgram"),
)
.get_matches();
let corpus = matches.value_of(CORPUS).unwrap().into();
Expand Down Expand Up @@ -126,11 +129,11 @@ impl SkipGramApp {
let context_size = matches
.value_of(CONTEXT)
.map(|v| v.parse().or_exit("Cannot parse context size", 1))
.unwrap_or(5);
.unwrap();
let model = matches
.value_of(MODEL)
.map(|v| ModelType::try_from_str(v).or_exit("Cannot parse model type", 1))
.unwrap_or(ModelType::SkipGram);
.unwrap();

SkipGramConfig {
context_size,
Expand Down Expand Up @@ -171,11 +174,11 @@ impl DepembedsApp {
let discard_threshold = matches
.value_of(CONTEXT_DISCARD)
.map(|v| v.parse().or_exit("Cannot parse discard threshold", 1))
.unwrap_or(1e-4);
.unwrap();
let min_count = matches
.value_of(CONTEXT_MINCOUNT)
.map(|v| v.parse().or_exit("Cannot parse mincount", 1))
.unwrap_or(5);
.unwrap();

let output_vocab_config = SimpleVocabConfig {
min_count,
Expand Down Expand Up @@ -233,22 +236,25 @@ impl DepembedsApp {
Arg::with_name(CONTEXT_DISCARD)
.long("context_discard")
.value_name("CONTEXT_THRESHOLD")
.help("Context discard threshold (default: 1e-4)")
.takes_value(true),
.help("Context discard threshold")
.takes_value(true)
.default_value("1e-4"),
)
.arg(
Arg::with_name(CONTEXT_MINCOUNT)
.long("context_mincount")
.value_name("CONTEXT_FREQ")
.help("Context mincount (default: 5)")
.takes_value(true),
.help("Context mincount")
.takes_value(true)
.default_value("5"),
)
.arg(
Arg::with_name(DEPENDENCY_DEPTH)
.long("dependency_depth")
.value_name("DEPENDENCY_DEPTH")
.help("Dependency depth (default: 2)")
.takes_value(true),
.help("Dependency depth")
.takes_value(true)
.default_value("1"),
)
.arg(
Arg::with_name(UNTYPED_DEPS)
Expand Down Expand Up @@ -276,7 +282,7 @@ impl DepembedsApp {
let depth = matches
.value_of(DEPENDENCY_DEPTH)
.map(|v| v.parse().or_exit("Cannot parse dependency depth", 1))
.unwrap_or(1);
.unwrap();
let untyped = matches.is_present(UNTYPED_DEPS);
let normalize = matches.is_present(NORMALIZE_CONTEXT);
let projectivize = matches.is_present(PROJECTIVIZE);
Expand All @@ -298,64 +304,73 @@ fn build_with_common_opts<'a, 'b>(name: &str) -> App<'a, 'b> {
Arg::with_name(BUCKETS)
.long("buckets")
.value_name("EXP")
.help("Number of buckets: 2^EXP (default: 21)")
.takes_value(true),
.help("Number of buckets: 2^EXP")
.takes_value(true)
.default_value("21"),
)
.arg(
Arg::with_name(DIMS)
.long("dims")
.value_name("DIMENSIONS")
.help("Embedding dimensionality (default: 100)")
.takes_value(true),
.help("Embedding dimensionality")
.takes_value(true)
.default_value("300"),
)
.arg(
Arg::with_name(DISCARD)
.long("discard")
.value_name("THRESHOLD")
.help("Discard threshold (default: 1e-4)")
.takes_value(true),
.help("Discard threshold")
.takes_value(true)
.default_value("1e-4"),
)
.arg(
Arg::with_name(EPOCHS)
.long("epochs")
.value_name("N")
.help("Number of epochs (default: 5)")
.takes_value(true),
.help("Number of epochs")
.takes_value(true)
.default_value("15"),
)
.arg(
Arg::with_name(LR)
.long("lr")
.value_name("LEARNING_RATE")
.help("Initial learning rate (default: 0.05)")
.takes_value(true),
.help("Initial learning rate")
.takes_value(true)
.default_value("0.05"),
)
.arg(
Arg::with_name(MINCOUNT)
.long("mincount")
.value_name("FREQ")
.help("Minimum token frequency (default: 5)")
.takes_value(true),
.help("Minimum token frequency")
.takes_value(true)
.default_value("5"),
)
.arg(
Arg::with_name(MINN)
.long("minn")
.value_name("LEN")
.help("Minimum ngram length (default: 3)")
.takes_value(true),
.help("Minimum ngram length")
.takes_value(true)
.default_value("3"),
)
.arg(
Arg::with_name(MAXN)
.long("maxn")
.value_name("LEN")
.help("Maximum ngram length (default: 6)")
.takes_value(true),
.help("Maximum ngram length")
.takes_value(true)
.default_value("6"),
)
.arg(
Arg::with_name(NS)
.long("ns")
.value_name("FREQ")
.help("Negative samples per word (default: 5)")
.takes_value(true),
.help("Negative samples per word")
.takes_value(true)
.default_value("5"),
)
.arg(
Arg::with_name(THREADS)
Expand All @@ -368,8 +383,9 @@ fn build_with_common_opts<'a, 'b>(name: &str) -> App<'a, 'b> {
Arg::with_name(ZIPF_EXPONENT)
.long("zipf")
.value_name("EXP")
.help("Exponent Zipf distribution for negative sampling (default: 0.5)")
.takes_value(true),
.help("Exponent Zipf distribution for negative sampling")
.takes_value(true)
.default_value("0.5"),
)
.arg(
Arg::with_name(CORPUS)
Expand All @@ -390,29 +406,29 @@ fn common_config_from_matches(matches: &ArgMatches) -> CommonConfig {
let dims = matches
.value_of(DIMS)
.map(|v| v.parse().or_exit("Cannot parse dimensionality", 1))
.unwrap_or(100);
.unwrap();
let epochs = matches
.value_of(EPOCHS)
.map(|v| v.parse().or_exit("Cannot parse number of epochs", 1))
.unwrap_or(5);
.unwrap();
let lr = matches
.value_of(LR)
.map(|v| v.parse().or_exit("Cannot parse learning rate", 1))
.unwrap_or(0.05);
.unwrap();
let negative_samples = matches
.value_of(NS)
.map(|v| {
v.parse()
.or_exit("Cannot parse number of negative samples", 1)
})
.unwrap_or(5);
.unwrap();
let zipf_exponent = matches
.value_of(ZIPF_EXPONENT)
.map(|v| {
v.parse()
.or_exit("Cannot parse exponent zipf distribution", 1)
})
.unwrap_or(0.5);
.unwrap();

CommonConfig {
loss: LossType::LogisticNegativeSampling,
Expand All @@ -429,23 +445,23 @@ fn subword_config_from_matches(matches: &ArgMatches) -> SubwordVocabConfig {
let buckets_exp = matches
.value_of(BUCKETS)
.map(|v| v.parse().or_exit("Cannot parse bucket exponent", 1))
.unwrap_or(21);
.unwrap();
let discard_threshold = matches
.value_of(DISCARD)
.map(|v| v.parse().or_exit("Cannot parse discard threshold", 1))
.unwrap_or(1e-4);
.unwrap();
let min_count = matches
.value_of(MINCOUNT)
.map(|v| v.parse().or_exit("Cannot parse mincount", 1))
.unwrap_or(5);
.unwrap();
let min_n = matches
.value_of(MINN)
.map(|v| v.parse().or_exit("Cannot parse minimum n-gram length", 1))
.unwrap_or(3);
.unwrap();
let max_n = matches
.value_of(MAXN)
.map(|v| v.parse().or_exit("Cannot parse maximum n-gram length", 1))
.unwrap_or(6);
.unwrap();

SubwordVocabConfig {
min_n,
Expand Down
8 changes: 4 additions & 4 deletions man/ff-train-deps.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ default minimum count is 5.
`--dims` *DIMS*

: The dimensionality of the trained word embeddings. The default
dimensionality is 100.
dimensionality is 300.

`--dependency_depth` *DEPTH*

Expand All @@ -61,7 +61,7 @@ discarded from training. The default discard threshold is *1e-4*.
`--epochs` *N*

: The number of training epochs. The number of necessary training epochs
typically decreases with the corpus size. The default number of epochs is *5*.
typically decreases with the corpus size. The default number of epochs is *15*.

`--lr` *LEARNING_RATE*

Expand Down Expand Up @@ -123,9 +123,9 @@ parameters:

ff-train-deps dewiki.conll dewiki-deps.bin

Train embeddings with dimensionality 300 on *dewiki.conll* using the dependency
Train embeddings with dimensionality 200 on *dewiki.conll* using the dependency
model from contexts with depth up to 2:

ff-train-deps --depth 2 --normalize --dims 300 \
ff-train-deps --depth 2 --normalize --dims 200 \
dewiki.conll dewiki-deps.bin

12 changes: 6 additions & 6 deletions man/ff-train-skipgram.1.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,12 @@ OPTIONS
`--context` *CONTEXT_SIZE*

: Words within the *CONTEXT_SIZE* of a focus word will be used to learn
the representation of the focus word. The default context size is *5*.
the representation of the focus word. The default context size is *10*.

`--dims` *DIMENSIONS*

: The dimensionality of the trained word embeddings. The default
dimensionality is 100.
dimensionality is 300.

`--discard` *THRESHOLD*

Expand All @@ -51,7 +51,7 @@ OPTIONS

: The number of training epochs. The number of necessary training epochs
typically decreases with the corpus size. The default number of epochs
is *5*.
is *15*.

`--lr` *LEARNING_RATE*

Expand Down Expand Up @@ -110,10 +110,10 @@ Train embeddings on *dewiki.txt* using the skip-gram model:

ff-train-skipgram dewiki.txt dewiki-skipgram.bin

Train embeddings with dimensionality 300 on *dewiki.txt* using the
structured skip-gram model with a context window of 10 tokens:
Train embeddings with dimensionality 200 on *dewiki.txt* using the
structured skip-gram model with a context window of 5 tokens:

ff-train-skipgram --model structgram --context 10 --dims 300 \
ff-train-skipgram --model structgram --context 5 --dims 200 \
dewiki.txt dewiki-structgram.bin

SEE ALSO
Expand Down

0 comments on commit 1e84919

Please sign in to comment.