diff --git a/README.md b/README.md index a08a228..41c687d 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ This NCE module if forked from the pytorch/examples repo. new arguments: - `--nce`: whether to use NCE as approximation - - `--noise_ratio <10>`: numbers of noise samples per data sample - - `--norm_term <9>`: the constant normalization term `Ln(z)` + - `--noise-ratio <10>`: numbers of noise samples per data sample + - `--norm-term <9>`: the constant normalization term `Ln(z)` - `--train`: train or just evaluation existing model - `--dict `: use vocabulary file if specified, otherwise use the words in train.txt @@ -11,7 +11,7 @@ new arguments: Run NCE criterion: ```bash -python main.py --cuda --noise_ratio 10 --norm_term 9 --nce --train +python main.py --cuda --noise-ratio 10 --norm-term 9 --nce --train ``` Run conventional CE criterion: @@ -19,21 +19,19 @@ Run conventional CE criterion: python main.py --cuda --train ``` -# Word-level language modeling RNN +----------------- +### Modified README from Pytorch/examples This example trains a multi-layer RNN (Elman, GRU, or LSTM) on a language modeling task. By default, the training script uses the PTB dataset, provided. The trained model can then be used by the generate script to generate new text. ```bash -python main.py --cuda --epochs 6 # Train a LSTM on PTB with CUDA, reaching perplexity of 117.61 -python main.py --cuda --epochs 6 --tied # Train a tied LSTM on PTB with CUDA, reaching perplexity of 110.44 -python main.py --cuda --tied # Train a tied LSTM on PTB with CUDA for 40 epochs, reaching perplexity of 87.17 -python generate.py # Generate samples from the trained LSTM model. +python main.py --cuda --epochs 6 # Train a LSTM on PTB with CUDA ``` -The model uses the `nn.RNN` module (and its sister modules `nn.GRU` and `nn.LSTM`) -which will automatically use the cuDNN backend if run on CUDA with cuDNN installed. +The model uses the `nn.LSTM` module which will automatically use the cuDNN backend if run on CUDA with +cuDNN installed. During training, if a keyboard interrupt (Ctrl-C) is received, training is stopped and the current model is evaluted against the test dataset. @@ -44,34 +42,18 @@ The `main.py` script accepts the following arguments: optional arguments: -h, --help show this help message and exit --data DATA location of the data corpus - --model MODEL type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU) --emsize EMSIZE size of word embeddings --nhid NHID humber of hidden units per layer --nlayers NLAYERS number of layers --lr LR initial learning rate + --lr-decay learning rate decay when no progress is observed on validation set + --weight-decay weight decay(L2 normalization) --clip CLIP gradient clipping --epochs EPOCHS upper epoch limit --batch-size N batch size - --bptt BPTT sequence length --dropout DROPOUT dropout applied to layers (0 = no dropout) - --decay DECAY learning rate decay per epoch - --tied tie the word embedding and softmax weights --seed SEED random seed --cuda use CUDA --log-interval N report interval --save SAVE path to save the final model ``` - -With these arguments, a variety of models can be tested. -As an example, the following arguments produce slower but better models: - -```bash -python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 # Test perplexity of 80.97 -python main.py --cuda --emsize 650 --nhid 650 --dropout 0.5 --epochs 40 --tied # Test perplexity of 75.96 -python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 # Test perplexity of 77.42 -python main.py --cuda --emsize 1500 --nhid 1500 --dropout 0.65 --epochs 40 --tied # Test perplexity of 72.30 -``` - -These perplexities are equal or better than -[Recurrent Neural Network Regularization (Zaremba et al. 2014)](https://arxiv.org/pdf/1409.2329.pdf) -and are similar to [Using the Output Embedding to Improve Language Models (Press & Wolf 2016](https://arxiv.org/abs/1608.05859) and [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling (Inan et al. 2016)](https://arxiv.org/pdf/1611.01462.pdf), though both of these papers have improved perplexities by using a form of recurrent dropout [(variational dropout)](http://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks). diff --git a/main.py b/main.py index 9942b59..78074b1 100644 --- a/main.py +++ b/main.py @@ -30,18 +30,18 @@ def setup_parser(): help='number of layers') parser.add_argument('--lr', type=float, default=1.0, help='initial learning rate') + parser.add_argument('--weight-decay', type=float, default=1e-5, + help='initial weight decay') + parser.add_argument('--lr-decay', type=float, default=2, + help='learning rate decay when no progress is observed on validation set') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') - parser.add_argument('--batch_size', type=int, default=20, metavar='N', + parser.add_argument('--batch-size', type=int, default=20, metavar='N', help='batch size') - parser.add_argument('--bptt', type=int, default=35, - help='sequence length') parser.add_argument('--dropout', type=float, default=0.2, help='dropout applied to layers (0 = no dropout)') - parser.add_argument('--tied', action='store_true', - help='tie the word embedding and softmax weights') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', action='store_true', @@ -52,13 +52,13 @@ def setup_parser(): help='path to save the final model') parser.add_argument('--nce', action='store_true', help='use NCE as loss function') - parser.add_argument('--noise_ratio', type=int, default=10, + parser.add_argument('--noise-ratio', type=int, default=10, help='set the noise ratio of NCE sampling') - parser.add_argument('--norm_term', type=int, default=9, + parser.add_argument('--norm-term', type=int, default=9, help='set the log normalization term of NCE sampling') parser.add_argument('--train', action='store_true', help='set train mode, otherwise only evaluation is performed') - parser.add_argument('--tb_name', type=str, default=None, + parser.add_argument('--tb-name', type=str, default=None, help='the name which would be used in tensorboard record') parser.add_argument('--prof', action='store_true', help='Enable profiling mode, will execute only one batch data') @@ -103,10 +103,10 @@ def setup_parser(): ################################################################## Build the criterion and model ################################################################# -# add the representation for padded index ntokens = len(corpus.train.dataset.dictionary) print('Vocabulary size is {}'.format(ntokens)) +# noise for soise sampling in NCE noise = build_unigram_noise( torch.FloatTensor(corpus.train.dataset.dictionary.idx2count) ) @@ -126,10 +126,10 @@ def setup_parser(): nhidden=args.nhid, ) -model = RNNModel(ntokens, args.emsize, args.nhid, args.nlayers, - criterion=criterion, - dropout=args.dropout, - tie_weights=args.tied) +model = RNNModel( + ntokens, args.emsize, args.nhid, args.nlayers, + criterion=criterion, dropout=args.dropout, +) if args.cuda: model.cuda() print(model) @@ -193,16 +193,16 @@ def evaluate(model, data_source, cuda=args.cuda): if __name__ == '__main__': - # Loop over epochs. lr = args.lr best_val_ppl = None - # At any point you can hit Ctrl + C to break out of training early. if args.train: + # At any point you can hit Ctrl + C to break out of training early. try: + # Loop over epochs. for epoch in range(1, args.epochs + 1): epoch_start_time = time.time() - train(model, corpus.train, lr=lr) + train(model, corpus.train, lr=lr, weight_decay=args.weight_decay) if args.prof: break val_ppl = evaluate(model, corpus.valid) @@ -224,7 +224,7 @@ def evaluate(model, data_source, cuda=args.cuda): else: # Anneal the learning rate if no improvement has been seen in the # validation dataset. - lr /= 2.0 + lr /= args.lr_decay except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') diff --git a/model.py b/model.py index b7322bc..351740f 100644 --- a/model.py +++ b/model.py @@ -7,19 +7,12 @@ class RNNModel(nn.Module): """Container module with an encoder, a recurrent module, and a criterion (decoder and loss function).""" - def __init__(self, ntoken, ninp, nhid, nlayers, criterion, dropout=0.5, tie_weights=False): + def __init__(self, ntoken, ninp, nhid, nlayers, criterion, dropout=0.5): super(RNNModel, self).__init__() self.drop = nn.Dropout(dropout) self.encoder = nn.Embedding(ntoken, ninp) self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout) - # Optionally tie weights as in: - # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016) - # https://arxiv.org/abs/1608.05859 - # and - # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016) - # https://arxiv.org/abs/1611.01462 - self.nhid = nhid self.nlayers = nlayers self.criterion = criterion