diff --git a/main.py b/main.py
index e409c26..9942b59 100644
--- a/main.py
+++ b/main.py
@@ -99,11 +99,7 @@ def setup_parser():
     pin_memory=args.cuda,
 )
 
-print(corpus.train.dataset.dictionary.idx2word[0])
-
-
 eval_batch_size = args.batch_size
-
 ################################################################## Build the criterion and model
 #################################################################
 
@@ -122,6 +118,7 @@ def setup_parser():
         noise=noise,
         noise_ratio=args.noise_ratio,
         norm_term=args.norm_term,
+        normed_eval=True, # evaluate PPL using normalized prob
     )
 else:
     criterion = crossEntropy.CELoss(
@@ -129,28 +126,26 @@ def setup_parser():
         nhidden=args.nhid,
     )
 
-evaluate_criterion = crossEntropy.CELoss(
-    ntokens=ntokens,
-    nhidden=args.nhid,
-    decoder_weight=(criterion.decoder.weight, criterion.decoder.bias),
-)
-
 model = RNNModel(ntokens, args.emsize, args.nhid, args.nlayers,
                  criterion=criterion,
                  dropout=args.dropout,
                  tie_weights=args.tied)
-print(model)
 if args.cuda:
     model.cuda()
+print(model)
 #################################################################
 # Training code
 #################################################################
 
 
-def train():
+def train(model, data_source, lr=1.0, weight_decay=1e-5, momentum=0.9):
     params = model.parameters()
-    optimizer = optim.SGD(params=params, lr=lr,
-                          momentum=0.9, weight_decay=1e-5)
+    optimizer = optim.SGD(
+        params=params,
+        lr=lr,
+        momentum=momentum,
+        weight_decay=weight_decay
+    )
     # Turn on training mode which enables dropout.
     model.train()
     total_loss = 0
@@ -177,7 +172,6 @@ def train():
                       cur_loss, math.exp(cur_loss)))
             total_loss = 0
             print('-' * 87)
-        num_batch += 1
 
 def evaluate(model, data_source, cuda=args.cuda):
     # Turn on evaluation mode which disables dropout.
@@ -185,7 +179,7 @@ def evaluate(model, data_source, cuda=args.cuda):
     eval_loss = 0
     total_length = 0
 
-    data_source.batch_size = 32
+    data_source.batch_size = eval_batch_size
     for data_batch in data_source:
         data, target, length = process_data(data_batch, cuda=cuda, eval=True)
 
@@ -208,7 +202,7 @@ def evaluate(model, data_source, cuda=args.cuda):
         try:
             for epoch in range(1, args.epochs + 1):
                 epoch_start_time = time.time()
-                train()
+                train(model, corpus.train, lr=lr)
                 if args.prof:
                     break
                 val_ppl = evaluate(model, corpus.valid)
@@ -249,4 +243,3 @@ def evaluate(model, data_source, cuda=args.cuda):
 
     if args.tb_name:
         writer.close()
-
diff --git a/nce.py b/nce.py
index 3587b66..ff3a838 100644
--- a/nce.py
+++ b/nce.py
@@ -24,6 +24,7 @@ class NCELoss(nn.Module):
         norm_term: the normalization term (lnZ in paper)
         size_average: average the loss by batch size
         decoder: the decoder matrix
+        normed_eval: using normalized probability during evaluation
 
     Shape:
         - noise: :math:`(V)` where `V = vocabulary size`
@@ -37,8 +38,8 @@ def __init__(self,
                  noise_ratio=10,
                  norm_term=9,
                  size_average=True,
-                 decoder_weight=None,
                  per_word=True,
+                 normed_eval=True,
                  ):
         super(NCELoss, self).__init__()
 
@@ -49,10 +50,10 @@ def __init__(self,
         self.ntokens = ntokens
         self.size_average = size_average
         self.per_word = per_word
+        if normed_eval:
+            self.normed_eval = normed_eval
+            self.ce = nn.CrossEntropyLoss(size_average=False)
         self.decoder = IndexLinear(nhidden, ntokens)
-        # Weight tying
-        if decoder_weight:
-            self.decoder.weight = decoder_weight
 
     def forward(self, input, target=None):
         """compute the loss with output and the desired target
@@ -92,6 +93,10 @@ def forward(self, input, target=None):
 
             loss = -1 * torch.sum(rnn_loss + noise_loss)
 
+        elif self.normed_eval:
+            # Fallback into conventional cross entropy
+            out = self.decoder(input)
+            loss = self.ce(out, target)
         else:
             out = self.decoder(input, indices=target.unsqueeze(1))
             nll = out.sub(self.norm_term)