diff --git a/lxmls/deep_learning/numpy_models/log_linear.py b/lxmls/deep_learning/numpy_models/log_linear.py
index 7714d585..3422d78c 100644
--- a/lxmls/deep_learning/numpy_models/log_linear.py
+++ b/lxmls/deep_learning/numpy_models/log_linear.py
@@ -18,36 +18,36 @@ def __init__(self, **config):
         self.bias = np.zeros((1, config['num_classes']))
         self.learning_rate = config['learning_rate']
 
-    def log_forward(self, input=None):
+    def log_forward(self, X):
         """Forward pass of the computation graph"""
 
         # Linear transformation
-        z = np.dot(input, self.weight.T) + self.bias
+        z = np.dot(X, self.weight.T) + self.bias
 
         # Softmax implemented in log domain
         log_tilde_z = z - logsumexp(z, axis=1, keepdims=True)
 
         return log_tilde_z
 
-    def predict(self, input=None):
+    def predict(self, X):
         """Most probable class index"""
-        return np.argmax(np.exp(self.log_forward(input)), axis=1)
+        return np.argmax(self.log_forward(X), axis=1)
 
-    def update(self, input=None, output=None):
+    def update(self, X, y):
         """Stochastic Gradient Descent update"""
 
         # Probabilities of each class
-        class_probabilities = np.exp(self.log_forward(input))
+        class_probabilities = np.exp(self.log_forward(X))
         batch_size, num_classes = class_probabilities.shape
 
         # Error derivative at softmax layer
-        I = index2onehot(output, num_classes)
+        I = index2onehot(y, num_classes)
         error = - (I - class_probabilities) / batch_size
 
         # Weight gradient
         gradient_weight = np.zeros(self.weight.shape)
         for l in np.arange(batch_size):
-            gradient_weight += np.outer(error[l, :], input[l, :])
+            gradient_weight += np.outer(error[l, :], X[l, :])
 
         # Bias gradient
         gradient_bias = np.sum(error, axis=0, keepdims=True)
diff --git a/lxmls/deep_learning/numpy_models/mlp.py b/lxmls/deep_learning/numpy_models/mlp.py
index ea19f452..f2de6cd4 100755
--- a/lxmls/deep_learning/numpy_models/mlp.py
+++ b/lxmls/deep_learning/numpy_models/mlp.py
@@ -15,19 +15,19 @@ def __init__(self, **config):
         # self.parameters
         MLP.__init__(self, **config)
 
-    def predict(self, input=None):
+    def predict(self, X):
         """
         Predict model outputs given input
         """
-        log_class_probabilities, _ = self.log_forward(input)
-        return np.argmax(np.exp(log_class_probabilities), axis=1)
+        log_class_probabilities, _ = self.log_forward(X)
+        return np.argmax(log_class_probabilities, axis=1)
 
-    def update(self, input=None, output=None):
+    def update(self, X, y):
         """
         Update model parameters given batch of data
         """
 
-        gradients = self.backpropagation(input, output)
+        gradients = self.backpropagation(X, y)
 
         learning_rate = self.config['learning_rate']
         num_parameters = len(self.parameters)
@@ -39,11 +39,11 @@ def update(self, input=None, output=None):
             # Update bias
             self.parameters[m][1] -= learning_rate * gradients[m][1]
 
-    def log_forward(self, input):
+    def log_forward(self, X):
         """Forward pass for sigmoid hidden layers and output softmax"""
 
         # Input
-        tilde_z = input
+        tilde_z = X
         layer_inputs = []
 
         # Hidden layers
@@ -72,17 +72,17 @@ def log_forward(self, input):
 
         return log_tilde_z, layer_inputs
 
-    def cross_entropy_loss(self, input, output):
+    def cross_entropy_loss(self, X, y):
         """Cross entropy loss"""
-        num_examples = input.shape[0]
-        log_probability, _ = self.log_forward(input)
-        return -log_probability[range(num_examples), output].mean()
+        num_examples = X.shape[0]
+        log_probability, _ = self.log_forward(X)
+        return -log_probability[range(num_examples), y].mean()
 
-    def backpropagation(self, input, output):
+    def backpropagation(self, X, y):
         """Gradients for sigmoid hidden layers and output softmax"""
 
         # Run forward and store activations for each layer
-        log_prob_y, layer_inputs = self.log_forward(input)
+        log_prob_y, layer_inputs = self.log_forward(X)
         prob_y = np.exp(log_prob_y)
 
         num_examples, num_clases = prob_y.shape
@@ -97,7 +97,7 @@ def backpropagation(self, input, output):
 
         # Initial error is the cost derivative at the last layer (for cross
         # entropy cost)
-        I = index2onehot(output, num_clases)
+        I = index2onehot(y, num_clases)
         error = - (I - prob_y) / num_examples
         errors.append(error)
 
@@ -105,10 +105,10 @@ def backpropagation(self, input, output):
         for n in reversed(range(num_hidden_layers)):
 
             # Backpropagate through linear layer
-            error = np.dot(error, self.parameters[n+1][0])
+            error = np.dot(error, self.parameters[n + 1][0])
 
             # Backpropagate through sigmoid layer
-            error *= layer_inputs[n+1] * (1-layer_inputs[n+1])
+            error *= layer_inputs[n + 1] * (1 - layer_inputs[n + 1])
 
             # Collect error
             errors.append(error)
diff --git a/lxmls/deep_learning/numpy_models/rnn.py b/lxmls/deep_learning/numpy_models/rnn.py
index f7911e5b..e2e2011e 100644
--- a/lxmls/deep_learning/numpy_models/rnn.py
+++ b/lxmls/deep_learning/numpy_models/rnn.py
@@ -11,18 +11,18 @@ def __init__(self, **config):
         # self.parameters
         RNN.__init__(self, **config)
 
-    def predict(self, input=None):
+    def predict(self, X):
         """
         Predict model outputs given input
         """
-        p_y = np.exp(self.log_forward(input)[0])
-        return np.argmax(p_y, axis=1)
+        log_p_y = self.log_forward(X)[0]
+        return np.argmax(log_p_y, axis=1)
 
-    def update(self, input=None, output=None):
+    def update(self, X, y):
         """
         Update model parameters given batch of data
         """
-        gradients = self.backpropagation(input, output)
+        gradients = self.backpropagation(X, y)
         learning_rate = self.config['learning_rate']
         # Update each parameter with SGD rule
         num_parameters = len(self.parameters)
@@ -30,15 +30,15 @@ def update(self, input=None, output=None):
             # Update weight
             self.parameters[m] -= learning_rate * gradients[m]
 
-    def log_forward(self, input):
+    def log_forward(self, X):
 
         # Get parameters and sizes
         W_e, W_x, W_h, W_y = self.parameters
         hidden_size = W_h.shape[0]
-        nr_steps = input.shape[0]
+        nr_steps = X.shape[0]
 
         # Embedding layer
-        z_e = W_e[input, :]
+        z_e = W_e[X, :]
 
         # Recurrent layer
         h = np.zeros((nr_steps + 1, hidden_size))
@@ -48,7 +48,7 @@ def log_forward(self, input):
             z_t = W_x.dot(z_e[t, :]) + W_h.dot(h[t, :])
 
             # Non-linear
-            h[t+1, :] = 1.0 / (1 + np.exp(-z_t))
+            h[t + 1, :] = 1.0 / (1 + np.exp(-z_t))
 
         # Output layer
         y = h[1:, :].dot(W_y.T)
@@ -56,25 +56,25 @@ def log_forward(self, input):
         # Softmax
         log_p_y = y - logsumexp(y, axis=1, keepdims=True)
 
-        return log_p_y, y, h, z_e, input
+        return log_p_y, y, h, z_e, X  # why does this return its own input?
 
-    def backpropagation(self, input, output):
+    def backpropagation(self, X, y):
 
         '''
         Compute gradientes, with the back-propagation method
         inputs:
-            x: vector with the (embedding) indicies of the words of a
+            X: matrix with the (embedding) indicies of the words of a
                 sentence
-            outputs: vector with the indicies of the tags for each word of
+            y: vector with the indicies of the tags for each word of
                         the sentence outputs:
             gradient_parameters: vector with parameters gradientes
         '''
 
         # Get parameters and sizes
         W_e, W_x, W_h, W_y = self.parameters
-        nr_steps = input.shape[0]
+        nr_steps = X.shape[0]
 
-        log_p_y, y, h, z_e, x = self.log_forward(input)
+        log_p_y, y, h, z_e, x = self.log_forward(X)
         p_y = np.exp(log_p_y)
 
         # Initialize gradients with zero entrances
@@ -87,7 +87,7 @@ def backpropagation(self, input, output):
         # Solution to Exercise 6.1
 
         # Gradient of the cost with respect to the last linear model
-        I = index2onehot(output, W_y.shape[0])
+        I = index2onehot(y, W_y.shape[0])
         error = - (I - p_y) / nr_steps
 
         # backward pass, with gradient computation
@@ -119,8 +119,8 @@ def backpropagation(self, input, output):
 
         return gradient_parameters
 
-    def cross_entropy_loss(self, input, output):
+    def cross_entropy_loss(self, X, y):
         """Cross entropy loss"""
-        nr_steps = input.shape[0]
-        log_probability = self.log_forward(input)[0]
-        return -log_probability[range(nr_steps), output].mean()
+        nr_steps = X.shape[0]
+        log_probability = self.log_forward(X)[0]
+        return -log_probability[range(nr_steps), y].mean()
diff --git a/lxmls/deep_learning/pytorch_models/log_linear.py b/lxmls/deep_learning/pytorch_models/log_linear.py
index 721fbd60..79e6d8a5 100644
--- a/lxmls/deep_learning/pytorch_models/log_linear.py
+++ b/lxmls/deep_learning/pytorch_models/log_linear.py
@@ -20,14 +20,14 @@ def __init__(self, **config):
         self.log_softmax = torch.nn.LogSoftmax(dim=1)
         self.loss_function = torch.nn.NLLLoss()
 
-    def _log_forward(self, input=None):
+    def _log_forward(self, X):
         """Forward pass of the computation graph in logarithm domain (pytorch)"""
 
         # IMPORTANT: Cast to pytorch format
-        input = torch.from_numpy(input).float()
+        X = torch.from_numpy(X).float()
 
         # Linear transformation
-        z =  torch.matmul(input, torch.t(self.weight)) + self.bias
+        z =  torch.matmul(X, torch.t(self.weight)) + self.bias
 
         # Softmax implemented in log domain
         log_tilde_z = self.log_softmax(z)
@@ -35,19 +35,19 @@ def _log_forward(self, input=None):
         # NOTE that this is a pytorch class!
         return log_tilde_z
 
-    def predict(self, input=None):
+    def predict(self, X):
         """Most probable class index"""
-        log_forward = self._log_forward(input).data.numpy()
+        log_forward = self._log_forward(X).data.numpy()
         return np.argmax(log_forward, axis=1)
 
-    def update(self, input=None, output=None):
+    def update(self, X, y):
         """Stochastic Gradient Descent update"""
 
         # IMPORTANT: Class indices need to be casted to LONG
-        true_class = torch.from_numpy(output).long()
+        true_class = torch.from_numpy(y).long()
 
         # Compute negative log-likelihood loss
-        loss = self.loss_function(self._log_forward(input), true_class)
+        loss = self.loss_function(self._log_forward(X), true_class)
 
         # Use autograd to compute the backward pass.
         loss.backward()
diff --git a/lxmls/deep_learning/pytorch_models/mlp.py b/lxmls/deep_learning/pytorch_models/mlp.py
index 5dce865c..0203fa44 100755
--- a/lxmls/deep_learning/pytorch_models/mlp.py
+++ b/lxmls/deep_learning/pytorch_models/mlp.py
@@ -33,16 +33,15 @@ def __init__(self, **config):
         self.loss_function = torch.nn.NLLLoss()
 
     # TODO: Move these outside fo the class as in the numpy case
-    def _log_forward(self, input):
+    def _log_forward(self, X):
         """
         Forward pass
         """
 
         # Ensure the type matches torch type
-        input = cast_float(input)
+        X = cast_float(X)
 
-        # Input
-        tilde_z = input
+        tilde_z = X
 
         # ----------
         # Solution to Exercise 6.4
@@ -71,15 +70,15 @@ def _log_forward(self, input):
 
         return log_tilde_z
 
-    def gradients(self, input, output):
+    def gradients(self, X, y):
         """
         Computes the gradients of the network with respect to cross entropy
         error cost
         """
-        true_class = torch.from_numpy(output).long()
+        true_class = torch.from_numpy(y).long()
 
         # Compute negative log-likelihood loss
-        _log_forward = self._log_forward(input)
+        _log_forward = self._log_forward(X)
         loss = self.loss_function(_log_forward, true_class)
         # Use autograd to compute the backward pass.
         loss.backward()
@@ -90,18 +89,18 @@ def gradients(self, input, output):
             nabla_parameters.append([weight.grad.data, bias.grad.data])
         return nabla_parameters
 
-    def predict(self, input=None):
+    def predict(self, X):
         """
         Predict model outputs given input
         """
-        log_forward = self._log_forward(input).data.numpy()
+        log_forward = self._log_forward(X).data.numpy()
         return np.argmax(log_forward, axis=1)
 
-    def update(self, input=None, output=None):
+    def update(self, X, y):
         """
         Update model parameters given batch of data
         """
-        gradients = self.gradients(input, output)
+        gradients = self.gradients(X, y)
         learning_rate = self.config['learning_rate']
         # Update each parameter with SGD rule
         for m in range(self.num_layers):
diff --git a/lxmls/deep_learning/pytorch_models/rnn.py b/lxmls/deep_learning/pytorch_models/rnn.py
index edca0f6f..c0720016 100644
--- a/lxmls/deep_learning/pytorch_models/rnn.py
+++ b/lxmls/deep_learning/pytorch_models/rnn.py
@@ -35,7 +35,7 @@ def __init__(self, **config):
         # First parameters are the embeddings
         # instantiate the embedding layer first
         self.embedding_layer = torch.nn.Embedding(
-            config['input_size'],
+            config['X_size'],
             config['embedding_size']
         )
 
@@ -55,18 +55,18 @@ def __init__(self, **config):
             # Get weigths and bias of the layer (even and odd positions)
             self.parameters[index] = cast_float(self.parameters[index])
 
-    def predict(self, input=None):
+    def predict(self, X):
         """
-        Predict model outputs given input
+        Predict model outputs given X
         """
-        log_p_y = self._log_forward(input).data.numpy()
+        log_p_y = self._log_forward(X).data.numpy()
         return np.argmax(log_p_y, axis=1)
 
-    def update(self, input=None, output=None):
+    def update(self, X, y):
         """
         Update model parameters given batch of data
         """
-        gradients = self.backpropagation(input, output)
+        gradients = self.backpropagation(X, y)
         learning_rate = self.config['learning_rate']
         # Update each parameter with SGD rule
         num_parameters = len(self.parameters)
@@ -74,19 +74,19 @@ def update(self, input=None, output=None):
             # Update weight
             self.parameters[m].data -= learning_rate * gradients[m]
 
-    def _log_forward(self, input):
+    def _log_forward(self, X):
         """
         Forward pass
         """
 
         # Ensure the type matches torch type
-        input = cast_int(input, grad=False)
+        X = cast_int(X, grad=False)
 
         # Get parameters and sizes
         W_e, W_x, W_h, W_y = self.parameters
         embedding_size, vocabulary_size = W_e.shape
         hidden_size = W_h.shape[0]
-        nr_steps = input.shape[0]
+        nr_steps = X.shape[0]
 
         # FORWARD PASS COMPUTATION GRAPH
 
@@ -94,7 +94,7 @@ def _log_forward(self, input):
         # Solution to Exercise 6.2
 
         # Word Embeddings
-        z_e = self.embedding_layer(input)
+        z_e = self.embedding_layer(X)
 
         # Recurrent layer
         h = torch.zeros(1, hidden_size)
@@ -122,14 +122,14 @@ def _log_forward(self, input):
 
         return log_p_y
 
-    def backpropagation(self, input, output):
+    def backpropagation(self, X, y):
         """
         Computes the gradients of the network with respect to cross entropy
         error cost
         """
 
         # Ensure the type matches torch type
-        output = cast_int(output, grad=False)
+        y = cast_int(y, grad=False)
 
         # Zero gradients
         for parameter in self.parameters:
@@ -137,8 +137,8 @@ def backpropagation(self, input, output):
                 parameter.grad.data.zero_()
 
         # Compute negative log-likelihood loss
-        log_p_y = self._log_forward(input)
-        cost = self.loss(log_p_y, output)
+        log_p_y = self._log_forward(X)
+        cost = self.loss(log_p_y, y)
         # Use autograd to compute the backward pass.
         cost.backward()
 
@@ -168,7 +168,7 @@ def __init__(self, **config):
         # First parameters are the embeddings
         # instantiate the embedding layer first
         self.embedding_layer = torch.nn.Embedding(
-            config['input_size'],
+            config['X_size'],
             config['embedding_size']
         )
         # Set its value to the stored weight
@@ -196,19 +196,19 @@ def __init__(self, **config):
             [cast_float(self.parameters[-1])]
         )
 
-    def predict(self, input=None):
+    def predict(self, X):
         """
-        Predict model outputs given input
+        Predict model outputs given X
         """
-        log_p_y = self._log_forward(input).data.numpy()
+        log_p_y = self._log_forward(X).data.numpy()
 
         return np.argmax(log_p_y, axis=1)
 
-    def update(self, input=None, output=None):
+    def update(self, X, y):
         """
         Update model parameters given batch of data
         """
-        gradients = self.backpropagation(input, output)
+        gradients = self.backpropagation(X, y)
         learning_rate = self.config['learning_rate']
         # Update each parameter with SGD rule
         num_parameters = len(self.parameters)
@@ -216,13 +216,13 @@ def update(self, input=None, output=None):
             # Update weight
             self.parameters[m].data -= learning_rate * gradients[m]
 
-    def _log_forward(self, input):
+    def _log_forward(self, X):
         """
         Forward pass
         """
 
         # Ensure the type matches torch type
-        input = cast_int(input)
+        X = cast_int(X)
 
         # Get parameters and sizes
         W_e, W_x, W_h, W_y = self.parameters
@@ -231,7 +231,7 @@ def _log_forward(self, input):
         # FORWARD PASS COMPUTATION GRAPH
 
         # Word Embeddings
-        z_e = self.embedding_layer(input)
+        z_e = self.embedding_layer(X)
 
         # RNN
         h, _ = self.rnn(z_e[:, None, :])
@@ -244,12 +244,12 @@ def _log_forward(self, input):
 
         return log_p_y
 
-    def backpropagation(self, input, output):
+    def backpropagation(self, X, y):
         """
         Computes the gradients of the network with respect to cross entropy
         error cost
         """
-        output = cast_int(output, grad=False)
+        y = cast_int(y, grad=False)
 
         # Zero gradients
         for parameter in self.parameters:
@@ -257,8 +257,8 @@ def backpropagation(self, input, output):
                 parameter.grad.data.zero_()
 
         # Compute negative log-likelihood loss
-        log_p_y = self._log_forward(input)
-        cost = self.loss(log_p_y, output)
+        log_p_y = self._log_forward(X)
+        cost = self.loss(log_p_y, y)
         # Use autograd to compute the backward pass.
         cost.backward()