Skip to content

Latest commit

 

History

History

multi-layer-network

Multi-Layer Network

This tutorial is based on the construction of a three-layer network that learns to classify a 1000 dimension data point as a dimension label. This tutorial will detail the following:

  • A raw implementation of three-layer network, in this case all fully connected layers, using just numpy
  • Manual gradient computation for backpropogation with gradient descent
  • Weight initialization techniques to counter vanishing gradient issues
  • Use of Pytorch tensors and modules for very building and training

Numpy Implementation

# Code in file numpy_linear_model.py
# Example of three-layer network using only numpy

import numpy as np

# Specify the size of your batch, data, hidden layers, and labels
N = 128  # Batch size: How many data points you feed in at a time
data_dim = 1000  # Dimension of data vector, remember one data point is one dimensional
H_1 = 100 # Dimension of first hidden layer
H_2 = 100  # Dimension of second hidden layer
label_dim = 10  # Dimension of label, the output/answer corresponding to your initial data of dim of 1000
learning_rate = 1e-6

# Create dummy data and labels
x = np.random.randn(N, data_dim)  # Our data in the shape of a 128 X 1000 tensor
y = np.random.randn(N, label_dim)  # Our corresponding labels in the shape of a 128 X 10 tensor

# Now we initialize our weights from a standard normal distribution (dummy data is also from SND but that is irrelevant)
w_1 = np.random.randn(data_dim, H_1)  # First layer in the shape of a 1000 X 2000 tensor
w_2 = np.random.randn(H_1, H_2)  # Second layer in the shape of a 2000 X 100 tensor
w_3 = np.random.randn(H_2, label_dim)  # Third layer in the shape of a 100 X 10 tensor

# On to the training
for i in range(1000):
    # Start with the forward pass
    h_1 = x.dot(w_1)  # MatMul between data and weights of the first layer with shape 128 X 2000
    h_1_relu = np.maximum(h_1, 0)  # Non-linear ReLU layer
    h_2 = h_1_relu.dot(w_2)  # Matmul between first hidden layer and second weight layer with shape 128 X 100
    h_2_relu = np.maximum(h_2, 0)  # Non-linear ReLU layer
    y_pred = h_2_relu.dot(w_3)  # Matmul between second hidden layer and third weight layer with shape 128 X 10


    # Use a loss function to see how well it did (in this case we use the residual sum of squares)
    loss = (np.square(y_pred - y).sum())  # This is a scalar representing our loss score...lower the better. shape: scalar
    print(f"Loss is: {loss}")
    print(f"Step is : {i}")

    # Time to backpropagate which will compute the gradients for our weights
    y_pred_gradient = (y_pred - y)  # Find derivative of loss in respect to y_pred: dloss/dy_pred with shape 128 X 10
    w_3_gradient = h_2_relu.T.dot(y_pred_gradient)  # Find derivative of y_pred in respect to w_3 and apply chain rule with shape 100 X 10

    h_2_relu_gradient = y_pred_gradient.dot(w_3.T)  # Find derivative of h_2_relu in respect to w_2 with shape 128 X 100
    h_2_relu_gradient[h_2 < 0] = 0  # Adjust to derivative of ReLU.  shape: 128 X 100
    w_2_gradient = h_1_relu.T.dot(h_2_relu_gradient)  # Chain rule applied to the derivative of h_2_relu with shape 2000 X 100

    h_2_relu_gradient = y_pred_gradient.dot(w_3.T)  # dloss/dy shape 128 X 10
    h_2_relu_gradient[h_2 < 0] = 0  # Adjust to derivative of ReLU. shape: 128 X 100
    h_1_relu_gradient = h_2_relu_gradient.dot(w_2.T)  # dloss/dy * dy/dh_2 * dh_2/dh_1 with shape 128 X 2000
    h_1_relu_gradient[h_1 < 0] = 0  # Adjust to derivative of ReLU. shape: 128 X 2000
    w_1_gradient = x.T.dot(h_1_relu_gradient)  # Chain rule applied all the way to the end with shape 1000 X 2000

    # Update weights at specified rate
    w_1 -= learning_rate*w_1_gradient
    w_2 -= learning_rate*w_2_gradient
    w_3 -= learning_rate*w_3_gradient


def forward_pass():
    h_1 = x.dot(w_1)
    h_1_relu = np.maximum(h_1, 0)
    h_2 = h_1_relu.dot(w_2)
    h_2_relu = np.maximum(h_2, 0)
    y_pred = h_2_relu.dot(w_3)
    return y_pred


print(f"The original label {y[0]}")
print(f"The learned prediction {forward_pass()[0]}")