import os
"CUDA_VISIBLE_DEVICES"] = ""
os.environ[
import random
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.initializers import Constant
Lab: Backpropagation
ACTL3143 & ACTL5111 Deep Learning for Actuaries
Backpropagation performs a backward pass to adjust the neural network’s parameters. It’s an algorithm that uses gradient descent to update the neural network weights.
Linear Regression via Batch Gradient Descent
Let be the parameter estimates of the th iteration. Let represents the training batch. Let mean squared error (MSE) be the loss/cost function .
Finding the Gradients
- Step 1: Write down and
- Step 2: Derive and
- Step 3: Derive
Then, we initialise and then apply gradient descent for using the derivatives derived from Equation 1 and Equation 2. is a chosen learning rate.
Exercise
- Use backpropagation algorithm to find with . The dataset is as follows:
That is, the true model would be , i.e., . Implement batch gradient descent.
Neural Network
For a neural network with hidden layers:
- is the input layer (the zeroth hidden layer). represents the th hidden layer for . is the output layer (the th hidden layer).
- represents the activation function for the th hidden layer, with . represents the activation function for the output layer.
- represents the weights connecting the activated neurons from the th hidden layer to the th neuron in the th hidden layer, where and , i.e., denotes the number of neurons in the th hidden layer. by definition.
- represents the bias for the th neuron in the th hidden layer.
Gradients For the Output Layer
The gradient for , i.e., the weights connecting the neurons in the th (last) hidden layer to the first neuron of the output layer, is given by: where
- .
- represents the inner product.
Example 1
- From input layer to the first hidden layer :
- From the first hidden layer to the output layer layer :
- (sigmoid function) and (exponential function).
Let be the parameter estimates of the th iteration. For illustration, we assume the bias terms are all zeros.
- For , apply equation Equation 3
- For , apply equation Equation 4
- For , apply equation Equation 4
Implementing Backpropagation in Python
See Week_4_Lab_Notebook.ipynb
for more details. The required packages/functions are as follows:
True weights:
= np.array([[0.25], [0.5], [0.75]])
w1_1 = np.array([[0.75], [0.5], [0.25]])
w1_2 = np.array([[2.0], [3.0]]) w2_1
Some synthetic data to work with:
# Generate 10000 random observations of 3 numerical features
0)
np.random.seed(= np.random.randn(10000, 3)
X
# Sigmoid activation function
def sigmoid(z):
return(1/(1+np.exp(-z)))
# Hidden Layer 1
= X @ w1_1 # The first neuron before activation
z1_1 = X @ w1_2 # The second neuron before activation
z1_2 = sigmoid(z1_1) # The first neuron after activation
a1_1 = sigmoid(z1_2) # The second neuron after activation
a1_2
# Output Layer
= np.concatenate((a1_1, a1_2), axis = 1) @ w2_1 # Pre-activation of the ouput
z2_1 = np.exp(z2_1) # Output
a2_1
# The actual values
= a2_1 y
From Scratch
# Initialised weights
= np.array([[0.2], [0.6], [1.0]])
w1_1_hat = np.array([[0.4], [0.8], [1.2]])
w1_2_hat = np.array([[1.0], [2.0]])
w2_1_hat
= []
losses = 5000
num_iterations for _ in range(num_iterations):
# Compute Forward Passes
# Hidden Layer 1
= X @ w1_1_hat # The first neuron before activation
z1_1_hat = X @ w1_2_hat # The second neuron before activation
z1_2_hat = sigmoid(z1_1_hat) # The first neuron after activation
a1_1_hat = sigmoid(z1_2_hat) # The second neuron after activation
a1_2_hat = np.concatenate((a1_1_hat, a1_2_hat), axis = 1)
a1_hat
# Output Layer
= a1_hat @ w2_1_hat # The output before activation
z2_1_hat = np.exp(z2_1_hat).reshape(len(y), 1) # The ouput
y_hat
# Track the Losses
= (y_hat - y)**2
loss
losses.append(np.mean(loss))
# Compute Deltas
= 2 * (y_hat - y) * np.exp(z2_1_hat)
delta2_1 = w2_1_hat[0] * delta2_1 * sigmoid(z1_1_hat) * (1-sigmoid(z1_1_hat))
delta1_1 = w2_1_hat[1] * delta2_1 * sigmoid(z1_2_hat) * (1-sigmoid(z1_2_hat))
delta1_2
# Compute Gradients
= delta2_1 * a1_hat
d2_1_hat = delta1_1 * X
d1_1_hat = delta1_2 * X
d1_2_hat
# Learning Rate
= 0.0005
eta
# Apply Batch Gradient Descent
-= eta * np.mean(d2_1_hat, axis = 0).reshape(2, 1)
w2_1_hat -= eta * np.mean(d1_1_hat, axis = 0).reshape(3, 1)
w1_1_hat -= eta * np.mean(d1_2_hat, axis = 0).reshape(3, 1)
w1_2_hat
print(w1_1_hat)
print(w1_2_hat)
print(w2_1_hat)
[[0.24985576]
[0.5000211 ]
[0.75018656]]
[[0.74987578]
[0.49998626]
[0.25009692]]
[[1.99874327]
[3.00125615]]
From Keras
# An initialiser for the weights in the neural network
= Constant([[0.2, 0.4], [0.6, 0.8], [1.0, 1.2]])
init1 = Constant([[1.0, 2.0]])
init2
# Build a neural network
# `use_bias` (whether to include bias terms for the neurons or not) is True by default
# `kernel_initializer` adjusts the initialisations of the weights
= Input(shape=X.shape[1:], name="Inputs")
x = Dense(2, "sigmoid", use_bias=False,
a1 =init1)(x)
kernel_initializer= Dense(1, "exponential", use_bias=False,
y_hat =init2)(a1)
kernel_initializer= Model(x, y_hat)
model
# Choosing the optimiser and the loss function
="adam", loss="mse")
model.compile(optimizer
# Model Training
# We don't implement early stopping to make the results comparable to the previous section
= model.fit(X, y, epochs=5000, verbose=0, batch_size = len(y))
hist
# Print out the weights
print(model.get_weights())
[array([[0.3025748 , 0.80548114],
[0.49333417, 0.5067073 ],
[0.6842524 , 0.2076197 ]], dtype=float32), array([[2.5133712, 2.5152776],
[2.4867477, 2.4848893]], dtype=float32)]