Multi-layer perceptron

Like pancakes, neural networks are made to be stacked with each other. We can make the output of a layer the input of the next layer, called a hidden layer. This hidden layer consists of a linear combination of the inputs to which an activation function is applied. This creates a new hidden vector which we can take as an input for the following hidden layer, on each step recombining the outputs from the previous layer by some weights and applying an activation function.

Let's start by introducing the sigmoid function, which will be useful later:

library(R6)

sigmoid <- function(x){
  1/(1+exp(-x))
}

The skeleton of the class is now:

MLP <- R6Class("MLP", 
                      public = list(
                        dim = NULL,
                        n_iter = NULL,
                        learning_rate = NULL,
                        hidden_layer_size=NULL,
                        Wih = NULL,
                        Who = NULL,
                        a = NULL,
                        initialize = function(learning_rate = 0.3, 
                                                n_iter=NA, 
                                                dim=NA, 
                                                hidden_layer_size=NA){
                          #INITIALIZATION CODE
                        }
                        , forward = function(x){
                            #Input: training vector
                            #Output: Class
                        }
                        , backward = function(t,y,X){
                          
                          # Input: Target, prediction and matrix of training examples
                          # No output, gradients are modified in the class
`  
                        }
                        
                        , train = function(X,t){
                           # Run the training loop: forward and backward propagation for n_iter
                        }
                        , predict = function(X){
                            # Call to the forward function for all training examples
                        }
                      )
)

The forward step is usually the simplest, which in our case is:

 , forward = function(x){
                          h <- as.matrix(x)%*%self$Wih
                          self$a <- sigmoid(h)
                          y <- sigmoid(self$a %*% self$Who) #Output of the network
                          return(y)
                        }

The backward step is the tricky part here. We have one error component, coming from the derivative of the loss function with respect to the last component. This is called layer2_delta in the code. The other component comes from the derivative with respect to the weights on the first layer, called layer1_delta. These are the derivatives that need to be evaluated in the corresponding points, namely X for the case of layer1_delta and the output a of the activation function for layer2_delta.

, backward = function(t,y,X){
                          
                          # Compute the error in the output layer
                          layer2_error <- t-y
                          layer2_delta <- (layer2_error)*(y*(1-y)) 

                          #Compute the error in the input layer
                          layer1_error <- layer2_delta %*% t(self$Who)
                          layer1_delta <- layer1_error*self$a*(1-self$a)
                          
                          # Adjustments of the weights
                          layer1_adjustment <- t(X) %*% layer1_delta
                          layer2_adjustment <- t(self$a) %*% layer2_delta
                          
                          self$Wih <- self$Wih+self$learning_rate*layer1_adjustment
                          self$Who <-

self$Who+self$learning_rate*layer2_adjustment

                        }

The other functions are easier to complete. For completeness, the full code is included here:

MLP <- R6Class("MLP", 
                      public = list(
                        dim = NULL,
                        n_iter = NULL,
                        learning_rate = NULL,
                        hidden_layer_size=NULL,
                        Wih = NULL,
                        Who = NULL,
                        a = NULL,
                        initialize = function(learning_rate = 0.3, 
                                                n_iter=NA, 
                                                dim=NA, 
                                                hidden_layer_size=NA){
                          self$dim <- dim
                          self$n_iter <- n_iter
                          self$learning_rate <- learning_rate
                          self$hidden_layer_size <- hidden_layer_size
                          self$Wih <- matrix(runif(self$hidden_layer_size*self$dim), 
                                               ncol = self$hidden_layer_size)
                          self$Who <- matrix(runif((self$hidden_layer_size)), ncol = 1)
                          self$a <- matrix(runif(self$hidden_layer_size*self$dim), ncol = self$dim)
                        }
                        , forward = function(x){
                          h <- as.matrix(x)%*%self$Wih
                          self$a <- sigmoid(h)
                          y <- sigmoid(self$a %*% self$Who) #Output of the network
                          return(y)
                        }
                        , backward = function(t,y,X){
                          
                          # Compute the error in the output layer
                          layer2_error <- t-y
                          layer2_delta <- (layer2_error)*(y*(1-y)) 

                          #Compute the error in the input layer
                          layer1_error <- layer2_delta %*% t(self$Who)
                          layer1_delta <- layer1_error*self$a*(1-self$a)
                          
                          # Adjustments of the weights
                          layer1_adjustment <- t(X) %*% layer1_delta
                          layer2_adjustment <- t(self$a) %*% layer2_delta
                          
                          self$Wih <- self$Wih+self$learning_rate*layer1_adjustment
                          self$Who <- self$Who+self$learning_rate*layer2_adjustment

                        }
                        
                        , train = function(X,t){
                          n_examples <- nrow(X)
                          for(iter in 1:self$n_iter){
                            preds <- self$forward(X)
                            self$backward(t,preds, X)
                            if(iter %% 1000 == 0){
                              cat("Iteration: ", iter,"
")
                            }
                            
                          }
                        }
                        , predict = function(X){
                          preds <- self$forward(X)
                          return(preds)
                        }
                      )
)

Let's try our network in the OR function, which as we saw, is linearly separable:

x1 <- c(0,0,1,1)
x2 <- c(0,1,0,1)
t <- c(0,1,1,1)
X <- as.matrix(data.frame(x1=x1, x2=x2))

Now, let's generate the predicted labels:

clf <- MLP$new(n_iter=5000,dim=ncol(X), hidden_layer_size=4)
clf$train(X,t)
clf$predict(X)

Well, that's all very nice, but certainly not much of an advantage, right? We already knew that this simple training example was correctly solved by the perceptron.

To really go one step further, let's consider the following data (the xor) function:

xor <- data.frame(x1=c(0,0,1,1), x2=c(0,1,0,1), t = c(0,1,1,0))
clf$train(xor[,1:2],xor[,3])
clf$predict(xor[,1:2])

We know that this is not linearly separable:

library(ggplot2)
grid_size <- 1e2
grid <- data.frame(V1=0,V2=0)
base <- seq(0,1,1/grid_size)

Let's generate a grid to which we will apply the decision function:

for(j in 1:grid_size){
 V1 <- rep(base[j],grid_size+1)
 V2 <- base
 tmp <- data.frame(V1=V1,V2=V2)
 grid <- rbind(tmp,grid)
}

Now, let's finally plot the evaluation of this function on the grid:

grid$z <- with(grid,clf$predict(cbind(V1,V2)))
ggplot(grid,aes(x=V1,y=V2))+geom_tile(aes(fill=z))+theme_bw()

We see that the multi-layer perceptron does a better job on this data, and is able to correctly guess the separation boundary between these two regions:

Table of Contents for Multi-layer perceptron

Create new playlist

Sign In

Sign Up

Table of Contents for
Multi-layer perceptron