Implementation as an R6 class

First, let's show the skeleton of this class. We need a number of functions here. First, we need to import the R6 package and create two auxiliary functions:

softmax <- function(x){
xt <- exp(x-max(x))
zeros_like <- function(M){

This will come in handy during the code to calculate the softmax and initialize matrices with the correct sizes. As before, our program needs the following basic functions:

  • Forward propagation
  • Backward propagation
  • A sample from the obtained probability distribution
  • Train the model

The structure of the class should look like:

RNN <- R6Class("RNN", 
public = list(
hidden_size = NULL,
vocab_size = NULL,
learning_rate = NULL,
seq_length = NULL,
chars = NULL,
n_iter = NULL,
initialize = function(hidden_size = NA, vocab_size = NA,
chars=NA, n_iter=100, seq_length=NA, learning_rate=0.01){

, forward_step = function(input_sample){
## Takes one column vector and returns the softmax output

, bptt = function(inputs,targets,s_prev){
seq_size <- length(inputs) #total length of the sequence
loss <- 0
for(idx in 1:seq_size){
# Forward pass: Update the hidden state and predict
# Backward pass: updates using the gradient
for(j in length(inputs):1){
## derivatives of error vs output
return(list("loss"=loss, "dU"=dU, "dW"=dW, "dV"=dV, "hs"=hs[length(inputs)-1]))
, sample_char <- function(h, seed_ix, n){
## generate a sample from the model
## given a hidden state and an initial seed

, train = function(text){
## Main training loop with Adagrad updates

The most interesting is the bptt function, so let us describe it in more detail. When the inputs from the environment are received, in this case, a chunk of text, we will loop through every character of this chunk and generate predictions for the following states given the present state and the value of the hidden state from the previous iteration. Once we traverse the input chunk, we need to calculate the updates for the gradients in reverse order. This is done in the back propagation part of this function. 

We observed better performance with Adagrad instead of stochastic gradient descent, which should be implemented on the final part, in the train function. 

The full code is:

softmax <- function(x){
xt <- exp(x-max(x))
zeros_like <- function(M){
RNN <- R6Class("RNN",
public = list(
hidden_size = NULL,
vocab_size = NULL,
learning_rate = NULL,
seq_length = NULL,
chars = NULL,
n_iter = NULL,
initialize = function(hidden_size = NA, vocab_size = NA, chars=NA, n_iter=100, seq_length=NA, learning_rate=0.01){
self$hidden_size <- hidden_size
self$n_iter <- n_iter
self$learning_rate <- learning_rate
self$seq_length <- seq_length
self$vocab_size <- as.numeric(vocab_size)
self$chars <- chars
self$U <- matrix(rnorm(hidden_size*vocab_size)*0.01, nrow=self$hidden_size) # input to hidden
self$W <- matrix(rnorm(hidden_size*hidden_size)*0.01, nrow=self$hidden_size) # hidden to hidden
self$V <- matrix(rnorm(vocab_size*hidden_size)*0.01, nrow=self$vocab_size) # hidden to output
, forward_step = function(input_sample){
## Takes one column vector and returns probabilities
x <- input_sample
s <- tanh(self$U%*%x+self$W%*%self$s)
o <- softmax(self$V%*%self$s)
, bptt = function(inputs,targets,s_prev){
seq_size <- length(inputs) #total length of the sequence
xs <- lapply(vector('list',seq_size), function(i) matrix(0,self$vocab_size, 1))
hs <- lapply(vector('list',seq_size), function(i) matrix(0,self$hidden_size, 1))
ys <- lapply(vector('list',seq_size), function(i) matrix(0,self$vocab_size, 1))
ps <- lapply(vector('list',seq_size), function(i) matrix(0,self$vocab_size,1))
loss <- 0
for(idx in 1:seq_size){
xs[[idx]] <- matrix(0,self$vocab_size,1)
xs[[idx]][inputs[[idx]]] = 1
## Update the hidden state
hs[[idx]] <- tanh(self$U%*%xs[[idx]]+self$W%*%s_prev)
hs[[idx]] <- tanh(self$U%*%xs[[idx]]+self$W%*%hs[[(idx-1)]])
## calculate the probabilities for the next character
ys[[idx]] <- self$V%*%hs[[idx]]
ps[[idx]] <- softmax(ys[[idx]])

## Cross-entropy loss
loss <- loss-log(ps[[idx]][targets[idx], 1])

# Calculate gradients
dU <- zeros_like(self$U)
dW <- zeros_like(self$W)
dV <- zeros_like(self$V)
dhnext <- zeros_like(s_prev)
for(j in length(inputs):1){
## Gradient of the error vs output
dy <- ps[[j]]
dy[targets[j]] <- dy[targets[j]]-1
dV <- dV+dy%*%t(hs[[j]])
dh <- t(self$V)%*%dy + dhnext
## backprop through the tanh
dhraw <- (1 - hs[[j]] * hs[[j]]) * dh
## derivative of the error between input and hidden layer
dU <- dU+dhraw%*%t(xs[[j]])
dW <- dW+dhraw%*%t(s_prev)
dW <- dW+dhraw%*%t(hs[[(j-1)]])
dhnext <- t(self$W)%*%dhraw
return(list("loss"=loss, "dU"=dU, "dW"=dW, "dV"=dV, "hs"=hs[length(inputs)-1]))
sample_char = function(h, seed_ix, n){
# Generate a sequence of characters given a seed and a hidden state
x <- matrix(0,self$vocab_size, 1)
x[seed_ix] <- 1
ixes <- c()
for(t in 1:n){
h <- tanh(self$U%*%x+self$W%*%h)
y <- self$V%*%h
p <- exp(y)/sum(exp(y)) #softmax
ix <- sample(self$chars,size=1, replace=T, prob=p)
x <- matrix(0,self$vocab_size,1)
x[which(chars==ix)] <- 1
ixes[t] <- ix
, train = function(text){
n <- 1
p <- 1
mU <- zeros_like(self$U)
mW <- zeros_like(self$W)
mV <- zeros_like(self$V)

# memory variables for Adagrad
smooth_loss = -log(1.0/self$vocab_size)*self$seq_length # loss at iteration 0

for(n in 1:self$n_iter){
if(p + self$seq_length + 1 >= length(text) || n == 1){
# reset RNN memory
## h_old is the previous hidden state of RNN
h_old <- matrix(0,self$hidden_size, 1)
# go from start of data
p <- 1
inputs <- unlist(lapply(text[p:(p+self$seq_length)],function(c){which(self$chars==c)}))
targets <- unlist(lapply(text[(p+1):(p+self$seq_length+1)],function(c){which(self$chars==c)}))
# See what the model is doing from time to time
if(n %% 100 == 0){
txt <- self$sample_char(h_old, inputs[[1]], 200)
## Find the line breaks
line_breaks <- which(txt==" ")
for(ix in 2:(length(line_breaks-1))){
first_ix <- line_breaks[ix-1]+1
last_ix <- line_breaks[ix]-1
print(paste(txt[first_ix:last_ix], collapse=""))
smooth_loss = smooth_loss*0.99+loss*0.01
print('---- sample -----')
cat("Iteration number: ",n, " ")
cat("Loss: ", smooth_loss)
tmp <- self$bptt(inputs, targets, h_old)
loss <- unlist(tmp$loss)
dU <- unlist(tmp$dU)
dW <- unlist(tmp$dW)
dV <- unlist(tmp$dV)
h_old <- unlist(tmp$hs)
## Time to update the Adagrad weights
mU <- mU+dU**2
self$U <- self$U-self$learning_rate * dU / sqrt(mU + 1e-8)
mW <- mW+dW**2
self$W <- self$W-self$learning_rate * dW / sqrt(mW + 1e-8)
mV <- mV+dV**2
self$V <- self$V-self$learning_rate * dV / sqrt(mV + 1e-8)
p <- p+self$seq_length
n <- n+1

There is certainly some work to be done here. For instance, you can refactor the part of the forward pass to use the forward_step function explicitly. We leave that as an exercise.

To test your code, you can use the list of male and female names (see the Exercises section). We show how to do this if we use the list for female names (after removing the header information manually):

data <- read_lines("./data/female.txt")
text <- data %>%
str_to_lower() %>%
str_c(collapse = " ") %>%
tokenize_characters(strip_non_alphanum = FALSE, simplify = TRUE)
chars <- text %>% unique
test <- RNN$new(hidden_size = 100,
vocab_size = length(chars),

You can use your own data; it simply needs to be a text file. 

While training your models with your own data, be careful with what you are passing as input, always remember, garbage in, garbage out. Insert some logs or print statements here and there, so that you can see what your network doing.
