Implementation without R6

In this section, we include the implementation of the same basic recurrent neural network without using R6 classes. First, some imports and setting the seed:

library(readr)
library(stringr)
library(purrr)
library(tokenizers)
set.seed(1234)

We introduce an auxiliary function to initialize to zeros a matrix with the shape of a matrix, M:

zeros_like <- function(M){
 return(matrix(0,dim(as.matrix(M))[1],dim(as.matrix(M))[2])) 
}

We also need the softmax function:

softmax <- function(x){
 xt <- exp(x-max(x))
 return(xt/sum(xt))
}

We will use this for testing the female names data (see the Exercises section):

data <- read_lines("./data/female.txt")

And do some preprocessing:

text <- data %>%
 str_to_lower() %>%
 str_c(collapse = "
") %>%
 tokenize_characters(strip_non_alphanum = FALSE, simplify = TRUE)

We set up the characters of our vocabulary. This is required to do the one-hot encoding of each input data:

chars <- text %>% unique
chars

Now, we go to the main part of the program. Besides the initializations, notice that we define a lossFun which includes the forward steps and backpropagation through time:

library(readr)
library(stringr)
library(purrr)
library(tokenizers)

set.seed(1234)

zeros_like <- function(M){
  return(matrix(0,dim(as.matrix(M))[1],dim(as.matrix(M))[2])) 
}


softmax <- function(x){
  xt <- exp(x-max(x))
  return(xt/sum(xt))
}
data <- read_lines("./data/female.txt")
head(data)


text <- data %>%
  str_to_lower() %>%
  str_c(collapse = "
") %>%
  tokenize_characters(strip_non_alphanum = FALSE, simplify = TRUE)


chars <- text %>% unique
chars


# hyperparameters
hidden_size = 100 # size of hidden layer of neurons
seq_length = 10 # number of steps to unroll the RNN for
learning_rate = 1e-1
vocab_size = length(chars)


U <- matrix(rnorm(hidden_size*vocab_size)*0.01, nrow=hidden_size) # input to hidden
W <- matrix(rnorm(hidden_size*hidden_size)*0.01, nrow=hidden_size) # hidden to hidden
V <- matrix(rnorm(vocab_size*hidden_size)*0.01, nrow=vocab_size) # hidden to output
bh <- matrix(0,hidden_size, 1) # hidden bias
by <- matrix(0,vocab_size, 1) # output bias


lossFun <- function(inputs,targets,prev_hidden){
  tot <- length(inputs) #total sequence length
  xs <- lapply(vector('list',tot), function(i) matrix(0,vocab_size, 1))
  hs <- lapply(vector('list',tot), function(i) matrix(0,hidden_size, 1))
  ys <- lapply(vector('list',tot), function(i) matrix(0,vocab_size, 1))
  ps <- lapply(vector('list',tot), function(i) matrix(0,vocab_size,1))
  loss <- 0
  for(idx in 1:tot){
    xs[[idx]] <- matrix(0,vocab_size,1)
    xs[[idx]][inputs[[idx]]] = 1

    ## update the hidden state
    if(idx==1){
      hs[[idx]] <- tanh(U%*%xs[[idx]]+W%*%h_old+bh) 
    }
    else{
      hs[[idx]] <- tanh(U%*%xs[[idx]]+W%*%hs[[(idx-1)]]+bh) 
    }
    
    ## Get char probabilities
    ys[[idx]] <- V%*%hs[[idx]] + by
    ps[[idx]] <- softmax(ys[[idx]])
    
    ## Loss function (cross-entropy here)
    loss <- loss-log(ps[[idx]][targets[idx], 1])
    
    # Initialize the gradients
    dU <- zeros_like(U)
    dW <- zeros_like(W)
    dV <- zeros_like(V)
    dbh <- zeros_like(bh)
    dby <- zeros_like(by)
    dhnext <- zeros_like(h_old)
    
    # Here comes the backprop loop
    for(j in length(inputs):1){
      # Output vs loss
      dy <- ps[[j]]
      dy[targets[j]] <- dy[targets[j]]-1 
      dV <- dV+dy%*%t(hs[[j]])
      dby <- dby+dy
      
      ## Hidden layer
      dh <- t(V)%*%dy + dhnext
      dh_raw <- (1 - hs[[j]] * hs[[j]]) * dh
      dbh <- dbh+dh_raw
      
      dU <- dU+dh_raw%*%t(xs[[j]])
      
      if(j==1){
        dW <- dW+dh_raw%*%t(h_old) 
      }
      else{
        dW <- dW+dh_raw%*%t(hs[[(j-1)]]) 
      }
      dhnext <- t(W)%*%dh_raw
      
      
    }
  }
  return(list("loss"=loss, "dU"=dU, "dW"=dW, "dV"=dV, "dbh"=dbh, "dby"=dby, "hs"=hs[length(inputs)-1]))
}


## Sample a few chars given a hidden state and a seed
sample_char <- function(h, seed_ix, n){
  x <- matrix(0,vocab_size, 1)
  x[seed_ix] <- 1
  
  ixes <- c()
  
  for(t in 1:n){
    h <- tanh(U%*%x+W%*%h+bh)
    y <- V%*%h+by
    p <- exp(y)/sum(exp(y)) #softmax
    ix <- sample(chars,size=1, replace=T, prob=p)
    x <- matrix(0,vocab_size,1)
    x[which(chars==ix)] <- 1
    ixes[t] <- ix
  }
  return(ixes)
}


n <- 1
p <- 1

mU <- zeros_like(U)
mW <- zeros_like(W)
mV <- zeros_like(V)
mbh <- zeros_like(bh)
mby <- zeros_like(by) # memory variables for Adagrad
smooth_loss = -log(1.0/vocab_size)*seq_length # loss at iteration 0

while(T){
  if(p + seq_length + 1 >= length(data) || n == 1){
    # reset RNN memory
    ## h_old is the hidden state of RNN
    h_old <- matrix(0,hidden_size, 1)
    # go from the start of the data
    p <- 1
  }
  
  inputs <- unlist(sapply(text[p:(p+seq_length)],function(c){which(chars==c)}))
  targets <- unlist(sapply(text[(p+1):(p+seq_length+1)],function(c){which(chars==c)}))
  
  # Check what the model is doing from time to time
  if(n %% 100 == 0){
    txt <- sample_char(h_old, inputs[[1]], 200)
    ## Find line breaks
    line_breaks <- which(txt=="
")
    if(length(line_breaks)<2){
      print(txt)
    }
    else{
      for(ix in 2:(length(line_breaks-1))){
        first_ix <- line_breaks[ix-1]+1
        last_ix <- line_breaks[ix]-1
        print(paste(txt[first_ix:last_ix], collapse=""))
      }
    }
    
    smooth_loss = smooth_loss*0.99+loss*0.01
    print('---- sample -----')
    cat("Iteration number: ",n, "
")
    cat("Loss: ", smooth_loss)
    
    
  }
  
  tmp <- lossFun(inputs, targets, h_old)
  loss <- unlist(tmp$loss)
  dU <- unlist(tmp$dU)
  dW <- unlist(tmp$dW) 
  dV <- unlist(tmp$dV)
  dbh <- unlist(tmp$dbh)
  dby <- unlist(tmp$dby)
  h_old <- unlist(tmp$hs)
  
  ## Weight updates for Adagrad
 
  mU <- mU+dU**2
  U <- U-learning_rate * dU / sqrt(mU + 1e-8)
  mW <- mW+dW**2
  W <- W-learning_rate * dW / sqrt(mW + 1e-8)
  mV <- mV+dV**2
  V <- V-learning_rate * dV / sqrt(mV + 1e-8)
  mbh <- mbh+mbh**2
  bh <- bh-learning_rate * dbh / sqrt(mbh + 1e-8)
  mby <- mby+dby**2
  by <- by-learning_rate * dby / sqrt(mby + 1e-8)
 
  p <- p+seq_length
  n <- n+1
}

What does this recurrent neural network produce? In the beginning, we get the following female names:

[1] "iaiaaan"
[1] "aannaeinraaniaraeinareanaeaaraana"
[1] "iainii"
[1] "laeoda"
[1] "arineaeia"
[1] "rdiiaai"
[1] "eiaa"
[1] "irineaaasrnaaaaaalaiiaaiaiiaranaxiaaaannnaiiorieiida"
[1] "naiiaaiaaialiaraaaaannaian"
[1] "aaaaieaiaain"
[1] "nad"
[1] "iiaaeaeaianiaa"
[1] "---- sample -----"
Iteration number: 100

After some time, the network starts to make sense of the data:

[1] "anna"
[1] "annanianinbnatarmadnanannannnablantd"
[1] "antsnamannd"
[1] "iniaina"
[1] "anta"
[1] "alnaenalnalna"
[1] "annilinnina"
[1] "anma"
[1] "bna"
[1] "anganna"
[1] "alnaniannnnna"
[1] "iriannannennandana"
[1] "anyoa"
[1] "annannllynaenpanda"
[1] "anedaannna"
[1] "anna"
[1] "---- sample -----"
Iteration number: 700

Note how really amazing this is. After only 700 iterations, you start rediscovering female names, and generate a few real-sounding names (like anyoa). All this is achieved from the character level. The network has no idea about words, nor language. It is able to create those names from statistical dependencies on the input data only.

What else can we do? I tried the same code on the LaTeX version of my PhD thesis. After a few thousand iterations, it learns to do the imports of the packages correctly. A bit further ahead, it starts to write small words in English. You can try it with other data; see the Exercises section for some suggestions.

Table of Contents for Implementation without R6

Create new playlist

Sign In

Sign Up

Table of Contents for
Implementation without R6