How to do it...

The code here is based on the paper Autoencoding Variational Bayes by Kingma and Welling (https://arxiv.org/pdf/1312.6114.pdf), and is adapted from GitHub: https://jmetzen.github.io/2015-11-27/vae.html .

The first step is as always importing the necessary modules. For this recipe we will require Numpy, Matplolib, and TensorFlow:

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

Next, we define the VariationalAutoencoder class. The class __init__ method defines the hyperparameters such as the learning rate, batch size, the placeholders for the input, and the weight and bias variables for the encoder and decoder network. It also builds the computational graph according to the network architecture of the VAE. We initialize the weights using Xavier initialization in this recipe. Instead of defining our own method for Xavier initialization, we use the tf.contrib.layers.xavier_initializer() TensorFlow to do the task. Lastly, we define the loss (generation and latent) and optimizer ops:

class VariationalAutoencoder(object):
  def __init__(self, network_architecture,   transfer_fct=tf.nn.softplus,
learning_rate=0.001, batch_size=100):
      self.network_architecture = network_architecture
      self.transfer_fct = transfer_fct
      self.learning_rate = learning_rate
      self.batch_size = batch_size
      # Place holder for the input
      self.x = tf.placeholder(tf.float32, [None,    network_architecture["n_input"]])
      # Define weights and biases
      network_weights = self._initialize_weights(**self.network_architecture)
      # Create autoencoder network
      # Use Encoder Network to determine mean and
      # (log) variance of Gaussian distribution in latent
      # space
      self.z_mean, self.z_log_sigma_sq = 
      self._encoder_network(network_weights["weights_encoder"],
    network_weights["biases_encoder"])
      # Draw one sample z from Gaussian distribution
      n_z = self.network_architecture["n_z"]
      eps = tf.random_normal((self.batch_size, n_z), 0, 1, dtype=tf.float32)
      # z = mu + sigma*epsilon
      self.z =       tf.add(self.z_mean,tf.multiply(tf.sqrt(tf.exp(self.z_log_sigma_sq)), eps))
  
      # Use Decoder network to determine mean of
      # Bernoulli distribution of reconstructed input
      self.x_reconstr_mean = 
      self._decoder_network(network_weights["weights_decoder"],
   network_weights["biases_decoder"])
      # Define loss function based variational upper-bound and 
      # corresponding optimizer
      # define generation loss
      generation_loss = 
  -tf.reduce_sum(self.x * tf.log(1e-10 + self.x_reconstr_mean)
+ (1-self.x) * tf.log(1e-10 + 1 - self.x_reconstr_mean), 1)
      latent_loss = -0.5 * tf.reduce_sum(1 + self.z_log_sigma_sq
- tf.square(self.z_mean)- tf.exp(self.z_log_sigma_sq), 1)
      self.cost = tf.reduce_mean(generation_loss + latent_loss)       #    average over batch
      # Define the optimizer
      self.optimizer = 
 tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.cost)
      # Initializing the tensor flow variables
      init = tf.global_variables_initializer()
  # Launch the session
      self.sess = tf.InteractiveSession()
      self.sess.run(init)
  
def _initialize_weights(self, n_hidden_recog_1, n_hidden_recog_2,
n_hidden_gener_1, n_hidden_gener_2,
n_input, n_z):
   initializer = tf.contrib.layers.xavier_initializer()
   all_weights = dict()
   all_weights['weights_encoder'] = {
   'h1': tf.Variable(initializer(shape=(n_input, n_hidden_recog_1))),
   'h2': tf.Variable(initializer(shape=(n_hidden_recog_1, n_hidden_recog_2))),
   'out_mean': tf.Variable(initializer(shape=(n_hidden_recog_2, n_z))),
   'out_log_sigma': tf.Variable(initializer(shape=(n_hidden_recog_2, n_z)))}
   all_weights['biases_encoder'] = {
   'b1': tf.Variable(tf.zeros([n_hidden_recog_1], dtype=tf.float32)),
   'b2': tf.Variable(tf.zeros([n_hidden_recog_2], dtype=tf.float32)),
   'out_mean': tf.Variable(tf.zeros([n_z], dtype=tf.float32)),
   'out_log_sigma': tf.Variable(tf.zeros([n_z], dtype=tf.float32))}

   all_weights['weights_decoder'] = {
   'h1': tf.Variable(initializer(shape=(n_z, n_hidden_gener_1))),
   'h2': tf.Variable(initializer(shape=(n_hidden_gener_1, n_hidden_gener_2))),
   'out_mean': tf.Variable(initializer(shape=(n_hidden_gener_2, n_input))),
   'out_log_sigma': tf.Variable(initializer(shape=(n_hidden_gener_2, n_input)))}

    all_weights['biases_decoder'] = {
   'b1': tf.Variable(tf.zeros([n_hidden_gener_1],    dtype=tf.float32)),
   'b2': tf.Variable(tf.zeros([n_hidden_gener_2], dtype=tf.float32)),'out_mean': tf.Variable(tf.zeros([n_input], dtype=tf.float32)),
   'out_log_sigma': tf.Variable(tf.zeros([n_input], dtype=tf.float32))}
   return all_weights

We build the encoder network and the decoder network. The first layer of the Encoder network is taking the input and generating a reduced latent representation of the input. The second layer maps the input to a Gaussian distribution. The network learns these transformations:

def _encoder_network(self, weights, biases):
  # Generate probabilistic encoder (recognition network), which
  # maps inputs onto a normal distribution in latent space.
  # The transformation is parametrized and can be learned.
  layer_1 = self.transfer_fct(tf.add(tf.matmul(self.x,     weights['h1']),
biases['b1']))
  layer_2 = self.transfer_fct(tf.add(tf.matmul(layer_1,   weights['h2']),
biases['b2']))
  z_mean = tf.add(tf.matmul(layer_2, weights['out_mean']),
biases['out_mean'])
  z_log_sigma_sq = 
tf.add(tf.matmul(layer_2, weights['out_log_sigma']),
biases['out_log_sigma'])
  return (z_mean, z_log_sigma_sq)

def _decoder_network(self, weights, biases):
  # Generate probabilistic decoder (decoder network), which
  # maps points in latent space onto a Bernoulli distribution in data space.
  # The transformation is parametrized and can be learned.
  layer_1 = self.transfer_fct(tf.add(tf.matmul(self.z, weights['h1']),
biases['b1']))
  layer_2 = self.transfer_fct(tf.add(tf.matmul(layer_1, weights['h2']),
biases['b2']))
  x_reconstr_mean = 
tf.nn.sigmoid(tf.add(tf.matmul(layer_2, weights['out_mean']),
  biases['out_mean']))
  return x_reconstr_mean

The class VariationalAutoencoder also contains some helper functions to generate and reconstruct data, and to fit the VAE:

def fit(self, X):
  opt, cost = self.sess.run((self.optimizer, self.cost),
  feed_dict={self.x: X})
  return cost

def generate(self, z_mu=None):
""" Generate data by sampling from latent space.
If z_mu is not None, data for this point in latent space is
generated. Otherwise, z_mu is drawn from prior in latent
space.
"""
  if z_mu is None:
    z_mu = np.random.normal(size=self.network_architecture["n_z"])
# Note: This maps to mean of distribution, we could alternatively
# sample from Gaussian distribution
  return self.sess.run(self.x_reconstr_mean,
      feed_dict={self.z: z_mu})

def reconstruct(self, X):
""" Use VAE to reconstruct given data. """
  return self.sess.run(self.x_reconstr_mean,
    feed_dict={self.x: X})

Once the VAE class is done, we define a function train, which uses the VAE class object and trains it for a given data.

def train(network_architecture, learning_rate=0.001,
batch_size=100, training_epochs=10, display_step=5):
  vae = VariationalAutoencoder(network_architecture,
  learning_rate=learning_rate,
  batch_size=batch_size)
  # Training cycle
  for epoch in range(training_epochs):
    avg_cost = 0.
    total_batch = int(n_samples / batch_size)
    # Loop over all batches
    for i in range(total_batch):
      batch_xs, _ = mnist.train.next_batch(batch_size)
      # Fit training using batch data
      cost = vae.fit(batch_xs)
      # Compute average loss
      avg_cost += cost / n_samples * batch_size
      # Display logs per epoch step
     if epoch % display_step == 0:
       print("Epoch:", '%04d' % (epoch+1), 
           "cost=", "{:.9f}".format(avg_cost))
  return vae

Let us now use the VAE class and train function. We use the VAE for our favorite MNIST dataset:

# Load MNIST data in a format suited for tensorflow.
# The script input_data is available under this URL:
#https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/examples/tutorials/mnist/input_data.py

from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
n_samples = mnist.train.num_examples

We define the network-architecture and perform training of VAE on MNIST dataset. In this case, we keep the latent dimensions 2 for simplicity.

network_architecture = 
dict(n_hidden_recog_1=500, # 1st layer encoder neurons
n_hidden_recog_2=500, # 2nd layer encoder neurons
n_hidden_gener_1=500, # 1st layer decoder neurons
n_hidden_gener_2=500, # 2nd layer decoder neurons
n_input=784, # MNIST data input (img shape: 28*28)
n_z=2) # dimensionality of latent space
vae = train(network_architecture, training_epochs=75)

Let us now see if the VAE really reconstructs the input or not. The output shows that digits are indeed reconstructed, and since we have used a 2D latent space, there is a significant blurring of the images:

x_sample = mnist.test.next_batch(100)[0]
x_reconstruct = vae.reconstruct(x_sample)
plt.figure(figsize=(8, 12))
for i in range(5):
  plt.subplot(5, 2, 2*i + 1)
  plt.imshow(x_sample[i].reshape(28, 28),  vmin=0, vmax=1, cmap="gray")
  plt.title("Test input")
  plt.colorbar()
  plt.subplot(5, 2, 2*i + 2)
  plt.imshow(x_reconstruct[i].reshape(28, 28), vmin=0, vmax=1, cmap="gray")
  plt.title("Reconstruction")
  plt.colorbar()
  plt.tight_layout()

Following is the output of the preceding code:

An example of MNIST reconstructed characters

The following are the samples of handwritten digits generated using the trained VAE:

nx = ny = 20
x_values = np.linspace(-3, 3, nx)
y_values = np.linspace(-3, 3, ny)
canvas = np.empty((28*ny, 28*nx))
for i, yi in enumerate(x_values):
  for j, xi in enumerate(y_values):
    z_mu = np.array([[xi, yi]]*vae.batch_size)
    x_mean = vae.generate(z_mu)
    canvas[(nx-i-1)*28:(nx-i)*28, j*28:(j+1)*28] = x_mean[0].reshape(28, 28)
plt.figure(figsize=(8, 10))
Xi, Yi = np.meshgrid(x_values, y_values)
plt.imshow(canvas, origin="upper", cmap="gray")
plt.tight_layout()

Following is the range of MNIST like characters generated by autoencoders:

A range of MNIST like characters generated by autoencoders

Table of Contents for How to do it...

Create new playlist

Sign In

Sign Up

Table of Contents for
How to do it...