As discussed in the earlier section, the Glimpse Sensor is a powerful concept. Combined with other concepts, such as RNN and RL, as discussed earlier, it is at the heart of improving the performance of visual models.
Let's see this in greater detail here. The code is commented at every line for easy understanding and is self-explanatory:
import tensorflow as tf
# the code is in tensorflow
import numpy as np
def glimpseSensor(image, fixationLocation):
'''
Glimpse Sensor for Recurrent Attention Model (RAM)
:param image: the image xt
:type image: numpy vector
:param fixationLocation: cordinates l for fixation center
:type fixationLocation: tuple
:return: Multi Resolution Representations from Glimpse Sensor
:rtype:
'''
img_size=np.asarray(image).shape[:2]
# this can be set as default from the size of images in our dataset, leaving the third 'channel' dimension if any
channels=1
# settings channels as 1 by default
if (np.asarray(img_size).shape[0]==3):
channels=np.asarray(image).shape[-1]
# re-setting the channel size if channels are present
batch_size=32
# setting batch size
loc = tf.round(((fixationLocation + 1) / 2.0) * img_size)
# fixationLocation coordinates are normalized between -1 and 1 wrt image center as 0,0
loc = tf.cast(loc, tf.int32)
# converting number format compatible with tf
image = tf.reshape(image, (batch_size, img_size[0], img_size[1], channels))
# changing img vector shape to fit tf
representaions = []
# representations of image
glimpse_images = []
# to show in window
minRadius=img_size[0]/10
# setting the side size of the smallest resolution image
max_radius=minRadius*2
offset = 2 * max_radius
# setting the max side and offset for drawing representations
depth = 3
# number of representations per fixation
sensorBandwidth = 8
# sensor bandwidth for glimpse sensor
# process each image individually
for k in range(batch_size):
imageRepresentations = []
one_img = image[k,:,:,:]
# selecting the required images to form a batch
one_img = tf.image.pad_to_bounding_box(one_img, offset, offset, max_radius * 4 + img_size, max_radius * 4 + img_size)
# pad image with zeros for use in tf as we require consistent size
for i in range(depth):
r = int(minRadius * (2 ** (i)))
# radius of draw
d_raw = 2 * r
# diameter
d = tf.constant(d_raw, shape=[1])
# tf constant for dia
d = tf.tile(d, [2])
loc_k = loc[k,:]
adjusted_loc = offset + loc_k - r
# location wrt image adjusted wrt image transformation and pad
one_img2 = tf.reshape(one_img, (one_img.get_shape()[0].value, one_img.get_shape()[1].value))
# reshaping image for tf
representations = tf.slice(one_img2, adjusted_loc, d)
# crop image to (d x d) for representation
representations = tf.image.resize_bilinear(tf.reshape(representations, (1, d_raw, d_raw, 1)), (sensorBandwidth, sensorBandwidth))
# resize cropped image to (sensorBandwidth x sensorBandwidth)
representations = tf.reshape(representations, (sensorBandwidth, sensorBandwidth))
# reshape for tf
imageRepresentations.append(representations)
# appending the current representation to the set of representations for image
representaions.append(tf.stack(imageRepresentations))
representations = tf.stack(representations)
glimpse_images.append(representations)
# return glimpse sensor output
return representations