Normalizing features

In k-means clustering, since all data points are not measured on the same scale, they have a high variance. This leads to clusters being less spherical. The uneven variance leads to putting to more weights on variables that will have a lower variance.

To fix this bias, we need to normalize our data, specially because we use Euclidean distance that ends up influencing clusters that have variables with a bigger magnitude. We fix this by standardizing the score of all variables. This is achieved by subtracting the average of the variable's value from each value and followed by a division with standard deviation.

We normalize our data using this same calculation:

def normalize(thedata):
 
 n = thedata.count()
 avg = thedata.reduce(add) / n
 
 var = thedata.map(lambda x: (x - avg)**2).reduce(add) / n
 std = np.sqrt(var)
 
 std[std==0] = 1

def normalize(val):
 return (val - avg) / std
 return thedata.map(normalize)

normalized = normalize(data).cache()
print(normalized.take(2))
print(thedata.take(2))

The output looks as follows:

[array([ -6.68331854e-02, -1.72038228e-03, 6.81884351e-02,
 -2.39084686e-03, -1.51391734e-02, -1.10348462e-03,
 -2.65207600e-02, -4.39091558e-03, 2.44279187e+00,
 -2.09732783e-03, -8.25770840e-03, -4.54646139e-03,
 -3.28458917e-03, -9.57233922e-03, -8.50457842e-03,
 -2.87561127e-02, 0.00000000e+00, -6.38979005e-04,
 -2.89113034e-02, -1.57541507e+00, -1.19624324e+00,
 -4.66042614e-01, -4.65755574e-01, -2.48285775e-01,
 -2.48130352e-01, 5.39733093e-01, -2.56056520e-01,
 -2.01059296e-01, -3.63913926e+00, -1.78651044e+00,
 -1.83302273e+00, -2.82939000e-01, -1.25793664e+00,
 -1.56668488e-01, -4.66404784e-01, -4.65453641e-01,
 -2.50831829e-01, -2.49631966e-01]), array([ -6.68331854e-02, -1.77667956e-03, 5.32451452e-03,
 -2.39084686e-03, -1.51391734e-02, -1.10348462e-03,
 -2.65207600e-02, -4.39091558e-03, 2.44279187e+00,
 -2.09732783e-03, -8.25770840e-03, -4.54646139e-03,
 -3.28458917e-03, -9.57233922e-03, -8.50457842e-03,
 -2.87561127e-02, 0.00000000e+00, -6.38979005e-04,
 -2.89113034e-02, -1.57069789e+00, -1.19217808e+00,
 -4.66042614e-01, -4.65755574e-01, -2.48285775e-01,
 -2.48130352e-01, 5.39733093e-01, -2.56056520e-01,
 -2.01059296e-01, -3.62351937e+00, -1.77706870e+00,
 5.98966843e-01, -2.82939000e-01, 8.21118739e-01,
 -1.56668488e-01, -4.66404784e-01, -4.65453641e-01,
 -2.50831829e-01, -2.49631966e-01])]
[array([ 0.00000000e+00, 2.15000000e+02, 4.50760000e+04,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00]), array([ 0.00000000e+00, 1.62000000e+02, 4.52800000e+03,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 2.00000000e+00, 2.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
 0.00000000e+00, 0.00000000e+00])]

We now build the model once again with the normalized data for different values of k. The values start from k = 60 to 110, with a leap of 10:

k_range = range(60, 111, 10)

k_scores = [clustering_error_Score(normalized, k) for k in k_range]
for kscore in k_scores:
 print(kscore)

plt.plot(k_range, kscores)

The elbow graph shows a far better pattern:

13428.588817861917
26586.44539596379
18520.0580113469
10282.671313141745
12240.257631897006
12229.312684687848

What we do next is we take a small sample of data from the given dataset and perform k-means clustering twice:

Once with normalization
Once without normalization

We compare the results of the clusters:

Before normalization, the result appears as follows:

#before norm
K_norm = 90

var = getVariance(thedata)
indices_of_variance = [t[0] for t in sorted(enumerate(var), key=lambda x: x[1])[-3:]]

dataprojected = thedata.randomSplit([1, 999])[0].cache()

kclusters = KMeans.train(thedata, K_norm, maxIterations=10, runs=10, initializationMode="random")

listdataprojected = dataprojected.collect()
projected_data = np.array([[point[i] for i in indices_of_variance] for point in listdataprojected])
klabels = [kclusters.predict(point) for point in listdataprojected]

Maxi = max(projected_data.flatten())
mini = min(projected_data.flatten())

figs = plt.figure(figsize=(8, 8))
pltx = figs.add_subplot(111, projection='3d')
pltx.scatter(projected_data[:, 0], projected_data[:, 1], projected_data[:, 2], c=klabels)
pltx.set_xlim(mini, Maxi)
pltx.set_ylim(mini, Maxi)
pltx.set_zlim(mini, Maxi)
pltx.set_title("Before normalization")

The plot looks like this:

After normalization, it looks as follows:

#After normalization:

kclusters = KMeans.train(normalized, K_norm, maxIterations=10, runs=10, initializationMode="random")

dataprojected_normed = normalize(thedata, dataprojected).cache()
dataprojected_normed = dataprojected_normed.collect()
projected_data = np.array([[point[i] for i in indices_of_variance] for point in dataprojected_normed])
klabels = [kclusters.predict(point) for point in dataprojected_normed]

Maxi = max(projected_data.flatten())
mini = min(projected_data.flatten())

figs = plt.figure(figsize=(8, 8))
pltx = fig.add_subplot(111, projection='3d')
pltx.scatter(projected_data[:, 0], projected_data[:, 1], projected_data[:, 2], c=klabels)
pltx.set_xlim(mini, Maxi)
pltx.set_ylim(mini, Maxi)
pltx.set_zlim(mini, Maxi)
pltx.set_title("After normalization")

The plot looks like this:

Before we move on to complete our model, we need to add one final thing, which is changing categorical variables to numerical ones. We do this by using one-hot encoding. One-hot encoding is the process of transforming categorical variables to forms that can be processed for statistical analysis:

col1 = raw_data.map(lambda line: line.split(",")[1]).distinct().collect()
col2 = raw_data.map(lambda line: line.split(",")[2]).distinct().collect()
col2 = raw_data.map(lambda line: line.split(",")[3]).distinct().collect()

def parseWithOneHotEncoding(line):
 column = line.split(',')
 thelabel = column[-1]
 thevector = column[0:-1]
 
 col1 = [0]*len(featureCol1)
 col1[col1.index(vector[1])] = 1
 col2 = [0]*len(col2)
 col2[featureCol1.index(vector[2])] = 1
 col2 = [0]*len(featureCol3)
 col2[featureCol1.index(vector[3])] = 1
 
 thevector = ([thevector[0]] + col1 + col2 + col3 + thevector[4:])
 
 thevector = np.array(thevector, dtype=np.float)
 
return (thelabel, thevector)
labelsAndData = raw_data.map(parseLineWithHotEncoding)

thedata = labelsAndData.values().cache()

normalized = normalize(thedata).cache()

The output is the following:

[ 0.00000000e+00 2.48680000e+04 3.50832000e+05 0.00000000e+00
   0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
   1.01000000e+02 0.00000000e+00 0.00000000e+00 0.00000000e+00
   0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
   0.00000000e+00 0.00000000e+00 0.00000000e+00 7.79000000e+02
   1.03300000e+03 0.00000000e+00 0.00000000e+00 0.00000000e+00
   0.00000000e+00 1.01000000e+02 0.00000000e+00 5.51000000e+00
   7.78300000e+03 2.26050000e+04 1.01000000e+02 0.00000000e+00
   9.05000000e+00 3.15000000e+00 0.00000000e+00 0.00000000e+00
   0.00000000e+00 0.00000000e+00]

Finally, to normalize the data, we have an optimal value for k, and the categorical variables have been taken care of. We perform k-means again as shown:

kclusters = KMeans.train(data, 100, maxIterations=10, runs=10, initializationMode="random")

anomaly = normalized.map(lambda point: (point, error(clusters, point))).takeOrdered(100, lambda key: key[1])
plt.plot([ano[1] for ano in anomaly])

The output plot consists of several steps, each of which depicts a threshold:

The number of anomalies per threshold will be as follows:

Threshold	# of anomalies
75200	10
75900	35
78200	65
78800	78
82800	95

Table of Contents for Normalizing features

Create new playlist

Sign In

Sign Up

Table of Contents for
Normalizing features