Normalizing features

In k-means clustering, since all data points are not measured on the same scale, they have a high variance. This leads to clusters being less spherical. The uneven variance leads to putting to more weights on variables that will have a lower variance.

To fix this bias, we need to normalize our data, specially because we use Euclidean distance that ends up influencing clusters that have variables with a bigger magnitude. We fix this by standardizing the score of all variables. This is achieved by subtracting the average of the variable's value from each value and followed by a division with standard deviation.

We normalize our data using this same calculation:

def normalize(thedata):

n = thedata.count()
avg = thedata.reduce(add) / n

var = thedata.map(lambda x: (x - avg)**2).reduce(add) / n
std = np.sqrt(var)

std[std==0] = 1

def normalize(val):
return (val - avg) / std
return thedata.map(normalize)

normalized = normalize(data).cache()
print(normalized.take(2))
print(thedata.take(2))

The output looks as follows:

[array([ -6.68331854e-02, -1.72038228e-03, 6.81884351e-02,
-2.39084686e-03, -1.51391734e-02, -1.10348462e-03,
-2.65207600e-02, -4.39091558e-03, 2.44279187e+00,
-2.09732783e-03, -8.25770840e-03, -4.54646139e-03,
-3.28458917e-03, -9.57233922e-03, -8.50457842e-03,
-2.87561127e-02, 0.00000000e+00, -6.38979005e-04,
-2.89113034e-02, -1.57541507e+00, -1.19624324e+00,
-4.66042614e-01, -4.65755574e-01, -2.48285775e-01,
-2.48130352e-01, 5.39733093e-01, -2.56056520e-01,
-2.01059296e-01, -3.63913926e+00, -1.78651044e+00,
-1.83302273e+00, -2.82939000e-01, -1.25793664e+00,
-1.56668488e-01, -4.66404784e-01, -4.65453641e-01,
-2.50831829e-01, -2.49631966e-01]), array([ -6.68331854e-02, -1.77667956e-03, 5.32451452e-03,
-2.39084686e-03, -1.51391734e-02, -1.10348462e-03,
-2.65207600e-02, -4.39091558e-03, 2.44279187e+00,
-2.09732783e-03, -8.25770840e-03, -4.54646139e-03,
-3.28458917e-03, -9.57233922e-03, -8.50457842e-03,
-2.87561127e-02, 0.00000000e+00, -6.38979005e-04,
-2.89113034e-02, -1.57069789e+00, -1.19217808e+00,
-4.66042614e-01, -4.65755574e-01, -2.48285775e-01,
-2.48130352e-01, 5.39733093e-01, -2.56056520e-01,
-2.01059296e-01, -3.62351937e+00, -1.77706870e+00,
5.98966843e-01, -2.82939000e-01, 8.21118739e-01,
-1.56668488e-01, -4.66404784e-01, -4.65453641e-01,
-2.50831829e-01, -2.49631966e-01])]
[array([ 0.00000000e+00, 2.15000000e+02, 4.50760000e+04,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00]), array([ 0.00000000e+00, 1.62000000e+02, 4.52800000e+03,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 2.00000000e+00, 2.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
0.00000000e+00, 0.00000000e+00])]

We now build the model once again with the normalized data for different values of k. The values start from k = 60 to 110, with a leap of 10:

k_range = range(60, 111, 10)

k_scores = [clustering_error_Score(normalized, k) for k in k_range]
for kscore in k_scores:
print(kscore)

plt.plot(k_range, kscores)

The elbow graph shows a far better pattern:

13428.588817861917
26586.44539596379
18520.0580113469
10282.671313141745
12240.257631897006
12229.312684687848

What we do next is we take a small sample of data from the given dataset and perform k-means clustering twice:

  • Once with normalization
  • Once without normalization

We compare the results of the clusters:

  • Before normalization, the result appears as follows:
#before norm
K_norm = 90

var = getVariance(thedata)
indices_of_variance = [t[0] for t in sorted(enumerate(var), key=lambda x: x[1])[-3:]]

dataprojected = thedata.randomSplit([1, 999])[0].cache()

kclusters = KMeans.train(thedata, K_norm, maxIterations=10, runs=10, initializationMode="random")

listdataprojected = dataprojected.collect()
projected_data = np.array([[point[i] for i in indices_of_variance] for point in listdataprojected])
klabels = [kclusters.predict(point) for point in listdataprojected]

Maxi = max(projected_data.flatten())
mini = min(projected_data.flatten())

figs = plt.figure(figsize=(8, 8))
pltx = figs.add_subplot(111, projection='3d')
pltx.scatter(projected_data[:, 0], projected_data[:, 1], projected_data[:, 2], c=klabels)
pltx.set_xlim(mini, Maxi)
pltx.set_ylim(mini, Maxi)
pltx.set_zlim(mini, Maxi)
pltx.set_title("Before normalization")

The plot looks like this:

  • After normalization, it looks as follows:
#After normalization:

kclusters = KMeans.train(normalized, K_norm, maxIterations=10, runs=10, initializationMode="random")

dataprojected_normed = normalize(thedata, dataprojected).cache()
dataprojected_normed = dataprojected_normed.collect()
projected_data = np.array([[point[i] for i in indices_of_variance] for point in dataprojected_normed])
klabels = [kclusters.predict(point) for point in dataprojected_normed]

Maxi = max(projected_data.flatten())
mini = min(projected_data.flatten())

figs = plt.figure(figsize=(8, 8))
pltx = fig.add_subplot(111, projection='3d')
pltx.scatter(projected_data[:, 0], projected_data[:, 1], projected_data[:, 2], c=klabels)
pltx.set_xlim(mini, Maxi)
pltx.set_ylim(mini, Maxi)
pltx.set_zlim(mini, Maxi)
pltx.set_title("After normalization")

The plot looks like this:

Before we move on to complete our model, we need to add one final thing, which is changing categorical variables to numerical ones. We do this by using one-hot encoding. One-hot encoding is the process of transforming categorical variables to forms that can be processed for statistical analysis:

col1 = raw_data.map(lambda line: line.split(",")[1]).distinct().collect()
col2 = raw_data.map(lambda line: line.split(",")[2]).distinct().collect()
col2 = raw_data.map(lambda line: line.split(",")[3]).distinct().collect()

def parseWithOneHotEncoding(line):
column = line.split(',')
thelabel = column[-1]
thevector = column[0:-1]

col1 = [0]*len(featureCol1)
col1[col1.index(vector[1])] = 1
col2 = [0]*len(col2)
col2[featureCol1.index(vector[2])] = 1
col2 = [0]*len(featureCol3)
col2[featureCol1.index(vector[3])] = 1

thevector = ([thevector[0]] + col1 + col2 + col3 + thevector[4:])

thevector = np.array(thevector, dtype=np.float)

return (thelabel, thevector)
labelsAndData = raw_data.map(parseLineWithHotEncoding)

thedata = labelsAndData.values().cache()

normalized = normalize(thedata).cache()

The output is the following:

[ 0.00000000e+00 2.48680000e+04 3.50832000e+05 0.00000000e+00
0.00000000e+00 0.00000000e+00 1.00000000e+00 0.00000000e+00
1.01000000e+02 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00 0.00000000e+00 7.79000000e+02
1.03300000e+03 0.00000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 1.01000000e+02 0.00000000e+00 5.51000000e+00
7.78300000e+03 2.26050000e+04 1.01000000e+02 0.00000000e+00
9.05000000e+00 3.15000000e+00 0.00000000e+00 0.00000000e+00
0.00000000e+00 0.00000000e+00]

Finally, to normalize the data, we have an optimal value for k, and the categorical variables have been taken care of. We perform k-means again as shown:

kclusters = KMeans.train(data, 100, maxIterations=10, runs=10, initializationMode="random")

anomaly = normalized.map(lambda point: (point, error(clusters, point))).takeOrdered(100, lambda key: key[1])
plt.plot([ano[1] for ano in anomaly])

The output plot consists of several steps, each of which depicts a threshold:

The number of anomalies per threshold will be as follows:

Threshold   # of anomalies
75200 10
75900 35
78200 65
78800 78
82800 95
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset