@data(), 133
@everywhere, 327
@parallel, 327
@time, 72, 312
abs(), 196, 198
accuracy rate, 39, 167, 209, 210
addprocs(n), 326
alphanumeric operator, 76
ANN. See artificial neural network
ANN2vector(), 263
append!(), 65
apply(), 74, 272
apply_forest(), 257, 259
apply_forest_proba(), 257
apply_tree(), 252
apply_tree_proba(), 252
array, 51, 52–55
array(), 196
artificial neural network, 260–65
association rules extraction, 106
Atom software, 15
AUC metric, 216
AUC(), 216
bar plot, 162
basic accuracy, 208
basic random sampling, 38
Bayesian network, 275
BigFloat, 51, 52
BigInt, 51, 52
binarization, 143
binning, 143
Bokeh, 161, 184
Bool, 51, 61, 96, 144, 145, 147, 290
boosted tree, 273
Botlzman machine, 274
box plot, 162
break, 79
build_forest(), 256, 258
by(), 133
Canopy, 14
centers, 233
Char, 51
chi-square test, 160, 175
classification, 34, 148
classify(), 35, 36
cleaning up data, 137–39
cliques, 293
clustering, 119
CM2EL(), 291
C-means, 231
code
organizing, 25
referencing, 26
collect(), 58
colwise(), 131
confusion matrix, 208
connected component, 291
connected_components(), 292
convert(), 140, 141
cor(), 158
correctrate(), 209
counts(), 233
CPU_CORES, 326
curve fitting, 269
custom function, 87–91
cutree(), 240
cycle detection, 288–91
DAG. See directed acyclical graph
data cleansing, 104
data discovery, 110–11, 119
data engineering, 102, 103–10
data exploration, 106–8, 118
data frame, 104, 126–35
data learning, 111–13, 119
data modeling, 102, 110–13
data preparation, 103–6, 117
data product, 101
data product creation, 113, 120
data representation, 108–10, 118
data science pipeline, 101–3
data science workflow, 5–7
data type, 49–52
data-driven application, 101
dataset
downloading, 30
loading, 30–32
DBSCAN, 234–37
decision tree, 249–53
deep belief network, 274
deep learning, 112
deep learning system, 274
delete!(), 133
deleterows!(), 133
Density Based Spatial Clustering of Applications with Noise. See DBSCAN
describe(), 129, 157
dictionary, 55–56
DID(), 198
dijkstra_shortest_paths(), 295
Dijktra algorithm, 294, 302
dimensionality reduction, 119, 187
directed acyclical graph, 289
discernibility-based method, 201
discretization, 143
distance calculation function, 34
distance(), 35, 331
distance-based classifier, 33, 142
eachmatch(), 86
ELM. See extreme learning machine
Emac, 18
Epicenter, 17, 18, 20, 45
epoch, 261
Euclidean, 34, 47, 219, 229, 230, 231, 242, 245
extreme learning machine, 265–68
F1 metric, 211
factor analysis, 111
false negative, 171, 172
false positive, 171, 172
feature_type(), 93
Fibonacci, 1
find(), 132, 197
Fisher’s Discriminant Ratio, 149, 151, 194
fit(), 193, 267, 270
fitness function, 200, 201
fitness(), 201
Float32, 51
Float64, 51
FN. See false negative
for-loop, 78
formatting data, 139–41
FP. See false positive
GA. See genetic algorithm approach
Gadfly, 154, 160, 161, 162, 163, 170, 184, 185, 319
generalization, 111
genetic algorithm approach, 200
get(), 69
gini coefficient, 216
GPU. See graphics processing unit
gradient descent, 269
graph
analysis of, 300
dataset for, 285–87
importance of, 282–84
shortest path in a, 294–96
statistics of, 287
graphics processing unit, 275
Graphlab, 41, 75, 127, 134, 329
harmonic mean, 212
has_missing_values(), 93
hcat(), 207
hclust(), 240
HDF5, 41, 46
head(), 129
help, 44
help(), 44
hierarchical clustering, 229, 237–40
histogram, 168–70
hypothesis, 106
hypothesis testing, 170–77
HypothesisTests, 154, 173, 177, 181, 184, 185, 186, 319
IDE. See integrated development environment
if-else statement, 72
IJulia, 16–17
importing and exporting data, 135–37
in, 65
include, 26
index, 52
Index of Discernibility, 148, 149, 151, 194, 197, 203
indmin(), 330
information distillation, 102, 113–16
init_network(), 262
insert!(), 68
insight, deliverance, and visualization, 114, 121
Int32, 50, 51, 57
int32(), 50
Int64, 51, 53, 78, 80, 109, 132, 160
Int8, 51, 52, 61, 140, 147
integrated development environment, 1, 14
intersect(), 176
IPython, 16
ismatch(), 85
isna(), 132, 150
Jaccard Coefficient, 159
Jaccard Similarity, 149, 151, 159, 337
Java, 1, 7, 120, 300, 314
join(), 83
Julia
bridging with Python, 323–24
bridging with R, 321
data science community adoption of, 7–8
interest in, 1
parallelization in, 325–27
Julia Data Format, 41
Julia Studio, 17
JuliaBox, 17, 18, 45, 329
Juno, 14–16
Jupyter, 16, 23, 24, 45
k Nearest Neighbor, 33–39, 45, 274
Kandell’s tau rank correlation, 159
KFCV. See k-fold cross validation
k-fold cross validation, 220–22
K-means, 231–34
kNN. See k Nearest Neighbor
kNN(), 331
kruskal_minimum_spantree(), 298
Kruskal-Wallis test, 172
K-W. See Kruskal-Wallis test
Leave-One-Out, 220, 225
length(), 46, 71, 89, 330
LG2G(), 299, 303
Light Table, 14
line plot, 163
linspace(), 59
listening to the data, 153
load(), 299
logical operator, 76
logistic model, 248
lowercase(), 146
Magic telescope, 27
magic dataset, 27, 32
main(), 94
map(), 74
match(), 85
matchall(), 86
Matlab, 1, 4, 7, 17, 41, 78, 146
matrix, 50
max(), 47, 331
maximal_cliques(), 293
mean, 104
mean square error, 217, 224
mean(), 64, 131
MergeSort, 69
minimum spanning tree, 296–99
misclassification cost, 212
mode(), 94
Monte-Carlo, 325
MSE. See mean square error, See mean square error
MSE(), 248, 272
MST. See minimum spanning tree
Mutual Information, 149, 151, 194, 203, 337
names(), 127
natural language processing, 104, 309
Natural Language Processing, 109
Neobook, 121
nfoldCV_forest(), 259
nfoldCV_tree(), 252
nforldCV_forest(), 257
NLP. See natural language processing
normalization, 142–43, 242
normalize(), 281
notebook
creating, 21
exporting, 24
loading, 24
renaming, 23
saving, 22
Notepad++, 18
nprocs(), 326
nrow(), 134
odd-ratio, 145
OnlineNewsPopularity dataset, 28
Opus Pro, 121
outlier, 137
package
finding and selecting, 18–19
hacking, 21
installing, 20–21
using, 21
pair(), 55
partitional clustering, 229
PCA. See Principal Components Analysis
Pearson’s correlation, 159
Plotly, 161, 184
plots, 160–70
pmap, 327
polynomial, 142
pop!(), 66
precision, 211
predict(), 267
principal component, 188
Principal Components Analysis, 188–93
print(), 56
println(), 57
procs(), 327
Programming Praxis, 1
Project Euler, 1
push!(), 67, 133
Python, 14, 74
quickshift(), 238, 244
quickshiftlabels(), 239
quickshiftplot(), 239
QuickSort, 69
R, 75
RAD. See rapid application development
rand(), 60–63
randn(), 60–63
random forest, 255–60
randperm(), 206
rapid application development, 120
RCE. See Reduced Coulomb Energy
Read, Evaluate, Print, Loop, 13, 16, 51
readlines(), 31
readtable(), 136, 150
recall, 211
receiver operating characteristic curve, 113, 213
Reduced Coulomb Energy, 274
regex, 83–84
regression, 148, See statistical regression
regression tree, 254–55
relative risk transformation, 145
rename!(), 127, 128
rename(), 128
REPL. See Read, Evaluate, Print, Loop
residual variance, 191
ROC curve. See receiver operating characteristic curve
roc(), 215
round(), 59, 267
sample(), 206, 207
sampling, 205–7
SArrays, 41
save(), 42, 299
saving data
delimited file, 40
native Julia format, 41
text file, 43
scatter plot, 164–68
SFrame, 134, 336
SFrames, 41
SGraphs, 41
show(), 58
sig(), 310
signal processing, 109
sil(), 241, 245
silhouette, 240
Silhouette Width, 240
Similarity Index, 151, 159, 337
Simple Matching Coefficient, 159
size(), 71
skewness, 91
skewness(), 157
skewness_type(), 157
sort!(), 69, 133, 134
sort(), 69
Spam Emails dataset, 29
Spearman’s rank correlation, 159
splice!(), 67
split(), 82
sqrt(), 330
SSE. See sum squared error
Stackoverflow, 44, 46
standard deviation, 104
statistical regression, 268–73
StatsBase, 154, 175, 184, 206, 207, 319
stemming, 104
stop word, 104
String, 51
string manipulation, 81–87
string(), 74, 141, 333
subtype, 50, 140
sum squared error, 217, 218
sum(), 63
summarystats(), 157
supervised machine learning, 247–48
support, 44
support vector machine, 273
SVM. See support vector machine
symbol(), 128
tail(), 129
time(), 71
total misclassification cost, 213
train(), 262, 263
Transductive Support Vector Machine, 274
transductive system, 274
transform(), 192, 193
true negative, 172
true positive, 172
t-SNE, 166, 182
TSVM. See Transductive Support Vector Machine
t-test, 173
Tutorials Point, 18
typemax(), 57
typemin(), 57
typeof(), 50
unsupervised learning, 228–31
uppercase(), 146
validation, 112
var(), 131
Variation of Information, 240, 241, 245
vector, 50, 60
vector2ANN(), 262
vectorization, 146
Vega, 161, 184
weighted accuracy, 209
while-loop, 79
Winston, 161, 184
working directory, 26
wrapper function, 25, 35, 37, 46, 325, 326, 330
writecsv(), 40
writedlm(), 40
writetable(), 136, 150
zeros(), 196