-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
knn.html
650 lines (561 loc) · 43.7 KB
/
knn.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">
<title>KNN</title>
<meta content="" name="description">
<meta content="" name="keywords">
<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">
<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">
<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>
<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">
<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->
<script type="text/javascript" async
src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>
<!-- =======================================================
* Template Name: iPortfolio
* Updated: Sep 18 2023 with Bootstrap v5.3.2
* Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
* Author: BootstrapMade.com
* License: https://bootstrapmade.com/license/
======================================================== -->
</head>
<body>
<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>
<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">
<div class="profile">
<img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
<h1 class="text-light"><a href="index.html">Arun</a></h1>
<div class="social-links mt-3 text-center">
<a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
<a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
<a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
<a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
<a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
</div>
</div>
<nav id="navbar" class="nav-menu navbar">
<ul>
<li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
<li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
<li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
<li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
<li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
<li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
<li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
<li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
<li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
<li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li>
<!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
</ul>
</nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->
<main id="main">
<!-- ======= Breadcrumbs ======= -->
<section id="breadcrumbs" class="breadcrumbs">
<div class="container">
<div class="d-flex justify-content-between align-items-center">
<h2>Machine Learning</h2>
<ol>
<li><a href="machine-learning.html" class="clickable-box">Content section</a></li>
<li><a href="index.html#portfolio" class="clickable-box">Portfolio section</a></li>
</ol>
</div>
</div>
</section><!-- End Breadcrumbs -->
<!------ right dropdown menue ------->
<div class="right-side-list">
<div class="dropdown">
<button class="dropbtn"><strong>Shortcuts:</strong></button>
<div class="dropdown-content">
<ul>
<li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
<li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
<li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
<li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
<li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
<li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
<li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
<li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
<li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
<li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
<li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquerry</a></li>
<li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
<li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
<!-- Add more subsections as needed -->
</ul>
</div>
</div>
</div>
<!-- ======= Portfolio Details Section ======= -->
<section id="portfolio-details" class="portfolio-details">
<div class="container">
<div class="row gy-4">
<h1>K-Nearest Neighbors (KNN): Classification methods</h1>
<div class="col-lg-8">
<div class="portfolio-details-slider swiper">
<div class="swiper-wrapper align-items-center">
<div class="swiper-slide">
<figure>
<img src="assets/img/machine-ln/classfication-knn1.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"></figcaption>
</figure>
</div>
</div>
</div>
</div>
<div class="col-lg-4 grey-box">
<div class="section-title">
<h3>Content</h3>
<ol>
<li><a href="#introduction">Introduction</a></li>
<li><a href="#principle">Principle of KNN</a></li>
<ul>
<li><a href="#distance">Distance metrics for k-nearest neighbours</a></li>
<li><a href="#when">When Do We Use the KNN Algorithm?</a></li>
<li><a href="#steps-to-follow">Steps to Effective K-Nearest Neighbors (KNN) Algorithm Implementation</a></li>
<li><a href="#what-k">What value should you choose for k in k-nearest neighbours</a></li>
<li><a href="why-knn">Why Do We Need the KNN Algorithm?</a></li>
<li><a href="#pros">Pros of using KNN</a></li>
<li><a href="#cons">Cons of Using KNN</a></li>
</ul>
<li><a href="#example">Example</a></li>
<li><a href="#reference">Reference</a></li>
</ol>
</div>
</div>
</div>
<section>
<!-------------------- Introduction ---------------------->
<h2 id="introdction">Introduction</h2>
<ul>
<li>K-Nearest Neighbors (KNN) is a simple yet powerful supervised machine learning algorithm used for classification and regression tasks.</li>
<li>It is a non-parametric and instance-based learning algorithm, meaning it doesn't make any underlying assumptions about the distribution of data and learns directly from the training instances.</li>
<li>The main objective of the KNN algorithm is to predict the classification of a new sample point based on data points that are separated into several individual classes. It is used in text mining, agriculture, finance, and healthcare.</li>
</ul>
<p></p>
<!-------------------- Principle ---------------------->
<h2 id="principle">Principle of KNN</h2>
<p>The principle behind KNN is straightforward: objects are classified based on the majority class among their K nearest neighbors. In other words, the class label of an unseen data point is determined by the class labels of its K nearest neighbors in the feature space.</p>
<ul>
<li>The central idea behind KNN is to classify a new data point based on the majority class of its nearest neighbors.</li>
<li>It assumes that similar things are close to each other. Hence, if most of the nearest neighbors of a data point belong to a certain class, the new data point is likely to belong to that class as well</li>
<li>Let use suppose that we have a set of training observations \((x,y)\) and they capture the relationship between \(x\) and \(y\). Now our goal is to make a model and then predict the values on the basis of new datasets.</li>
<li>In the context of \(K\)-Nearest Neighbors (KNN), our goal is to learn a function \(h : X \rightarrow Y\) that can predict the output \(y\) for an unseen observation \(x\) based on the relationships between the input features \(X\) and the corresponding output labels \(Y\) in a labeled dataset.</li>
<li>In the K-Nearest Neighbors (KNN) algorithm, \(K\) represents the number of nearest neighbors to consider when making predictions for a new data point.</li>
<li>When a new data point is to be classified, KNN calculate the distances between the point and all other points in the dataset. It then selects the \(K\) nearest neighbors based on these distances. The class label or value assigned to the new data point is typically determined by the majority class or avergae value among these \(K\) nearest neighbors.</li>
<li>The choice of \(K\) is crucial in KNN, as it significantly impacts the performance of the algorithm. A smaller \(K\) value may lead to more flexible decision boundaries, potentially capturing noise in the data but being more sensitive to outliers. On the other hand, a larger \(K\) value may provide smoother decision boundaries but might not capture local patterns effectively.</li>
<li>Selecting an appropriate \(K\) value often involves experimentation and validation using techniques such as cross-validation to ensure optimal performance for the specific dataset and problem at hand.</li>
</ul>
<!---------------------------------->
<h3 id="distance">Distance metrics for k-nearest neighbours</h3>
<p>Distance metrics are crucial in the k-Nearest Neighbours (KNN) algorithm, as they determine how similarity or dissimilarity between data points is measured. The choice of distance metric can significantly impact the performance of the KNN algorithm. Here are some standard distance metrics used in KNN:</p>
<ol>
<li><strong>Euclidean Distance: </strong>The most commonly used distance metric in KNN. Calculates the straight-line distance between two points in the feature space. Suitable for continuous numerical features.
<p><strong>Formula: </strong></p>
$$d(\vec{p},\vec{q}) = \sqrt{\sum_{i=1}^n (p_i - q_i)^2}$$
<p>The Euclidean distance measures the straight-line distance between two points in a Euclidean space and the formula calculates the square root of the sum of squared differences between corresponding coordinates of two points.</p>
</li>
<li><strong> Manhattan Distance (Taxicab or City Block Distance): </strong>Calculates the distance between two points by summing the absolute differences between their coordinates along each dimension. Practical when dealing with features measured in different units.
<p><strong>Formula: </strong></p>
$$d(\vec{p},\vec{q}) = \sum_{i=1}^n |p_i - q_i|$$
<p>Manhattan distance calculates the distance between two points by summing the absolute differences of their coordinates. It represents the distance a taxicab would travel to reach the destination by moving along the grid-like city blocks.</p>
</li>
<li><strong>Minkowski Distance: </strong>Minkowski distance is a generalization of both Euclidean and Manhattan distances.
<p><strong>Formula: </strong></p>
$$d(\vec{p},\vec{q}) = \left(\sum_{i=1}^n |p_i - q_i|^{r}\right)^{1/r}$$
<p>where The parameter 'r' controls the degree of the distance metric. When \(r=1\), it reduces to the Manhattan distance, and when \(r=2\), it becomes the Euclidean distance.</p>
</li>
<li><strong>Chebyshev Distance (Maximum Metric): </strong>
Chebyshev distance calculates the maximum absolute difference between the coordinates of two points.
<p><strong>Formula:</strong></p>
$$d(\vec{p},\vec{q}) = \max_{i}(|p_i - q_i|)$$
<p>It represents the distance a king would travel on a chessboard to move between two squares.</p>
</li>
<li><strong>Hamming Distance (for categorical data):</strong> Hamming distance measures the number of positions at which the corresponding symbols are different between two strings of equal length.
<p><strong>Formula:</strong></p>
$$d(\vec{p},\vec{q}) = \sum_{i=1}^n \delta(p_i \neq q_i)$$
It's commonly used for categorical variables or binary data.
</li>
<li><strong>Cosine Similarity: </strong> The Cosine Similarity is a metric used to measure the similarity between two vectors in a multidimensional space. It calculates the cosine of the angle between the two vectors, providing a measure of how closely the vectors align in direction, irrespective of their magnitude. The Cosine Similarity ranges from -1 to 1.
<p><strong>Formula:</strong></p>
$$\cos(\theta) = \frac{\mathbf{a} \cdot \mathbf{b}}{\|\mathbf{a}\| \cdot \|\mathbf{b}\|}$$
where:
<ul>
<li>\(\mathbf{a} \cdot \mathbf{b}\) represents the dot product of the two vectors.</li>
<li>\(\|\mathbf{a}\|\) and \(\cdot \|\mathbf{b}\|\) represent the magnitudes (or norms) of the vectors \(a\) and \(b\), respectively.</li>
</ul>
<p>The Cosine Similarity is widely used in text analysis, document clustering, recommendation systems, and many other applications where vector representations are used to measure similarity between entities.</p>
</li>
<li><strong>Correlation Distance: </strong>Correlation distance is a measure of dissimilarity between two vectors that takes into account the correlation between their elements. It quantifies how much two vectors deviate from being perfectly positively correlated (correlation coefficient of 1) or perfectly negatively correlated (correlation coefficient of -1). The correlation distance ranges from 0 to 2, where:
<ul>
<li>If the correlation coefficient is 1 (perfect positive correlation), the correlation distance is 0.</li>
<li>If the correlation coefficient is -1 (perfect negative correlation), the correlation distance is 2.</li>
</ul>
<p><strong>Formula:</strong></p> The formula for calculating the correlation distance between two vectors \(\vec{x}\) and \(\vec{y}\) of length \(n\) is given by
$$\text{Correlation Distance}(\vec{x}, \vec{y}) = 1- \text{Correlation Coefficient}(\vec{x}, \vec{y})$$
where:
<ul>
<li>The correlation coefficient Correlation Coefficient(\(x\),\(y\)) measures the strength and direction of the linear relationship between the elements of vectors \(x\) and \(y\).</li>
<li>The correlation distance is calculated as 1 minus the correlation coefficient to convert the correlation coefficient into a distance metric.</li>
</ul>
</li>
</ol>
<!---------------------------------->
<h3 id="when">When Do We Use the KNN Algorithm?</h3>
<p>The K-Nearest Neighbors (KNN) algorithm is used in various scenarios where its characteristics align well with the requirements of the problem. Here are some common situations where KNN is often used:</p>
<ul>
<li>Data is labeled</li>
<li>Data is noise-free</li>
<li>Dataset is small, as KNN is a lazy learner</li>
</ul>
<!--------------------------------->
<h3 id="steps-to-follow">Steps to Effective K-Nearest Neighbors (KNN) Algorithm Implementation</h3>
The K-NN working can be explained on the basis of the below algorithm:
<ul>
<li><strong>Step-1: (Data processing): </strong> Clean and preprocess your dataset to handle missing values, outliers, and irrelevant features. Normalize or standardize the numerical features to ensure that all features contribute equally to the distance calculations.</li>
<pre><code class="language-python">
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
# Assume X_train and X_test are your feature matrices
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)
</code></pre>
<li><strong>Step-2: (Split Data): </strong> Split your dataset into training and testing sets. The training set will be used to train the KNN model, while the testing set will be used to evaluate its performance</li>
<pre><code class="language-python">
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
</code></pre>
<li><strong>Step-3: (Choose \(K\)): </strong>Decide on the value of \(K\) based on cross-validation or experimentation to find the optimal value that minimizes error on the validation set.</li>
<pre><code class="language-python">
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
# Define KNN classifier
knn = KNeighborsClassifier()
# Define parameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
# Perform grid search
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
# Get the best value of K
best_k = grid_search.best_params_['n_neighbors']
</code></pre>
<li><strong>Step-4 (Select Distance Metric): </strong> Choose an appropriate distance metric (e.g., Euclidean, Manhattan, Cosine) based on the nature of your data and problem domain. Different distance metrics may perform differently on different types of data.</li>
<pre><code class="language-python">
# By default, KNeighborsClassifier uses Euclidean distance
# You can specify different distance metrics using the 'metric' parameter
knn = KNeighborsClassifier(metric='manhattan')
</code></pre>
<li><strong>Step-5 (Train Model): </strong> Train the KNN model using the training dataset. The model memorizes the training data and does not require an explicit training step.</li>
<pre><code class="language-python">
# Using the best value of K obtained from grid search
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train_scaled, y_train)
</code></pre>
<li><strong>Step-6 (Evaluate the Model): </strong> Use the testing dataset to evaluate the performance of the trained KNN model. Calculate evaluation metrics such as accuracy, precision, recall, F1-score, or others depending on the specific problem.</li>
<pre><code class="language-python">
from sklearn.metrics import accuracy_score
# Make predictions on the testing set
y_pred = knn.predict(X_test_scaled)
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
</code></pre>
<li><strong>Step-7 (Tune Hyperparameters): </strong> Fine-tune the hyperparameters of the KNN algorithm, such as the value of \(K\) and the choice of distance metric, based on the evaluation results. Perform grid search or randomized search to find the best hyperparameters.</li>
<pre>
# Use grid search or randomized search to tune hyperparameters
# Example using grid search is shown in step 3
</pre>
<li><strong>Step-8 (Cross-Validation): </strong> Implement cross-validation techniques such as k-fold cross-validation to ensure the robustness of the model and to assess its generalization performance on unseen data.</li>
<pre><code class="language-python">
from sklearn.model_selection import cross_val_score
# Perform k-fold cross-validation
cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)
</code></pre>
<li><strong>Step-9 (Feature Selection): </strong> Conduct feature selection or dimensionality reduction techniques if necessary to improve the model's performance and reduce computational complexity.</li>
<pre><code class="language-python">
# Use feature selection techniques such as SelectKBest or PCA
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import PCA
# Example using SelectKBest
selector = SelectKBest(k=5)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)
# Example using PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
</code></pre>
<li><strong>Step-10 (Optimize Performance): </strong> Optimize the performance of the algorithm by using efficient data structures such as KD-trees or Ball trees for fast nearest neighbor search, especially for large datasets.</li>
<pre><code class="language-python">
# Use efficient data structures for nearest neighbor search
# By default, KNeighborsClassifier uses a brute-force algorithm
# For large datasets, you can use KD-trees or Ball trees for faster search
knn = KNeighborsClassifier(algorithm='kd_tree')
</code></pre>
</ul>
<!--------------------------------->
<h3 id="what-k">What value should you choose for k in k-nearest neighbours?</h3>
<p>Choosing the correct value for k in the k-Nearest Neighbours (KNN) algorithm is a critical step, as it can significantly impact the model’s performance. Selecting an appropriate k value involves finding a balance between bias and variance. Here are a few approaches you can use to choose the optimal k value:</p>
<ul>
<li><strong>Cross-Validation: </strong>Split your dataset into multiple folds (e.g., using k-fold cross-validation). For each fold, train the KNN model with different k values and evaluate its performance on the validation set. Calculate the average performance metric (e.g., accuracy) for each k value across all folds. Choose the k value that results in the best average performance.</li>
<li><strong>Odd vs. Even \(K\) values:</strong> Choose odd \(K\) values to avoid ties when classifying data points with an equal number of nearest neighbours from different classes. Using an odd k value prevents indecisiveness in classification.</li>
<li><strong>Elbow Method: </strong>Plot the performance metric (e.g., accuracy) as a function of different k values. Look for the point on the plot where the performance stabilizes or starts to decrease. This point resembles an “elbow.” This method helps you identify a value k that offers a good trade-off between bias and variance.</li>
<li><strong>Grid Search: </strong>Perform a grid search over a predefined range of k values. Train and evaluate the model for each k value in the range. Choose the k value that gives the best performance on a validation set.</li>
<li><strong>Domain Knowledge: </strong> Sometimes, domain knowledge can help you make an informed decision about the k value. For example, if you know that the problem is expected to have specific characteristics, you can choose a k value accordingly.</li>
<li><strong>Use Case-Specific Testing: </strong>Experiment with different k values and assess the model’s performance on a separate test dataset that wasn’t used during training. This approach helps you directly observe how different k values affect real-world predictions.</li>
</ul>
<!---------------------------------->
<h3 id="why-knn">Why Do We Need the KNN Algorithm?</h3>
The K-Nearest Neighbors (KNN) algorithm is valuable in data science and analytics for several reasons:
<ul>
<li><strong>Flexibility in Data Patterns:</strong> KNN is effective in recognizing patterns in data that might not follow a linear or parametric model. It can handle complex relationships between features and target classes, making it suitable for a wide range of datasets.</li>
<li><strong>Non-parametric Approach:</strong> Unlike some other algorithms that make assumptions about the underlying data distribution, KNN is non-parametric. It doesn't require the data to be normally distributed or have a specific shape, making it versatile and applicable in various domains.</li>
<li><strong>Handling Multi-Class Classification:</strong> KNN can handle multi-class classification problems with ease. By considering the majority class among the nearest neighbors, it can assign a class label to a new data point based on the distribution of classes in its vicinity.</li>
<li><strong>Adaptability to New Data:</strong> KNN is well-suited for online learning scenarios where new data points need to be incorporated into the existing model without retraining. Since KNN doesn't have a training phase and simply memorizes the training data, it can quickly adapt to changes in the dataset.</li>
<li><strong>Interpretability:</strong> KNN predictions are intuitive and easy to interpret. The class assigned to a new data point is based on the classes of its nearest neighbors, providing insights into why a particular prediction was made.</li>
<li><strong>Robustness to Noisy Data:</strong> KNN can handle noisy data and outliers reasonably well. Since it relies on the majority class among the nearest neighbors, outliers are less likely to significantly impact the classification results compared to some other algorithms.</li>
<li><strong>Simple Implementation:</strong> KNN is relatively easy to implement and understand, making it accessible to beginners in machine learning and pattern recognition. Its simplicity also allows for quick experimentation and prototyping.</li>
</ul>
<!---------------------------------->
<h3 id="pros">Pros of using KNN</h3>
<ul>
<li><strong>Simplicity:</strong> KNN is easy to understand and implement, making it accessible to beginners in machine learning and data science.</li>
<li><strong>No Training Phase:</strong> KNN is a lazy learning algorithm, meaning it doesn't require a training phase. Instead, it memorizes the entire training dataset, making it efficient for online learning scenarios where new data points need to be incorporated without retraining.</li>
<li><strong>Versatility:</strong> KNN can be applied to both classification and regression tasks. It can handle complex relationships between features and target variables, making it suitable for a wide range of datasets.</li>
<li><strong>Non-parametric:</strong> KNN makes no assumptions about the underlying data distribution, making it robust to different types of data. It can capture complex patterns in the data without relying on predefined models.</li>
<li><strong>Interpretability:</strong> Predictions made by KNN are intuitive and easy to interpret. The class label or regression value assigned to a new data point is based on the majority class or average value of its nearest neighbors.</li>
</ul>
<!-------------------------------------->
<h3 id="cons">Cons of Using KNN</h3>
<ol>
<li><strong>Computational Complexity:</strong> As the size of the training dataset grows, the computational cost of KNN increases significantly. Calculating distances between the new data point and all existing data points in the training set can be time-consuming for large datasets.</li>
<li><strong>Memory Usage:</strong> Since KNN memorizes the entire training dataset, it requires storing all training instances in memory. This can be problematic for datasets with a large number of features or a high-dimensional feature space, leading to high memory usage.</li>
<li><strong>Prediction Time:</strong> KNN incurs a high prediction time during inference. For each new data point, KNN needs to calculate distances to all training instances and determine the nearest neighbors, which can be slow for real-time or latency-sensitive applications.</li>
<li><strong>Sensitivity to Irrelevant Features:</strong> KNN considers all features equally when calculating distances between data points. Irrelevant or noisy features can negatively impact the algorithm's performance and lead to suboptimal results.</li>
<li><strong>Need for Optimal K Value:</strong> The choice of the hyperparameter K significantly influences the performance of KNN. Selecting an inappropriate value of K can lead to overfitting or underfitting of the model, requiring careful tuning and validation.</li>
<li><strong>Curse of Dimensionality:</strong> KNN's performance can degrade in high-dimensional feature spaces due to the curse of dimensionality. As the number of dimensions increases, the Euclidean distance between data points becomes less meaningful, making it challenging to define nearest neighbors accurately.</li>
</ol>
<!-------------------------------->
<h2 id="example">Example with the dataset on 'user_data.csv'</h2>
<p>Again we consider the 'user_data.csv' used in <a href="naive-byes.html" target="_blank">Naive-Bayes algorithm</a> (for the code, please see the github repository <a href="https://github.com/arunp77/Machine-Learning/tree/main/Projects-ML/Reg-models" target="_blank">check the 'Project-2.3-KNN-classification.ipynb' file in the Github repo</a>).</p>
<ul>
<li><strong>Data loading: </strong></li>
<pre><code class="language-python">
# loading the important libraries
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# loading the datset and creating a dataframe
df_user = pd.read_csv('User_Data.csv')
# Display the DataFrame
print(df_user.head())
</code></pre>
<figure>
<img src="assets/img/machine-ln/classfication-user-table.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"></figcaption>
</figure>
<li><strong>Data cleaning, preprocessing and then splitting and scalling: </strong>
<pre><code class="language-python">
# Importing the dataset
X = df_user.iloc[:, [2, 3]].values # features
y = df_user.iloc[:, 4].values # target
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
</code></pre>
</li>
<li><strong>Fitting K-NN classifier to the Training data: </strong>
<p>In next step, we fit the K-NN classifier to the training data. To do this we will import the <code>KNeighborsClassifier</code> class of the <code>sklearn</code>. The object <code>Classifier</code> from the <code>KNeighborsClassifier</code> which needs various parameters such as (formmore details, see the official documentation <a href="https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html" target="_blank">Sklearn.neighbors.KNeighborsClassifier</a>):</p>
<ul>
<li><code>n_neighbors</code>: defines the required neighbors of the algorithm.</li>
<li><code>metric='minkowski'</code>: This is the default parameter and it decides the distance between the points.</li>
<li><code>p=2</code>: It is equivalent to the standard Euclidean metric.</li>
</ul>
<pre><code class="language-python">
#Fitting K-NN classifier to the training set
from sklearn.neighbors import KNeighborsClassifier
classifier= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
</code></pre>
</li>
<li><strong>Predicting the Test Result: </strong>
<pre><code class="language-python">
#Predicting the test set result
y_pred= classifier.predict(x_test)
</code></pre>
</li>
<li><strong>Creating the Confusion Matrix:</strong> Now we will create the <a href="https://arunp77.github.io/logistic-regression.html#con-mat" target="_blank">Confusion matrix</a> for our K-NN model to see the accuracy of the classifier. Below is the code for it:
<pre><code class="language-python">
#Creating the Confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test, y_pred)
cm
</code></pre>
which created a numpy array:
<pre>
array(
[[64 4]
[ 3 29]], dtype=int64
)
</pre>
</li>
<li><strong>Visualizing the Training set result: </strong>
Now, we will visualize the training and the test dataset result for K-NN model.
<pre><code class="language-python">
# Importing libraries
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
# Set up the figure with two subplots in one row and two columns
fig, axes = plt.subplots(1, 2, figsize=(12, 6))
# Visulaizing the training set result
x_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
axes[0].contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha=0.75, cmap=ListedColormap(['#87CEEB', '#90EE90']))
axes[0].set_xlim(X1.min(), X1.max())
axes[0].set_ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
axes[0].scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c=ListedColormap(['#0000FF', '#2ca02c'])(i), label=j)
axes[0].set_title('K-NN Algorithm (Training set)')
axes[0].set_xlabel('Age')
axes[0].set_ylabel('Estimated Salary')
axes[0].legend()
# Visulaizing the test set result
x_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
axes[1].contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
alpha=0.75, cmap=ListedColormap(['#87CEEB', '#90EE90']))
axes[1].set_xlim(X1.min(), X1.max())
axes[1].set_ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
axes[1].scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
c=ListedColormap(['#0000FF', '#2ca02c'])(i), label=j)
axes[1].set_title('K-NN Algorithm (Test set)')
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Estimated Salary')
axes[1].legend()
plt.tight_layout()
plt.show()
</code></pre>
<figure>
<img src="assets/img/machine-ln/classfication-knn-plot.png" alt="" style="max-width: 90%; max-height: auto;">
<figcaption style="text-align: center;"></figcaption>
</figure>
<p>The output graph is different from the graph which we have occurred in Logistic Regression (For details, you can see <a href="naive-byes.html">Naive-Bayes classification</a>.</p>
</li>
<p><strong>Explanation for training dataset:</strong></p>
<ul>
<li>As we can see the graph is showing the blue point and green points. The green points are for Purchased(1) and blue Points for not Purchased(0) variable.</li>
<li>The graph is showing an irregular boundary instead of showing any straight line or any curve because it is a K-NN algorithm, i.e., finding the nearest neighbor.</li>
<li>The graph has classified users in the correct categories as most of the users who didn't buy the SUV are in the blue region and users who bought the SUV are in the green region.</li>
<li>The graph is showing good result but still, there are some green points in the blue region and blue points in the green region. But this is no big issue as by doing this model is prevented from overfitting issues.</li>
<li>Hence our model is well trained.</li>
</ul>
<p><strong>Explanation for the test dataset:</strong></p>
<ul>
<li>On the right side of the plot, plot for test datset is shown.</li>
<li>As we can see in the graph, the predicted output is well good as most of the b;ue points are in the lightblue region and most of the green points are in the lightgreen region.</li>
<li>However, there are few green points in the lightblue region and a few blue points in the lightgreen region. So these are the incorrect observations that we have observed in the confusion matrix.</li>
</ul>
</ul>
</section>
<!----------- Reference ----------->
<section id="reference">
<h2>References</h2>
<ul>
<li><a href="https://arunp77.github.io/logistic-regression.html#con-mat" target="_blank">Confusion matrix details</a>.</li>
<li>My github Repositories on Remote sensing <a href="https://github.com/arunp77/Machine-Learning/" target="_blank">Machine learning</a></li>
<li><a href="https://mlu-explain.github.io/linear-regression/" target="_blank">A Visual Introduction To Linear regression</a> (Best reference for theory and visualization).</li>
<li>Book on Regression model: <a href="https://avehtari.github.io/ROS-Examples/" target="_blank">Regression and Other Stories</a></li>
<li>Book on Statistics: <a href="https://hastie.su.domains/Papers/ESLII.pdf" target="_blank">The Elements of Statistical Learning</a></li>
<li><a href="https://www.javatpoint.com/machine-learning-naive-bayes-classifier" target="_blank">Naïve Bayes Classifier Algorithm, JAVAPoint.com</a></li>
<li><a href="https://www.colorado.edu/amath/sites/default/files/attached-files/ch12_0.pdf">https://www.colorado.edu/amath/sites/default/files/attached-files/ch12_0.pdf</a></li>
<li><a href="https://datahacker.rs/002-machine-learning-linear-regression-model/" target="_blank">One of the best description on Linear regression</a>.</li>
</ul>
</section>
<hr>
<div style="background-color: #f0f0f0; padding: 15px; border-radius: 5px;">
<h3>Some other interesting things to know:</h3>
<ul style="list-style-type: disc; margin-left: 30px;">
<li>Visit my website on <a href="sql-project.html">For Data, Big Data, Data-modeling, Datawarehouse, SQL, cloud-compute.</a></li>
<li>Visit my website on <a href="Data-engineering.html">Data engineering</a></li>
</ul>
</div>
<p></p>
<div class="navigation">
<a href="index.html#portfolio" class="clickable-box">
<span class="arrow-left">Portfolio section</span>
</a>
<a href="machine-learning.html" class="clickable-box">
<span class="arrow-right">Content</span>
</a>
</div>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --
<!-- ======= Footer ======= -->
<footer id="footer">
<div class="container">
<div class="copyright">
© Copyright <strong><span>Arun</span></strong>
</div>
</div>
</footer><!-- End Footer -->
<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>
<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>
<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>
<script>
document.addEventListener("DOMContentLoaded", function () {
hljs.initHighlightingOnLoad();
});
</script>
</body>
</html>