knn.html

<!DOCTYPE html>
<html lang="en">

<head>
<meta charset="utf-8">
<meta content="width=device-width, initial-scale=1.0" name="viewport">

<title>KNN</title>
<meta content="" name="description">
<meta content="" name="keywords">

<!-- Favicons -->
<link href="assets/img/Favicon-1.png" rel="icon">
<link href="assets/img/Favicon-1.png" rel="apple-touch-icon">

<!-- Google Fonts -->
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,500,500i,600,600i,700,700i|Poppins:300,300i,400,400i,500,500i,600,600i,700,700i" rel="stylesheet">

<!-- Vendor CSS Files -->
<link href="assets/vendor/aos/aos.css" rel="stylesheet">
<link href="assets/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="assets/vendor/bootstrap-icons/bootstrap-icons.css" rel="stylesheet">
<link href="assets/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
<link href="assets/vendor/glightbox/css/glightbox.min.css" rel="stylesheet">
<link href="assets/vendor/swiper/swiper-bundle.min.css" rel="stylesheet">
<!-- Creating a python code section-->
<link rel="stylesheet" href="assets/css/prism.css">
<script src="assets/js/prism.js"></script>

<!-- Template Main CSS File -->
<link href="assets/css/style.css" rel="stylesheet">

<!-- To set the icon, visit https://fontawesome.com/account-->
<script src="https://kit.fontawesome.com/5d25c1efd3.js" crossorigin="anonymous"></script>
<!-- end of icon-->

<script type="text/javascript" async
    src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML">
</script>


<!-- =======================================================
  * Template Name: iPortfolio
  * Updated: Sep 18 2023 with Bootstrap v5.3.2
  * Template URL: https://bootstrapmade.com/iportfolio-bootstrap-portfolio-websites-template/
  * Author: BootstrapMade.com
  * License: https://bootstrapmade.com/license/
======================================================== -->
</head>

<body>

<!-- ======= Mobile nav toggle button ======= -->
<i class="bi bi-list mobile-nav-toggle d-xl-none"></i>

<!-- ======= Header ======= -->
<header id="header">
<div class="d-flex flex-column">

    <div class="profile">
    <img src="assets/img/myphoto.jpeg" alt="" class="img-fluid rounded-circle">
    <h1 class="text-light"><a href="index.html">Arun</a></h1>
    <div class="social-links mt-3 text-center">
        <a href="https://www.linkedin.com/in/arunp77/" target="_blank" class="linkedin"><i class="bx bxl-linkedin"></i></a>
        <a href="https://github.com/arunp77" target="_blank" class="github"><i class="bx bxl-github"></i></a>
        <a href="https://twitter.com/arunp77_" target="_blank" class="twitter"><i class="bx bxl-twitter"></i></a>
        <a href="https://www.instagram.com/arunp77/" target="_blank" class="instagram"><i class="bx bxl-instagram"></i></a>
        <a href="https://arunp77.medium.com/" target="_blank" class="medium"><i class="bx bxl-medium"></i></a>
    </div>
    </div>

    <nav id="navbar" class="nav-menu navbar">
    <ul>
        <li><a href="index.html#hero" class="nav-link scrollto active"><i class="bx bx-home"></i> <span>Home</span></a></li>
        <li><a href="index.html#about" class="nav-link scrollto"><i class="bx bx-user"></i> <span>About</span></a></li>
        <li><a href="index.html#resume" class="nav-link scrollto"><i class="bx bx-file-blank"></i> <span>Resume</span></a></li>
        <li><a href="index.html#portfolio" class="nav-link scrollto"><i class="bx bx-book-content"></i> <span>Portfolio</span></a></li>
        <li><a href="index.html#skills-and-tools" class="nav-link scrollto"><i class="bx bx-wrench"></i> <span>Skills and Tools</span></a></li>
        <li><a href="index.html#language" class="nav-link scrollto"><i class="bi bi-menu-up"></i> <span>Languages</span></a></li>
        <li><a href="index.html#awards" class="nav-link scrollto"><i class="bi bi-award-fill"></i> <span>Awards</span></a></li>
        <li><a href="index.html#professionalcourses" class="nav-link scrollto"><i class="bx bx-book-alt"></i> <span>Professional Certification</span></a></li>
        <li><a href="index.html#publications" class="nav-link scrollto"><i class="bx bx-news"></i> <span>Publications</span></a></li>
        <li><a href="index.html#extra-curricular" class="nav-link scrollto"><i class="bx bx-rocket"></i> <span>Extra-Curricular Activities</span></a></li>
        <!-- <li><a href="#contact" class="nav-link scrollto"><i class="bx bx-envelope"></i> <span>Contact</span></a></li> -->
    </ul>
    </nav><!-- .nav-menu -->
</div>
</header><!-- End Header -->

<main id="main">

    <!-- ======= Breadcrumbs ======= -->
    <section id="breadcrumbs" class="breadcrumbs">
    <div class="container">

    <div class="d-flex justify-content-between align-items-center">
        <h2>Machine Learning</h2>
        <ol>
        <li><a href="machine-learning.html" class="clickable-box">Content section</a></li>
        <li><a href="index.html#portfolio" class="clickable-box">Portfolio section</a></li>
        </ol>
    </div>

    </div>
    </section><!-- End Breadcrumbs -->

    <!------  right dropdown menue ------->
    <div class="right-side-list">
    <div class="dropdown">
        <button class="dropbtn"><strong>Shortcuts:</strong></button>
        <div class="dropdown-content">
            <ul>
                <li><a href="cloud-compute.html"><i class="fas fa-cloud"></i> Cloud</a></li>
                <li><a href="AWS-GCP.html"><i class="fas fa-cloud"></i> AWS-GCP</a></li>
                <li><a href="amazon-s3.html"><i class="fas fa-cloud"></i> AWS S3</a></li>
                <li><a href="ec2-confi.html"><i class="fas fa-server"></i> EC2</a></li>
                <li><a href="Docker-Container.html"><i class="fab fa-docker" style="color: rgb(29, 27, 27);"></i> Docker</a></li>
                <li><a href="Jupyter-nifi.html"><i class="fab fa-python" style="color: rgb(34, 32, 32);"></i> Jupyter-nifi</a></li>
                <li><a href="snowflake-task-stream.html"><i class="fas fa-snowflake"></i> Snowflake</a></li>
                <li><a href="data-model.html"><i class="fas fa-database"></i> Data modeling</a></li>
                <li><a href="sql-basics.html"><i class="fas fa-table"></i> QL</a></li>
                <li><a href="sql-basic-details.html"><i class="fas fa-database"></i> SQL</a></li>
                <li><a href="Bigquerry-sql.html"><i class="fas fa-database"></i> Bigquerry</a></li>
                <li><a href="scd.html"><i class="fas fa-archive"></i> SCD</a></li>
                <li><a href="sql-project.html"><i class="fas fa-database"></i> SQL project</a></li>
                <!-- Add more subsections as needed -->
            </ul>
        </div>
    </div>
    </div>

    <!-- ======= Portfolio Details Section ======= -->
    <section id="portfolio-details" class="portfolio-details">
    <div class="container">
    <div class="row gy-4">
        <h1>K-Nearest Neighbors (KNN): Classification methods</h1>
        <div class="col-lg-8">
        <div class="portfolio-details-slider swiper">
            <div class="swiper-wrapper align-items-center">
            <div class="swiper-slide">
                <figure>
                    <img src="assets/img/machine-ln/classfication-knn1.png" alt="" style="max-width: 90%; max-height: auto;">
                    <figcaption style="text-align: center;"></figcaption>
                </figure>
            </div>
            </div>
        </div>
    </div>

    <div class="col-lg-4 grey-box">
        
        <div class="section-title">
        <h3>Content</h3>
        <ol>
            <li><a href="#introduction">Introduction</a></li>
            <li><a href="#principle">Principle of KNN</a></li>
            <ul>
                <li><a href="#distance">Distance metrics for k-nearest neighbours</a></li>
                <li><a href="#when">When Do We Use the KNN Algorithm?</a></li>
                <li><a href="#steps-to-follow">Steps to Effective K-Nearest Neighbors (KNN) Algorithm Implementation</a></li>
                <li><a href="#what-k">What value should you choose for k in k-nearest neighbours</a></li>
                <li><a href="why-knn">Why Do We Need the KNN Algorithm?</a></li>
                <li><a href="#pros">Pros of using KNN</a></li>
                <li><a href="#cons">Cons of Using KNN</a></li>
            </ul>
            <li><a href="#example">Example</a></li>
            <li><a href="#reference">Reference</a></li>  
        </ol>
        </div>
    </div>
    </div>

    <section>
    <!-------------------- Introduction ---------------------->
    <h2 id="introdction">Introduction</h2>
    <ul>
        <li>K-Nearest Neighbors (KNN) is a simple yet powerful supervised machine learning algorithm used for classification and regression tasks.</li>
        <li>It is a non-parametric and instance-based learning algorithm, meaning it doesn't make any underlying assumptions about the distribution of data and learns directly from the training instances.</li>
        <li>The main objective of the KNN algorithm is to predict the classification of a new sample point based on data points that are separated into several individual classes. It is used in text mining, agriculture, finance, and healthcare.</li>
    </ul>
    <p></p>
    
    <!-------------------- Principle ---------------------->
    <h2 id="principle">Principle of KNN</h2>
    <p>The principle behind KNN is straightforward: objects are classified based on the majority class among their K nearest neighbors. In other words, the class label of an unseen data point is determined by the class labels of its K nearest neighbors in the feature space.</p>
    <ul>
        <li>The central idea behind KNN is to classify a new data point based on the majority class of its nearest neighbors.</li>
        <li>It assumes that similar things are close to each other. Hence, if most of the nearest neighbors of a data point belong to a certain class, the new data point is likely to belong to that class as well</li>
        <li>Let use suppose that we have a set of training observations \((x,y)\) and they capture the relationship between \(x\) and \(y\). Now our goal is to make a model and then predict the values on the basis of new datasets.</li>
        <li>In the context of \(K\)-Nearest Neighbors (KNN), our goal is to learn a function \(h : X \rightarrow Y\) that can predict the output \(y\) for an unseen observation \(x\) based on the relationships between the input features \(X\) and the corresponding output labels \(Y\) in a labeled dataset.</li>
        <li>In the K-Nearest Neighbors (KNN) algorithm, \(K\) represents the number of nearest neighbors to consider when making predictions for a new data point.</li>
        <li>When a new data point is to be classified, KNN calculate the distances between the point and all other points in the dataset. It then selects the \(K\) nearest neighbors based on these distances. The class label or value assigned to the new data point is typically determined by the majority class or avergae value among these \(K\) nearest neighbors.</li>
        <li>The choice of \(K\) is crucial in KNN, as it significantly impacts the performance of the algorithm. A smaller \(K\) value may lead to more flexible decision boundaries, potentially capturing noise in the data but being more sensitive to outliers. On the other hand, a larger \(K\) value may provide smoother decision boundaries but might not capture local patterns effectively.</li>
        <li>Selecting an appropriate \(K\) value often involves experimentation and validation using techniques such as cross-validation to ensure optimal performance for the specific dataset and problem at hand.</li>
    </ul>

    <!---------------------------------->
    <h3 id="distance">Distance metrics for k-nearest neighbours</h3>
    <p>Distance metrics are crucial in the k-Nearest Neighbours (KNN) algorithm, as they determine how similarity or dissimilarity between data points is measured. The choice of distance metric can significantly impact the performance of the KNN algorithm. Here are some standard distance metrics used in KNN:</p>
    <ol>
        <li><strong>Euclidean Distance: </strong>The most commonly used distance metric in KNN. Calculates the straight-line distance between two points in the feature space. Suitable for continuous numerical features.
        <p><strong>Formula: </strong></p>
        $$d(\vec{p},\vec{q}) = \sqrt{\sum_{i=1}^n (p_i - q_i)^2}$$
        <p>The Euclidean distance measures the straight-line distance between two points in a Euclidean space and the formula calculates the square root of the sum of squared differences between corresponding coordinates of two points.</p>
        </li>
        <li><strong> Manhattan Distance (Taxicab or City Block Distance): </strong>Calculates the distance between two points by summing the absolute differences between their coordinates along each dimension. Practical when dealing with features measured in different units.
        <p><strong>Formula: </strong></p>
        $$d(\vec{p},\vec{q}) = \sum_{i=1}^n |p_i - q_i|$$
        <p>Manhattan distance calculates the distance between two points by summing the absolute differences of their coordinates. It represents the distance a taxicab would travel to reach the destination by moving along the grid-like city blocks.</p>
        </li>
        <li><strong>Minkowski Distance: </strong>Minkowski distance is a generalization of both Euclidean and Manhattan distances.
            <p><strong>Formula: </strong></p>
            $$d(\vec{p},\vec{q}) = \left(\sum_{i=1}^n |p_i - q_i|^{r}\right)^{1/r}$$
            <p>where The parameter 'r' controls the degree of the distance metric. When \(r=1\), it reduces to the Manhattan distance, and when \(r=2\), it becomes the Euclidean distance.</p>
        </li>
        <li><strong>Chebyshev Distance (Maximum Metric): </strong>
            Chebyshev distance calculates the maximum absolute difference between the coordinates of two points.
            <p><strong>Formula:</strong></p>
            $$d(\vec{p},\vec{q}) = \max_{i}(|p_i - q_i|)$$
            <p>It represents the distance a king would travel on a chessboard to move between two squares.</p>
        </li>
        <li><strong>Hamming Distance (for categorical data):</strong> Hamming distance measures the number of positions at which the corresponding symbols are different between two strings of equal length.
            <p><strong>Formula:</strong></p>
            $$d(\vec{p},\vec{q}) = \sum_{i=1}^n \delta(p_i \neq q_i)$$
            It's commonly used for categorical variables or binary data.
        </li>
        <li><strong>Cosine Similarity: </strong> The Cosine Similarity is a metric used to measure the similarity between two vectors in a multidimensional space. It calculates the cosine of the angle between the two vectors, providing a measure of how closely the vectors align in direction, irrespective of their magnitude. The Cosine Similarity ranges from -1 to 1.
            <p><strong>Formula:</strong></p>
            $$\cos(\theta) =  \frac{\mathbf{a} \cdot \mathbf{b}}{\|\mathbf{a}\| \cdot \|\mathbf{b}\|}$$
            where:
            <ul>
                <li>\(\mathbf{a} \cdot \mathbf{b}\) represents the dot product of the two vectors.</li>
                <li>\(\|\mathbf{a}\|\) and \(\cdot \|\mathbf{b}\|\)  represent the magnitudes (or norms) of the vectors \(a\) and \(b\), respectively.</li>
            </ul>
            <p>The Cosine Similarity is widely used in text analysis, document clustering, recommendation systems, and many other applications where vector representations are used to measure similarity between entities.</p>
        </li>
        <li><strong>Correlation Distance: </strong>Correlation distance is a measure of dissimilarity between two vectors that takes into account the correlation between their elements. It quantifies how much two vectors deviate from being perfectly positively correlated (correlation coefficient of 1) or perfectly negatively correlated (correlation coefficient of -1). The correlation distance ranges from 0 to 2, where:
            <ul>
                <li>If the correlation coefficient is 1 (perfect positive correlation), the correlation distance is 0.</li>
                <li>If the correlation coefficient is -1 (perfect negative correlation), the correlation distance is 2.</li>
            </ul>
            <p><strong>Formula:</strong></p> The formula for calculating the correlation distance between two vectors \(\vec{x}\) and \(\vec{y}\) of length \(n\) is given by 
            $$\text{Correlation Distance}(\vec{x}, \vec{y}) = 1- \text{Correlation Coefficient}(\vec{x}, \vec{y})$$
            where:
            <ul>
                <li>The correlation coefficient Correlation Coefficient(\(x\),\(y\)) measures the strength and direction of the linear relationship between the elements of vectors \(x\) and \(y\).</li>
                <li>The correlation distance is calculated as 1 minus the correlation coefficient to convert the correlation coefficient into a distance metric.</li>
            </ul>
        </li>
    </ol>


    <!---------------------------------->
    <h3 id="when">When Do We Use the KNN Algorithm?</h3>
    <p>The K-Nearest Neighbors (KNN) algorithm is used in various scenarios where its characteristics align well with the requirements of the problem. Here are some common situations where KNN is often used:</p>
    <ul>
        <li>Data is labeled</li>
        <li>Data is noise-free</li>
        <li>Dataset is small, as KNN is a lazy learner</li>
    </ul>
    <!--------------------------------->
    <h3 id="steps-to-follow">Steps to Effective K-Nearest Neighbors (KNN) Algorithm Implementation</h3>
    The K-NN working can be explained on the basis of the below algorithm:
    <ul>
        <li><strong>Step-1: (Data processing): </strong> Clean and preprocess your dataset to handle missing values, outliers, and irrelevant features. Normalize or standardize the numerical features to ensure that all features contribute equally to the distance calculations.</li>
        <pre><code class="language-python">
            from sklearn.preprocessing import StandardScaler
            from sklearn.impute import SimpleImputer
            
            # Assume X_train and X_test are your feature matrices
            # Impute missing values
            imputer = SimpleImputer(strategy='mean')
            X_train_imputed = imputer.fit_transform(X_train)
            X_test_imputed = imputer.transform(X_test)
            
            # Standardize features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train_imputed)
            X_test_scaled = scaler.transform(X_test_imputed)            
        </code></pre>
        <li><strong>Step-2: (Split Data): </strong> Split your dataset into training and testing sets. The training set will be used to train the KNN model, while the testing set will be used to evaluate its performance</li>
        <pre><code class="language-python">
            from sklearn.model_selection import train_test_split

            # Split the dataset into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)            
        </code></pre>
        <li><strong>Step-3: (Choose \(K\)): </strong>Decide on the value of \(K\)  based on cross-validation or experimentation to find the optimal value that minimizes error on the validation set.</li>
        <pre><code class="language-python">
            from sklearn.neighbors import KNeighborsClassifier
            from sklearn.model_selection import GridSearchCV
            
            # Define KNN classifier
            knn = KNeighborsClassifier()
            
            # Define parameter grid
            param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
            
            # Perform grid search
            grid_search = GridSearchCV(knn, param_grid, cv=5)
            grid_search.fit(X_train, y_train)
            
            # Get the best value of K
            best_k = grid_search.best_params_['n_neighbors']            
        </code></pre>
        <li><strong>Step-4 (Select Distance Metric): </strong> Choose an appropriate distance metric (e.g., Euclidean, Manhattan, Cosine) based on the nature of your data and problem domain. Different distance metrics may perform differently on different types of data.</li>
        <pre><code class="language-python">
            # By default, KNeighborsClassifier uses Euclidean distance
            # You can specify different distance metrics using the 'metric' parameter
            knn = KNeighborsClassifier(metric='manhattan')
        </code></pre>
        <li><strong>Step-5 (Train Model): </strong> Train the KNN model using the training dataset. The model memorizes the training data and does not require an explicit training step.</li>
        <pre><code class="language-python">
            # Using the best value of K obtained from grid search
            knn = KNeighborsClassifier(n_neighbors=best_k)
            knn.fit(X_train_scaled, y_train)
        </code></pre>
        <li><strong>Step-6 (Evaluate the Model): </strong> Use the testing dataset to evaluate the performance of the trained KNN model. Calculate evaluation metrics such as accuracy, precision, recall, F1-score, or others depending on the specific problem.</li>
        <pre><code class="language-python">
            from sklearn.metrics import accuracy_score

            # Make predictions on the testing set
            y_pred = knn.predict(X_test_scaled)
            
            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)            
        </code></pre>
        <li><strong>Step-7 (Tune Hyperparameters): </strong> Fine-tune the hyperparameters of the KNN algorithm, such as the value of \(K\) and the choice of distance metric, based on the evaluation results. Perform grid search or randomized search to find the best hyperparameters.</li>
        <pre>
            # Use grid search or randomized search to tune hyperparameters
            # Example using grid search is shown in step 3
        </pre>
        <li><strong>Step-8 (Cross-Validation): </strong> Implement cross-validation techniques such as k-fold cross-validation to ensure the robustness of the model and to assess its generalization performance on unseen data.</li>
        <pre><code class="language-python">
            from sklearn.model_selection import cross_val_score

            # Perform k-fold cross-validation
            cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5)
        </code></pre>
        <li><strong>Step-9 (Feature Selection): </strong> Conduct feature selection or dimensionality reduction techniques if necessary to improve the model's performance and reduce computational complexity.</li>
        <pre><code class="language-python">
            # Use feature selection techniques such as SelectKBest or PCA
            from sklearn.feature_selection import SelectKBest
            from sklearn.decomposition import PCA
            
            # Example using SelectKBest
            selector = SelectKBest(k=5)
            X_train_selected = selector.fit_transform(X_train_scaled, y_train)
            X_test_selected = selector.transform(X_test_scaled)
            
            # Example using PCA
            pca = PCA(n_components=2)
            X_train_pca = pca.fit_transform(X_train_scaled)
            X_test_pca = pca.transform(X_test_scaled)
        </code></pre>
        <li><strong>Step-10 (Optimize Performance): </strong> Optimize the performance of the algorithm by using efficient data structures such as KD-trees or Ball trees for fast nearest neighbor search, especially for large datasets.</li>
        <pre><code class="language-python">
            # Use efficient data structures for nearest neighbor search
            # By default, KNeighborsClassifier uses a brute-force algorithm
            # For large datasets, you can use KD-trees or Ball trees for faster search
            knn = KNeighborsClassifier(algorithm='kd_tree')
        </code></pre>
    </ul>


    <!--------------------------------->
    <h3 id="what-k">What value should you choose for k in k-nearest neighbours?</h3>
    <p>Choosing the correct value for k in the k-Nearest Neighbours (KNN) algorithm is a critical step, as it can significantly impact the model&rsquo;s performance. Selecting an appropriate k value involves finding a balance between bias and variance. Here are a few approaches you can use to choose the optimal k value:</p>
    <ul>
        <li><strong>Cross-Validation: </strong>Split your dataset into multiple folds (e.g., using k-fold cross-validation). For each fold, train the KNN model with different k values and evaluate its performance on the validation set. Calculate the average performance metric (e.g., accuracy) for each k value across all folds. Choose the k value that results in the best average performance.</li>
        <li><strong>Odd vs. Even \(K\) values:</strong> Choose odd \(K\) values to avoid ties when classifying data points with an equal number of nearest neighbours from different classes. Using an odd k value prevents indecisiveness in classification.</li>
        <li><strong>Elbow Method: </strong>Plot the performance metric (e.g., accuracy) as a function of different k values. Look for the point on the plot where the performance stabilizes or starts to decrease. This point resembles an &ldquo;elbow.&rdquo; This method helps you identify a value k that offers a good trade-off between bias and variance.</li>
        <li><strong>Grid Search: </strong>Perform a grid search over a predefined range of k values. Train and evaluate the model for each k value in the range. Choose the k value that gives the best performance on a validation set.</li>
        <li><strong>Domain Knowledge: </strong> Sometimes, domain knowledge can help you make an informed decision about the k value. For example, if you know that the problem is expected to have specific characteristics, you can choose a k value accordingly.</li>
        <li><strong>Use Case-Specific Testing: </strong>Experiment with different k values and assess the model&rsquo;s performance on a separate test dataset that wasn&rsquo;t used during training. This approach helps you directly observe how different k values affect real-world predictions.</li>
    </ul>

    <!---------------------------------->
    <h3 id="why-knn">Why Do We Need the KNN Algorithm?</h3>
    The K-Nearest Neighbors (KNN) algorithm is valuable in data science and analytics for several reasons:
    <ul>
        <li><strong>Flexibility in Data Patterns:</strong> KNN is effective in recognizing patterns in data that might not follow a linear or parametric model. It can handle complex relationships between features and target classes, making it suitable for a wide range of datasets.</li>
        <li><strong>Non-parametric Approach:</strong> Unlike some other algorithms that make assumptions about the underlying data distribution, KNN is non-parametric. It doesn't require the data to be normally distributed or have a specific shape, making it versatile and applicable in various domains.</li>
        <li><strong>Handling Multi-Class Classification:</strong> KNN can handle multi-class classification problems with ease. By considering the majority class among the nearest neighbors, it can assign a class label to a new data point based on the distribution of classes in its vicinity.</li>
        <li><strong>Adaptability to New Data:</strong> KNN is well-suited for online learning scenarios where new data points need to be incorporated into the existing model without retraining. Since KNN doesn't have a training phase and simply memorizes the training data, it can quickly adapt to changes in the dataset.</li>
        <li><strong>Interpretability:</strong> KNN predictions are intuitive and easy to interpret. The class assigned to a new data point is based on the classes of its nearest neighbors, providing insights into why a particular prediction was made.</li>
        <li><strong>Robustness to Noisy Data:</strong> KNN can handle noisy data and outliers reasonably well. Since it relies on the majority class among the nearest neighbors, outliers are less likely to significantly impact the classification results compared to some other algorithms.</li>
        <li><strong>Simple Implementation:</strong> KNN is relatively easy to implement and understand, making it accessible to beginners in machine learning and pattern recognition. Its simplicity also allows for quick experimentation and prototyping.</li>
    </ul>

    <!---------------------------------->
    <h3 id="pros">Pros of using KNN</h3>
    <ul>
        <li><strong>Simplicity:</strong> KNN is easy to understand and implement, making it accessible to beginners in machine learning and data science.</li>
        <li><strong>No Training Phase:</strong> KNN is a lazy learning algorithm, meaning it doesn't require a training phase. Instead, it memorizes the entire training dataset, making it efficient for online learning scenarios where new data points need to be incorporated without retraining.</li>
        <li><strong>Versatility:</strong> KNN can be applied to both classification and regression tasks. It can handle complex relationships between features and target variables, making it suitable for a wide range of datasets.</li>
        <li><strong>Non-parametric:</strong> KNN makes no assumptions about the underlying data distribution, making it robust to different types of data. It can capture complex patterns in the data without relying on predefined models.</li>
        <li><strong>Interpretability:</strong> Predictions made by KNN are intuitive and easy to interpret. The class label or regression value assigned to a new data point is based on the majority class or average value of its nearest neighbors.</li>
    </ul>

    <!-------------------------------------->
    <h3 id="cons">Cons of Using KNN</h3>
    <ol>
        <li><strong>Computational Complexity:</strong> As the size of the training dataset grows, the computational cost of KNN increases significantly. Calculating distances between the new data point and all existing data points in the training set can be time-consuming for large datasets.</li>

        <li><strong>Memory Usage:</strong> Since KNN memorizes the entire training dataset, it requires storing all training instances in memory. This can be problematic for datasets with a large number of features or a high-dimensional feature space, leading to high memory usage.</li>
        
        <li><strong>Prediction Time:</strong> KNN incurs a high prediction time during inference. For each new data point, KNN needs to calculate distances to all training instances and determine the nearest neighbors, which can be slow for real-time or latency-sensitive applications.</li>
        
        <li><strong>Sensitivity to Irrelevant Features:</strong> KNN considers all features equally when calculating distances between data points. Irrelevant or noisy features can negatively impact the algorithm's performance and lead to suboptimal results.</li>
        
        <li><strong>Need for Optimal K Value:</strong> The choice of the hyperparameter K significantly influences the performance of KNN. Selecting an inappropriate value of K can lead to overfitting or underfitting of the model, requiring careful tuning and validation.</li>
        
        <li><strong>Curse of Dimensionality:</strong> KNN's performance can degrade in high-dimensional feature spaces due to the curse of dimensionality. As the number of dimensions increases, the Euclidean distance between data points becomes less meaningful, making it challenging to define nearest neighbors accurately.</li>
    </ol>


    <!-------------------------------->
    <h2 id="example">Example with the dataset on 'user_data.csv'</h2>
    <p>Again we consider the 'user_data.csv' used in <a href="naive-byes.html" target="_blank">Naive-Bayes algorithm</a> (for the code, please see the github repository <a href="https://github.com/arunp77/Machine-Learning/tree/main/Projects-ML/Reg-models" target="_blank">check the 'Project-2.3-KNN-classification.ipynb' file in the Github repo</a>).</p>

    <ul>
        <li><strong>Data loading: </strong></li>
        <pre><code class="language-python">
            # loading the important libraries
            import pandas as pd
            import numpy as np
            import random 
            import matplotlib.pyplot as plt 
            
            # loading the datset and creating a dataframe
            df_user =  pd.read_csv('User_Data.csv')
            
            # Display the DataFrame
            print(df_user.head())
        </code></pre>
        <figure>
            <img src="assets/img/machine-ln/classfication-user-table.png" alt="" style="max-width: 90%; max-height: auto;">
            <figcaption style="text-align: center;"></figcaption>
        </figure>
        <li><strong>Data cleaning, preprocessing and then splitting and scalling: </strong>
            <pre><code class="language-python">
                # Importing the dataset  
                X = df_user.iloc[:, [2, 3]].values  # features
                y = df_user.iloc[:, 4].values  # target
                
                # Splitting the dataset into the Training set and Test set  
                from sklearn.model_selection import train_test_split  
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

                # Feature Scaling  
                from sklearn.preprocessing import StandardScaler  
                sc = StandardScaler()  
                X_train = sc.fit_transform(X_train)  
                X_test = sc.transform(X_test) 
            </code></pre>
        </li>
        <li><strong>Fitting K-NN classifier to the Training data: </strong>
            <p>In next step, we fit the K-NN classifier to the training data. To do this we will import the <code>KNeighborsClassifier</code> class of the <code>sklearn</code>. The object <code>Classifier</code> from the <code>KNeighborsClassifier</code> which needs various parameters such as (formmore details, see the official documentation <a href="https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html" target="_blank">Sklearn.neighbors.KNeighborsClassifier</a>):</p>
            <ul>
                <li><code>n_neighbors</code>: defines the required neighbors of the algorithm.</li>
                <li><code>metric='minkowski'</code>: This is the default parameter and it decides the distance between the points.</li>
                <li><code>p=2</code>: It is equivalent to the standard Euclidean metric.</li>
            </ul>
            <pre><code class="language-python">
                #Fitting K-NN classifier to the training set  
                from sklearn.neighbors import KNeighborsClassifier  
                classifier= KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
                
                classifier.fit(X_train, y_train)  
            </code></pre>
        </li>
        <li><strong>Predicting the Test Result: </strong>
            <pre><code class="language-python">
                #Predicting the test set result  
                y_pred= classifier.predict(x_test)  
            </code></pre>
        </li>
        <li><strong>Creating the Confusion Matrix:</strong> Now we will create the <a href="https://arunp77.github.io/logistic-regression.html#con-mat" target="_blank">Confusion matrix</a> for our K-NN model to see the accuracy of the classifier. Below is the code for it:
            <pre><code class="language-python">
                #Creating the Confusion matrix  
                from sklearn.metrics import confusion_matrix  
                cm= confusion_matrix(y_test, y_pred)  
                cm
            </code></pre>
            which created a numpy array:
            <pre>
                array(
                    [[64   4]
                    [  3  29]], dtype=int64
                )
            </pre>
        </li>
        <li><strong>Visualizing the Training set result: </strong>
            Now, we will visualize the training and the test dataset result for K-NN model. 
            <pre><code class="language-python">
                # Importing libraries
                import matplotlib.pyplot as plt
                from matplotlib.colors import ListedColormap
                
                # Set up the figure with two subplots in one row and two columns
                fig, axes = plt.subplots(1, 2, figsize=(12, 6))
                
                # Visulaizing the training set result
                x_set, y_set = X_train, y_train
                X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
                                    np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
                axes[0].contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
                                alpha=0.75, cmap=ListedColormap(['#87CEEB', '#90EE90']))
                axes[0].set_xlim(X1.min(), X1.max())
                axes[0].set_ylim(X2.min(), X2.max())
                
                for i, j in enumerate(np.unique(y_set)):
                    axes[0].scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
                                    c=ListedColormap(['#0000FF', '#2ca02c'])(i), label=j)
                
                axes[0].set_title('K-NN Algorithm (Training set)')
                axes[0].set_xlabel('Age')
                axes[0].set_ylabel('Estimated Salary')
                axes[0].legend()
                
                # Visulaizing the test set result
                x_set, y_set = X_test, y_test
                X1, X2 = np.meshgrid(np.arange(start=x_set[:, 0].min() - 1, stop=x_set[:, 0].max() + 1, step=0.01),
                                    np.arange(start=x_set[:, 1].min() - 1, stop=x_set[:, 1].max() + 1, step=0.01))
                axes[1].contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
                                alpha=0.75, cmap=ListedColormap(['#87CEEB', '#90EE90']))
                axes[1].set_xlim(X1.min(), X1.max())
                axes[1].set_ylim(X2.min(), X2.max())
                
                for i, j in enumerate(np.unique(y_set)):
                    axes[1].scatter(x_set[y_set == j, 0], x_set[y_set == j, 1],
                                    c=ListedColormap(['#0000FF', '#2ca02c'])(i), label=j)
                
                axes[1].set_title('K-NN Algorithm (Test set)')
                axes[1].set_xlabel('Age')
                axes[1].set_ylabel('Estimated Salary')
                axes[1].legend()
                
                plt.tight_layout()
                plt.show()
                
            </code></pre>
            <figure>
                <img src="assets/img/machine-ln/classfication-knn-plot.png" alt="" style="max-width: 90%; max-height: auto;">
                <figcaption style="text-align: center;"></figcaption>
            </figure>
            <p>The output graph is different from the graph which we have occurred in Logistic Regression (For details, you can see <a href="naive-byes.html">Naive-Bayes classification</a>.</p>
        </li>
        <p><strong>Explanation for training dataset:</strong></p>
        <ul>
            <li>As we can see the graph is showing the blue point and green points. The green points are for Purchased(1) and blue Points for not Purchased(0) variable.</li>
            <li>The graph is showing an irregular boundary instead of showing any straight line or any curve because it is a K-NN algorithm, i.e., finding the nearest neighbor.</li>
            <li>The graph has classified users in the correct categories as most of the users who didn't buy the SUV are in the blue region and users who bought the SUV are in the green region.</li>
            <li>The graph is showing good result but still, there are some green points in the blue region and blue points in the green region. But this is no big issue as by doing this model is prevented from overfitting issues.</li>
            <li>Hence our model is well trained.</li>
        </ul>
        <p><strong>Explanation for the test dataset:</strong></p>
        <ul>
            <li>On the right side of the plot, plot for test datset is shown.</li>
            <li>As we can see in the graph, the predicted output is well good as most of the b;ue points are in the lightblue region and most of the green points are in the lightgreen region.</li>
            <li>However, there are few green points in the lightblue region and a few blue points in the lightgreen region. So these are the incorrect observations that we have observed in the confusion matrix.</li>
        </ul>
    </ul>
    
    
    </section>

    <!----------- Reference ----------->
    <section id="reference">
    <h2>References</h2>
    <ul>
        <li><a href="https://arunp77.github.io/logistic-regression.html#con-mat" target="_blank">Confusion matrix details</a>.</li>
        <li>My github Repositories on Remote sensing <a href="https://github.com/arunp77/Machine-Learning/" target="_blank">Machine learning</a></li>
        <li><a href="https://mlu-explain.github.io/linear-regression/" target="_blank">A Visual Introduction To Linear regression</a> (Best reference for theory and visualization).</li>
        <li>Book on Regression model: <a href="https://avehtari.github.io/ROS-Examples/" target="_blank">Regression and Other Stories</a></li>
        <li>Book on Statistics: <a href="https://hastie.su.domains/Papers/ESLII.pdf" target="_blank">The Elements of Statistical Learning</a></li>
        <li><a href="https://www.javatpoint.com/machine-learning-naive-bayes-classifier" target="_blank">Na&iuml;ve Bayes Classifier Algorithm, JAVAPoint.com</a></li>
        <li><a href="https://www.colorado.edu/amath/sites/default/files/attached-files/ch12_0.pdf">https://www.colorado.edu/amath/sites/default/files/attached-files/ch12_0.pdf</a></li>
        <li><a href="https://datahacker.rs/002-machine-learning-linear-regression-model/" target="_blank">One of the best description on Linear regression</a>.</li>
    </ul>
    </section>

    <hr>
    
    <div style="background-color: #f0f0f0; padding: 15px; border-radius: 5px;">

    <h3>Some other interesting things to know:</h3>
    <ul style="list-style-type: disc; margin-left: 30px;">
        <li>Visit my website on <a href="sql-project.html">For Data, Big Data, Data-modeling, Datawarehouse, SQL, cloud-compute.</a></li>
        <li>Visit my website on <a href="Data-engineering.html">Data engineering</a></li>
    </ul>
    </div>
    <p></p>

    <div class="navigation">
        <a href="index.html#portfolio" class="clickable-box">
            <span class="arrow-left">Portfolio section</span>
        </a>
        
        <a href="machine-learning.html" class="clickable-box">
            <span class="arrow-right">Content</span>
        </a>
    </div>
</div>
</div>
</section><!-- End Portfolio Details Section -->
</main><!-- End #main --

<!-- ======= Footer ======= -->
<footer id="footer">
  <div class="container">
    <div class="copyright">
      &copy; Copyright <strong><span>Arun</span></strong>
    </div>
  </div>
</footer><!-- End  Footer -->

<a href="#" class="back-to-top d-flex align-items-center justify-content-center"><i class="bi bi-arrow-up-short"></i></a>

<!-- Vendor JS Files -->
<script src="assets/vendor/purecounter/purecounter_vanilla.js"></script>
<script src="assets/vendor/aos/aos.js"></script>
<script src="assets/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
<script src="assets/vendor/glightbox/js/glightbox.min.js"></script>
<script src="assets/vendor/isotope-layout/isotope.pkgd.min.js"></script>
<script src="assets/vendor/swiper/swiper-bundle.min.js"></script>
<script src="assets/vendor/typed.js/typed.umd.js"></script>
<script src="assets/vendor/waypoints/noframework.waypoints.js"></script>
<script src="assets/vendor/php-email-form/validate.js"></script>

<!-- Template Main JS File -->
<script src="assets/js/main.js"></script>

<script>
  document.addEventListener("DOMContentLoaded", function () {
    hljs.initHighlightingOnLoad();
  });
</script>

</body>

</html>