{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Salary_ML_Project.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LXy6hmTOhed1"
      },
      "source": [
        "Setting up Colab"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PiiiSef_6tle",
        "outputId": "7fedf5f1-1b59-441c-d6ff-a32fd9987468"
      },
      "source": [
        "from google.colab import drive\n",
        "\n",
        "drive.mount('/content/drive', force_remount = True)"
      ],
      "execution_count": 132,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NIraf5LmHxm-"
      },
      "source": [
        "### Read Dataset"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 307
        },
        "id": "6YndzY3swBQj",
        "outputId": "a342b5b8-cc12-4979-ec15-4a09000d3151"
      },
      "source": [
        "import pandas as pd\n",
        "df = pd.read_csv('/content/drive/MyDrive/ML Salary Project /Data/project-final.csv')\n",
        "df.head()"
      ],
      "execution_count": 133,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>SalaryUSD</th>\n",
              "      <th>Country</th>\n",
              "      <th>PrimaryDatabase</th>\n",
              "      <th>YearsWithThisDatabase</th>\n",
              "      <th>EmploymentStatus</th>\n",
              "      <th>JobTitle</th>\n",
              "      <th>ManageStaff</th>\n",
              "      <th>YearsWithThisTypeOfJob</th>\n",
              "      <th>HowManyCompanies</th>\n",
              "      <th>OtherPeopleOnYourTeam</th>\n",
              "      <th>EmploymentSector</th>\n",
              "      <th>CareerPlansThisYear</th>\n",
              "      <th>Gender</th>\n",
              "      <th>DatabaseServers</th>\n",
              "      <th>Education</th>\n",
              "      <th>Certifications</th>\n",
              "      <th>HoursWorkedPerWeek</th>\n",
              "      <th>TelecommuteDaysPerWeek</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>Low</td>\n",
              "      <td>Sweden</td>\n",
              "      <td>M</td>\n",
              "      <td>4</td>\n",
              "      <td>E</td>\n",
              "      <td>D</td>\n",
              "      <td>Y</td>\n",
              "      <td>4</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0</td>\n",
              "      <td>PB</td>\n",
              "      <td>S</td>\n",
              "      <td>M</td>\n",
              "      <td>373.411662</td>\n",
              "      <td>'Bachelors (4 years)'</td>\n",
              "      <td>N</td>\n",
              "      <td>43.16509</td>\n",
              "      <td>zero</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>High</td>\n",
              "      <td>USA</td>\n",
              "      <td>M</td>\n",
              "      <td>15</td>\n",
              "      <td>E</td>\n",
              "      <td>DBAP</td>\n",
              "      <td>N</td>\n",
              "      <td>25</td>\n",
              "      <td>5.0</td>\n",
              "      <td>0</td>\n",
              "      <td>PB</td>\n",
              "      <td>S</td>\n",
              "      <td>M</td>\n",
              "      <td>373.411662</td>\n",
              "      <td>'Bachelors (4 years)'</td>\n",
              "      <td>N</td>\n",
              "      <td>43.16509</td>\n",
              "      <td>zero</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>High-Mid</td>\n",
              "      <td>USA</td>\n",
              "      <td>M</td>\n",
              "      <td>12</td>\n",
              "      <td>E</td>\n",
              "      <td>DBAG</td>\n",
              "      <td>Y</td>\n",
              "      <td>6</td>\n",
              "      <td>4.0</td>\n",
              "      <td>1</td>\n",
              "      <td>PB</td>\n",
              "      <td>S</td>\n",
              "      <td>M</td>\n",
              "      <td>373.411662</td>\n",
              "      <td>'Bachelors (4 years)'</td>\n",
              "      <td>N</td>\n",
              "      <td>43.16509</td>\n",
              "      <td>zero</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>Low</td>\n",
              "      <td>UK</td>\n",
              "      <td>M</td>\n",
              "      <td>10</td>\n",
              "      <td>E</td>\n",
              "      <td>DBAP</td>\n",
              "      <td>N</td>\n",
              "      <td>5</td>\n",
              "      <td>2.0</td>\n",
              "      <td>0</td>\n",
              "      <td>E</td>\n",
              "      <td>S</td>\n",
              "      <td>M</td>\n",
              "      <td>373.411662</td>\n",
              "      <td>'Bachelors (4 years)'</td>\n",
              "      <td>N</td>\n",
              "      <td>43.16509</td>\n",
              "      <td>zero</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>High-Mid</td>\n",
              "      <td>USA</td>\n",
              "      <td>M</td>\n",
              "      <td>5</td>\n",
              "      <td>E</td>\n",
              "      <td>D</td>\n",
              "      <td>N</td>\n",
              "      <td>5</td>\n",
              "      <td>1.0</td>\n",
              "      <td>0</td>\n",
              "      <td>PB</td>\n",
              "      <td>S</td>\n",
              "      <td>M</td>\n",
              "      <td>373.411662</td>\n",
              "      <td>'Bachelors (4 years)'</td>\n",
              "      <td>N</td>\n",
              "      <td>43.16509</td>\n",
              "      <td>zero</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "  SalaryUSD Country  ... HoursWorkedPerWeek  TelecommuteDaysPerWeek\n",
              "0       Low  Sweden  ...           43.16509                    zero\n",
              "1      High     USA  ...           43.16509                    zero\n",
              "2  High-Mid     USA  ...           43.16509                    zero\n",
              "3       Low      UK  ...           43.16509                    zero\n",
              "4  High-Mid     USA  ...           43.16509                    zero\n",
              "\n",
              "[5 rows x 18 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 133
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GnlW4u1wxi7l"
      },
      "source": [
        "categorical_attributes = [\"SalaryUSD\", \"Country\", \"PrimaryDatabase\", \"EmploymentStatus\", \"JobTitle\", \"ManageStaff\", \"EmploymentSector\", \"CareerPlansThisYear\", \"Gender\", \"Education\", \"Certifications\", \"TelecommuteDaysPerWeek\"]\n",
        "num_attributes = [name for name in list(df.columns) if name not in categorical_attributes]"
      ],
      "execution_count": 134,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "j-V2E0q-zagM"
      },
      "source": [
        "from sklearn.pipeline import Pipeline\n",
        "from sklearn.preprocessing import OrdinalEncoder\n",
        "from sklearn.compose import ColumnTransformer\n",
        "from sklearn.preprocessing import MinMaxScaler\n",
        "\n",
        "#normalizes numerical attributes\n",
        "num_pipeline = Pipeline([('min_max_scaler', MinMaxScaler())]) \n",
        "\n",
        "#converts categories into numbers\n",
        "cat_pipeline = Pipeline([\n",
        "        ('number_converter', OrdinalEncoder()),\n",
        "    ])\n",
        "\n",
        "#combining both pipelines\n",
        "full_pipeline = ColumnTransformer([\n",
        "        (\"cat\", cat_pipeline, categorical_attributes),\n",
        "        (\"num\", num_pipeline, num_attributes)\n",
        "        ]) \n",
        "\n",
        "# fit_transform calculates the standard deviation of the whole training set\n",
        "df_prep = full_pipeline.fit_transform(df) \n",
        "df_prep = pd.DataFrame(df_prep, columns=categorical_attributes+num_attributes)\n",
        "df_prep.head()\n",
        "\n",
        "# Split data to X and Y\n",
        "X = df_prep.drop(['SalaryUSD'], axis=1, inplace=False)\n",
        "Y = df_prep['SalaryUSD']"
      ],
      "execution_count": 135,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AZCzlB6KcNgd"
      },
      "source": [
        "### Attribute Selection Algorithim\n",
        "\n",
        "---\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 417
        },
        "id": "TqM0BzEp0Vu1",
        "outputId": "8914f90a-0885-43a2-b40d-f2ef7807678d"
      },
      "source": [
        "from sklearn.feature_selection import SelectKBest, SelectFromModel\n",
        "from sklearn.feature_selection import chi2, RFE, VarianceThreshold\n",
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.decomposition import PCA\n",
        "\n",
        "selector = None\n",
        "\n",
        "# selector = VarianceThreshold(threshold=(.1))\n",
        "# selector = SelectKBest(chi2, k=10)\n",
        "# selector = RFE(estimator=RandomForestClassifier())\n",
        "selector = PCA(n_components=4)\n",
        "\n",
        "selector.fit(X, Y)\n",
        "\n",
        "X_selected = X if selector == None else pd.DataFrame(selector.transform(X))\n",
        "\n",
        "if type(selector) != PCA:\n",
        "  features = selector.get_support(indices=True)\n",
        "  features = [column for column in X.columns[features]]\n",
        "  X_selected.columns = features\n",
        "\n",
        "\n",
        "# X_selected = X\n",
        "\n",
        "X = X_selected\n",
        "\n",
        "X_selected"
      ],
      "execution_count": 136,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>1</th>\n",
              "      <th>2</th>\n",
              "      <th>3</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>-4.138638</td>\n",
              "      <td>-4.141160</td>\n",
              "      <td>-1.060148</td>\n",
              "      <td>-0.344151</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>-12.162499</td>\n",
              "      <td>1.815431</td>\n",
              "      <td>-0.994463</td>\n",
              "      <td>-0.261570</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>-12.157918</td>\n",
              "      <td>0.826832</td>\n",
              "      <td>-1.003102</td>\n",
              "      <td>-0.267811</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>-11.176152</td>\n",
              "      <td>1.820139</td>\n",
              "      <td>-1.147436</td>\n",
              "      <td>0.215920</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>-12.138983</td>\n",
              "      <td>-4.182848</td>\n",
              "      <td>-1.095431</td>\n",
              "      <td>-0.348160</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10334</th>\n",
              "      <td>-11.146052</td>\n",
              "      <td>-2.188204</td>\n",
              "      <td>-0.909104</td>\n",
              "      <td>-0.143865</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10335</th>\n",
              "      <td>3.841024</td>\n",
              "      <td>-2.151027</td>\n",
              "      <td>1.931727</td>\n",
              "      <td>-0.344638</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10336</th>\n",
              "      <td>17.828090</td>\n",
              "      <td>0.889711</td>\n",
              "      <td>2.020452</td>\n",
              "      <td>-0.317144</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10337</th>\n",
              "      <td>-12.161796</td>\n",
              "      <td>-2.293752</td>\n",
              "      <td>4.563015</td>\n",
              "      <td>0.136997</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>10338</th>\n",
              "      <td>8.880702</td>\n",
              "      <td>-8.122048</td>\n",
              "      <td>-0.633003</td>\n",
              "      <td>-0.221737</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>10339 rows × 4 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "               0         1         2         3\n",
              "0      -4.138638 -4.141160 -1.060148 -0.344151\n",
              "1     -12.162499  1.815431 -0.994463 -0.261570\n",
              "2     -12.157918  0.826832 -1.003102 -0.267811\n",
              "3     -11.176152  1.820139 -1.147436  0.215920\n",
              "4     -12.138983 -4.182848 -1.095431 -0.348160\n",
              "...          ...       ...       ...       ...\n",
              "10334 -11.146052 -2.188204 -0.909104 -0.143865\n",
              "10335   3.841024 -2.151027  1.931727 -0.344638\n",
              "10336  17.828090  0.889711  2.020452 -0.317144\n",
              "10337 -12.161796 -2.293752  4.563015  0.136997\n",
              "10338   8.880702 -8.122048 -0.633003 -0.221737\n",
              "\n",
              "[10339 rows x 4 columns]"
            ]
          },
          "metadata": {},
          "execution_count": 136
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "grd5N391HlHf"
      },
      "source": [
        "### Split into training and Testing "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ik3lh2A4zQme"
      },
      "source": [
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
        "X_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-train.csv')\n",
        "X_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-test.csv')\n",
        "Y_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-train.csv')\n",
        "Y_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-test.csv')"
      ],
      "execution_count": 137,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2Wthf8O2JAGW"
      },
      "source": [
        "### Trying Different Classifiers"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "v5um6ThxKPUO"
      },
      "source": [
        "from sklearn.metrics import accuracy_score, confusion_matrix\n",
        "def get_train_test_acc(model, X_train, y_train, X_test, y_test):\n",
        "  train_preds = model.predict(X_train)\n",
        "  print(f'Training Accuracy: {accuracy_score(y_train, train_preds)*100}%')\n",
        "  print('Confusion Matrix(training): \\n', confusion_matrix(y_train, train_preds))\n",
        "  test_preds = model.predict(X_test)\n",
        "  print(f'Testing Accuracy: {accuracy_score(y_test, test_preds)*100}%')\n",
        "  print('Confusion Matrix(testing): \\n', confusion_matrix(y_test, test_preds))"
      ],
      "execution_count": 138,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "c-RSkJF5Msg0"
      },
      "source": [
        "###Decision Trees"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ygsKlYkIZICQ"
      },
      "source": [
        "from sklearn.tree import DecisionTreeClassifier\n",
        "clf = DecisionTreeClassifier(criterion=\"entropy\", random_state=4, max_depth = 8, min_samples_split=7)\n",
        "clf = clf.fit(X_train, Y_train)"
      ],
      "execution_count": 139,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ITsJedWSZisU",
        "outputId": "8986a4c4-b8ea-4e45-d77f-013bb93708ad"
      },
      "source": [
        "y_pred = clf.predict(X_train)\n",
        "get_train_test_acc(clf, X_train, Y_train, X_test, Y_test)"
      ],
      "execution_count": 140,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Training Accuracy: 51.23926973763753%\n",
            "Confusion Matrix(training): \n",
            " [[ 917 1008   80   98]\n",
            " [ 417 1344  125  145]\n",
            " [  43  306 1450  259]\n",
            " [ 199  934  419  527]]\n",
            "Testing Accuracy: 45.84139264990329%\n",
            "Confusion Matrix(testing): \n",
            " [[184 253  26  19]\n",
            " [123 321  34  44]\n",
            " [ 18  92 343  74]\n",
            " [ 61 258 118 100]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "WfnkIwWUoTRw"
      },
      "source": [
        "### Random **Forest**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "yvFL2YFxoWKD",
        "outputId": "e6e84b16-f7c7-4568-cb78-0c347c92ebe4"
      },
      "source": [
        "from sklearn.ensemble import RandomForestClassifier\n",
        "from sklearn.metrics import accuracy_score, confusion_matrix\n",
        "rfc = RandomForestClassifier(max_depth = 12)\n",
        "rfc.fit(X_train, Y_train)"
      ],
      "execution_count": 141,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
              "                       criterion='gini', max_depth=12, max_features='auto',\n",
              "                       max_leaf_nodes=None, max_samples=None,\n",
              "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
              "                       min_samples_leaf=1, min_samples_split=2,\n",
              "                       min_weight_fraction_leaf=0.0, n_estimators=100,\n",
              "                       n_jobs=None, oob_score=False, random_state=None,\n",
              "                       verbose=0, warm_start=False)"
            ]
          },
          "metadata": {},
          "execution_count": 141
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5-8gDS1QouhZ",
        "outputId": "0d6f8b91-a371-4f16-b450-2a6ec1362982"
      },
      "source": [
        "get_train_test_acc(rfc, X_train, Y_train, X_test, Y_test)"
      ],
      "execution_count": 142,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Training Accuracy: 71.38193688792165%\n",
            "Confusion Matrix(training): \n",
            " [[1460  476   93   74]\n",
            " [ 265 1579  103   84]\n",
            " [  43  210 1696  109]\n",
            " [ 172  492  246 1169]]\n",
            "Testing Accuracy: 47.87234042553192%\n",
            "Confusion Matrix(testing): \n",
            " [[237 183  28  34]\n",
            " [161 253  47  61]\n",
            " [ 20  63 366  78]\n",
            " [ 77 191 135 134]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "7leaD--ksHuQ"
      },
      "source": [
        "###Naive Bayes "
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "teGGylJBsX3H",
        "outputId": "28ed7dd4-1cf5-4007-d594-f291294d7a88"
      },
      "source": [
        "from sklearn.naive_bayes import GaussianNB\n",
        "nbc = GaussianNB()\n",
        "nbc.fit(X_train, Y_train)"
      ],
      "execution_count": 143,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "GaussianNB(priors=None, var_smoothing=1e-09)"
            ]
          },
          "metadata": {},
          "execution_count": 143
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "l4WhrBZ4suHP",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "47826a95-0b38-46a2-abfb-0fcdf08c77bf"
      },
      "source": [
        "get_train_test_acc(nbc, X_train, Y_train, X_test, Y_test)"
      ],
      "execution_count": 144,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Training Accuracy: 36.065771974368275%\n",
            "Confusion Matrix(training): \n",
            " [[1045  936  111   11]\n",
            " [ 678 1189  143   21]\n",
            " [ 409  875  725   49]\n",
            " [ 609 1022  424   24]]\n",
            "Testing Accuracy: 35.686653771760156%\n",
            "Confusion Matrix(testing): \n",
            " [[247 212  20   3]\n",
            " [192 290  38   2]\n",
            " [105 212 195  15]\n",
            " [141 274 116   6]]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "jXB9OE_MudDQ"
      },
      "source": [
        "###KNeighbors Classifer"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vYG2FQlGuyaP",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "81394f38-ebac-4d46-89a0-fca92f8a6fd0"
      },
      "source": [
        "from sklearn.neighbors import KNeighborsClassifier\n",
        "kfc = KNeighborsClassifier(algorithm='auto')\n",
        "kfc.fit(X_train, Y_train)"
      ],
      "execution_count": 145,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
              "                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
              "                     weights='uniform')"
            ]
          },
          "metadata": {},
          "execution_count": 145
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MPMgHDGLvFHu",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "38a9d0cf-e939-4620-d964-7eab781fc086"
      },
      "source": [
        "get_train_test_acc(kfc, X_train, Y_train, X_test, Y_test)"
      ],
      "execution_count": 146,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Training Accuracy: 63.075807036634025%\n",
            "Confusion Matrix(training): \n",
            " [[1586  322   73  122]\n",
            " [ 583 1132  101  215]\n",
            " [ 119  166 1515  258]\n",
            " [ 363  405  327  984]]\n",
            "Testing Accuracy: 43.32688588007737%\n",
            "Confusion Matrix(testing): \n",
            " [[266 156  21  39]\n",
            " [228 165  38  91]\n",
            " [ 46  55 325 101]\n",
            " [126 153 118 140]]\n"
          ]
        }
      ]
    }
  ]
}