mirror of
https://github.com/Rushilwiz/ml.git
synced 2025-04-09 15:00:17 -04:00
816 lines
26 KiB
Plaintext
816 lines
26 KiB
Plaintext
{
|
||
"nbformat": 4,
|
||
"nbformat_minor": 0,
|
||
"metadata": {
|
||
"colab": {
|
||
"name": "Salary_ML_Project.ipynb",
|
||
"provenance": [],
|
||
"collapsed_sections": []
|
||
},
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"name": "python"
|
||
}
|
||
},
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "LXy6hmTOhed1"
|
||
},
|
||
"source": [
|
||
"Setting up Colab"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "PiiiSef_6tle",
|
||
"outputId": "7fedf5f1-1b59-441c-d6ff-a32fd9987468"
|
||
},
|
||
"source": [
|
||
"from google.colab import drive\n",
|
||
"\n",
|
||
"drive.mount('/content/drive', force_remount = True)"
|
||
],
|
||
"execution_count": 132,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Mounted at /content/drive\n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "NIraf5LmHxm-"
|
||
},
|
||
"source": [
|
||
"### Read Dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 307
|
||
},
|
||
"id": "6YndzY3swBQj",
|
||
"outputId": "a342b5b8-cc12-4979-ec15-4a09000d3151"
|
||
},
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"df = pd.read_csv('/content/drive/MyDrive/ML Salary Project /Data/project-final.csv')\n",
|
||
"df.head()"
|
||
],
|
||
"execution_count": 133,
|
||
"outputs": [
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>SalaryUSD</th>\n",
|
||
" <th>Country</th>\n",
|
||
" <th>PrimaryDatabase</th>\n",
|
||
" <th>YearsWithThisDatabase</th>\n",
|
||
" <th>EmploymentStatus</th>\n",
|
||
" <th>JobTitle</th>\n",
|
||
" <th>ManageStaff</th>\n",
|
||
" <th>YearsWithThisTypeOfJob</th>\n",
|
||
" <th>HowManyCompanies</th>\n",
|
||
" <th>OtherPeopleOnYourTeam</th>\n",
|
||
" <th>EmploymentSector</th>\n",
|
||
" <th>CareerPlansThisYear</th>\n",
|
||
" <th>Gender</th>\n",
|
||
" <th>DatabaseServers</th>\n",
|
||
" <th>Education</th>\n",
|
||
" <th>Certifications</th>\n",
|
||
" <th>HoursWorkedPerWeek</th>\n",
|
||
" <th>TelecommuteDaysPerWeek</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Low</td>\n",
|
||
" <td>Sweden</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>E</td>\n",
|
||
" <td>D</td>\n",
|
||
" <td>Y</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>PB</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>373.411662</td>\n",
|
||
" <td>'Bachelors (4 years)'</td>\n",
|
||
" <td>N</td>\n",
|
||
" <td>43.16509</td>\n",
|
||
" <td>zero</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>High</td>\n",
|
||
" <td>USA</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>15</td>\n",
|
||
" <td>E</td>\n",
|
||
" <td>DBAP</td>\n",
|
||
" <td>N</td>\n",
|
||
" <td>25</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>PB</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>373.411662</td>\n",
|
||
" <td>'Bachelors (4 years)'</td>\n",
|
||
" <td>N</td>\n",
|
||
" <td>43.16509</td>\n",
|
||
" <td>zero</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>High-Mid</td>\n",
|
||
" <td>USA</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>E</td>\n",
|
||
" <td>DBAG</td>\n",
|
||
" <td>Y</td>\n",
|
||
" <td>6</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>PB</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>373.411662</td>\n",
|
||
" <td>'Bachelors (4 years)'</td>\n",
|
||
" <td>N</td>\n",
|
||
" <td>43.16509</td>\n",
|
||
" <td>zero</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Low</td>\n",
|
||
" <td>UK</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>10</td>\n",
|
||
" <td>E</td>\n",
|
||
" <td>DBAP</td>\n",
|
||
" <td>N</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>2.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>E</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>373.411662</td>\n",
|
||
" <td>'Bachelors (4 years)'</td>\n",
|
||
" <td>N</td>\n",
|
||
" <td>43.16509</td>\n",
|
||
" <td>zero</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>High-Mid</td>\n",
|
||
" <td>USA</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>E</td>\n",
|
||
" <td>D</td>\n",
|
||
" <td>N</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>1.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>PB</td>\n",
|
||
" <td>S</td>\n",
|
||
" <td>M</td>\n",
|
||
" <td>373.411662</td>\n",
|
||
" <td>'Bachelors (4 years)'</td>\n",
|
||
" <td>N</td>\n",
|
||
" <td>43.16509</td>\n",
|
||
" <td>zero</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" SalaryUSD Country ... HoursWorkedPerWeek TelecommuteDaysPerWeek\n",
|
||
"0 Low Sweden ... 43.16509 zero\n",
|
||
"1 High USA ... 43.16509 zero\n",
|
||
"2 High-Mid USA ... 43.16509 zero\n",
|
||
"3 Low UK ... 43.16509 zero\n",
|
||
"4 High-Mid USA ... 43.16509 zero\n",
|
||
"\n",
|
||
"[5 rows x 18 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 133
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"id": "GnlW4u1wxi7l"
|
||
},
|
||
"source": [
|
||
"categorical_attributes = [\"SalaryUSD\", \"Country\", \"PrimaryDatabase\", \"EmploymentStatus\", \"JobTitle\", \"ManageStaff\", \"EmploymentSector\", \"CareerPlansThisYear\", \"Gender\", \"Education\", \"Certifications\", \"TelecommuteDaysPerWeek\"]\n",
|
||
"num_attributes = [name for name in list(df.columns) if name not in categorical_attributes]"
|
||
],
|
||
"execution_count": 134,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"id": "j-V2E0q-zagM"
|
||
},
|
||
"source": [
|
||
"from sklearn.pipeline import Pipeline\n",
|
||
"from sklearn.preprocessing import OrdinalEncoder\n",
|
||
"from sklearn.compose import ColumnTransformer\n",
|
||
"from sklearn.preprocessing import MinMaxScaler\n",
|
||
"\n",
|
||
"#normalizes numerical attributes\n",
|
||
"num_pipeline = Pipeline([('min_max_scaler', MinMaxScaler())]) \n",
|
||
"\n",
|
||
"#converts categories into numbers\n",
|
||
"cat_pipeline = Pipeline([\n",
|
||
" ('number_converter', OrdinalEncoder()),\n",
|
||
" ])\n",
|
||
"\n",
|
||
"#combining both pipelines\n",
|
||
"full_pipeline = ColumnTransformer([\n",
|
||
" (\"cat\", cat_pipeline, categorical_attributes),\n",
|
||
" (\"num\", num_pipeline, num_attributes)\n",
|
||
" ]) \n",
|
||
"\n",
|
||
"# fit_transform calculates the standard deviation of the whole training set\n",
|
||
"df_prep = full_pipeline.fit_transform(df) \n",
|
||
"df_prep = pd.DataFrame(df_prep, columns=categorical_attributes+num_attributes)\n",
|
||
"df_prep.head()\n",
|
||
"\n",
|
||
"# Split data to X and Y\n",
|
||
"X = df_prep.drop(['SalaryUSD'], axis=1, inplace=False)\n",
|
||
"Y = df_prep['SalaryUSD']"
|
||
],
|
||
"execution_count": 135,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "AZCzlB6KcNgd"
|
||
},
|
||
"source": [
|
||
"### Attribute Selection Algorithim\n",
|
||
"\n",
|
||
"---\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/",
|
||
"height": 417
|
||
},
|
||
"id": "TqM0BzEp0Vu1",
|
||
"outputId": "8914f90a-0885-43a2-b40d-f2ef7807678d"
|
||
},
|
||
"source": [
|
||
"from sklearn.feature_selection import SelectKBest, SelectFromModel\n",
|
||
"from sklearn.feature_selection import chi2, RFE, VarianceThreshold\n",
|
||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
"from sklearn.decomposition import PCA\n",
|
||
"\n",
|
||
"selector = None\n",
|
||
"\n",
|
||
"# selector = VarianceThreshold(threshold=(.1))\n",
|
||
"# selector = SelectKBest(chi2, k=10)\n",
|
||
"# selector = RFE(estimator=RandomForestClassifier())\n",
|
||
"selector = PCA(n_components=4)\n",
|
||
"\n",
|
||
"selector.fit(X, Y)\n",
|
||
"\n",
|
||
"X_selected = X if selector == None else pd.DataFrame(selector.transform(X))\n",
|
||
"\n",
|
||
"if type(selector) != PCA:\n",
|
||
" features = selector.get_support(indices=True)\n",
|
||
" features = [column for column in X.columns[features]]\n",
|
||
" X_selected.columns = features\n",
|
||
"\n",
|
||
"\n",
|
||
"# X_selected = X\n",
|
||
"\n",
|
||
"X = X_selected\n",
|
||
"\n",
|
||
"X_selected"
|
||
],
|
||
"execution_count": 136,
|
||
"outputs": [
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>0</th>\n",
|
||
" <th>1</th>\n",
|
||
" <th>2</th>\n",
|
||
" <th>3</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>-4.138638</td>\n",
|
||
" <td>-4.141160</td>\n",
|
||
" <td>-1.060148</td>\n",
|
||
" <td>-0.344151</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>-12.162499</td>\n",
|
||
" <td>1.815431</td>\n",
|
||
" <td>-0.994463</td>\n",
|
||
" <td>-0.261570</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>-12.157918</td>\n",
|
||
" <td>0.826832</td>\n",
|
||
" <td>-1.003102</td>\n",
|
||
" <td>-0.267811</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>-11.176152</td>\n",
|
||
" <td>1.820139</td>\n",
|
||
" <td>-1.147436</td>\n",
|
||
" <td>0.215920</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>-12.138983</td>\n",
|
||
" <td>-4.182848</td>\n",
|
||
" <td>-1.095431</td>\n",
|
||
" <td>-0.348160</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10334</th>\n",
|
||
" <td>-11.146052</td>\n",
|
||
" <td>-2.188204</td>\n",
|
||
" <td>-0.909104</td>\n",
|
||
" <td>-0.143865</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10335</th>\n",
|
||
" <td>3.841024</td>\n",
|
||
" <td>-2.151027</td>\n",
|
||
" <td>1.931727</td>\n",
|
||
" <td>-0.344638</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10336</th>\n",
|
||
" <td>17.828090</td>\n",
|
||
" <td>0.889711</td>\n",
|
||
" <td>2.020452</td>\n",
|
||
" <td>-0.317144</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10337</th>\n",
|
||
" <td>-12.161796</td>\n",
|
||
" <td>-2.293752</td>\n",
|
||
" <td>4.563015</td>\n",
|
||
" <td>0.136997</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10338</th>\n",
|
||
" <td>8.880702</td>\n",
|
||
" <td>-8.122048</td>\n",
|
||
" <td>-0.633003</td>\n",
|
||
" <td>-0.221737</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>10339 rows × 4 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" 0 1 2 3\n",
|
||
"0 -4.138638 -4.141160 -1.060148 -0.344151\n",
|
||
"1 -12.162499 1.815431 -0.994463 -0.261570\n",
|
||
"2 -12.157918 0.826832 -1.003102 -0.267811\n",
|
||
"3 -11.176152 1.820139 -1.147436 0.215920\n",
|
||
"4 -12.138983 -4.182848 -1.095431 -0.348160\n",
|
||
"... ... ... ... ...\n",
|
||
"10334 -11.146052 -2.188204 -0.909104 -0.143865\n",
|
||
"10335 3.841024 -2.151027 1.931727 -0.344638\n",
|
||
"10336 17.828090 0.889711 2.020452 -0.317144\n",
|
||
"10337 -12.161796 -2.293752 4.563015 0.136997\n",
|
||
"10338 8.880702 -8.122048 -0.633003 -0.221737\n",
|
||
"\n",
|
||
"[10339 rows x 4 columns]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 136
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "grd5N391HlHf"
|
||
},
|
||
"source": [
|
||
"### Split into training and Testing "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"id": "Ik3lh2A4zQme"
|
||
},
|
||
"source": [
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"\n",
|
||
"X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
|
||
"X_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-train.csv')\n",
|
||
"X_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-test.csv')\n",
|
||
"Y_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-train.csv')\n",
|
||
"Y_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-test.csv')"
|
||
],
|
||
"execution_count": 137,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "2Wthf8O2JAGW"
|
||
},
|
||
"source": [
|
||
"### Trying Different Classifiers"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"id": "v5um6ThxKPUO"
|
||
},
|
||
"source": [
|
||
"from sklearn.metrics import accuracy_score, confusion_matrix\n",
|
||
"def get_train_test_acc(model, X_train, y_train, X_test, y_test):\n",
|
||
" train_preds = model.predict(X_train)\n",
|
||
" print(f'Training Accuracy: {accuracy_score(y_train, train_preds)*100}%')\n",
|
||
" print('Confusion Matrix(training): \\n', confusion_matrix(y_train, train_preds))\n",
|
||
" test_preds = model.predict(X_test)\n",
|
||
" print(f'Testing Accuracy: {accuracy_score(y_test, test_preds)*100}%')\n",
|
||
" print('Confusion Matrix(testing): \\n', confusion_matrix(y_test, test_preds))"
|
||
],
|
||
"execution_count": 138,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "c-RSkJF5Msg0"
|
||
},
|
||
"source": [
|
||
"###Decision Trees"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"id": "ygsKlYkIZICQ"
|
||
},
|
||
"source": [
|
||
"from sklearn.tree import DecisionTreeClassifier\n",
|
||
"clf = DecisionTreeClassifier(criterion=\"entropy\", random_state=4, max_depth = 8, min_samples_split=7)\n",
|
||
"clf = clf.fit(X_train, Y_train)"
|
||
],
|
||
"execution_count": 139,
|
||
"outputs": []
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "ITsJedWSZisU",
|
||
"outputId": "8986a4c4-b8ea-4e45-d77f-013bb93708ad"
|
||
},
|
||
"source": [
|
||
"y_pred = clf.predict(X_train)\n",
|
||
"get_train_test_acc(clf, X_train, Y_train, X_test, Y_test)"
|
||
],
|
||
"execution_count": 140,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Training Accuracy: 51.23926973763753%\n",
|
||
"Confusion Matrix(training): \n",
|
||
" [[ 917 1008 80 98]\n",
|
||
" [ 417 1344 125 145]\n",
|
||
" [ 43 306 1450 259]\n",
|
||
" [ 199 934 419 527]]\n",
|
||
"Testing Accuracy: 45.84139264990329%\n",
|
||
"Confusion Matrix(testing): \n",
|
||
" [[184 253 26 19]\n",
|
||
" [123 321 34 44]\n",
|
||
" [ 18 92 343 74]\n",
|
||
" [ 61 258 118 100]]\n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "WfnkIwWUoTRw"
|
||
},
|
||
"source": [
|
||
"### Random **Forest**"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "yvFL2YFxoWKD",
|
||
"outputId": "e6e84b16-f7c7-4568-cb78-0c347c92ebe4"
|
||
},
|
||
"source": [
|
||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
"from sklearn.metrics import accuracy_score, confusion_matrix\n",
|
||
"rfc = RandomForestClassifier(max_depth = 12)\n",
|
||
"rfc.fit(X_train, Y_train)"
|
||
],
|
||
"execution_count": 141,
|
||
"outputs": [
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/plain": [
|
||
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
|
||
" criterion='gini', max_depth=12, max_features='auto',\n",
|
||
" max_leaf_nodes=None, max_samples=None,\n",
|
||
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
|
||
" min_samples_leaf=1, min_samples_split=2,\n",
|
||
" min_weight_fraction_leaf=0.0, n_estimators=100,\n",
|
||
" n_jobs=None, oob_score=False, random_state=None,\n",
|
||
" verbose=0, warm_start=False)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 141
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "5-8gDS1QouhZ",
|
||
"outputId": "0d6f8b91-a371-4f16-b450-2a6ec1362982"
|
||
},
|
||
"source": [
|
||
"get_train_test_acc(rfc, X_train, Y_train, X_test, Y_test)"
|
||
],
|
||
"execution_count": 142,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Training Accuracy: 71.38193688792165%\n",
|
||
"Confusion Matrix(training): \n",
|
||
" [[1460 476 93 74]\n",
|
||
" [ 265 1579 103 84]\n",
|
||
" [ 43 210 1696 109]\n",
|
||
" [ 172 492 246 1169]]\n",
|
||
"Testing Accuracy: 47.87234042553192%\n",
|
||
"Confusion Matrix(testing): \n",
|
||
" [[237 183 28 34]\n",
|
||
" [161 253 47 61]\n",
|
||
" [ 20 63 366 78]\n",
|
||
" [ 77 191 135 134]]\n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "7leaD--ksHuQ"
|
||
},
|
||
"source": [
|
||
"###Naive Bayes "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"id": "teGGylJBsX3H",
|
||
"outputId": "28ed7dd4-1cf5-4007-d594-f291294d7a88"
|
||
},
|
||
"source": [
|
||
"from sklearn.naive_bayes import GaussianNB\n",
|
||
"nbc = GaussianNB()\n",
|
||
"nbc.fit(X_train, Y_train)"
|
||
],
|
||
"execution_count": 143,
|
||
"outputs": [
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/plain": [
|
||
"GaussianNB(priors=None, var_smoothing=1e-09)"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 143
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"id": "l4WhrBZ4suHP",
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"outputId": "47826a95-0b38-46a2-abfb-0fcdf08c77bf"
|
||
},
|
||
"source": [
|
||
"get_train_test_acc(nbc, X_train, Y_train, X_test, Y_test)"
|
||
],
|
||
"execution_count": 144,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Training Accuracy: 36.065771974368275%\n",
|
||
"Confusion Matrix(training): \n",
|
||
" [[1045 936 111 11]\n",
|
||
" [ 678 1189 143 21]\n",
|
||
" [ 409 875 725 49]\n",
|
||
" [ 609 1022 424 24]]\n",
|
||
"Testing Accuracy: 35.686653771760156%\n",
|
||
"Confusion Matrix(testing): \n",
|
||
" [[247 212 20 3]\n",
|
||
" [192 290 38 2]\n",
|
||
" [105 212 195 15]\n",
|
||
" [141 274 116 6]]\n"
|
||
]
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"id": "jXB9OE_MudDQ"
|
||
},
|
||
"source": [
|
||
"###KNeighbors Classifer"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"id": "vYG2FQlGuyaP",
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"outputId": "81394f38-ebac-4d46-89a0-fca92f8a6fd0"
|
||
},
|
||
"source": [
|
||
"from sklearn.neighbors import KNeighborsClassifier\n",
|
||
"kfc = KNeighborsClassifier(algorithm='auto')\n",
|
||
"kfc.fit(X_train, Y_train)"
|
||
],
|
||
"execution_count": 145,
|
||
"outputs": [
|
||
{
|
||
"output_type": "execute_result",
|
||
"data": {
|
||
"text/plain": [
|
||
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
|
||
" metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
|
||
" weights='uniform')"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"execution_count": 145
|
||
}
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"metadata": {
|
||
"id": "MPMgHDGLvFHu",
|
||
"colab": {
|
||
"base_uri": "https://localhost:8080/"
|
||
},
|
||
"outputId": "38a9d0cf-e939-4620-d964-7eab781fc086"
|
||
},
|
||
"source": [
|
||
"get_train_test_acc(kfc, X_train, Y_train, X_test, Y_test)"
|
||
],
|
||
"execution_count": 146,
|
||
"outputs": [
|
||
{
|
||
"output_type": "stream",
|
||
"name": "stdout",
|
||
"text": [
|
||
"Training Accuracy: 63.075807036634025%\n",
|
||
"Confusion Matrix(training): \n",
|
||
" [[1586 322 73 122]\n",
|
||
" [ 583 1132 101 215]\n",
|
||
" [ 119 166 1515 258]\n",
|
||
" [ 363 405 327 984]]\n",
|
||
"Testing Accuracy: 43.32688588007737%\n",
|
||
"Confusion Matrix(testing): \n",
|
||
" [[266 156 21 39]\n",
|
||
" [228 165 38 91]\n",
|
||
" [ 46 55 325 101]\n",
|
||
" [126 153 118 140]]\n"
|
||
]
|
||
}
|
||
]
|
||
}
|
||
]
|
||
} |