ml/ML/ML Project Salary/SKLearn Code/Salary_ML_Project.ipynb

816 lines
26 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "Salary_ML_Project.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "LXy6hmTOhed1"
},
"source": [
"Setting up Colab"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "PiiiSef_6tle",
"outputId": "7fedf5f1-1b59-441c-d6ff-a32fd9987468"
},
"source": [
"from google.colab import drive\n",
"\n",
"drive.mount('/content/drive', force_remount = True)"
],
"execution_count": 132,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/drive\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "NIraf5LmHxm-"
},
"source": [
"### Read Dataset"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 307
},
"id": "6YndzY3swBQj",
"outputId": "a342b5b8-cc12-4979-ec15-4a09000d3151"
},
"source": [
"import pandas as pd\n",
"df = pd.read_csv('/content/drive/MyDrive/ML Salary Project /Data/project-final.csv')\n",
"df.head()"
],
"execution_count": 133,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SalaryUSD</th>\n",
" <th>Country</th>\n",
" <th>PrimaryDatabase</th>\n",
" <th>YearsWithThisDatabase</th>\n",
" <th>EmploymentStatus</th>\n",
" <th>JobTitle</th>\n",
" <th>ManageStaff</th>\n",
" <th>YearsWithThisTypeOfJob</th>\n",
" <th>HowManyCompanies</th>\n",
" <th>OtherPeopleOnYourTeam</th>\n",
" <th>EmploymentSector</th>\n",
" <th>CareerPlansThisYear</th>\n",
" <th>Gender</th>\n",
" <th>DatabaseServers</th>\n",
" <th>Education</th>\n",
" <th>Certifications</th>\n",
" <th>HoursWorkedPerWeek</th>\n",
" <th>TelecommuteDaysPerWeek</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Low</td>\n",
" <td>Sweden</td>\n",
" <td>M</td>\n",
" <td>4</td>\n",
" <td>E</td>\n",
" <td>D</td>\n",
" <td>Y</td>\n",
" <td>4</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>PB</td>\n",
" <td>S</td>\n",
" <td>M</td>\n",
" <td>373.411662</td>\n",
" <td>'Bachelors (4 years)'</td>\n",
" <td>N</td>\n",
" <td>43.16509</td>\n",
" <td>zero</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>High</td>\n",
" <td>USA</td>\n",
" <td>M</td>\n",
" <td>15</td>\n",
" <td>E</td>\n",
" <td>DBAP</td>\n",
" <td>N</td>\n",
" <td>25</td>\n",
" <td>5.0</td>\n",
" <td>0</td>\n",
" <td>PB</td>\n",
" <td>S</td>\n",
" <td>M</td>\n",
" <td>373.411662</td>\n",
" <td>'Bachelors (4 years)'</td>\n",
" <td>N</td>\n",
" <td>43.16509</td>\n",
" <td>zero</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>High-Mid</td>\n",
" <td>USA</td>\n",
" <td>M</td>\n",
" <td>12</td>\n",
" <td>E</td>\n",
" <td>DBAG</td>\n",
" <td>Y</td>\n",
" <td>6</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>PB</td>\n",
" <td>S</td>\n",
" <td>M</td>\n",
" <td>373.411662</td>\n",
" <td>'Bachelors (4 years)'</td>\n",
" <td>N</td>\n",
" <td>43.16509</td>\n",
" <td>zero</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Low</td>\n",
" <td>UK</td>\n",
" <td>M</td>\n",
" <td>10</td>\n",
" <td>E</td>\n",
" <td>DBAP</td>\n",
" <td>N</td>\n",
" <td>5</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>E</td>\n",
" <td>S</td>\n",
" <td>M</td>\n",
" <td>373.411662</td>\n",
" <td>'Bachelors (4 years)'</td>\n",
" <td>N</td>\n",
" <td>43.16509</td>\n",
" <td>zero</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>High-Mid</td>\n",
" <td>USA</td>\n",
" <td>M</td>\n",
" <td>5</td>\n",
" <td>E</td>\n",
" <td>D</td>\n",
" <td>N</td>\n",
" <td>5</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>PB</td>\n",
" <td>S</td>\n",
" <td>M</td>\n",
" <td>373.411662</td>\n",
" <td>'Bachelors (4 years)'</td>\n",
" <td>N</td>\n",
" <td>43.16509</td>\n",
" <td>zero</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" SalaryUSD Country ... HoursWorkedPerWeek TelecommuteDaysPerWeek\n",
"0 Low Sweden ... 43.16509 zero\n",
"1 High USA ... 43.16509 zero\n",
"2 High-Mid USA ... 43.16509 zero\n",
"3 Low UK ... 43.16509 zero\n",
"4 High-Mid USA ... 43.16509 zero\n",
"\n",
"[5 rows x 18 columns]"
]
},
"metadata": {},
"execution_count": 133
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "GnlW4u1wxi7l"
},
"source": [
"categorical_attributes = [\"SalaryUSD\", \"Country\", \"PrimaryDatabase\", \"EmploymentStatus\", \"JobTitle\", \"ManageStaff\", \"EmploymentSector\", \"CareerPlansThisYear\", \"Gender\", \"Education\", \"Certifications\", \"TelecommuteDaysPerWeek\"]\n",
"num_attributes = [name for name in list(df.columns) if name not in categorical_attributes]"
],
"execution_count": 134,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "j-V2E0q-zagM"
},
"source": [
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import OrdinalEncoder\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"#normalizes numerical attributes\n",
"num_pipeline = Pipeline([('min_max_scaler', MinMaxScaler())]) \n",
"\n",
"#converts categories into numbers\n",
"cat_pipeline = Pipeline([\n",
" ('number_converter', OrdinalEncoder()),\n",
" ])\n",
"\n",
"#combining both pipelines\n",
"full_pipeline = ColumnTransformer([\n",
" (\"cat\", cat_pipeline, categorical_attributes),\n",
" (\"num\", num_pipeline, num_attributes)\n",
" ]) \n",
"\n",
"# fit_transform calculates the standard deviation of the whole training set\n",
"df_prep = full_pipeline.fit_transform(df) \n",
"df_prep = pd.DataFrame(df_prep, columns=categorical_attributes+num_attributes)\n",
"df_prep.head()\n",
"\n",
"# Split data to X and Y\n",
"X = df_prep.drop(['SalaryUSD'], axis=1, inplace=False)\n",
"Y = df_prep['SalaryUSD']"
],
"execution_count": 135,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "AZCzlB6KcNgd"
},
"source": [
"### Attribute Selection Algorithim\n",
"\n",
"---\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 417
},
"id": "TqM0BzEp0Vu1",
"outputId": "8914f90a-0885-43a2-b40d-f2ef7807678d"
},
"source": [
"from sklearn.feature_selection import SelectKBest, SelectFromModel\n",
"from sklearn.feature_selection import chi2, RFE, VarianceThreshold\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.decomposition import PCA\n",
"\n",
"selector = None\n",
"\n",
"# selector = VarianceThreshold(threshold=(.1))\n",
"# selector = SelectKBest(chi2, k=10)\n",
"# selector = RFE(estimator=RandomForestClassifier())\n",
"selector = PCA(n_components=4)\n",
"\n",
"selector.fit(X, Y)\n",
"\n",
"X_selected = X if selector == None else pd.DataFrame(selector.transform(X))\n",
"\n",
"if type(selector) != PCA:\n",
" features = selector.get_support(indices=True)\n",
" features = [column for column in X.columns[features]]\n",
" X_selected.columns = features\n",
"\n",
"\n",
"# X_selected = X\n",
"\n",
"X = X_selected\n",
"\n",
"X_selected"
],
"execution_count": 136,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>-4.138638</td>\n",
" <td>-4.141160</td>\n",
" <td>-1.060148</td>\n",
" <td>-0.344151</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>-12.162499</td>\n",
" <td>1.815431</td>\n",
" <td>-0.994463</td>\n",
" <td>-0.261570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>-12.157918</td>\n",
" <td>0.826832</td>\n",
" <td>-1.003102</td>\n",
" <td>-0.267811</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>-11.176152</td>\n",
" <td>1.820139</td>\n",
" <td>-1.147436</td>\n",
" <td>0.215920</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>-12.138983</td>\n",
" <td>-4.182848</td>\n",
" <td>-1.095431</td>\n",
" <td>-0.348160</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10334</th>\n",
" <td>-11.146052</td>\n",
" <td>-2.188204</td>\n",
" <td>-0.909104</td>\n",
" <td>-0.143865</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10335</th>\n",
" <td>3.841024</td>\n",
" <td>-2.151027</td>\n",
" <td>1.931727</td>\n",
" <td>-0.344638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10336</th>\n",
" <td>17.828090</td>\n",
" <td>0.889711</td>\n",
" <td>2.020452</td>\n",
" <td>-0.317144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10337</th>\n",
" <td>-12.161796</td>\n",
" <td>-2.293752</td>\n",
" <td>4.563015</td>\n",
" <td>0.136997</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10338</th>\n",
" <td>8.880702</td>\n",
" <td>-8.122048</td>\n",
" <td>-0.633003</td>\n",
" <td>-0.221737</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10339 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3\n",
"0 -4.138638 -4.141160 -1.060148 -0.344151\n",
"1 -12.162499 1.815431 -0.994463 -0.261570\n",
"2 -12.157918 0.826832 -1.003102 -0.267811\n",
"3 -11.176152 1.820139 -1.147436 0.215920\n",
"4 -12.138983 -4.182848 -1.095431 -0.348160\n",
"... ... ... ... ...\n",
"10334 -11.146052 -2.188204 -0.909104 -0.143865\n",
"10335 3.841024 -2.151027 1.931727 -0.344638\n",
"10336 17.828090 0.889711 2.020452 -0.317144\n",
"10337 -12.161796 -2.293752 4.563015 0.136997\n",
"10338 8.880702 -8.122048 -0.633003 -0.221737\n",
"\n",
"[10339 rows x 4 columns]"
]
},
"metadata": {},
"execution_count": 136
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "grd5N391HlHf"
},
"source": [
"### Split into training and Testing "
]
},
{
"cell_type": "code",
"metadata": {
"id": "Ik3lh2A4zQme"
},
"source": [
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)\n",
"X_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-train.csv')\n",
"X_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-test.csv')\n",
"Y_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-train.csv')\n",
"Y_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-test.csv')"
],
"execution_count": 137,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "2Wthf8O2JAGW"
},
"source": [
"### Trying Different Classifiers"
]
},
{
"cell_type": "code",
"metadata": {
"id": "v5um6ThxKPUO"
},
"source": [
"from sklearn.metrics import accuracy_score, confusion_matrix\n",
"def get_train_test_acc(model, X_train, y_train, X_test, y_test):\n",
" train_preds = model.predict(X_train)\n",
" print(f'Training Accuracy: {accuracy_score(y_train, train_preds)*100}%')\n",
" print('Confusion Matrix(training): \\n', confusion_matrix(y_train, train_preds))\n",
" test_preds = model.predict(X_test)\n",
" print(f'Testing Accuracy: {accuracy_score(y_test, test_preds)*100}%')\n",
" print('Confusion Matrix(testing): \\n', confusion_matrix(y_test, test_preds))"
],
"execution_count": 138,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "c-RSkJF5Msg0"
},
"source": [
"###Decision Trees"
]
},
{
"cell_type": "code",
"metadata": {
"id": "ygsKlYkIZICQ"
},
"source": [
"from sklearn.tree import DecisionTreeClassifier\n",
"clf = DecisionTreeClassifier(criterion=\"entropy\", random_state=4, max_depth = 8, min_samples_split=7)\n",
"clf = clf.fit(X_train, Y_train)"
],
"execution_count": 139,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ITsJedWSZisU",
"outputId": "8986a4c4-b8ea-4e45-d77f-013bb93708ad"
},
"source": [
"y_pred = clf.predict(X_train)\n",
"get_train_test_acc(clf, X_train, Y_train, X_test, Y_test)"
],
"execution_count": 140,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Training Accuracy: 51.23926973763753%\n",
"Confusion Matrix(training): \n",
" [[ 917 1008 80 98]\n",
" [ 417 1344 125 145]\n",
" [ 43 306 1450 259]\n",
" [ 199 934 419 527]]\n",
"Testing Accuracy: 45.84139264990329%\n",
"Confusion Matrix(testing): \n",
" [[184 253 26 19]\n",
" [123 321 34 44]\n",
" [ 18 92 343 74]\n",
" [ 61 258 118 100]]\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "WfnkIwWUoTRw"
},
"source": [
"### Random **Forest**"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yvFL2YFxoWKD",
"outputId": "e6e84b16-f7c7-4568-cb78-0c347c92ebe4"
},
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import accuracy_score, confusion_matrix\n",
"rfc = RandomForestClassifier(max_depth = 12)\n",
"rfc.fit(X_train, Y_train)"
],
"execution_count": 141,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
" criterion='gini', max_depth=12, max_features='auto',\n",
" max_leaf_nodes=None, max_samples=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=100,\n",
" n_jobs=None, oob_score=False, random_state=None,\n",
" verbose=0, warm_start=False)"
]
},
"metadata": {},
"execution_count": 141
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5-8gDS1QouhZ",
"outputId": "0d6f8b91-a371-4f16-b450-2a6ec1362982"
},
"source": [
"get_train_test_acc(rfc, X_train, Y_train, X_test, Y_test)"
],
"execution_count": 142,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Training Accuracy: 71.38193688792165%\n",
"Confusion Matrix(training): \n",
" [[1460 476 93 74]\n",
" [ 265 1579 103 84]\n",
" [ 43 210 1696 109]\n",
" [ 172 492 246 1169]]\n",
"Testing Accuracy: 47.87234042553192%\n",
"Confusion Matrix(testing): \n",
" [[237 183 28 34]\n",
" [161 253 47 61]\n",
" [ 20 63 366 78]\n",
" [ 77 191 135 134]]\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "7leaD--ksHuQ"
},
"source": [
"###Naive Bayes "
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "teGGylJBsX3H",
"outputId": "28ed7dd4-1cf5-4007-d594-f291294d7a88"
},
"source": [
"from sklearn.naive_bayes import GaussianNB\n",
"nbc = GaussianNB()\n",
"nbc.fit(X_train, Y_train)"
],
"execution_count": 143,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"GaussianNB(priors=None, var_smoothing=1e-09)"
]
},
"metadata": {},
"execution_count": 143
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "l4WhrBZ4suHP",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "47826a95-0b38-46a2-abfb-0fcdf08c77bf"
},
"source": [
"get_train_test_acc(nbc, X_train, Y_train, X_test, Y_test)"
],
"execution_count": 144,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Training Accuracy: 36.065771974368275%\n",
"Confusion Matrix(training): \n",
" [[1045 936 111 11]\n",
" [ 678 1189 143 21]\n",
" [ 409 875 725 49]\n",
" [ 609 1022 424 24]]\n",
"Testing Accuracy: 35.686653771760156%\n",
"Confusion Matrix(testing): \n",
" [[247 212 20 3]\n",
" [192 290 38 2]\n",
" [105 212 195 15]\n",
" [141 274 116 6]]\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "jXB9OE_MudDQ"
},
"source": [
"###KNeighbors Classifer"
]
},
{
"cell_type": "code",
"metadata": {
"id": "vYG2FQlGuyaP",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "81394f38-ebac-4d46-89a0-fca92f8a6fd0"
},
"source": [
"from sklearn.neighbors import KNeighborsClassifier\n",
"kfc = KNeighborsClassifier(algorithm='auto')\n",
"kfc.fit(X_train, Y_train)"
],
"execution_count": 145,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
" metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n",
" weights='uniform')"
]
},
"metadata": {},
"execution_count": 145
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "MPMgHDGLvFHu",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "38a9d0cf-e939-4620-d964-7eab781fc086"
},
"source": [
"get_train_test_acc(kfc, X_train, Y_train, X_test, Y_test)"
],
"execution_count": 146,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Training Accuracy: 63.075807036634025%\n",
"Confusion Matrix(training): \n",
" [[1586 322 73 122]\n",
" [ 583 1132 101 215]\n",
" [ 119 166 1515 258]\n",
" [ 363 405 327 984]]\n",
"Testing Accuracy: 43.32688588007737%\n",
"Confusion Matrix(testing): \n",
" [[266 156 21 39]\n",
" [228 165 38 91]\n",
" [ 46 55 325 101]\n",
" [126 153 118 140]]\n"
]
}
]
}
]
}