{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Salary_ML_Project.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "LXy6hmTOhed1" }, "source": [ "Setting up Colab" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PiiiSef_6tle", "outputId": "7fedf5f1-1b59-441c-d6ff-a32fd9987468" }, "source": [ "from google.colab import drive\n", "\n", "drive.mount('/content/drive', force_remount = True)" ], "execution_count": 132, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "NIraf5LmHxm-" }, "source": [ "### Read Dataset" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 307 }, "id": "6YndzY3swBQj", "outputId": "a342b5b8-cc12-4979-ec15-4a09000d3151" }, "source": [ "import pandas as pd\n", "df = pd.read_csv('/content/drive/MyDrive/ML Salary Project /Data/project-final.csv')\n", "df.head()" ], "execution_count": 133, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SalaryUSDCountryPrimaryDatabaseYearsWithThisDatabaseEmploymentStatusJobTitleManageStaffYearsWithThisTypeOfJobHowManyCompaniesOtherPeopleOnYourTeamEmploymentSectorCareerPlansThisYearGenderDatabaseServersEducationCertificationsHoursWorkedPerWeekTelecommuteDaysPerWeek
0LowSwedenM4EDY41.00PBSM373.411662'Bachelors (4 years)'N43.16509zero
1HighUSAM15EDBAPN255.00PBSM373.411662'Bachelors (4 years)'N43.16509zero
2High-MidUSAM12EDBAGY64.01PBSM373.411662'Bachelors (4 years)'N43.16509zero
3LowUKM10EDBAPN52.00ESM373.411662'Bachelors (4 years)'N43.16509zero
4High-MidUSAM5EDN51.00PBSM373.411662'Bachelors (4 years)'N43.16509zero
\n", "
" ], "text/plain": [ " SalaryUSD Country ... HoursWorkedPerWeek TelecommuteDaysPerWeek\n", "0 Low Sweden ... 43.16509 zero\n", "1 High USA ... 43.16509 zero\n", "2 High-Mid USA ... 43.16509 zero\n", "3 Low UK ... 43.16509 zero\n", "4 High-Mid USA ... 43.16509 zero\n", "\n", "[5 rows x 18 columns]" ] }, "metadata": {}, "execution_count": 133 } ] }, { "cell_type": "code", "metadata": { "id": "GnlW4u1wxi7l" }, "source": [ "categorical_attributes = [\"SalaryUSD\", \"Country\", \"PrimaryDatabase\", \"EmploymentStatus\", \"JobTitle\", \"ManageStaff\", \"EmploymentSector\", \"CareerPlansThisYear\", \"Gender\", \"Education\", \"Certifications\", \"TelecommuteDaysPerWeek\"]\n", "num_attributes = [name for name in list(df.columns) if name not in categorical_attributes]" ], "execution_count": 134, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "j-V2E0q-zagM" }, "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import OrdinalEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.preprocessing import MinMaxScaler\n", "\n", "#normalizes numerical attributes\n", "num_pipeline = Pipeline([('min_max_scaler', MinMaxScaler())]) \n", "\n", "#converts categories into numbers\n", "cat_pipeline = Pipeline([\n", " ('number_converter', OrdinalEncoder()),\n", " ])\n", "\n", "#combining both pipelines\n", "full_pipeline = ColumnTransformer([\n", " (\"cat\", cat_pipeline, categorical_attributes),\n", " (\"num\", num_pipeline, num_attributes)\n", " ]) \n", "\n", "# fit_transform calculates the standard deviation of the whole training set\n", "df_prep = full_pipeline.fit_transform(df) \n", "df_prep = pd.DataFrame(df_prep, columns=categorical_attributes+num_attributes)\n", "df_prep.head()\n", "\n", "# Split data to X and Y\n", "X = df_prep.drop(['SalaryUSD'], axis=1, inplace=False)\n", "Y = df_prep['SalaryUSD']" ], "execution_count": 135, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "AZCzlB6KcNgd" }, "source": [ "### Attribute Selection Algorithim\n", "\n", "---\n", "\n" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 417 }, "id": "TqM0BzEp0Vu1", "outputId": "8914f90a-0885-43a2-b40d-f2ef7807678d" }, "source": [ "from sklearn.feature_selection import SelectKBest, SelectFromModel\n", "from sklearn.feature_selection import chi2, RFE, VarianceThreshold\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.decomposition import PCA\n", "\n", "selector = None\n", "\n", "# selector = VarianceThreshold(threshold=(.1))\n", "# selector = SelectKBest(chi2, k=10)\n", "# selector = RFE(estimator=RandomForestClassifier())\n", "selector = PCA(n_components=4)\n", "\n", "selector.fit(X, Y)\n", "\n", "X_selected = X if selector == None else pd.DataFrame(selector.transform(X))\n", "\n", "if type(selector) != PCA:\n", " features = selector.get_support(indices=True)\n", " features = [column for column in X.columns[features]]\n", " X_selected.columns = features\n", "\n", "\n", "# X_selected = X\n", "\n", "X = X_selected\n", "\n", "X_selected" ], "execution_count": 136, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123
0-4.138638-4.141160-1.060148-0.344151
1-12.1624991.815431-0.994463-0.261570
2-12.1579180.826832-1.003102-0.267811
3-11.1761521.820139-1.1474360.215920
4-12.138983-4.182848-1.095431-0.348160
...............
10334-11.146052-2.188204-0.909104-0.143865
103353.841024-2.1510271.931727-0.344638
1033617.8280900.8897112.020452-0.317144
10337-12.161796-2.2937524.5630150.136997
103388.880702-8.122048-0.633003-0.221737
\n", "

10339 rows × 4 columns

\n", "
" ], "text/plain": [ " 0 1 2 3\n", "0 -4.138638 -4.141160 -1.060148 -0.344151\n", "1 -12.162499 1.815431 -0.994463 -0.261570\n", "2 -12.157918 0.826832 -1.003102 -0.267811\n", "3 -11.176152 1.820139 -1.147436 0.215920\n", "4 -12.138983 -4.182848 -1.095431 -0.348160\n", "... ... ... ... ...\n", "10334 -11.146052 -2.188204 -0.909104 -0.143865\n", "10335 3.841024 -2.151027 1.931727 -0.344638\n", "10336 17.828090 0.889711 2.020452 -0.317144\n", "10337 -12.161796 -2.293752 4.563015 0.136997\n", "10338 8.880702 -8.122048 -0.633003 -0.221737\n", "\n", "[10339 rows x 4 columns]" ] }, "metadata": {}, "execution_count": 136 } ] }, { "cell_type": "markdown", "metadata": { "id": "grd5N391HlHf" }, "source": [ "### Split into training and Testing " ] }, { "cell_type": "code", "metadata": { "id": "Ik3lh2A4zQme" }, "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)\n", "X_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-train.csv')\n", "X_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-test.csv')\n", "Y_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-train.csv')\n", "Y_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-test.csv')" ], "execution_count": 137, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "2Wthf8O2JAGW" }, "source": [ "### Trying Different Classifiers" ] }, { "cell_type": "code", "metadata": { "id": "v5um6ThxKPUO" }, "source": [ "from sklearn.metrics import accuracy_score, confusion_matrix\n", "def get_train_test_acc(model, X_train, y_train, X_test, y_test):\n", " train_preds = model.predict(X_train)\n", " print(f'Training Accuracy: {accuracy_score(y_train, train_preds)*100}%')\n", " print('Confusion Matrix(training): \\n', confusion_matrix(y_train, train_preds))\n", " test_preds = model.predict(X_test)\n", " print(f'Testing Accuracy: {accuracy_score(y_test, test_preds)*100}%')\n", " print('Confusion Matrix(testing): \\n', confusion_matrix(y_test, test_preds))" ], "execution_count": 138, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "c-RSkJF5Msg0" }, "source": [ "###Decision Trees" ] }, { "cell_type": "code", "metadata": { "id": "ygsKlYkIZICQ" }, "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "clf = DecisionTreeClassifier(criterion=\"entropy\", random_state=4, max_depth = 8, min_samples_split=7)\n", "clf = clf.fit(X_train, Y_train)" ], "execution_count": 139, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ITsJedWSZisU", "outputId": "8986a4c4-b8ea-4e45-d77f-013bb93708ad" }, "source": [ "y_pred = clf.predict(X_train)\n", "get_train_test_acc(clf, X_train, Y_train, X_test, Y_test)" ], "execution_count": 140, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training Accuracy: 51.23926973763753%\n", "Confusion Matrix(training): \n", " [[ 917 1008 80 98]\n", " [ 417 1344 125 145]\n", " [ 43 306 1450 259]\n", " [ 199 934 419 527]]\n", "Testing Accuracy: 45.84139264990329%\n", "Confusion Matrix(testing): \n", " [[184 253 26 19]\n", " [123 321 34 44]\n", " [ 18 92 343 74]\n", " [ 61 258 118 100]]\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "WfnkIwWUoTRw" }, "source": [ "### Random **Forest**" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yvFL2YFxoWKD", "outputId": "e6e84b16-f7c7-4568-cb78-0c347c92ebe4" }, "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, confusion_matrix\n", "rfc = RandomForestClassifier(max_depth = 12)\n", "rfc.fit(X_train, Y_train)" ], "execution_count": 141, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", " criterion='gini', max_depth=12, max_features='auto',\n", " max_leaf_nodes=None, max_samples=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=100,\n", " n_jobs=None, oob_score=False, random_state=None,\n", " verbose=0, warm_start=False)" ] }, "metadata": {}, "execution_count": 141 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5-8gDS1QouhZ", "outputId": "0d6f8b91-a371-4f16-b450-2a6ec1362982" }, "source": [ "get_train_test_acc(rfc, X_train, Y_train, X_test, Y_test)" ], "execution_count": 142, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training Accuracy: 71.38193688792165%\n", "Confusion Matrix(training): \n", " [[1460 476 93 74]\n", " [ 265 1579 103 84]\n", " [ 43 210 1696 109]\n", " [ 172 492 246 1169]]\n", "Testing Accuracy: 47.87234042553192%\n", "Confusion Matrix(testing): \n", " [[237 183 28 34]\n", " [161 253 47 61]\n", " [ 20 63 366 78]\n", " [ 77 191 135 134]]\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "7leaD--ksHuQ" }, "source": [ "###Naive Bayes " ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "teGGylJBsX3H", "outputId": "28ed7dd4-1cf5-4007-d594-f291294d7a88" }, "source": [ "from sklearn.naive_bayes import GaussianNB\n", "nbc = GaussianNB()\n", "nbc.fit(X_train, Y_train)" ], "execution_count": 143, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "GaussianNB(priors=None, var_smoothing=1e-09)" ] }, "metadata": {}, "execution_count": 143 } ] }, { "cell_type": "code", "metadata": { "id": "l4WhrBZ4suHP", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "47826a95-0b38-46a2-abfb-0fcdf08c77bf" }, "source": [ "get_train_test_acc(nbc, X_train, Y_train, X_test, Y_test)" ], "execution_count": 144, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training Accuracy: 36.065771974368275%\n", "Confusion Matrix(training): \n", " [[1045 936 111 11]\n", " [ 678 1189 143 21]\n", " [ 409 875 725 49]\n", " [ 609 1022 424 24]]\n", "Testing Accuracy: 35.686653771760156%\n", "Confusion Matrix(testing): \n", " [[247 212 20 3]\n", " [192 290 38 2]\n", " [105 212 195 15]\n", " [141 274 116 6]]\n" ] } ] }, { "cell_type": "markdown", "metadata": { "id": "jXB9OE_MudDQ" }, "source": [ "###KNeighbors Classifer" ] }, { "cell_type": "code", "metadata": { "id": "vYG2FQlGuyaP", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "81394f38-ebac-4d46-89a0-fca92f8a6fd0" }, "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "kfc = KNeighborsClassifier(algorithm='auto')\n", "kfc.fit(X_train, Y_train)" ], "execution_count": 145, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", " metric_params=None, n_jobs=None, n_neighbors=5, p=2,\n", " weights='uniform')" ] }, "metadata": {}, "execution_count": 145 } ] }, { "cell_type": "code", "metadata": { "id": "MPMgHDGLvFHu", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "38a9d0cf-e939-4620-d964-7eab781fc086" }, "source": [ "get_train_test_acc(kfc, X_train, Y_train, X_test, Y_test)" ], "execution_count": 146, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Training Accuracy: 63.075807036634025%\n", "Confusion Matrix(training): \n", " [[1586 322 73 122]\n", " [ 583 1132 101 215]\n", " [ 119 166 1515 258]\n", " [ 363 405 327 984]]\n", "Testing Accuracy: 43.32688588007737%\n", "Confusion Matrix(testing): \n", " [[266 156 21 39]\n", " [228 165 38 91]\n", " [ 46 55 325 101]\n", " [126 153 118 140]]\n" ] } ] } ] }