Setting up Colab

In [132]:
from google.colab import drive

drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


### Read Dataset

In [133]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/ML Salary Project /Data/project-final.csv')
df.head()

Unnamed: 0,SalaryUSD,Country,PrimaryDatabase,YearsWithThisDatabase,EmploymentStatus,JobTitle,ManageStaff,YearsWithThisTypeOfJob,HowManyCompanies,OtherPeopleOnYourTeam,EmploymentSector,CareerPlansThisYear,Gender,DatabaseServers,Education,Certifications,HoursWorkedPerWeek,TelecommuteDaysPerWeek
0,Low,Sweden,M,4,E,D,Y,4,1.0,0,PB,S,M,373.411662,'Bachelors (4 years)',N,43.16509,zero
1,High,USA,M,15,E,DBAP,N,25,5.0,0,PB,S,M,373.411662,'Bachelors (4 years)',N,43.16509,zero
2,High-Mid,USA,M,12,E,DBAG,Y,6,4.0,1,PB,S,M,373.411662,'Bachelors (4 years)',N,43.16509,zero
3,Low,UK,M,10,E,DBAP,N,5,2.0,0,E,S,M,373.411662,'Bachelors (4 years)',N,43.16509,zero
4,High-Mid,USA,M,5,E,D,N,5,1.0,0,PB,S,M,373.411662,'Bachelors (4 years)',N,43.16509,zero


In [134]:
categorical_attributes = ["SalaryUSD", "Country", "PrimaryDatabase", "EmploymentStatus", "JobTitle", "ManageStaff", "EmploymentSector", "CareerPlansThisYear", "Gender", "Education", "Certifications", "TelecommuteDaysPerWeek"]
num_attributes = [name for name in list(df.columns) if name not in categorical_attributes]

In [135]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

#normalizes numerical attributes
num_pipeline = Pipeline([('min_max_scaler', MinMaxScaler())]) 

#converts categories into numbers
cat_pipeline = Pipeline([
        ('number_converter', OrdinalEncoder()),
    ])

#combining both pipelines
full_pipeline = ColumnTransformer([
        ("cat", cat_pipeline, categorical_attributes),
        ("num", num_pipeline, num_attributes)
        ]) 

# fit_transform calculates the standard deviation of the whole training set
df_prep = full_pipeline.fit_transform(df) 
df_prep = pd.DataFrame(df_prep, columns=categorical_attributes+num_attributes)
df_prep.head()

# Split data to X and Y
X = df_prep.drop(['SalaryUSD'], axis=1, inplace=False)
Y = df_prep['SalaryUSD']

### Attribute Selection Algorithim

---



In [136]:
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import chi2, RFE, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

selector = None

# selector = VarianceThreshold(threshold=(.1))
# selector = SelectKBest(chi2, k=10)
# selector = RFE(estimator=RandomForestClassifier())
selector = PCA(n_components=4)

selector.fit(X, Y)

X_selected = X if selector == None else pd.DataFrame(selector.transform(X))

if type(selector) != PCA:
  features = selector.get_support(indices=True)
  features = [column for column in X.columns[features]]
  X_selected.columns = features


# X_selected = X

X = X_selected

X_selected

Unnamed: 0,0,1,2,3
0,-4.138638,-4.141160,-1.060148,-0.344151
1,-12.162499,1.815431,-0.994463,-0.261570
2,-12.157918,0.826832,-1.003102,-0.267811
3,-11.176152,1.820139,-1.147436,0.215920
4,-12.138983,-4.182848,-1.095431,-0.348160
...,...,...,...,...
10334,-11.146052,-2.188204,-0.909104,-0.143865
10335,3.841024,-2.151027,1.931727,-0.344638
10336,17.828090,0.889711,2.020452,-0.317144
10337,-12.161796,-2.293752,4.563015,0.136997


### Split into training and Testing 

In [137]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-train.csv')
X_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/x-test.csv')
Y_train.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-train.csv')
Y_test.to_csv('/content/drive/MyDrive/ML Salary Project /Data/Y-test.csv')

### Trying Different Classifiers

In [138]:
from sklearn.metrics import accuracy_score, confusion_matrix
def get_train_test_acc(model, X_train, y_train, X_test, y_test):
  train_preds = model.predict(X_train)
  print(f'Training Accuracy: {accuracy_score(y_train, train_preds)*100}%')
  print('Confusion Matrix(training): \n', confusion_matrix(y_train, train_preds))
  test_preds = model.predict(X_test)
  print(f'Testing Accuracy: {accuracy_score(y_test, test_preds)*100}%')
  print('Confusion Matrix(testing): \n', confusion_matrix(y_test, test_preds))

###Decision Trees

In [139]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion="entropy", random_state=4, max_depth = 8, min_samples_split=7)
clf = clf.fit(X_train, Y_train)

In [140]:
y_pred = clf.predict(X_train)
get_train_test_acc(clf, X_train, Y_train, X_test, Y_test)

Training Accuracy: 51.23926973763753%
Confusion Matrix(training): 
 [[ 917 1008   80   98]
 [ 417 1344  125  145]
 [  43  306 1450  259]
 [ 199  934  419  527]]
Testing Accuracy: 45.84139264990329%
Confusion Matrix(testing): 
 [[184 253  26  19]
 [123 321  34  44]
 [ 18  92 343  74]
 [ 61 258 118 100]]


### Random **Forest**

In [141]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
rfc = RandomForestClassifier(max_depth = 12)
rfc.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=12, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [142]:
get_train_test_acc(rfc, X_train, Y_train, X_test, Y_test)

Training Accuracy: 71.38193688792165%
Confusion Matrix(training): 
 [[1460  476   93   74]
 [ 265 1579  103   84]
 [  43  210 1696  109]
 [ 172  492  246 1169]]
Testing Accuracy: 47.87234042553192%
Confusion Matrix(testing): 
 [[237 183  28  34]
 [161 253  47  61]
 [ 20  63 366  78]
 [ 77 191 135 134]]


###Naive Bayes 

In [143]:
from sklearn.naive_bayes import GaussianNB
nbc = GaussianNB()
nbc.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [144]:
get_train_test_acc(nbc, X_train, Y_train, X_test, Y_test)

Training Accuracy: 36.065771974368275%
Confusion Matrix(training): 
 [[1045  936  111   11]
 [ 678 1189  143   21]
 [ 409  875  725   49]
 [ 609 1022  424   24]]
Testing Accuracy: 35.686653771760156%
Confusion Matrix(testing): 
 [[247 212  20   3]
 [192 290  38   2]
 [105 212 195  15]
 [141 274 116   6]]


###KNeighbors Classifer

In [145]:
from sklearn.neighbors import KNeighborsClassifier
kfc = KNeighborsClassifier(algorithm='auto')
kfc.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [146]:
get_train_test_acc(kfc, X_train, Y_train, X_test, Y_test)

Training Accuracy: 63.075807036634025%
Confusion Matrix(training): 
 [[1586  322   73  122]
 [ 583 1132  101  215]
 [ 119  166 1515  258]
 [ 363  405  327  984]]
Testing Accuracy: 43.32688588007737%
Confusion Matrix(testing): 
 [[266 156  21  39]
 [228 165  38  91]
 [ 46  55 325 101]
 [126 153 118 140]]
