mirror of
https://github.com/Rushilwiz/ml.git
synced 2025-04-09 15:00:17 -04:00
242 lines
8.8 KiB
Plaintext
242 lines
8.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Naive Bayes examined 100 samples\n",
|
|
"---\n",
|
|
"Confusion Matrix\n",
|
|
"{'Iris-versicolor': 16, 'Iris-virginica': 3, 'Iris-setosa': 0}\n",
|
|
"{'Iris-versicolor': 1, 'Iris-virginica': 15, 'Iris-setosa': 0}\n",
|
|
"{'Iris-versicolor': 0, 'Iris-virginica': 0, 'Iris-setosa': 15}\n",
|
|
"\n",
|
|
"Accuracy: 92.0%\n",
|
|
"skLearn accuracy: 96.0%\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import math\n",
|
|
"import pandas as pd\n",
|
|
"from sklearn import preprocessing, tree, model_selection\n",
|
|
"from sklearn.naive_bayes import GaussianNB\n",
|
|
"from sklearn.datasets import load_iris\n",
|
|
"\n",
|
|
"filename = 'iris.csv'\n",
|
|
"needs_discretized = True\n",
|
|
"class_attr = 'class'\n",
|
|
"split = .67\n",
|
|
"classifier = 3\n",
|
|
"\n",
|
|
"def main():\n",
|
|
" # Read CSV\n",
|
|
" df = pd.read_csv(filename)\n",
|
|
" \n",
|
|
" # Randomize Order\n",
|
|
" df = df.sample(frac=1)\n",
|
|
"\n",
|
|
" # Discretize\n",
|
|
" if needs_discretized:\n",
|
|
" for col in df:\n",
|
|
" if col != class_attr:\n",
|
|
" df[col] = pd.qcut(df[col], q=5)\n",
|
|
" \n",
|
|
" # Split Data\n",
|
|
" if split != 1:\n",
|
|
" testing = df.head(-math.floor(len(df)*split))\n",
|
|
" data = df.head(math.floor(len(df)*split))\n",
|
|
" else:\n",
|
|
" testing = data = df\n",
|
|
" \n",
|
|
" # Choose Classifier\n",
|
|
" if classifier == 1:\n",
|
|
" r1(data, testing)\n",
|
|
" elif classifier == 2:\n",
|
|
" decision_tree(data, testing)\n",
|
|
" else:\n",
|
|
" naive_bayes(data, testing)\n",
|
|
" \n",
|
|
"def r1(data, testing):\n",
|
|
" # Set up big dictionary\n",
|
|
" rules = dict()\n",
|
|
" \n",
|
|
" for attr in data:\n",
|
|
" if attr != class_attr:\n",
|
|
" rules[attr] = dict()\n",
|
|
"\n",
|
|
" # Loop thru data\n",
|
|
" for attr in data:\n",
|
|
" if attr != class_attr:\n",
|
|
" freq = {v:{c:0 for c in data[class_attr].unique()} for v in data[attr].unique()}\n",
|
|
" for i, sample in data.iterrows():\n",
|
|
" freq[sample[attr]][sample[class_attr]] += 1\n",
|
|
" \n",
|
|
" attr_rule = dict()\n",
|
|
" error = 0\n",
|
|
" for (k,v) in freq.items():\n",
|
|
" rule = max(v, key=v.get)\n",
|
|
" for c in v:\n",
|
|
" if c != rule:\n",
|
|
" error += v[c]\n",
|
|
" attr_rule[k] = rule\n",
|
|
" error /= len(data)\n",
|
|
" rules[attr] = (attr_rule, error)\n",
|
|
" \n",
|
|
" # Select best attr\n",
|
|
" best_attr = min(rules, key=lambda x: rules[x][1])\n",
|
|
" rule = rules[best_attr][0]\n",
|
|
" print(f'R1 chose {best_attr}')\n",
|
|
" print(print_tree(rule))\n",
|
|
" print('---')\n",
|
|
" \n",
|
|
" confusion = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}\n",
|
|
" \n",
|
|
" correct = 0\n",
|
|
" for i, row in testing.iterrows():\n",
|
|
" confusion[row[class_attr]][rule[row[best_attr]]] += 1\n",
|
|
" if row[class_attr] == rule[row[best_attr]]: correct += 1\n",
|
|
" \n",
|
|
" print(\"Confusion Matrix\")\n",
|
|
" \n",
|
|
" for (actual,guess) in confusion.items():\n",
|
|
" print(guess)\n",
|
|
" print()\n",
|
|
" print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n",
|
|
"\n",
|
|
"\n",
|
|
"def decision_tree(data, testing):\n",
|
|
" print(f'Decision Tree examined {len(data)} samples and built the following tree:', end='')\n",
|
|
" rules = recur_tree(data)\n",
|
|
" print_tree(rules)\n",
|
|
" print('\\n---')\n",
|
|
" print(\"Confusion Matrix\")\n",
|
|
" confusion, correct = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}, 0\n",
|
|
" \n",
|
|
" for i, row in testing.iterrows():\n",
|
|
" guess = test_tree(row, rules)\n",
|
|
" confusion[row[class_attr]][guess] += 1\n",
|
|
" if row[class_attr] == guess: correct += 1 \n",
|
|
" \n",
|
|
" for (actual,guess) in confusion.items():\n",
|
|
" print(guess)\n",
|
|
"\n",
|
|
" print()\n",
|
|
" print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n",
|
|
" \n",
|
|
" # Test with sklearn tree\n",
|
|
" dtc = tree.DecisionTreeClassifier()\n",
|
|
" x,y = load_iris(return_X_y=True)\n",
|
|
" x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=(1-split), random_state=0)\n",
|
|
" y_pred = dtc.fit(x_train, y_train).predict(x_test)\n",
|
|
" print(f'skLearn accuracy: {sum(y_pred == y_test)*100/len(y_pred)}%')\n",
|
|
"\n",
|
|
"def recur_tree(data):\n",
|
|
" rules = {}\n",
|
|
" \n",
|
|
" # Find info gain per attrT\n",
|
|
" info = calc_info(data)\n",
|
|
" if info == 0:\n",
|
|
" return data[class_attr].unique()[0]\n",
|
|
" \n",
|
|
" # gain = {attr:sum([info - calc_info(data[data[attr] == v]) for v in data[attr].unique()]) for attr in data if attr != class_attr}\n",
|
|
" gain = {attr:0 for attr in data if attr != class_attr}\n",
|
|
" for attr in gain:\n",
|
|
" for v in data[attr].unique():\n",
|
|
" gain[attr] += info - calc_info(data[data[attr] == v])\n",
|
|
" \n",
|
|
" # Choose highest info gain\n",
|
|
" attr = max(gain, key=gain.get)\n",
|
|
" if (gain[attr] == 0): \n",
|
|
" return data[class_attr].unique()[0]\n",
|
|
" \n",
|
|
" # Split data based on values of attr and recur\n",
|
|
" rules[attr] = {}\n",
|
|
" for v in data[attr].unique():\n",
|
|
" rules[attr][v] = recur_tree(data[data[attr] == v])\n",
|
|
"\n",
|
|
" return rules\n",
|
|
" \n",
|
|
"def calc_info(data):\n",
|
|
" return abs(sum([(count/len(data))*math.log((count/len(data)), 2) for count in data[class_attr].value_counts()]))\n",
|
|
" \n",
|
|
"def print_tree(rules, indent=0):\n",
|
|
" if type(rules) != dict: return rules\n",
|
|
" \n",
|
|
" for key in rules.keys():\n",
|
|
" print('\\n'+' '*3*indent + f'* {key}', end='')\n",
|
|
" s = print_tree(rules[key], indent + 1)\n",
|
|
" if s: print(f' --> {s}', end='')\n",
|
|
" \n",
|
|
" return None\n",
|
|
"\n",
|
|
"def test_tree(row, rules):\n",
|
|
" if type(rules) != dict: return rules\n",
|
|
" \n",
|
|
" attr = list(rules.keys())[0]\n",
|
|
" return test_tree(row, rules[attr][row[attr]])\n",
|
|
"\n",
|
|
"def naive_bayes(data, testing):\n",
|
|
" confusion, correct = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}, 0\n",
|
|
" class_freq = {c:(len(data[data[class_attr] == c])) for c in data[class_attr].unique()}\n",
|
|
" for i, row in testing.iterrows():\n",
|
|
" probs = {c:(len(data[data[class_attr] == c]))/len(data) for c in data[class_attr].unique()}\n",
|
|
" \n",
|
|
" for attr in data:\n",
|
|
" if attr != class_attr:\n",
|
|
" same_value = data[data[attr] == row[attr]]\n",
|
|
" for c in class_freq.keys():\n",
|
|
" probs[c] *= len(same_value[same_value[class_attr] == c])/class_freq[c]\n",
|
|
" \n",
|
|
" guess = max(probs, key=probs.get)\n",
|
|
" confusion[row[class_attr]][guess] += 1\n",
|
|
" if row[class_attr] == guess: correct += 1\n",
|
|
" \n",
|
|
" print(f'Naive Bayes examined {len(data)} samples')\n",
|
|
" print('---')\n",
|
|
" print(\"Confusion Matrix\")\n",
|
|
" for (actual,guess) in confusion.items():\n",
|
|
" print(guess)\n",
|
|
" print()\n",
|
|
" print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n",
|
|
" \n",
|
|
" # Test with sklearn GaussianNaiveBayes\n",
|
|
" nb = GaussianNB()\n",
|
|
" x,y = load_iris(return_X_y=True)\n",
|
|
" x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=(1-split), random_state=0)\n",
|
|
" y_pred = nb.fit(x_train, y_train).predict(x_test)\n",
|
|
" print(f'skLearn accuracy: {sum(y_pred == y_test)*100/len(y_pred)}%')\n",
|
|
" \n",
|
|
"if __name__ == '__main__':\n",
|
|
" main()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python (default)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|