{ "cells": [ { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Naive Bayes examined 100 samples\n", "---\n", "Confusion Matrix\n", "{'Iris-versicolor': 16, 'Iris-virginica': 3, 'Iris-setosa': 0}\n", "{'Iris-versicolor': 1, 'Iris-virginica': 15, 'Iris-setosa': 0}\n", "{'Iris-versicolor': 0, 'Iris-virginica': 0, 'Iris-setosa': 15}\n", "\n", "Accuracy: 92.0%\n", "skLearn accuracy: 96.0%\n" ] } ], "source": [ "import math\n", "import pandas as pd\n", "from sklearn import preprocessing, tree, model_selection\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.datasets import load_iris\n", "\n", "filename = 'iris.csv'\n", "needs_discretized = True\n", "class_attr = 'class'\n", "split = .67\n", "classifier = 3\n", "\n", "def main():\n", " # Read CSV\n", " df = pd.read_csv(filename)\n", " \n", " # Randomize Order\n", " df = df.sample(frac=1)\n", "\n", " # Discretize\n", " if needs_discretized:\n", " for col in df:\n", " if col != class_attr:\n", " df[col] = pd.qcut(df[col], q=5)\n", " \n", " # Split Data\n", " if split != 1:\n", " testing = df.head(-math.floor(len(df)*split))\n", " data = df.head(math.floor(len(df)*split))\n", " else:\n", " testing = data = df\n", " \n", " # Choose Classifier\n", " if classifier == 1:\n", " r1(data, testing)\n", " elif classifier == 2:\n", " decision_tree(data, testing)\n", " else:\n", " naive_bayes(data, testing)\n", " \n", "def r1(data, testing):\n", " # Set up big dictionary\n", " rules = dict()\n", " \n", " for attr in data:\n", " if attr != class_attr:\n", " rules[attr] = dict()\n", "\n", " # Loop thru data\n", " for attr in data:\n", " if attr != class_attr:\n", " freq = {v:{c:0 for c in data[class_attr].unique()} for v in data[attr].unique()}\n", " for i, sample in data.iterrows():\n", " freq[sample[attr]][sample[class_attr]] += 1\n", " \n", " attr_rule = dict()\n", " error = 0\n", " for (k,v) in freq.items():\n", " rule = max(v, key=v.get)\n", " for c in v:\n", " if c != rule:\n", " error += v[c]\n", " attr_rule[k] = rule\n", " error /= len(data)\n", " rules[attr] = (attr_rule, error)\n", " \n", " # Select best attr\n", " best_attr = min(rules, key=lambda x: rules[x][1])\n", " rule = rules[best_attr][0]\n", " print(f'R1 chose {best_attr}')\n", " print(print_tree(rule))\n", " print('---')\n", " \n", " confusion = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}\n", " \n", " correct = 0\n", " for i, row in testing.iterrows():\n", " confusion[row[class_attr]][rule[row[best_attr]]] += 1\n", " if row[class_attr] == rule[row[best_attr]]: correct += 1\n", " \n", " print(\"Confusion Matrix\")\n", " \n", " for (actual,guess) in confusion.items():\n", " print(guess)\n", " print()\n", " print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n", "\n", "\n", "def decision_tree(data, testing):\n", " print(f'Decision Tree examined {len(data)} samples and built the following tree:', end='')\n", " rules = recur_tree(data)\n", " print_tree(rules)\n", " print('\\n---')\n", " print(\"Confusion Matrix\")\n", " confusion, correct = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}, 0\n", " \n", " for i, row in testing.iterrows():\n", " guess = test_tree(row, rules)\n", " confusion[row[class_attr]][guess] += 1\n", " if row[class_attr] == guess: correct += 1 \n", " \n", " for (actual,guess) in confusion.items():\n", " print(guess)\n", "\n", " print()\n", " print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n", " \n", " # Test with sklearn tree\n", " dtc = tree.DecisionTreeClassifier()\n", " x,y = load_iris(return_X_y=True)\n", " x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=(1-split), random_state=0)\n", " y_pred = dtc.fit(x_train, y_train).predict(x_test)\n", " print(f'skLearn accuracy: {sum(y_pred == y_test)*100/len(y_pred)}%')\n", "\n", "def recur_tree(data):\n", " rules = {}\n", " \n", " # Find info gain per attrT\n", " info = calc_info(data)\n", " if info == 0:\n", " return data[class_attr].unique()[0]\n", " \n", " # gain = {attr:sum([info - calc_info(data[data[attr] == v]) for v in data[attr].unique()]) for attr in data if attr != class_attr}\n", " gain = {attr:0 for attr in data if attr != class_attr}\n", " for attr in gain:\n", " for v in data[attr].unique():\n", " gain[attr] += info - calc_info(data[data[attr] == v])\n", " \n", " # Choose highest info gain\n", " attr = max(gain, key=gain.get)\n", " if (gain[attr] == 0): \n", " return data[class_attr].unique()[0]\n", " \n", " # Split data based on values of attr and recur\n", " rules[attr] = {}\n", " for v in data[attr].unique():\n", " rules[attr][v] = recur_tree(data[data[attr] == v])\n", "\n", " return rules\n", " \n", "def calc_info(data):\n", " return abs(sum([(count/len(data))*math.log((count/len(data)), 2) for count in data[class_attr].value_counts()]))\n", " \n", "def print_tree(rules, indent=0):\n", " if type(rules) != dict: return rules\n", " \n", " for key in rules.keys():\n", " print('\\n'+' '*3*indent + f'* {key}', end='')\n", " s = print_tree(rules[key], indent + 1)\n", " if s: print(f' --> {s}', end='')\n", " \n", " return None\n", "\n", "def test_tree(row, rules):\n", " if type(rules) != dict: return rules\n", " \n", " attr = list(rules.keys())[0]\n", " return test_tree(row, rules[attr][row[attr]])\n", "\n", "def naive_bayes(data, testing):\n", " confusion, correct = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}, 0\n", " class_freq = {c:(len(data[data[class_attr] == c])) for c in data[class_attr].unique()}\n", " for i, row in testing.iterrows():\n", " probs = {c:(len(data[data[class_attr] == c]))/len(data) for c in data[class_attr].unique()}\n", " \n", " for attr in data:\n", " if attr != class_attr:\n", " same_value = data[data[attr] == row[attr]]\n", " for c in class_freq.keys():\n", " probs[c] *= len(same_value[same_value[class_attr] == c])/class_freq[c]\n", " \n", " guess = max(probs, key=probs.get)\n", " confusion[row[class_attr]][guess] += 1\n", " if row[class_attr] == guess: correct += 1\n", " \n", " print(f'Naive Bayes examined {len(data)} samples')\n", " print('---')\n", " print(\"Confusion Matrix\")\n", " for (actual,guess) in confusion.items():\n", " print(guess)\n", " print()\n", " print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n", " \n", " # Test with sklearn GaussianNaiveBayes\n", " nb = GaussianNB()\n", " x,y = load_iris(return_X_y=True)\n", " x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=(1-split), random_state=0)\n", " y_pred = nb.fit(x_train, y_train).predict(x_test)\n", " print(f'skLearn accuracy: {sum(y_pred == y_test)*100/len(y_pred)}%')\n", " \n", "if __name__ == '__main__':\n", " main()" ] } ], "metadata": { "kernelspec": { "display_name": "Python (default)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }