ml/w5/lab6.ipynb
2021-10-15 12:56:05 -04:00

242 lines
8.8 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Naive Bayes examined 100 samples\n",
"---\n",
"Confusion Matrix\n",
"{'Iris-versicolor': 16, 'Iris-virginica': 3, 'Iris-setosa': 0}\n",
"{'Iris-versicolor': 1, 'Iris-virginica': 15, 'Iris-setosa': 0}\n",
"{'Iris-versicolor': 0, 'Iris-virginica': 0, 'Iris-setosa': 15}\n",
"\n",
"Accuracy: 92.0%\n",
"skLearn accuracy: 96.0%\n"
]
}
],
"source": [
"import math\n",
"import pandas as pd\n",
"from sklearn import preprocessing, tree, model_selection\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.datasets import load_iris\n",
"\n",
"filename = 'iris.csv'\n",
"needs_discretized = True\n",
"class_attr = 'class'\n",
"split = .67\n",
"classifier = 3\n",
"\n",
"def main():\n",
" # Read CSV\n",
" df = pd.read_csv(filename)\n",
" \n",
" # Randomize Order\n",
" df = df.sample(frac=1)\n",
"\n",
" # Discretize\n",
" if needs_discretized:\n",
" for col in df:\n",
" if col != class_attr:\n",
" df[col] = pd.qcut(df[col], q=5)\n",
" \n",
" # Split Data\n",
" if split != 1:\n",
" testing = df.head(-math.floor(len(df)*split))\n",
" data = df.head(math.floor(len(df)*split))\n",
" else:\n",
" testing = data = df\n",
" \n",
" # Choose Classifier\n",
" if classifier == 1:\n",
" r1(data, testing)\n",
" elif classifier == 2:\n",
" decision_tree(data, testing)\n",
" else:\n",
" naive_bayes(data, testing)\n",
" \n",
"def r1(data, testing):\n",
" # Set up big dictionary\n",
" rules = dict()\n",
" \n",
" for attr in data:\n",
" if attr != class_attr:\n",
" rules[attr] = dict()\n",
"\n",
" # Loop thru data\n",
" for attr in data:\n",
" if attr != class_attr:\n",
" freq = {v:{c:0 for c in data[class_attr].unique()} for v in data[attr].unique()}\n",
" for i, sample in data.iterrows():\n",
" freq[sample[attr]][sample[class_attr]] += 1\n",
" \n",
" attr_rule = dict()\n",
" error = 0\n",
" for (k,v) in freq.items():\n",
" rule = max(v, key=v.get)\n",
" for c in v:\n",
" if c != rule:\n",
" error += v[c]\n",
" attr_rule[k] = rule\n",
" error /= len(data)\n",
" rules[attr] = (attr_rule, error)\n",
" \n",
" # Select best attr\n",
" best_attr = min(rules, key=lambda x: rules[x][1])\n",
" rule = rules[best_attr][0]\n",
" print(f'R1 chose {best_attr}')\n",
" print(print_tree(rule))\n",
" print('---')\n",
" \n",
" confusion = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}\n",
" \n",
" correct = 0\n",
" for i, row in testing.iterrows():\n",
" confusion[row[class_attr]][rule[row[best_attr]]] += 1\n",
" if row[class_attr] == rule[row[best_attr]]: correct += 1\n",
" \n",
" print(\"Confusion Matrix\")\n",
" \n",
" for (actual,guess) in confusion.items():\n",
" print(guess)\n",
" print()\n",
" print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n",
"\n",
"\n",
"def decision_tree(data, testing):\n",
" print(f'Decision Tree examined {len(data)} samples and built the following tree:', end='')\n",
" rules = recur_tree(data)\n",
" print_tree(rules)\n",
" print('\\n---')\n",
" print(\"Confusion Matrix\")\n",
" confusion, correct = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}, 0\n",
" \n",
" for i, row in testing.iterrows():\n",
" guess = test_tree(row, rules)\n",
" confusion[row[class_attr]][guess] += 1\n",
" if row[class_attr] == guess: correct += 1 \n",
" \n",
" for (actual,guess) in confusion.items():\n",
" print(guess)\n",
"\n",
" print()\n",
" print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n",
" \n",
" # Test with sklearn tree\n",
" dtc = tree.DecisionTreeClassifier()\n",
" x,y = load_iris(return_X_y=True)\n",
" x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=(1-split), random_state=0)\n",
" y_pred = dtc.fit(x_train, y_train).predict(x_test)\n",
" print(f'skLearn accuracy: {sum(y_pred == y_test)*100/len(y_pred)}%')\n",
"\n",
"def recur_tree(data):\n",
" rules = {}\n",
" \n",
" # Find info gain per attrT\n",
" info = calc_info(data)\n",
" if info == 0:\n",
" return data[class_attr].unique()[0]\n",
" \n",
" # gain = {attr:sum([info - calc_info(data[data[attr] == v]) for v in data[attr].unique()]) for attr in data if attr != class_attr}\n",
" gain = {attr:0 for attr in data if attr != class_attr}\n",
" for attr in gain:\n",
" for v in data[attr].unique():\n",
" gain[attr] += info - calc_info(data[data[attr] == v])\n",
" \n",
" # Choose highest info gain\n",
" attr = max(gain, key=gain.get)\n",
" if (gain[attr] == 0): \n",
" return data[class_attr].unique()[0]\n",
" \n",
" # Split data based on values of attr and recur\n",
" rules[attr] = {}\n",
" for v in data[attr].unique():\n",
" rules[attr][v] = recur_tree(data[data[attr] == v])\n",
"\n",
" return rules\n",
" \n",
"def calc_info(data):\n",
" return abs(sum([(count/len(data))*math.log((count/len(data)), 2) for count in data[class_attr].value_counts()]))\n",
" \n",
"def print_tree(rules, indent=0):\n",
" if type(rules) != dict: return rules\n",
" \n",
" for key in rules.keys():\n",
" print('\\n'+' '*3*indent + f'* {key}', end='')\n",
" s = print_tree(rules[key], indent + 1)\n",
" if s: print(f' --> {s}', end='')\n",
" \n",
" return None\n",
"\n",
"def test_tree(row, rules):\n",
" if type(rules) != dict: return rules\n",
" \n",
" attr = list(rules.keys())[0]\n",
" return test_tree(row, rules[attr][row[attr]])\n",
"\n",
"def naive_bayes(data, testing):\n",
" confusion, correct = {v:{c:0 for c in data[class_attr].unique()} for v in data[class_attr].unique()}, 0\n",
" class_freq = {c:(len(data[data[class_attr] == c])) for c in data[class_attr].unique()}\n",
" for i, row in testing.iterrows():\n",
" probs = {c:(len(data[data[class_attr] == c]))/len(data) for c in data[class_attr].unique()}\n",
" \n",
" for attr in data:\n",
" if attr != class_attr:\n",
" same_value = data[data[attr] == row[attr]]\n",
" for c in class_freq.keys():\n",
" probs[c] *= len(same_value[same_value[class_attr] == c])/class_freq[c]\n",
" \n",
" guess = max(probs, key=probs.get)\n",
" confusion[row[class_attr]][guess] += 1\n",
" if row[class_attr] == guess: correct += 1\n",
" \n",
" print(f'Naive Bayes examined {len(data)} samples')\n",
" print('---')\n",
" print(\"Confusion Matrix\")\n",
" for (actual,guess) in confusion.items():\n",
" print(guess)\n",
" print()\n",
" print(f'Accuracy: {round((correct/len(testing))*100, 3)}%')\n",
" \n",
" # Test with sklearn GaussianNaiveBayes\n",
" nb = GaussianNB()\n",
" x,y = load_iris(return_X_y=True)\n",
" x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=(1-split), random_state=0)\n",
" y_pred = nb.fit(x_train, y_train).predict(x_test)\n",
" print(f'skLearn accuracy: {sum(y_pred == y_test)*100/len(y_pred)}%')\n",
" \n",
"if __name__ == '__main__':\n",
" main()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (default)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}