import pandas as pd
import numpy as np
import math
import os
os.chdir("d:/WORK/Dropbox/__ZZD__/ZZD/")
Načtení datasetu mushroom (UCI ML Repository)
columns = ["edible", "cap-shape", "cap-surface", "cap-color", "bruises?",
"odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color",
"stalk-shape", "stalk-root", "stalk-surface-above-ring",
"stalk-surface-below-ring", "stalk-color-above-ring",
"stalk-color-below-ring", "veil-type", "veil-color", "ring-number",
"ring-type", "spore-print-color", "population", "habitat"
]
dataset = pd.read_csv("mushroom.data",
names=columns, index_col=None)
X = dataset.drop("edible", axis=1)
y = dataset["edible"]
X.head()
Budeme potřebovat zjistit unikatní hodnoty ve sloupci:
col = "cap-shape"
attr_values = X[col].unique()
attr_values
A počty jednotlivých hodnot:
X[col].value_counts()
Meření nejistoty budeme používat jako kritérium, podle kterého budeme vybírat attribut, dle kterého rozdělíme data.
def H(y):
total = len(y)
result = 0
for count in y.value_counts():
if count == 0: continue
p = count/total
result -= p * math.log2(p)
return result
H(y)
Přesněji, budeme vybírat ten attribut, který zajistí největší rozdíl v nečistotě
def reduction_of_impurity(y,y_splits,impurity=H):
result = impurity(y)
total = len(y)
for y_split in y_splits:
result -= len(y_split)/total * impurity(y_split)
return result
Zamýšlené použití:
attr_values = X[col].unique()
indices = [(X[col] == value) for value in attr_values]
reduction_of_impurity(y,[y[index] for index in indices])
můžeme to zabalit do funkce:
def split_by_values(X,y,col):
attr_values = X[col].unique()
indices = [(X[col] == value) for value in attr_values]
return [y[index] for index in indices]
reduction_of_impurity(y,split_by_values(X,y,col))
(vše v komentářích)
class node:
def __init__(self):
# priznak jestli jde o interni uzel [False]
# nebo o listovy (rozhodovaci) uzel [True]
self.decisionNode = False
# attribut, podle ktereho se rozhodujeme
# internim uzlu
self.attribute = None
# cilovy attribut -- budeme jej uvazovat
# nejen v
self.decision = None
# pocty pripadu z trenovacich dat
# ktere byly pouzity pri indukci tohoho
# uzlu
self.decisionValues = None
# potomci a predek
self.childs = {}
self.parent = None
class decision_tree:
def __init__(self):
self.root = None
def stopping_criterion(self,X,y,lvl):
#X -- data v uzlu
#y -- jejich tridy
#lvl -- v jake jsme hloubce
return(y.value_counts().size == 1)
def splitting_criterion(self,y,y_splits):
return reduction_of_impurity(y,y_splits)
def predict(self,X):
decisions = []
for _, row in X.iterrows():
#sestupujeme z korene k rozhodovacimu uzlu
actualNode = self.root
while (not actualNode.decisionNode):
value = row[actualNode.attribute]
nextnode = actualNode.childs.get(value)
#pokud odpovidajici podstrom neexistuje
#pouzijeme decision z
if nextnode == None:
decisions.append(actualNode.decision)
break;
actualNode = nextnode;
else:
decisions.append(actualNode.decision)
return pd.Series(decisions,dtype="category")
def fit(self,X,y):
self.root = self.induce_tree(X,y,0)
return self
def induce_tree(self,X,y,lvl=0):
result = node()
result.decisionValues = y.value_counts()
result.decision = result.decisionValues.index[0]
# pokud nastalo kriterium zastaveni
# ukoncime indukci (vratime listovy uzel)
if self.stopping_criterion(X,y,lvl):
result.decisionNode = True
return result;
# jinak hledame atribut s max. kriteriem splitu
max_crit = -1
max_attribute = None
for col in X.columns:
# pripravime splity a
# vypocteme kriterium splitu
crit = self.splitting_criterion(y,split_by_values(X,y,col))
if crit > max_crit:
max_crit = crit
max_attribute = col
# OK, mame nejlepsi attribut na split
# pokud ani tento attribut nesnizuje necistotu (kriterium je 0)
# ukoncime indukci (vytvorime listovy uzel)
if max_crit <= 0.0000001:
result.decisionNode = True
return result;
# vygenerujeme potomky
max_attr_values = X[max_attribute].unique()
max_indices = [(X[max_attribute] == value) for value in max_attr_values]
childs = {value : self.induce_tree(X[index] , y[index],lvl+1) for value,index in zip(max_attr_values, max_indices) }
result.decisionNode = False
result.childs = childs
result.attribute = max_attribute
return result
Můžeme testnout:
DT = decision_tree()
DT.fit(X,y)
DT.predict(X)
a vyhodnotit s pomocí k-složkové křížové validace:
# k-fold cross-validation
# k = 10
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
k = 10
kf = StratifiedKFold(n_splits=k, shuffle=True)
acc = 0
for train_index, test_index in kf.split(X,y):
X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
y_train, y_test = y.iloc[train_index], y[test_index]
DT.fit(X_train,y_train)
y_pred = DT.predict(X_test)
acc += accuracy_score(y_test, y_pred)
print(acc/k)