""" prpy module trees.py Jason Corso (jcorso@acm.org) This module has been programmed to support teaching an introduction to pattern recognition course. Contains tree classifiers """ # local imports import datatools # global imports import numpy as np kDT_MaxDepth = 5 # maximum depth we'll grow a decision tree kDT_LenRanges = 100 # how many thresholds to consider when selecting AxisAligned query at a node kVerbose = True class DTNode(object): ''' A Decision Tree Node class. Links to children via reference list. ''' def __init__(self,q=None,left=None,right=None): self.q = q self.child = [left,right] def isLeaf(self): return False def walk(self,sample): if self.q.query(sample)==0: return self.child[0] else: return self.child[1] class DTLeafNode(DTNode): ''' A Leaf Node of the decision tree that stores an empirical density. ''' def __init__(self,density=None): self.density = density if (self.density): self.computeMode_() def computeDensity(self,X,Y): ''' Compute and store the empirical density at this leaf. ''' U = np.unique(Y) self.density = np.zeros(U.max()+1) for c in U: yc = np.nonzero(Y==c)[0] # like matlab's find() if (len(yc)==0): continue self.density[c] = len(yc)/np.double(len(Y)) self.computeMode_() def computeMode_(self): ''' Compute and store the mode of the empirical density at this leaf. ''' m = 0.0 self.mode = None for i in range(len(self.density)): if (self.density[i] > m): m = self.density[i] self.mode = i if kVerbose: print "Density is " , self.density print "Mode of the leaf is %d\n" % self.mode def isLeaf(self): return True def walk(self,sample): return self class DTQuery(object): ''' An empty parent class to store a query for the decision tree. ''' pass class DTQuery_AxisAligned(DTQuery): ''' An axis-aligned query (basically thresholds on one coordinate in the value array. ''' def __init__(self,query_index=None,tau=None): self.qi = query_index self.tau = tau def __repr__(self): return "Query_AxisAligned: index: %d, tau %f\n" % (self.qi,self.tau) def query(self,value): ''' A simple query response tester on the value (a vector) index query_index. If the value is less then tau, a 0 is returned and if it is greater than tau a 1 is returned. ''' if value[self.qi] < self.tau: return 0 else: return 1 class DTree(object): ''' A Decision Tree class. ''' def __init__(self,root=None): self.root = root def classify(self,sample): ''' Classify the sample using the tree. ''' leaf = self.findLeaf(sample) return leaf.mode def findLeaf(self,sample): ''' Find the leaf node in the tree for a given sample. ''' node = self.root; while (node.isLeaf() is not True): node = node.walk(sample) return node def probability(self,sample): ''' Return the empirical distribution over the classes for the sample. ''' leaf = self.findLeaf(sample) return leaf.density def impurity_entropy(Y): ''' Calculate the entropy impurity for a data set (Y). ''' U = np.unique(Y) H = 0.0; for c in U: yc = np.nonzero(Y==c)[0] # like matlab's find() if (len(yc)==0): continue cportion = len(yc)/np.double(len(Y)) H += cportion * np.log(cportion) return -1.0 * H def trainDTree_prepRanges(X): ''' Prepare a ranges list for training in the trainDTree_AxisAligned. The ranges list just basically goes through the data-range of each dimension in X and gathers a set of possible thresholds for it. ''' R = [] for d in range(X.shape[1]): dmin = np.min(X[:,d]) dmax = np.max(X[:,d]) dstep = np.abs((dmax-dmin)/np.double(kDT_LenRanges)) if (dstep < 0.00001): R.append(np.zeros(kDT_LenRanges)) else: R.append(np.arange(dmin,dmax,dstep)) return R def trainDTree_AxisAligned(X,Y,impurity,depth=0): ''' Train a decision tree using data set (X,Y). Convention on (X,Y) is that each row is a sample in the data set, X has the values of the data and Y has the class label. impurity is a function that takes X,Y and returns the impurity of the data set ***Specific to an axis-aligned query case, which is a fairly general case... ***Code flow for clarity rather than speed... ''' # could assume there is some impurity coming in, but just recompute it I = impurity(Y) if depth>kDT_MaxDepth or I==0: ''' We're at a leaf-node and need to return as such. ''' leaf = DTLeafNode() leaf.computeDensity(X,Y) return leaf ranges = trainDTree_prepRanges(X) dim = len(ranges) assert(dim == X.shape[1]) num = X.shape[0] bestIg = 0 Q = DTQuery_AxisAligned() bestQ = DTQuery_AxisAligned() bestL = None # store the splits for simplicity bestR = None # cycle through the data indices and possible thresholds to find the best one (the arg max) for qi in range(dim): Q.qi = qi for t in ranges[qi]: Q.tau = t # calculate the splits L = [] R = [] for i in range(num): if Q.query(X[i,:])==0: L.append(i) else: R.append(i) # calculate the impurity gradient Lportion = len(L) / np.double(num) Limpurity = impurity(Y[L]) Rportion = len(R) / np.double(num) Rimpurity = impurity(Y[R]) Ig = I - Lportion*Limpurity - Rportion*Rimpurity if (Ig > bestIg): bestIg = Ig bestQ.qi = qi bestQ.tau = t bestL = L bestR = R # If we cannot find a split at this node, just turn it into a leaf # BUGFIX 2/17/12 jcorso if (bestL is None) or (bestR is None): leaf = DTLeafNode() leaf.computeDensity(X,Y) return leaf # bestQ is the arg max_queries of impurity gradient leftChild = trainDTree_AxisAligned(X[bestL,:],Y[bestL],impurity,depth+1) rightChild = trainDTree_AxisAligned(X[bestR,:],Y[bestR],impurity,depth+1) if (depth>0): return DTNode(bestQ,leftChild,rightChild) else: return DTree(DTNode(bestQ,leftChild,rightChild))