机器学习 | 决策树

jiezi

6 年前

由于近期学业繁重 QAQ，所以我就不说废话了，直接上代码~
运行效果

代码
from math import log
import operator
import matplotlib.pyplot as plt

# 定义文本框和箭头格式
decisionNode=dict(boxstyle=”sawtooth”,fc=”0.8″)
leafNode=dict(boxstyle=”round4″,fc=”0.8″)
arrow_args=dict(arrowstyle=”<-“)

# 画树

# 使用文本注解绘制树节点
#绘制带箭头的注解
def plotNode(nodeTxt,centerPt,parentPt,nodeType):
createPlot.ax1.annotate(nodeTxt,xy=parentPt,
xycoords=’axes fraction’,
xytext=centerPt,textcoords=’axes fraction’,
va=”center”,ha=”center”,bbox=nodeType,
arrowprops=arrow_args)

# 在父子节点间填充文本信息
def plotMidText(cntrPt,parentPt,txtString):
xMid=(parentPt[0]-cntrPt[0])/2.0+cntrPt[0]
yMid=(parentPt[1]-cntrPt[1])/2.0+cntrPt[1]
createPlot.ax1.text(xMid,yMid,txtString)

def plotTree(myTree,parentPt,nodeTxt):
numLeafs=getNumLeafs(myTree)
depth=getTreeDepth(myTree)
firstStr=list(myTree.keys())[0]
cntrPt=(plotTree.xOff+(1.0+float(numLeafs))/2.0/plotTree.totalW,
plotTree.yOff)
plotMidText(cntrPt,parentPt,nodeTxt)
plotNode(firstStr,cntrPt,parentPt,decisionNode)
secondDict=myTree[firstStr]
plotTree.yOff=plotTree.yOff-1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__==’dict’:
plotTree(secondDict[key],cntrPt,str(key))
else:
plotTree.xOff=plotTree.xOff+1.0/plotTree.totalW
plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff),
cntrPt,leafNode)
plotMidText((plotTree.xOff,plotTree.yOff),
cntrPt,str(key))
plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD

def createPlot(inTree):
fig=plt.figure(1,facecolor=’white’)
fig.clf()
axprops=dict(xticks=[],yticks=[])
createPlot.ax1=plt.subplot(111,frameon=False,**axprops)
plotTree.totalW=float(getNumLeafs(inTree))
plotTree.totalD=float(getNumLeafs(inTree))
plotTree.xOff=-0.5/plotTree.totalW;plotTree.yOff=1.0;
plotTree(inTree,(0.5,1.0),”)
plt.show()

# 创建数据集
def createDataSet():
dataSet=[[1,1,’yes’],
[1,1,’yes’],
[1,0,’no’],
[0,1,’no’],
[0,1,’no’]]
labels=[‘no surfacing’,’flippers’]
return dataSet,labels

# 计算给定数据的香农熵
#熵值越高，混合的数据越多，越无序
#我们可以在数据集中添加更多的分类
def calcShannonEnt(dataSet):
numEntries=len(dataSet)
#数据字典，键值为最后一列的数值 ”yes”or”no”
labelCounts={}
for featVec in dataSet:
#为所有可能分类创建字典
#”yes”or”no”
currentLabel=featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries
#以 2 为㡳求对数
shannonEnt-=prob*log(prob,2)
return shannonEnt

# 按照给定特征划分数据集
#输入的参数为：待划分的数据集，
#划分数据集的特征(第几列)，
#特征的返回值(这一列的值为多少)
#返回的是符合这一列的值的每一行，
#并且将这一列的数据去掉了
def splitDataSet(dataSet,axis,value):
retDataSet=[]
#遍历整个数据集
#featVec：[1, 1, ‘yes’]
for featVec in dataSet:
#print(‘featVec:’)
#print(featVec)
#抽取其中符合特征的
#featVec[axis]表示 [1, 1, ‘yes’] 中的第 axis+ 1 个
if featVec[axis]==value:
#保存这一列前面的数据
reducedFeatVec=featVec[:axis]
#print(‘reducedFeatVec:’)
#print(reducedFeatVec)
#保存这一列后面的数据
reducedFeatVec.extend(featVec[axis+1:])
#print(‘reducedFeatVec:’)
#print(reducedFeatVec)
retDataSet.append(reducedFeatVec)
#print(‘retDataSet:’)
#print(retDataSet)
return retDataSet

# 选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
#numFeatures：2
numFeatures=len(dataSet[0])-1
#计算香农熵
baseEntropy=calcShannonEnt(dataSet)
bestInfoGain=0.0
bestFeature=-1
#i：0,1
for i in range(numFeatures):
#取出 dataSet 的第 i 列
featList=[example[i] for example in dataSet]
#print(‘featList:’)
#print(featList)
#弄成一个 set，去掉其中相同的元素
uniqueVals=set(featList)
#print(‘uniqueVals:’)
#print(uniqueVals)
newEntropy=0.0
for value in uniqueVals:
#按照第 i 列，值为 value 的去划分
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
#计算划分后的熵值
newEntropy+=prob*calcShannonEnt(subDataSet)
infoGain=baseEntropy-newEntropy
#判断是否更优
if(infoGain>bestInfoGain):
bestInfoGain=infoGain
bestFeature=i
#返回划分的最优类别
#表示按照第 i 列去划分
return bestFeature

# 传入的是分类名称的列表
#返回出现次数最多的分类的名称
def majorityCnt(classList):
#创建字典，键值为 classList 中唯一值
#字典的值为 classList 中每隔标签出现的频率
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
#按照字典值的顺序从大到小排序
sortedClassCount=sorted(classCount,iteritems(),
key=operator.itemgetter(1),reverse=True)
#返回出现次数最多的分类的名称
return sortedClassCount[0][0]

# 创建树
#传入参数为数据集与标签列表
def createTree(dataSet,labels):
#得到分类名称的标签 ”yes”or”no”
#[‘yes’, ‘yes’, ‘no’, ‘no’, ‘no’]
classList=[example[-1] for example in dataSet]
#print(‘classList:’)
#print(classList)
#递归结束的第一个条件
#所有的类标签完全相同
if classList.count(classList[0])==len(classList):
return classList[0]
#递归结束的第二个条件
#使用完了所有的特征，仍然不能将数
#据集划分成仅包含唯一类别的分组
#此时无法简单地返回唯一的类标签，
#直接返回出现次数最多的类标签
if len(dataSet[0])==1:
return majorityCnt(classList)

#bestFeat 是最好的划分方式对应的列的下标
bestFeat=chooseBestFeatureToSplit(dataSet)
#labels 中这一列信息对应的类别名称
bestFeatLabel=labels[bestFeat]
#树
myTree={bestFeatLabel:{}}
#将 labels 中的这一类别 delete
del(labels[bestFeat])
#这一类别对应的列的值
featValues=[example[bestFeat] for example in dataSet]
#print(‘featValues:’)
#print(featValues)
#set 去掉列中相同的值
uniqueVals=set(featValues)
for value in uniqueVals:
#去掉最优类别后剩下的类别
subLabels=labels[:]
#print(‘subLabels:’)
#print(subLabels)
#print(‘bestFeatLabel:’)
#print(bestFeatLabel)
#print(value)
#myTree[‘no surfacing’][0]
#myTree[‘no surfacing’][1]
#……
myTree[bestFeatLabel][value]=createTree(
#按照第 bestFeat 列，值为 value 的去划分
splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree

# 获取叶节点的数目
def getNumLeafs(myTree):
numLeafs=0
firstStr=list(myTree.keys())[0]
secondDir=myTree[firstStr]
for key in secondDir.keys():
#子节点为字典类型，则该结点也是一个判断结点
#需要递归调用 getNumLeafs 函数
if type(secondDir[key]).__name__==’dict’:
numLeafs+=getNumLeafs(secondDir[key])
#该结点为叶子节点，叶子数 +1
else:
numLeafs+=1
return numLeafs

# 获取树的层数
def getTreeDepth(myTree):
maxDepth=0
firstStr=list(myTree.keys())[0]
secondDict=myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__==’dict’:
thisDepth=1+getTreeDepth(secondDict[key])
else:
thisDepth=1
if thisDepth>maxDepth:maxDepth=thisDepth
return maxDepth

def main():
dataSet,labels=createDataSet()
chooseBestFeatureToSplit(dataSet)
#{‘no surfacing’: {0: ‘no’, 1: {‘flippers’: {0: ‘no’, 1: ‘yes’}}}}
myTree=createTree(dataSet,labels)
print(‘myTree:’)
print(myTree)
createPlot(myTree)
#i=getNumLeafs(myTree)
#print(i)
#i=getTreeDepth(myTree)
#print(i)
#i=chooseBestFeatureToSplit(dataSet)
#print(i)
#shannonEnt=calcShannonEnt(dataSet)
#print(shannonEnt)
#增加一个类别后再测试信息熵，发现熵值增大
#dataSet[0][-1]=’maybe’
#shannonEnt=calcShannonEnt(dataSet)
#print(shannonEnt)
#retDataSet=splitDataSet(dataSet,0,1)
#print(‘retDataSet:’)
#print(retDataSet)
#retDataSet=splitDataSet(dataSet,0,0)
#print(‘retDataSet:’)
#print(retDataSet)

if __name__==’__main__’:
main()