摘要:由于近期學業繁重,所以我就不說廢話了,直接上代碼簡單的決策樹示例運行效果代碼定義文本框和箭頭格式畫樹使用文本注解繪制樹節點繪制帶箭頭的注解在父子節點間填充文本信息創建數據集計算給定數據的香農熵熵值越高,混合的數據越多,越無序我們可
由于近期學業繁重QAQ,所以我就不說廢話了,直接上代碼~
簡單的決策樹示例 運行效果from math import log import operator import matplotlib.pyplot as plt #定義文本框和箭頭格式 decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-") #畫樹 #使用文本注解繪制樹節點 #繪制帶箭頭的注解 def plotNode(nodeTxt,centerPt,parentPt,nodeType): createPlot.ax1.annotate(nodeTxt,xy=parentPt, xycoords="axes fraction", xytext=centerPt,textcoords="axes fraction", va="center",ha="center",bbox=nodeType, arrowprops=arrow_args) #在父子節點間填充文本信息 def plotMidText(cntrPt,parentPt,txtString): xMid=(parentPt[0]-cntrPt[0])/2.0+cntrPt[0] yMid=(parentPt[1]-cntrPt[1])/2.0+cntrPt[1] createPlot.ax1.text(xMid,yMid,txtString) def plotTree(myTree,parentPt,nodeTxt): numLeafs=getNumLeafs(myTree) depth=getTreeDepth(myTree) firstStr=list(myTree.keys())[0] cntrPt=(plotTree.xOff+(1.0+float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) plotMidText(cntrPt,parentPt,nodeTxt) plotNode(firstStr,cntrPt,parentPt,decisionNode) secondDict=myTree[firstStr] plotTree.yOff=plotTree.yOff-1.0/plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": plotTree(secondDict[key],cntrPt,str(key)) else: plotTree.xOff=plotTree.xOff+1.0/plotTree.totalW plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff), cntrPt,leafNode) plotMidText((plotTree.xOff,plotTree.yOff), cntrPt,str(key)) plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD def createPlot(inTree): fig=plt.figure(1,facecolor="white") fig.clf() axprops=dict(xticks=[],yticks=[]) createPlot.ax1=plt.subplot(111,frameon=False,**axprops) plotTree.totalW=float(getNumLeafs(inTree)) plotTree.totalD=float(getNumLeafs(inTree)) plotTree.xOff=-0.5/plotTree.totalW;plotTree.yOff=1.0; plotTree(inTree,(0.5,1.0),"") plt.show() #創建數據集 def createDataSet(): dataSet=[[1,1,"yes"], [1,1,"yes"], [1,0,"no"], [0,1,"no"], [0,1,"no"]] labels=["no surfacing","flippers"] return dataSet,labels #計算給定數據的香農熵 #熵值越高,混合的數據越多,越無序 #我們可以在數據集中添加更多的分類 def calcShannonEnt(dataSet): numEntries=len(dataSet) #數據字典,鍵值為最后一列的數值"yes"or"no" labelCounts={} for featVec in dataSet: #為所有可能分類創建字典 #"yes"or"no" currentLabel=featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 shannonEnt=0.0 for key in labelCounts: prob=float(labelCounts[key])/numEntries #以2為?求對數 shannonEnt-=prob*log(prob,2) return shannonEnt #按照給定特征劃分數據集 #輸入的參數為:待劃分的數據集, #劃分數據集的特征(第幾列), #特征的返回值(這一列的值為多少) #返回的是符合這一列的值的每一行, #并且將這一列的數據去掉了 def splitDataSet(dataSet,axis,value): retDataSet=[] #遍歷整個數據集 #featVec:[1, 1, "yes"] for featVec in dataSet: #print("featVec:") #print(featVec) #抽取其中符合特征的 #featVec[axis]表示[1, 1, "yes"]中的第axis+1個 if featVec[axis]==value: #保存這一列前面的數據 reducedFeatVec=featVec[:axis] #print("reducedFeatVec:") #print(reducedFeatVec) #保存這一列后面的數據 reducedFeatVec.extend(featVec[axis+1:]) #print("reducedFeatVec:") #print(reducedFeatVec) retDataSet.append(reducedFeatVec) #print("retDataSet:") #print(retDataSet) return retDataSet #選擇最好的數據集劃分方式 def chooseBestFeatureToSplit(dataSet): #numFeatures:2 numFeatures=len(dataSet[0])-1 #計算香農熵 baseEntropy=calcShannonEnt(dataSet) bestInfoGain=0.0 bestFeature=-1 #i:0,1 for i in range(numFeatures): #取出dataSet的第i列 featList=[example[i] for example in dataSet] #print("featList:") #print(featList) #弄成一個set,去掉其中相同的元素 uniqueVals=set(featList) #print("uniqueVals:") #print(uniqueVals) newEntropy=0.0 for value in uniqueVals: #按照第i列,值為value的去劃分 subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) #計算劃分后的熵值 newEntropy+=prob*calcShannonEnt(subDataSet) infoGain=baseEntropy-newEntropy #判斷是否更優 if(infoGain>bestInfoGain): bestInfoGain=infoGain bestFeature=i #返回劃分的最優類別 #表示按照第i列去劃分 return bestFeature #傳入的是分類名稱的列表 #返回出現次數最多的分類的名稱 def majorityCnt(classList): #創建字典,鍵值為classList中唯一值 #字典的值為classList中每隔標簽出現的頻率 classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 #按照字典值的順序從大到小排序 sortedClassCount=sorted(classCount,iteritems(), key=operator.itemgetter(1),reverse=True) #返回出現次數最多的分類的名稱 return sortedClassCount[0][0] #創建樹 #傳入參數為數據集與標簽列表 def createTree(dataSet,labels): #得到分類名稱的標簽"yes"or"no" #["yes", "yes", "no", "no", "no"] classList=[example[-1] for example in dataSet] #print("classList:") #print(classList) #遞歸結束的第一個條件 #所有的類標簽完全相同 if classList.count(classList[0])==len(classList): return classList[0] #遞歸結束的第二個條件 #使用完了所有的特征,仍然不能將數 #據集劃分成僅包含唯一類別的分組 #此時無法簡單地返回唯一的類標簽, #直接返回出現次數最多的類標簽 if len(dataSet[0])==1: return majorityCnt(classList) #bestFeat是最好的劃分方式對應的列的下標 bestFeat=chooseBestFeatureToSplit(dataSet) #labels中這一列信息對應的類別名稱 bestFeatLabel=labels[bestFeat] #樹 myTree={bestFeatLabel:{}} #將labels中的這一類別delete del(labels[bestFeat]) #這一類別對應的列的值 featValues=[example[bestFeat] for example in dataSet] #print("featValues:") #print(featValues) #set 去掉列中相同的值 uniqueVals=set(featValues) for value in uniqueVals: #去掉最優類別后剩下的類別 subLabels=labels[:] #print("subLabels:") #print(subLabels) #print("bestFeatLabel:") #print(bestFeatLabel) #print(value) #myTree["no surfacing"][0] #myTree["no surfacing"][2] #...... myTree[bestFeatLabel][value]=createTree( #按照第bestFeat列,值為value的去劃分 splitDataSet(dataSet,bestFeat,value),subLabels) return myTree #獲取葉節點的數目 def getNumLeafs(myTree): numLeafs=0 firstStr=list(myTree.keys())[0] secondDir=myTree[firstStr] for key in secondDir.keys(): #子節點為字典類型,則該結點也是一個判斷結點 #需要遞歸調用getNumLeafs函數 if type(secondDir[key]).__name__=="dict": numLeafs+=getNumLeafs(secondDir[key]) #該結點為葉子節點,葉子數+1 else: numLeafs+=1 return numLeafs #獲取樹的層數 def getTreeDepth(myTree): maxDepth=0 firstStr=list(myTree.keys())[0] secondDict=myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": thisDepth=1+getTreeDepth(secondDict[key]) else: thisDepth=1 if thisDepth>maxDepth:maxDepth=thisDepth return maxDepth def main(): dataSet,labels=createDataSet() chooseBestFeatureToSplit(dataSet) #{"no surfacing": {0: "no", 1: {"flippers": {0: "no", 1: "yes"}}}} myTree=createTree(dataSet,labels) print("myTree:") print(myTree) createPlot(myTree) #i=getNumLeafs(myTree) #print(i) #i=getTreeDepth(myTree) #print(i) #i=chooseBestFeatureToSplit(dataSet) #print(i) #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #增加一個類別后再測試信息熵,發現熵值增大 #dataSet[0][-1]="maybe" #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #retDataSet=splitDataSet(dataSet,0,1) #print("retDataSet:") #print(retDataSet) #retDataSet=splitDataSet(dataSet,0,0) #print("retDataSet:") #print(retDataSet) if __name__=="__main__": main()
from math import log import operator import matplotlib.pyplot as plt #定義文本框和箭頭格式 decisionNode=dict(boxstyle="sawtooth",fc="0.8") leafNode=dict(boxstyle="round4",fc="0.8") arrow_args=dict(arrowstyle="<-") #畫樹 #使用文本注解繪制樹節點 #繪制帶箭頭的注解 def plotNode(nodeTxt,centerPt,parentPt,nodeType): createPlot.ax1.annotate(nodeTxt,xy=parentPt, xycoords="axes fraction", xytext=centerPt,textcoords="axes fraction", va="center",ha="center",bbox=nodeType, arrowprops=arrow_args) #在父子節點間填充文本信息 def plotMidText(cntrPt,parentPt,txtString): xMid=(parentPt[0]-cntrPt[0])/2.0+cntrPt[0] yMid=(parentPt[1]-cntrPt[1])/2.0+cntrPt[1] createPlot.ax1.text(xMid,yMid,txtString) def plotTree(myTree,parentPt,nodeTxt): numLeafs=getNumLeafs(myTree) depth=getTreeDepth(myTree) firstStr=list(myTree.keys())[0] cntrPt=(plotTree.xOff+(1.0+float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) plotMidText(cntrPt,parentPt,nodeTxt) plotNode(firstStr,cntrPt,parentPt,decisionNode) secondDict=myTree[firstStr] plotTree.yOff=plotTree.yOff-1.0/plotTree.totalD for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": plotTree(secondDict[key],cntrPt,str(key)) else: plotTree.xOff=plotTree.xOff+1.0/plotTree.totalW plotNode(secondDict[key],(plotTree.xOff,plotTree.yOff), cntrPt,leafNode) plotMidText((plotTree.xOff,plotTree.yOff), cntrPt,str(key)) plotTree.yOff=plotTree.yOff+1.0/plotTree.totalD def createPlot(inTree): fig=plt.figure(1,facecolor="white") fig.clf() axprops=dict(xticks=[],yticks=[]) createPlot.ax1=plt.subplot(111,frameon=False,**axprops) plotTree.totalW=float(getNumLeafs(inTree)) plotTree.totalD=float(getNumLeafs(inTree)) plotTree.xOff=-0.5/plotTree.totalW;plotTree.yOff=1.0; plotTree(inTree,(0.5,1.0),"") plt.show() #創建數據集 def createDataSet(): dataSet=[[1,1,"yes"], [1,1,"yes"], [1,0,"no"], [0,1,"no"], [0,1,"no"]] labels=["no surfacing","flippers"] return dataSet,labels #計算給定數據的香農熵 #熵值越高,混合的數據越多,越無序 #我們可以在數據集中添加更多的分類 def calcShannonEnt(dataSet): numEntries=len(dataSet) #數據字典,鍵值為最后一列的數值"yes"or"no" labelCounts={} for featVec in dataSet: #為所有可能分類創建字典 #"yes"or"no" currentLabel=featVec[-1] if currentLabel not in labelCounts.keys(): labelCounts[currentLabel]=0 labelCounts[currentLabel]+=1 shannonEnt=0.0 for key in labelCounts: prob=float(labelCounts[key])/numEntries #以2為?求對數 shannonEnt-=prob*log(prob,2) return shannonEnt #按照給定特征劃分數據集 #輸入的參數為:待劃分的數據集, #劃分數據集的特征(第幾列), #特征的返回值(這一列的值為多少) #返回的是符合這一列的值的每一行, #并且將這一列的數據去掉了 def splitDataSet(dataSet,axis,value): retDataSet=[] #遍歷整個數據集 #featVec:[1, 1, "yes"] for featVec in dataSet: #print("featVec:") #print(featVec) #抽取其中符合特征的 #featVec[axis]表示[1, 1, "yes"]中的第axis+1個 if featVec[axis]==value: #保存這一列前面的數據 reducedFeatVec=featVec[:axis] #print("reducedFeatVec:") #print(reducedFeatVec) #保存這一列后面的數據 reducedFeatVec.extend(featVec[axis+1:]) #print("reducedFeatVec:") #print(reducedFeatVec) retDataSet.append(reducedFeatVec) #print("retDataSet:") #print(retDataSet) return retDataSet #選擇最好的數據集劃分方式 def chooseBestFeatureToSplit(dataSet): #numFeatures:2 numFeatures=len(dataSet[0])-1 #計算香農熵 baseEntropy=calcShannonEnt(dataSet) bestInfoGain=0.0 bestFeature=-1 #i:0,1 for i in range(numFeatures): #取出dataSet的第i列 featList=[example[i] for example in dataSet] #print("featList:") #print(featList) #弄成一個set,去掉其中相同的元素 uniqueVals=set(featList) #print("uniqueVals:") #print(uniqueVals) newEntropy=0.0 for value in uniqueVals: #按照第i列,值為value的去劃分 subDataSet=splitDataSet(dataSet,i,value) prob=len(subDataSet)/float(len(dataSet)) #計算劃分后的熵值 newEntropy+=prob*calcShannonEnt(subDataSet) infoGain=baseEntropy-newEntropy #判斷是否更優 if(infoGain>bestInfoGain): bestInfoGain=infoGain bestFeature=i #返回劃分的最優類別 #表示按照第i列去劃分 return bestFeature #傳入的是分類名稱的列表 #返回出現次數最多的分類的名稱 def majorityCnt(classList): #創建字典,鍵值為classList中唯一值 #字典的值為classList中每隔標簽出現的頻率 classCount={} for vote in classList: if vote not in classCount.keys(): classCount[vote]=0 classCount[vote]+=1 #按照字典值的順序從大到小排序 sortedClassCount=sorted(classCount,iteritems(), key=operator.itemgetter(1),reverse=True) #返回出現次數最多的分類的名稱 return sortedClassCount[0][0] #創建樹 #傳入參數為數據集與標簽列表 def createTree(dataSet,labels): #得到分類名稱的標簽"yes"or"no" #["yes", "yes", "no", "no", "no"] classList=[example[-1] for example in dataSet] #print("classList:") #print(classList) #遞歸結束的第一個條件 #所有的類標簽完全相同 if classList.count(classList[0])==len(classList): return classList[0] #遞歸結束的第二個條件 #使用完了所有的特征,仍然不能將數 #據集劃分成僅包含唯一類別的分組 #此時無法簡單地返回唯一的類標簽, #直接返回出現次數最多的類標簽 if len(dataSet[0])==1: return majorityCnt(classList) #bestFeat是最好的劃分方式對應的列的下標 bestFeat=chooseBestFeatureToSplit(dataSet) #labels中這一列信息對應的類別名稱 bestFeatLabel=labels[bestFeat] #樹 myTree={bestFeatLabel:{}} #將labels中的這一類別delete del(labels[bestFeat]) #這一類別對應的列的值 featValues=[example[bestFeat] for example in dataSet] #print("featValues:") #print(featValues) #set 去掉列中相同的值 uniqueVals=set(featValues) for value in uniqueVals: #去掉最優類別后剩下的類別 subLabels=labels[:] #print("subLabels:") #print(subLabels) #print("bestFeatLabel:") #print(bestFeatLabel) #print(value) #myTree["no surfacing"][0] #myTree["no surfacing"][4] #...... myTree[bestFeatLabel][value]=createTree( #按照第bestFeat列,值為value的去劃分 splitDataSet(dataSet,bestFeat,value),subLabels) return myTree #獲取葉節點的數目 def getNumLeafs(myTree): numLeafs=0 firstStr=list(myTree.keys())[0] secondDir=myTree[firstStr] for key in secondDir.keys(): #子節點為字典類型,則該結點也是一個判斷結點 #需要遞歸調用getNumLeafs函數 if type(secondDir[key]).__name__=="dict": numLeafs+=getNumLeafs(secondDir[key]) #該結點為葉子節點,葉子數+1 else: numLeafs+=1 return numLeafs #獲取樹的層數 def getTreeDepth(myTree): maxDepth=0 firstStr=list(myTree.keys())[0] secondDict=myTree[firstStr] for key in secondDict.keys(): if type(secondDict[key]).__name__=="dict": thisDepth=1+getTreeDepth(secondDict[key]) else: thisDepth=1 if thisDepth>maxDepth:maxDepth=thisDepth return maxDepth #使用決策樹的分類函數 def classify(inputTree,featLabels,testVec): firstStr=list(inputTree.keys())[0] secondDict=inputTree[firstStr] #將標簽字符串轉換為索引 featIndex=featLabels.index(firstStr) for key in secondDict.keys(): if testVec[featIndex]==key: if type(secondDict[key]).__name__=="dict": classLabel=classify(secondDict[key],featLabels,testVec) else: classLabel=secondDict[key] return classLabel #使用pickle模塊存儲決策樹 def storeTree(inputTree,filename): import pickle fw=open(filename,"wb") pickle.dump(inputTree,fw) fw.close() #使用pickle模塊加載樹 def grabTree(filename): import pickle fr=open(filename,"rb") return pickle.load(fr) #使用決策樹預測隱形眼鏡類型 def predictTypes(): fr=open("lenses.txt") #[["young", "myope", "no", "reduced", "no lenses"], ...] lenses=[inst.strip().split(" ") for inst in fr.readlines()] #print(lenses) #標簽 lensesLabels=["age","prescript","astigmatic","tearRate"] #創建決策樹 lensesTree=createTree(lenses,lensesLabels) print(lensesTree) #畫樹 createPlot(lensesTree) def main(): predictTypes() #dataSet,labels=createDataSet() #print(labels) #chooseBestFeatureToSplit(dataSet) #{"no surfacing": {0: "no", 1: {"flippers": {0: "no", 1: "yes"}}}} #myTree=createTree(dataSet,labels) #storeTree(myTree,"classifierStorage.txt") #Tree=grabTree("classifierStorage.txt") #print(Tree) #createPlot(Tree) #print("myTree:") #print(myTree) #createPlot(myTree) #labels2=["no surfacing", "flippers"] #i=classify(myTree,labels2,[1,1]) #print(i) #i=getNumLeafs(myTree) #print(i) #i=getTreeDepth(myTree) #print(i) #i=chooseBestFeatureToSplit(dataSet) #print(i) #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #增加一個類別后再測試信息熵,發現熵值增大 #dataSet[0][-1]="maybe" #shannonEnt=calcShannonEnt(dataSet) #print(shannonEnt) #retDataSet=splitDataSet(dataSet,0,1) #print("retDataSet:") #print(retDataSet) #retDataSet=splitDataSet(dataSet,0,0) #print("retDataSet:") #print(retDataSet) if __name__=="__main__": main()數據
lenses.txt文件內容如下
young myope no reduced no lenses young myope no normal soft young myope yes reduced no lenses young myope yes normal hard young hyper no reduced no lenses young hyper no normal soft young hyper yes reduced no lenses young hyper yes normal hard pre myope no reduced no lenses pre myope no normal soft pre myope yes reduced no lenses pre myope yes normal hard pre hyper no reduced no lenses pre hyper no normal soft pre hyper yes reduced no lenses pre hyper yes normal no lenses presbyopic myope no reduced no lenses presbyopic myope no normal no lenses presbyopic myope yes reduced no lenses presbyopic myope yes normal hard presbyopic hyper no reduced no lenses presbyopic hyper no normal soft presbyopic hyper yes reduced no lenses presbyopic hyper yes normal no lenses
文章版權歸作者所有,未經允許請勿轉載,若此文章存在違規行為,您可以聯系管理員刪除。
轉載請注明本文地址:http://m.specialneedsforspecialkids.com/yun/43459.html
摘要:總言言之,決策樹第一個是需要從大量的已存在的樣本中推出可供做決策的規則,同時,這個規則應該避免做無謂的損耗。算法原理構造決策樹的關鍵步驟是分裂屬性。這時分裂屬性可能會遇到三種不同的情況對離散值生成非二叉決策樹。對離散值生成二叉決策樹。 算法背景 決策樹故名思意是用于基于條件來做決策的,而它運行的邏輯相比一些復雜的算法更容易理解,只需按條件遍歷樹就可以了,需要花點心思的是理解如何建立決策...
摘要:決策樹分支轉存寫代碼的方法今天是周日,我還在倒騰決策樹,然后發現了一個不用裝軟件也能倒的方法,而且更簡單。剛開始看視頻的時候是看的的視頻,講的真差,太模糊了,不適合我。 決策樹分支dot轉存pdf 1、寫代碼的方法 今天是周日,我還在倒騰決策樹,然后發現了一個不用裝軟件也能倒pdf的方法,而且更簡單。參照了這個中文的文檔實現:http://sklearn.apachecn.org/c....
閱讀 2418·2021-08-18 10:21
閱讀 2528·2019-08-30 13:45
閱讀 2159·2019-08-30 13:16
閱讀 2121·2019-08-30 12:52
閱讀 1370·2019-08-30 11:20
閱讀 2630·2019-08-29 13:47
閱讀 1628·2019-08-29 11:22
閱讀 2766·2019-08-26 12:11