机器学习 | Logistic回归

由于近期学业繁重QAQ，所以我就不说废话了，直接上代码~Logistic回归进行分类分类效果Logistic回归预测病马的死亡率预测结果全部代码from numpy import import matplotlib.pyplot as plt#使用梯度上升法找到最佳参数#使用梯度上升法找到最佳回归系数，#也就是拟合Logistic回归模型的最佳参数#Logistic归回梯度上升优化算法#加载文件def loadDataSet(): dataMat=[];labelMat=[] #打开文本文件 fr=open(’testSet.txt’) #逐行读取 for line in fr.readlines(): lineArr=line.strip().split() #为了方便计算，将x0设为1，后面的x1，x2是文本中每行的前两个值 dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #文本中每行的第三个值为数据对应的标签 labelMat.append(int(lineArr[2])) return dataMat,labelMat #sigmiod函数def sigmoid(inX): return 1.0/(1+exp(-inX)) #梯度上升算法#第一个参数是一个2维的Numpy数组#每列表示不同的特征#每行表示每个训练样本#我们采用100个样本，包含两个x1，x2特征#再加上第0维特征x0，所以dataMatIn是一个100X3的矩阵#第二个参数是类别标签，是一个1X100的行向量def gradAscent(dataMatIn,classLabels): dataMatrix=mat(dataMatIn) #将行向量转置为列向量 labelMat=mat(classLabels).transpose() #得到矩阵的大小 m,n=shape(dataMatrix) #向目标移动的步长 alpha=0.001 #迭代次数 maxCycles=500 weights=ones((n,1)) #在for循环结束之后，将返回训练好的回归系数 for k in range(maxCycles): #注：此处是矩阵相乘 #h是一个列向量，元素的个数等于样本的个数 h=sigmoid(dataMatrixweights) #真实类别与预测类别的差别 error=(labelMat-h) #按照该差别的方向调整回归系数 weights=weights+alphadataMatrix.transpose()error #返回回归系数——确定了不同类别数据之间的分割线 return weights #画出决策边界#画出数据集和Logistic回归最佳拟合直线的函数 #X1表示一个特征，X2表示另一个特征def plotBestFit(weights): #得到数据集与标签 dataMat,labelMat=loadDataSet() dataArr = array(dataMat) n = shape(dataArr)[0] xcord1 = []; ycord1 = [] xcord2 = []; ycord2 = [] #对数据集进行分类 for i in range(n): if int(labelMat[i])== 1: xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) else: xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(xcord1, ycord1, s=30, c=‘red’, marker=’s’) ax.scatter(xcord2, ycord2, s=30, c=‘green’) x = arange(-3.0, 3.0, 0.1) #根据gradAscent得到的回归系数绘制分割线 y = (-weights[0]-weights[1]x)/weights[2] #print(x) #print(y) ax.plot(x, y) plt.xlabel(‘X1’); plt.ylabel(‘X2’); plt.show() #梯度上升方法在每次更新回归系数时都需要遍历整个数据集#改进方法：一次仅使用一个样本点来更新回归系数——随机梯度上升算法#由于可以在新样本到来时对分类器进行增量式更新，因此随机梯度上升#算法是一个在线学习算法#与“在线学习”相对应，一次数里所有数据被称作“批处理”#随机梯度上升算法def stocGradAscent0(dataMatrix,classLabels): #得到矩阵的大小 m,n=shape(dataMatrix) #向目标移动的步长 alpha=0.01 weights=ones(n) for i in range(m): #h为向量 h=sigmoid(sum(dataMatrix[i]weights)) #error为向量 error=classLabels[i]-h weights=weights+alphaerrordataMatrix[i] return weights #由于经过测试，多次迭代后，X0，X1收敛速度较小#且存在一些小的周期性的波动，因此及逆行改进#改进随机梯度上升算法#第三个参数为迭代次数def stocGradAscent1(dataMatrix,classLabels,numIter=150): m,n=shape(dataMatrix) weights=ones(n) for j in range(numIter): dataIndex=list(range(m)) for i in range(m): #改进1：alpha[向目标移动的步长]会随着迭代的次数不断减小 #可以缓解数据波动或高频波动， alpha=4/(1.0+j+i)+0.01 #通过随机选取样本来更新回归系数 #可以减少周期性的波动 randIndex=int(random.uniform(0,len(dataIndex))) h=sigmoid(sum(dataMatrix[randIndex]weights)) error=classLabels[randIndex]-h weights=weights+alphaerrordataMatrix[randIndex] del(dataIndex[randIndex]) return weights #Logistic回归预测病马的死亡率#对于缺失数据，我们选择用0来替换#因为这样不会影响系数weights的值#对于标签已经丢失的，我们将这条数据丢弃#使用Logistic回归进行分类的主要思路：#把测试集上每个特征向量乘最优方法得到的回归系数#再将该乘积结果求和，最后输入Sigmoid函数中即可，#若对应的sigmoid值>0.5预测类别标签为1，否则为0#Logistic回归分类函数def classifyVector(inX,weights): #以回归系数和特征向量作为输入来计算对应的Sigmoid值 prob=sigmoid(sum(inXweights)) if prob>0.5:return 1.0 else:return 0.0#打开测试集和训练集，并对数据进行格式化处理 def colicTest(): frTrain=open(‘horseColicTraining.txt’) frTest=open(‘horseColicTest.txt’) trainingSet=[] trainingLabels=[] #遍历每一行 for line in frTrain.readlines(): currLine=line.strip().split(’\t’) lineArr=[] #遍历每一列 for i in range(21): lineArr.append(float(currLine[i])) trainingSet.append(lineArr) #最后一列为类别标签 trainingLabels.append(float(currLine[21])) #计算回归系数向量 trainWeights=stocGradAscent1(array(trainingSet),trainingLabels,500) errorCount=0 numTestVec=0.0 for line in frTest.readlines(): numTestVec+=1.0 currLine=line.strip().split(’\t’) lineArr=[] for i in range(21): lineArr.append(float(currLine[i])) #对测试集进行分类，并查看结果是否正确 if int(classifyVector(array(lineArr),trainWeights))!=int(currLine[21]): errorCount+=1 #计算错误率 errorRate=(float(errorCount)/numTestVec) print(“the error rate of this test is: %f”%errorRate) return errorRate#调用colicTest函数10次，并且结果的平均值 def multiTest(): numTests=10 errorSum=0.0 for k in range(numTests): errorSum+=colicTest() print(“after %d iterations the average error rate is: %f”%(numTests,errorSum/float(numTests))) def main(): #dataArr,labelMat=loadDataSet() #weights=gradAscent(dataArr,labelMat) #print(weights) #plotBestFit(weights.getA()) #weights=stocGradAscent0(array(dataArr),labelMat) #weights=stocGradAscent1(array(dataArr),labelMat) #plotBestFit(weights) multiTest() if name==’main’: main()