|
| 1 | +#coding:utf-8 |
| 2 | +import numpy as np |
| 3 | +import matplotlib.pyplot as plt |
| 4 | + |
| 5 | +''' |
| 6 | +np.linalg Core Linear Algebra Tools |
| 7 | +xx.T 矩阵的转置 |
| 8 | +xx.I 矩阵的逆 |
| 9 | +m 样本点数 |
| 10 | +n 特征维数 |
| 11 | +''' |
| 12 | + |
| 13 | +def loadDataSet(datafile): |
| 14 | + featData = [] |
| 15 | + labelData = [] |
| 16 | + with open(datafile, 'r') as fr_file: |
| 17 | + for eachLine in fr_file: |
| 18 | + oneLine = eachLine.split('\t') |
| 19 | + tempArr = [] |
| 20 | + for i in range(len(oneLine)-1): |
| 21 | + tempArr.append(float(oneLine[i])) |
| 22 | + featData.append(tempArr) |
| 23 | + labelData.append(float(oneLine[-1].strip())) # float型连续变量 |
| 24 | + featData = np.array(featData) # 转换为array |
| 25 | + labelData = np.array(labelData) # 转换为array |
| 26 | + return featData, labelData |
| 27 | + |
| 28 | +def rssError(yArr, yHat): |
| 29 | + return np.sum((yArr-yHat)**2) |
| 30 | + |
| 31 | +def showRegres(xArr, yArr, yHat): |
| 32 | + fig = plt.figure() |
| 33 | + ax = fig.add_subplot(111) |
| 34 | + ax.scatter(xArr[:, 1], yArr) |
| 35 | + ''' |
| 36 | + 因为数据假定了x0=1,因此yHat=ws[0]+ws[1]*x1,看yHat与x1之间的线性关系 |
| 37 | + ''' |
| 38 | + srtInd = xArr[:, 1].argsort(0) |
| 39 | + # print srtInd |
| 40 | + ax.plot(xArr[srtInd, 1], yHat[srtInd]) # 拟合前需要将点升序排列 |
| 41 | + plt.show() |
| 42 | + |
| 43 | +'''标准的线性回归:最小二乘法(平方误差最小),适用于m>=n情况''' |
| 44 | +def standRegres(xMat, yMat): |
| 45 | + xTx = xMat.T*xMat # n*n |
| 46 | + if np.linalg.det(xTx) == 0.0: |
| 47 | + print 'This matrix is singular, cannot do inverse' |
| 48 | + return |
| 49 | + ## 方法1 |
| 50 | + ws = xTx.I*(xMat.T*yMat) # n*1 |
| 51 | + ## 方法2 |
| 52 | + # ws = np.linalg.solve(xTx, xMat.T*yMat) # n*1 |
| 53 | + # yHat = xMat*ws # m*1 |
| 54 | + return ws |
| 55 | + |
| 56 | +def standRegresTest(xArr, yArr): |
| 57 | + xMat = np.matrix(xArr) # m*n |
| 58 | + yMat = np.matrix(yArr).T # m*1 |
| 59 | + ws = standRegres(xMat, yMat) # n*1 |
| 60 | + # print ws |
| 61 | + yHat = xMat*ws # m*1 |
| 62 | + yHat = np.array(yHat).reshape(1, -1)[0] ## [[xx1][xx2]]二维matrix为[xx1, xx2]一维array |
| 63 | + return yHat |
| 64 | + |
| 65 | +'''局部加权线性回归,适用于m>=n情况''' |
| 66 | +def lwlr(testPoint, xMat, yMat, k=1.0): |
| 67 | + m = np.shape(xMat)[0] |
| 68 | + weights = np.matrix(np.eye(m)) # 创建对角矩阵 |
| 69 | + for j in range(m): |
| 70 | + diffMat = testPoint-xMat[j, :] |
| 71 | + weights[j, j] = np.exp(diffMat*diffMat.T/(-2.0*k**2)) # 高斯核 |
| 72 | + print weights |
| 73 | + xTx = xMat.T*(weights*xMat) |
| 74 | + if np.linalg.det(xTx) == 0.0: |
| 75 | + print 'This matrix is singular, cannot do inverse' |
| 76 | + return |
| 77 | + ws = xTx.I*(xMat.T*(weights*yMat)) |
| 78 | + return testPoint*ws |
| 79 | + |
| 80 | +def lwlrTest(testArr, xArr, yArr, k=1.0): |
| 81 | + xMat = np.matrix(xArr) # m*n |
| 82 | + yMat = np.matrix(yArr).T # m*1 |
| 83 | + m = np.shape(testArr)[0] |
| 84 | + yHat = np.zeros(m) |
| 85 | + for i in range(m): |
| 86 | + yHat[i] = lwlr(testArr[i], xMat, yMat, k) |
| 87 | + return yHat |
| 88 | + |
| 89 | +'''岭回归,适用于m>=n及m<n情况''' |
| 90 | +def ridgeRegres(xMat, yMat, lam=0.2): |
| 91 | + xTx = xMat.T*xMat |
| 92 | + denom = xTx+np.eye(np.shape(xMat)[1])*lam |
| 93 | + if np.linalg.det(denom) == 0.0: |
| 94 | + print 'This matrix is singular, cannot do inverse' |
| 95 | + return |
| 96 | + ws = denom.I*(xMat.T*yMat) |
| 97 | + return ws |
| 98 | + |
| 99 | +def ridgeTest(xArr, yArr): |
| 100 | + xMat = np.matrix(xArr) |
| 101 | + yMat = np.matrix(yArr).T |
| 102 | + '''标准化XY''' |
| 103 | + ## regularize Y's |
| 104 | + yMean = np.mean(yMat, 0) |
| 105 | + yMat = yMat-yMean # to eliminate X0 take mean off of Y |
| 106 | + ## regularize X's |
| 107 | + xMeans = np.mean(xMat, 0) # calc mean then subtract it off |
| 108 | + xVar = np.var(xMat, 0) # calc variance of Xi then divide by it |
| 109 | + xMat = (xMat-xMeans)/xVar |
| 110 | + '''计算wMat''' |
| 111 | + numTestPts = 30 |
| 112 | + wMat = np.matrix(np.zeros((numTestPts, np.shape(xMat)[1]))) |
| 113 | + for i in range(numTestPts): |
| 114 | + ws = ridgeRegres(xMat, yMat, np.exp(i-10)) |
| 115 | + wMat[i, :] = ws.T |
| 116 | + return wMat |
| 117 | + |
| 118 | +if __name__ == '__main__': |
| 119 | + #################################################################################### |
| 120 | + ## 标准的线性回归 |
| 121 | + xArr, yArr = loadDataSet('ex.txt') |
| 122 | + yHat = standRegresTest(xArr, yArr) |
| 123 | + print yHat |
| 124 | + showRegres(xArr, yArr, yHat) |
| 125 | + coef = np.corrcoef(yArr, yHat) |
| 126 | + print coef |
| 127 | + print (coef[0, 1]+coef[1, 0])/2.0 |
| 128 | + print rssError(yArr, yHat) |
| 129 | + #################################################################################### |
| 130 | + ## 局部加权线性回归 |
| 131 | + xArr, yArr = loadDataSet('ex.txt') |
| 132 | + yHat = lwlrTest(xArr, xArr, yArr, k=0.01) |
| 133 | + print yHat |
| 134 | + showRegres(xArr, yArr, yHat) |
| 135 | + coef = np.corrcoef(yArr, yHat) |
| 136 | + print coef |
| 137 | + print (coef[0, 1]+coef[1, 0])/2.0 |
| 138 | + print rssError(yArr, yHat) |
| 139 | + # '''寻找使相关系数最大的k''' |
| 140 | + # max_k = 0 |
| 141 | + # max_coef = 0 |
| 142 | + # for k in range(1, 100): |
| 143 | + # k /= 1000.0 |
| 144 | + # yHat = lwlrTest(xArr, xArr, yArr, k) |
| 145 | + # coef = np.corrcoef(yArr, yHat) |
| 146 | + # temp_coef = (coef[0, 1]+coef[1, 0])/2.0 |
| 147 | + # if temp_coef > max_coef: |
| 148 | + # max_coef = temp_coef |
| 149 | + # max_k = k |
| 150 | + # print max_k, max_coef |
| 151 | + # '''寻找使平方误差最小的k''' |
| 152 | + # min_k = 0 |
| 153 | + # min_error = np.inf |
| 154 | + # for k in range(1, 100): |
| 155 | + # k /= 1000.0 |
| 156 | + # yHat = lwlrTest(xArr, xArr, yArr, k) |
| 157 | + # temp_error = rssError(yArr, yHat) |
| 158 | + # if temp_error < min_error: |
| 159 | + # min_error = temp_error |
| 160 | + # min_k = k |
| 161 | + # print min_k, min_error |
| 162 | + #################################################################################### |
| 163 | + ## 岭回归 |
| 164 | + xArr, yArr = loadDataSet('abalone.txt') |
| 165 | + wMat = ridgeTest(xArr, yArr) |
| 166 | + fig = plt.figure() |
| 167 | + ax = fig.add_subplot(111) |
| 168 | + ax.plot(wMat) # 描述回归系数与log(lam)的关系 |
| 169 | + plt.show() |
| 170 | + ''' |
| 171 | + 在最左边时,lam为np.exp(0-10)=0,回归系数为原始值(即不缩减),跟标准的线性回归一致 |
| 172 | + 在最右边时,lam为np.exp(20-10)=e^10,回归系数全部缩减为0 |
| 173 | + 因此,在中间的某部分取值,lam能得到最好的预测效果,去掉不重要回归参数,参数的大小表示其重要性 |
| 174 | + ''' |
0 commit comments