Commit 21889b2

committed

test

1 parent a19ffd4 commit 21889b2Copy full SHA for 21889b2

File tree

7 files changed

+4616

-15

lines changed

NB&LR
- NaiveBayes_vs_LogisticRegression.py
README.md
Regression

7 files changed

+4616

-15

lines changed

`‎NB&LR/NaiveBayes_vs_LogisticRegression.py‎`

Lines changed: 56 additions & 15 deletions

Original file line number	Diff line number	Diff line change
`@@ -51,26 +51,44 @@ def fit(self, train_x, train_y):`
`51`	`51`	`param = p0, p1, p0Vec, p1Vec`
`52`	`52`	`return vocabList, param`
`53`	`53`
`54`		`- def predict(self, test_X, test_y, vocabList, param):`
	`54`	`+ def predict(self, test_X, vocabList, param):`
`55`	`55`	`p0, p1, p0Vec, p1Vec = param`
`56`	`56`	`testMat = []`
`57`	`57`	`for wordList in test_X:`
`58`	`58`	`testMat.append(self.listOfWords2Vec(vocabList, wordList))`
`59`	`59`	`testMatrix = np.array(testMat) ## array`
`60`		`- testLabel = np.array(test_y) ## array`
`61`	`60`	`predict_y = []`
`62`	`61`	`for vec in testMatrix:`
`63`	`62`	`prob_y0 = sum(vecp0Vec)+np.log(p0) # 对应p(w1\|c0)p(w2\|c0)...p(c0),log(a*b) = log(a)+log(b)`
`64`	`63`	`prob_y1 = sum(vecp1Vec)+np.log(p1) # 对应p(w1\|c1)p(w2\|c1)...p(c1),log(a*b) = log(a)+log(b)`
`65`		`- if prob_y0 < prob_y1:`
	`64`	`+ if prob_y0 < prob_y1:## 对应0/1分类,但是NaiveBayes可以修改成多分类`
`66`	`65`	`predict_y.append(1)`
`67`	`66`	`else:`
`68`	`67`	`predict_y.append(0)`
`69`	`68`	`predictLabel = np.array(predict_y) ## array`
`70`		`- print 'accuracy:', sum(testLabel==predictLabel)/float(len(testLabel))`
`71`	`69`	`return predictLabel`
`72`	`70`
`73`		`-class LogisticRegression(): # 二分类`
	`71`	`+ def predict1(self, test_X, test_y, vocabList, param):`
	`72`	`+ p0, p1, p0Vec, p1Vec = param`
	`73`	`+ testMat = []`
	`74`	`+ for wordList in test_X:`
	`75`	`+ testMat.append(self.listOfWords2Vec(vocabList, wordList))`
	`76`	`+ testMatrix = np.array(testMat) ## array`
	`77`	`+ m = testMatrix.shape[0]`
	`78`	`+ predict_y = []`
	`79`	`+ for vec in testMatrix:`
	`80`	`+ prob_y0 = sum(vecp0Vec)+np.log(p0) # 对应p(w1\|c0)p(w2\|c0)...p(c0),log(a*b) = log(a)+log(b)`
	`81`	`+ prob_y1 = sum(vecp1Vec)+np.log(p1) # 对应p(w1\|c1)p(w2\|c1)...p(c1),log(a*b) = log(a)+log(b)`
	`82`	`+ if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类`
	`83`	`+ predict_y.append(1)`
	`84`	`+ else:`
	`85`	`+ predict_y.append(0)`
	`86`	`+ testLabel = np.array(test_y) ## array`
	`87`	`+ predictLabel = np.array(predict_y) ## array`
	`88`	`+ print 'accuracy:', sum(testLabel==predictLabel)/float(m)`
	`89`	`+ return predictLabel`
	`90`	`+`
	`91`	`+class LogisticRegression(): # 二分类,0/1分类`
`74`	`92`	`def __init__(self):`
`75`	`93`	`pass`
`76`	`94`
`@@ -102,29 +120,48 @@ def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):`
`102`	`120`	`trainMatrix = np.matrix(trainMat) ## matrix是二维的 # size: m*n`
`103`	`121`	`trainLabel = np.matrix(train_y).T ## matrix是二维的 # size: m*1`
`104`	`122`	`m, n = trainMatrix.shape`
`105`		`- weigh = np.ones((n, 1)) # size: n*1`
	`123`	`+ weigh = np.matrix(np.ones((n, 1))) # size: n*1`
`106`	`124`	`for i in range(maxCycles):`
`107`		`- hx = self.sigmoid(trainMatrixweigh) # size: m1`
	`125`	`+ hx = self.sigmoid(trainMatrixweigh) # size: m1 sigmoid把线性回归转换到[0,1]之间,对应概率`
`108`	`126`	`error = trainLabel-hx # size: m*1`
`109`	`127`	`weigh += alphatrainMatrix.Terror # size: n*1`
`110`	`128`	`return vocabList, weigh`
`111`	`129`
`112`	`130`	`# 使用学习得到的参数进行分类`
`113`		`- def predict(self, test_X, test_y, vocabList, weigh):`
	`131`	`+ def predict(self, test_X, vocabList, weigh):`
`114`	`132`	`testMat = []`
`115`	`133`	`for wordList in test_X:`
`116`	`134`	`testMat.append(self.listOfWords2Vec(vocabList, wordList))`
`117`	`135`	`testMatrix = np.matrix(testMat) ## matrix是二维的`
`118`		`- testLabel = np.array(test_y) ## array`
`119`		`- hx = self.sigmoid(testMatrixweigh) # size: m1`
	`136`	`+ m = testMatrix.shape[0]`
	`137`	`+ hx = self.sigmoid(testMatrixweigh) # size: m1 sigmoid把线性回归转换到[0,1]之间,对应概率`
	`138`	`+ predict_y = []`
	`139`	`+ for i in range(m): ## 对应0/1分类`
	`140`	`+ if hx[i][0] > 0.5:`
	`141`	`+ predict_y.append(1)`
	`142`	`+ else:`
	`143`	`+ predict_y.append(0)`
	`144`	`+ predictLabel = np.array(predict_y) ## array`
	`145`	`+ # predictLabel = np.matrix(predict_y).T ## matrix`
	`146`	`+ return predictLabel`
	`147`	`+`
	`148`	`+ # 使用学习得到的参数进行分类`
	`149`	`+ def predict1(self, test_X, test_y, vocabList, weigh):`
	`150`	`+ testMat = []`
	`151`	`+ for wordList in test_X:`
	`152`	`+ testMat.append(self.listOfWords2Vec(vocabList, wordList))`
	`153`	`+ testMatrix = np.matrix(testMat) ## matrix是二维的`
	`154`	`+ m = testMatrix.shape[0]`
	`155`	`+ hx = self.sigmoid(testMatrixweigh) # size: m1 sigmoid把线性回归转换到[0,1]之间,对应概率`
`120`	`156`	`predict_y = []`
`121`		`- for i in range(len(testLabel)):`
	`157`	`+ for i in range(m): ## 对应0/1分类`
`122`	`158`	`if hx[i][0] > 0.5:`
`123`	`159`	`predict_y.append(1)`
`124`	`160`	`else:`
`125`	`161`	`predict_y.append(0)`
	`162`	`+ testLabel = np.array(test_y) ## array`
`126`	`163`	`predictLabel = np.array(predict_y) ## array`
`127`		`- print 'accuracy:', sum(testLabel==predictLabel)/float(len(testLabel))`
	`164`	`+ print 'accuracy:', sum(testLabel==predictLabel)/float(m)`
`128`	`165`	`return predictLabel`
`129`	`166`
`130`	`167`	`def loadTrainDataSet():`
`@@ -150,9 +187,13 @@ def loadTestDataSet():`
`150`	`187`	`test_X, test_y = loadTestDataSet()`
`151`	`188`	`clf = NaiveBayes()`
`152`	`189`	`vocabList, param = clf.fit(train_X, train_y)`
`153`		`- results = clf.predict(test_X, test_y, vocabList, param)`
	`190`	`+ results = clf.predict(test_X, vocabList, param)`
`154`	`191`	`print results`
	`192`	`+ results1 = clf.predict1(test_X, test_y, vocabList, param)`
	`193`	`+ print results1`
`155`	`194`	`clf = LogisticRegression()`
`156`	`195`	`vocabList, weigh = clf.fit(train_X, train_y)`
`157`		`- results = clf.predict(test_X, test_y, vocabList, weigh)`
`158`		`- print results`
	`196`	`+ results = clf.predict(test_X, vocabList, weigh)`
	`197`	`+ print results`
	`198`	`+ results1 = clf.predict1(test_X, test_y, vocabList, weigh)`
	`199`	`+ print results1`

`‎README.md‎`

Lines changed: 9 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -12,4 +12,13 @@`
`12`	`12`
`13`	`13`	`* 针对文本分类的 LogisticRegression 算法`
`14`	`14`
	`15`	`+* 回归算法:`
	`16`	`+ * 标准的线性回归`
	`17`	`+ * 局部加权线性回归`
	`18`	`+ * 岭回归`
	`19`	`+`
	`20`	`+结果示例:`
	`21`	`+![image](./Regression/standRegresResults.png)`
	`22`	`+![image](./Regression/lwlrResults.png)`
	`23`	`+`
`15`	`24`	`参照:《机器学习实战》`

`‎Regression/RegressionTest.py‎`

Lines changed: 174 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,174 @@`
	`1`	`+#coding:utf-8`
	`2`	`+import numpy as np`
	`3`	`+import matplotlib.pyplot as plt`
	`4`	`+`
	`5`	`+'''`
	`6`	`+np.linalg Core Linear Algebra Tools`
	`7`	`+xx.T 矩阵的转置`
	`8`	`+xx.I 矩阵的逆`
	`9`	`+m 样本点数`
	`10`	`+n 特征维数`
	`11`	`+'''`
	`12`	`+`
	`13`	`+def loadDataSet(datafile):`
	`14`	`+ featData = []`
	`15`	`+ labelData = []`
	`16`	`+ with open(datafile, 'r') as fr_file:`
	`17`	`+ for eachLine in fr_file:`
	`18`	`+ oneLine = eachLine.split('\t')`
	`19`	`+ tempArr = []`
	`20`	`+ for i in range(len(oneLine)-1):`
	`21`	`+ tempArr.append(float(oneLine[i]))`
	`22`	`+ featData.append(tempArr)`
	`23`	`+ labelData.append(float(oneLine[-1].strip())) # float型连续变量`
	`24`	`+ featData = np.array(featData) # 转换为array`
	`25`	`+ labelData = np.array(labelData) # 转换为array`
	`26`	`+ return featData, labelData`
	`27`	`+`
	`28`	`+def rssError(yArr, yHat):`
	`29`	`+ return np.sum((yArr-yHat)**2)`
	`30`	`+`
	`31`	`+def showRegres(xArr, yArr, yHat):`
	`32`	`+ fig = plt.figure()`
	`33`	`+ ax = fig.add_subplot(111)`
	`34`	`+ ax.scatter(xArr[:, 1], yArr)`
	`35`	`+ '''`
	`36`	`+ 因为数据假定了x0=1,因此yHat=ws[0]+ws[1]*x1,看yHat与x1之间的线性关系`
	`37`	`+ '''`
	`38`	`+ srtInd = xArr[:, 1].argsort(0)`
	`39`	`+ # print srtInd`
	`40`	`+ ax.plot(xArr[srtInd, 1], yHat[srtInd]) # 拟合前需要将点升序排列`
	`41`	`+ plt.show()`
	`42`	`+`
	`43`	`+'''标准的线性回归:最小二乘法(平方误差最小),适用于m>=n情况'''`
	`44`	`+def standRegres(xMat, yMat):`
	`45`	`+ xTx = xMat.TxMat # nn`
	`46`	`+ if np.linalg.det(xTx) == 0.0:`
	`47`	`+ print 'This matrix is singular, cannot do inverse'`
	`48`	`+ return`
	`49`	`+ ## 方法1`
	`50`	`+ ws = xTx.I(xMat.TyMat) # n*1`
	`51`	`+ ## 方法2`
	`52`	`+ # ws = np.linalg.solve(xTx, xMat.TyMat) # n1`
	`53`	`+ # yHat = xMatws # m1`
	`54`	`+ return ws`
	`55`	`+`
	`56`	`+def standRegresTest(xArr, yArr):`
	`57`	`+ xMat = np.matrix(xArr) # m*n`
	`58`	`+ yMat = np.matrix(yArr).T # m*1`
	`59`	`+ ws = standRegres(xMat, yMat) # n*1`
	`60`	`+ # print ws`
	`61`	`+ yHat = xMatws # m1`
	`62`	`+ yHat = np.array(yHat).reshape(1, -1)[0] ## [[xx1][xx2]]二维matrix为[xx1, xx2]一维array`
	`63`	`+ return yHat`
	`64`	`+`
	`65`	`+'''局部加权线性回归,适用于m>=n情况'''`
	`66`	`+def lwlr(testPoint, xMat, yMat, k=1.0):`
	`67`	`+ m = np.shape(xMat)[0]`
	`68`	`+ weights = np.matrix(np.eye(m)) # 创建对角矩阵`
	`69`	`+ for j in range(m):`
	`70`	`+ diffMat = testPoint-xMat[j, :]`
	`71`	`+ weights[j, j] = np.exp(diffMatdiffMat.T/(-2.0k**2)) # 高斯核`
	`72`	`+ print weights`
	`73`	`+ xTx = xMat.T(weightsxMat)`
	`74`	`+ if np.linalg.det(xTx) == 0.0:`
	`75`	`+ print 'This matrix is singular, cannot do inverse'`
	`76`	`+ return`
	`77`	`+ ws = xTx.I(xMat.T(weights*yMat))`
	`78`	`+ return testPoint*ws`
	`79`	`+`
	`80`	`+def lwlrTest(testArr, xArr, yArr, k=1.0):`
	`81`	`+ xMat = np.matrix(xArr) # m*n`
	`82`	`+ yMat = np.matrix(yArr).T # m*1`
	`83`	`+ m = np.shape(testArr)[0]`
	`84`	`+ yHat = np.zeros(m)`
	`85`	`+ for i in range(m):`
	`86`	`+ yHat[i] = lwlr(testArr[i], xMat, yMat, k)`
	`87`	`+ return yHat`
	`88`	`+`
	`89`	`+'''岭回归,适用于m>=n及m<n情况'''`
	`90`	`+def ridgeRegres(xMat, yMat, lam=0.2):`
	`91`	`+ xTx = xMat.T*xMat`
	`92`	`+ denom = xTx+np.eye(np.shape(xMat)[1])*lam`
	`93`	`+ if np.linalg.det(denom) == 0.0:`
	`94`	`+ print 'This matrix is singular, cannot do inverse'`
	`95`	`+ return`
	`96`	`+ ws = denom.I(xMat.TyMat)`
	`97`	`+ return ws`
	`98`	`+`
	`99`	`+def ridgeTest(xArr, yArr):`
	`100`	`+ xMat = np.matrix(xArr)`
	`101`	`+ yMat = np.matrix(yArr).T`
	`102`	`+ '''标准化XY'''`
	`103`	`+ ## regularize Y's`
	`104`	`+ yMean = np.mean(yMat, 0)`
	`105`	`+ yMat = yMat-yMean # to eliminate X0 take mean off of Y`
	`106`	`+ ## regularize X's`
	`107`	`+ xMeans = np.mean(xMat, 0) # calc mean then subtract it off`
	`108`	`+ xVar = np.var(xMat, 0) # calc variance of Xi then divide by it`
	`109`	`+ xMat = (xMat-xMeans)/xVar`
	`110`	`+ '''计算wMat'''`
	`111`	`+ numTestPts = 30`
	`112`	`+ wMat = np.matrix(np.zeros((numTestPts, np.shape(xMat)[1])))`
	`113`	`+ for i in range(numTestPts):`
	`114`	`+ ws = ridgeRegres(xMat, yMat, np.exp(i-10))`
	`115`	`+ wMat[i, :] = ws.T`
	`116`	`+ return wMat`
	`117`	`+`
	`118`	`+if __name__ == '__main__':`
	`119`	`+ ####################################################################################`
	`120`	`+ ## 标准的线性回归`
	`121`	`+ xArr, yArr = loadDataSet('ex.txt')`
	`122`	`+ yHat = standRegresTest(xArr, yArr)`
	`123`	`+ print yHat`
	`124`	`+ showRegres(xArr, yArr, yHat)`
	`125`	`+ coef = np.corrcoef(yArr, yHat)`
	`126`	`+ print coef`
	`127`	`+ print (coef[0, 1]+coef[1, 0])/2.0`
	`128`	`+ print rssError(yArr, yHat)`
	`129`	`+ ####################################################################################`
	`130`	`+ ## 局部加权线性回归`
	`131`	`+ xArr, yArr = loadDataSet('ex.txt')`
	`132`	`+ yHat = lwlrTest(xArr, xArr, yArr, k=0.01)`
	`133`	`+ print yHat`
	`134`	`+ showRegres(xArr, yArr, yHat)`
	`135`	`+ coef = np.corrcoef(yArr, yHat)`
	`136`	`+ print coef`
	`137`	`+ print (coef[0, 1]+coef[1, 0])/2.0`
	`138`	`+ print rssError(yArr, yHat)`
	`139`	`+ # '''寻找使相关系数最大的k'''`
	`140`	`+ # max_k = 0`
	`141`	`+ # max_coef = 0`
	`142`	`+ # for k in range(1, 100):`
	`143`	`+ # k /= 1000.0`
	`144`	`+ # yHat = lwlrTest(xArr, xArr, yArr, k)`
	`145`	`+ # coef = np.corrcoef(yArr, yHat)`
	`146`	`+ # temp_coef = (coef[0, 1]+coef[1, 0])/2.0`
	`147`	`+ # if temp_coef > max_coef:`
	`148`	`+ # max_coef = temp_coef`
	`149`	`+ # max_k = k`
	`150`	`+ # print max_k, max_coef`
	`151`	`+ # '''寻找使平方误差最小的k'''`
	`152`	`+ # min_k = 0`
	`153`	`+ # min_error = np.inf`
	`154`	`+ # for k in range(1, 100):`
	`155`	`+ # k /= 1000.0`
	`156`	`+ # yHat = lwlrTest(xArr, xArr, yArr, k)`
	`157`	`+ # temp_error = rssError(yArr, yHat)`
	`158`	`+ # if temp_error < min_error:`
	`159`	`+ # min_error = temp_error`
	`160`	`+ # min_k = k`
	`161`	`+ # print min_k, min_error`
	`162`	`+ ####################################################################################`
	`163`	`+ ## 岭回归`
	`164`	`+ xArr, yArr = loadDataSet('abalone.txt')`
	`165`	`+ wMat = ridgeTest(xArr, yArr)`
	`166`	`+ fig = plt.figure()`
	`167`	`+ ax = fig.add_subplot(111)`
	`168`	`+ ax.plot(wMat) # 描述回归系数与log(lam)的关系`
	`169`	`+ plt.show()`
	`170`	`+ '''`
	`171`	`+ 在最左边时,lam为np.exp(0-10)=0,回归系数为原始值(即不缩减),跟标准的线性回归一致`
	`172`	`+ 在最右边时,lam为np.exp(20-10)=e^10,回归系数全部缩减为0`
	`173`	`+ 因此,在中间的某部分取值,lam能得到最好的预测效果,去掉不重要回归参数,参数的大小表示其重要性`
	`174`	`+ '''`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 21889b2

File tree

7 files changed

7 files changed

`‎NB&LR/NaiveBayes_vs_LogisticRegression.py‎`

`‎README.md‎`

`‎Regression/RegressionTest.py‎`

0 commit comments