Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 21889b2

Browse files
test
1 parent a19ffd4 commit 21889b2

File tree

7 files changed

+4616
-15
lines changed

7 files changed

+4616
-15
lines changed

‎NB&LR/NaiveBayes_vs_LogisticRegression.py‎

Lines changed: 56 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -51,26 +51,44 @@ def fit(self, train_x, train_y):
5151
param = p0, p1, p0Vec, p1Vec
5252
return vocabList, param
5353

54-
def predict(self, test_X, test_y, vocabList, param):
54+
def predict(self, test_X, vocabList, param):
5555
p0, p1, p0Vec, p1Vec = param
5656
testMat = []
5757
for wordList in test_X:
5858
testMat.append(self.listOfWords2Vec(vocabList, wordList))
5959
testMatrix = np.array(testMat) ## array
60-
testLabel = np.array(test_y) ## array
6160
predict_y = []
6261
for vec in testMatrix:
6362
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
6463
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
65-
if prob_y0 < prob_y1:
64+
if prob_y0 < prob_y1:## 对应0/1分类,但是NaiveBayes可以修改成多分类
6665
predict_y.append(1)
6766
else:
6867
predict_y.append(0)
6968
predictLabel = np.array(predict_y) ## array
70-
print 'accuracy:', sum(testLabel==predictLabel)/float(len(testLabel))
7169
return predictLabel
7270

73-
class LogisticRegression(): # 二分类
71+
def predict1(self, test_X, test_y, vocabList, param):
72+
p0, p1, p0Vec, p1Vec = param
73+
testMat = []
74+
for wordList in test_X:
75+
testMat.append(self.listOfWords2Vec(vocabList, wordList))
76+
testMatrix = np.array(testMat) ## array
77+
m = testMatrix.shape[0]
78+
predict_y = []
79+
for vec in testMatrix:
80+
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
81+
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
82+
if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
83+
predict_y.append(1)
84+
else:
85+
predict_y.append(0)
86+
testLabel = np.array(test_y) ## array
87+
predictLabel = np.array(predict_y) ## array
88+
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
89+
return predictLabel
90+
91+
class LogisticRegression(): # 二分类,0/1分类
7492
def __init__(self):
7593
pass
7694

@@ -102,29 +120,48 @@ def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):
102120
trainMatrix = np.matrix(trainMat) ## matrix是二维的 # size: m*n
103121
trainLabel = np.matrix(train_y).T ## matrix是二维的 # size: m*1
104122
m, n = trainMatrix.shape
105-
weigh = np.ones((n, 1)) # size: n*1
123+
weigh = np.matrix(np.ones((n, 1))) # size: n*1
106124
for i in range(maxCycles):
107-
hx = self.sigmoid(trainMatrix*weigh) # size: m*1
125+
hx = self.sigmoid(trainMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
108126
error = trainLabel-hx # size: m*1
109127
weigh += alpha*trainMatrix.T*error # size: n*1
110128
return vocabList, weigh
111129

112130
# 使用学习得到的参数进行分类
113-
def predict(self, test_X, test_y, vocabList, weigh):
131+
def predict(self, test_X, vocabList, weigh):
114132
testMat = []
115133
for wordList in test_X:
116134
testMat.append(self.listOfWords2Vec(vocabList, wordList))
117135
testMatrix = np.matrix(testMat) ## matrix是二维的
118-
testLabel = np.array(test_y) ## array
119-
hx = self.sigmoid(testMatrix*weigh) # size: m*1
136+
m = testMatrix.shape[0]
137+
hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
138+
predict_y = []
139+
for i in range(m): ## 对应0/1分类
140+
if hx[i][0] > 0.5:
141+
predict_y.append(1)
142+
else:
143+
predict_y.append(0)
144+
predictLabel = np.array(predict_y) ## array
145+
# predictLabel = np.matrix(predict_y).T ## matrix
146+
return predictLabel
147+
148+
# 使用学习得到的参数进行分类
149+
def predict1(self, test_X, test_y, vocabList, weigh):
150+
testMat = []
151+
for wordList in test_X:
152+
testMat.append(self.listOfWords2Vec(vocabList, wordList))
153+
testMatrix = np.matrix(testMat) ## matrix是二维的
154+
m = testMatrix.shape[0]
155+
hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
120156
predict_y = []
121-
for i in range(len(testLabel)):
157+
for i in range(m): ## 对应0/1分类
122158
if hx[i][0] > 0.5:
123159
predict_y.append(1)
124160
else:
125161
predict_y.append(0)
162+
testLabel = np.array(test_y) ## array
126163
predictLabel = np.array(predict_y) ## array
127-
print 'accuracy:', sum(testLabel==predictLabel)/float(len(testLabel))
164+
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
128165
return predictLabel
129166

130167
def loadTrainDataSet():
@@ -150,9 +187,13 @@ def loadTestDataSet():
150187
test_X, test_y = loadTestDataSet()
151188
clf = NaiveBayes()
152189
vocabList, param = clf.fit(train_X, train_y)
153-
results = clf.predict(test_X, test_y, vocabList, param)
190+
results = clf.predict(test_X, vocabList, param)
154191
print results
192+
results1 = clf.predict1(test_X, test_y, vocabList, param)
193+
print results1
155194
clf = LogisticRegression()
156195
vocabList, weigh = clf.fit(train_X, train_y)
157-
results = clf.predict(test_X, test_y, vocabList, weigh)
158-
print results
196+
results = clf.predict(test_X, vocabList, weigh)
197+
print results
198+
results1 = clf.predict1(test_X, test_y, vocabList, weigh)
199+
print results1

‎README.md‎

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,13 @@
1212

1313
* 针对文本分类的 LogisticRegression 算法
1414

15+
* 回归算法:
16+
* 标准的线性回归
17+
* 局部加权线性回归
18+
* 岭回归
19+
20+
结果示例:
21+
![image](./Regression/standRegresResults.png)
22+
![image](./Regression/lwlrResults.png)
23+
1524
参照:《机器学习实战》

‎Regression/RegressionTest.py‎

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#coding:utf-8
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
5+
'''
6+
np.linalg Core Linear Algebra Tools
7+
xx.T 矩阵的转置
8+
xx.I 矩阵的逆
9+
m 样本点数
10+
n 特征维数
11+
'''
12+
13+
def loadDataSet(datafile):
14+
featData = []
15+
labelData = []
16+
with open(datafile, 'r') as fr_file:
17+
for eachLine in fr_file:
18+
oneLine = eachLine.split('\t')
19+
tempArr = []
20+
for i in range(len(oneLine)-1):
21+
tempArr.append(float(oneLine[i]))
22+
featData.append(tempArr)
23+
labelData.append(float(oneLine[-1].strip())) # float型连续变量
24+
featData = np.array(featData) # 转换为array
25+
labelData = np.array(labelData) # 转换为array
26+
return featData, labelData
27+
28+
def rssError(yArr, yHat):
29+
return np.sum((yArr-yHat)**2)
30+
31+
def showRegres(xArr, yArr, yHat):
32+
fig = plt.figure()
33+
ax = fig.add_subplot(111)
34+
ax.scatter(xArr[:, 1], yArr)
35+
'''
36+
因为数据假定了x0=1,因此yHat=ws[0]+ws[1]*x1,看yHat与x1之间的线性关系
37+
'''
38+
srtInd = xArr[:, 1].argsort(0)
39+
# print srtInd
40+
ax.plot(xArr[srtInd, 1], yHat[srtInd]) # 拟合前需要将点升序排列
41+
plt.show()
42+
43+
'''标准的线性回归:最小二乘法(平方误差最小),适用于m>=n情况'''
44+
def standRegres(xMat, yMat):
45+
xTx = xMat.T*xMat # n*n
46+
if np.linalg.det(xTx) == 0.0:
47+
print 'This matrix is singular, cannot do inverse'
48+
return
49+
## 方法1
50+
ws = xTx.I*(xMat.T*yMat) # n*1
51+
## 方法2
52+
# ws = np.linalg.solve(xTx, xMat.T*yMat) # n*1
53+
# yHat = xMat*ws # m*1
54+
return ws
55+
56+
def standRegresTest(xArr, yArr):
57+
xMat = np.matrix(xArr) # m*n
58+
yMat = np.matrix(yArr).T # m*1
59+
ws = standRegres(xMat, yMat) # n*1
60+
# print ws
61+
yHat = xMat*ws # m*1
62+
yHat = np.array(yHat).reshape(1, -1)[0] ## [[xx1][xx2]]二维matrix为[xx1, xx2]一维array
63+
return yHat
64+
65+
'''局部加权线性回归,适用于m>=n情况'''
66+
def lwlr(testPoint, xMat, yMat, k=1.0):
67+
m = np.shape(xMat)[0]
68+
weights = np.matrix(np.eye(m)) # 创建对角矩阵
69+
for j in range(m):
70+
diffMat = testPoint-xMat[j, :]
71+
weights[j, j] = np.exp(diffMat*diffMat.T/(-2.0*k**2)) # 高斯核
72+
print weights
73+
xTx = xMat.T*(weights*xMat)
74+
if np.linalg.det(xTx) == 0.0:
75+
print 'This matrix is singular, cannot do inverse'
76+
return
77+
ws = xTx.I*(xMat.T*(weights*yMat))
78+
return testPoint*ws
79+
80+
def lwlrTest(testArr, xArr, yArr, k=1.0):
81+
xMat = np.matrix(xArr) # m*n
82+
yMat = np.matrix(yArr).T # m*1
83+
m = np.shape(testArr)[0]
84+
yHat = np.zeros(m)
85+
for i in range(m):
86+
yHat[i] = lwlr(testArr[i], xMat, yMat, k)
87+
return yHat
88+
89+
'''岭回归,适用于m>=n及m<n情况'''
90+
def ridgeRegres(xMat, yMat, lam=0.2):
91+
xTx = xMat.T*xMat
92+
denom = xTx+np.eye(np.shape(xMat)[1])*lam
93+
if np.linalg.det(denom) == 0.0:
94+
print 'This matrix is singular, cannot do inverse'
95+
return
96+
ws = denom.I*(xMat.T*yMat)
97+
return ws
98+
99+
def ridgeTest(xArr, yArr):
100+
xMat = np.matrix(xArr)
101+
yMat = np.matrix(yArr).T
102+
'''标准化XY'''
103+
## regularize Y's
104+
yMean = np.mean(yMat, 0)
105+
yMat = yMat-yMean # to eliminate X0 take mean off of Y
106+
## regularize X's
107+
xMeans = np.mean(xMat, 0) # calc mean then subtract it off
108+
xVar = np.var(xMat, 0) # calc variance of Xi then divide by it
109+
xMat = (xMat-xMeans)/xVar
110+
'''计算wMat'''
111+
numTestPts = 30
112+
wMat = np.matrix(np.zeros((numTestPts, np.shape(xMat)[1])))
113+
for i in range(numTestPts):
114+
ws = ridgeRegres(xMat, yMat, np.exp(i-10))
115+
wMat[i, :] = ws.T
116+
return wMat
117+
118+
if __name__ == '__main__':
119+
####################################################################################
120+
## 标准的线性回归
121+
xArr, yArr = loadDataSet('ex.txt')
122+
yHat = standRegresTest(xArr, yArr)
123+
print yHat
124+
showRegres(xArr, yArr, yHat)
125+
coef = np.corrcoef(yArr, yHat)
126+
print coef
127+
print (coef[0, 1]+coef[1, 0])/2.0
128+
print rssError(yArr, yHat)
129+
####################################################################################
130+
## 局部加权线性回归
131+
xArr, yArr = loadDataSet('ex.txt')
132+
yHat = lwlrTest(xArr, xArr, yArr, k=0.01)
133+
print yHat
134+
showRegres(xArr, yArr, yHat)
135+
coef = np.corrcoef(yArr, yHat)
136+
print coef
137+
print (coef[0, 1]+coef[1, 0])/2.0
138+
print rssError(yArr, yHat)
139+
# '''寻找使相关系数最大的k'''
140+
# max_k = 0
141+
# max_coef = 0
142+
# for k in range(1, 100):
143+
# k /= 1000.0
144+
# yHat = lwlrTest(xArr, xArr, yArr, k)
145+
# coef = np.corrcoef(yArr, yHat)
146+
# temp_coef = (coef[0, 1]+coef[1, 0])/2.0
147+
# if temp_coef > max_coef:
148+
# max_coef = temp_coef
149+
# max_k = k
150+
# print max_k, max_coef
151+
# '''寻找使平方误差最小的k'''
152+
# min_k = 0
153+
# min_error = np.inf
154+
# for k in range(1, 100):
155+
# k /= 1000.0
156+
# yHat = lwlrTest(xArr, xArr, yArr, k)
157+
# temp_error = rssError(yArr, yHat)
158+
# if temp_error < min_error:
159+
# min_error = temp_error
160+
# min_k = k
161+
# print min_k, min_error
162+
####################################################################################
163+
## 岭回归
164+
xArr, yArr = loadDataSet('abalone.txt')
165+
wMat = ridgeTest(xArr, yArr)
166+
fig = plt.figure()
167+
ax = fig.add_subplot(111)
168+
ax.plot(wMat) # 描述回归系数与log(lam)的关系
169+
plt.show()
170+
'''
171+
在最左边时,lam为np.exp(0-10)=0,回归系数为原始值(即不缩减),跟标准的线性回归一致
172+
在最右边时,lam为np.exp(20-10)=e^10,回归系数全部缩减为0
173+
因此,在中间的某部分取值,lam能得到最好的预测效果,去掉不重要回归参数,参数的大小表示其重要性
174+
'''

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /