Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 0b2b686

Browse files
test
1 parent 551a58e commit 0b2b686

File tree

1 file changed

+210
-10
lines changed

1 file changed

+210
-10
lines changed

‎NB&LR/NaiveBayes_vs_LogisticRegression.py‎

Lines changed: 210 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@ def fit(self, train_x, train_y):
3030
trainLabel = np.array(train_y) ## array
3131
numTrainDocs = len(trainMatrix) # 统计样本个数
3232
numWords = len(trainMatrix[0]) # 统计特征个数,理论上是词库的长度
33-
## 计算p(c0),p(c1)
34-
p1 = sum(trainLabel)/float(numTrainDocs) # 对应p(c1)
35-
p0 = 1-p1 # 对应p(c0)
33+
## 计算p(c0),p(c1) ## 对应0/1分类
34+
p0 = sum(trainLabel==0)/float(numTrainDocs) # 对应p(c0)
35+
p1 = sum(trainLabel==1)/float(numTrainDocs) # 对应p(c1)
3636
## 计算p(wi|c0),p(wi|c1)
3737
p0Num = np.ones(numWords) # 初始样本个数为1,防止条件概率为0,影响结果
3838
p1Num = np.ones(numWords)
@@ -61,7 +61,7 @@ def predict(self, test_X, vocabList, param):
6161
for vec in testMatrix:
6262
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
6363
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
64-
if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
64+
if prob_y0 < prob_y1: ## 对应0/1分类
6565
predict_y.append(1)
6666
else:
6767
predict_y.append(0)
@@ -79,7 +79,7 @@ def predict1(self, test_X, test_y, vocabList, param):
7979
for vec in testMatrix:
8080
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
8181
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
82-
if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类
82+
if prob_y0 < prob_y1: ## 对应0/1分类
8383
predict_y.append(1)
8484
else:
8585
predict_y.append(0)
@@ -123,7 +123,7 @@ def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):
123123
weigh = np.matrix(np.ones((n, 1))) # size: n*1
124124
for i in range(maxCycles):
125125
hx = self.sigmoid(trainMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
126-
error = trainLabel-hx # size: m*1
126+
error = np.abs(trainLabel-hx) # size: m*1
127127
weigh += alpha*trainMatrix.T*error # size: n*1
128128
return vocabList, weigh
129129

@@ -142,7 +142,6 @@ def predict(self, test_X, vocabList, weigh):
142142
else:
143143
predict_y.append(0)
144144
predictLabel = np.array(predict_y) ## array
145-
# predictLabel = np.matrix(predict_y).T ## matrix
146145
return predictLabel
147146

148147
# 使用学习得到的参数进行分类
@@ -159,8 +158,8 @@ def predict1(self, test_X, test_y, vocabList, weigh):
159158
predict_y.append(1)
160159
else:
161160
predict_y.append(0)
162-
testLabel = np.array(test_y) ## array
163161
predictLabel = np.array(predict_y) ## array
162+
testLabel = np.array(test_y) ## array
164163
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
165164
return predictLabel
166165

@@ -171,18 +170,204 @@ def loadTrainDataSet():
171170
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
172171
['mr', 'licks','ate','my', 'steak', 'how', 'to', 'stop', 'him'],
173172
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
174-
train_y = [0,1,0,1,0,1]# 0:good; 1: bad
173+
train_y = [0,1,0,1,0,1]
175174
return train_x, train_y
176175

177176
def loadTestDataSet():
178177
test_X = [['love', 'my', 'girl', 'friend'],
179178
['stupid', 'garbage'],
180179
['Haha', 'I', 'really', "Love", "You"],
181180
['This', 'is', "my", "dog"]]
182-
test_y = [0,1,0,0]# 0:good; 1: bad
181+
test_y = [0,1,0,0]
183182
return test_X, test_y
184183

184+
185+
class NaiveBayes_neg1_pos1():
186+
def __init__(self):
187+
pass
188+
189+
def createVocabList(self, train_x):
190+
vocabSet = set([])
191+
for wordList in train_x:
192+
vocabSet = vocabSet | set(wordList)
193+
return list(vocabSet)
194+
195+
def listOfWords2Vec(self, vocabList, wordList):
196+
wordsVec = [0] * len(vocabList)
197+
for word in wordList:
198+
if word in vocabList:
199+
wordsVec[vocabList.index(word)] = 1 # 词集模型
200+
# wordsVec[vocabList.index(word)] += 1 # 词袋模型
201+
# else:
202+
# print "the word:%s is not in my vocabulary!" % word
203+
return wordsVec
204+
205+
def fit(self, train_x, train_y):
206+
vocabList = self.createVocabList(train_x)
207+
trainMat = []
208+
for wordList in train_x:
209+
trainMat.append(self.listOfWords2Vec(vocabList, wordList))
210+
trainMatrix = np.array(trainMat) ## array
211+
trainLabel = np.array(train_y) ## array
212+
numTrainDocs = len(trainMatrix) # 统计样本个数
213+
numWords = len(trainMatrix[0]) # 统计特征个数,理论上是词库的长度
214+
## 计算p(c0),p(c1) ## 对应-1/1分类
215+
p0 = sum(trainLabel==-1)/float(numTrainDocs) # 对应p(c0)
216+
p1 = sum(trainLabel==1)/float(numTrainDocs) # 对应p(c1)
217+
## 计算p(wi|c0),p(wi|c1)
218+
p0Num = np.ones(numWords) # 初始样本个数为1,防止条件概率为0,影响结果
219+
p1Num = np.ones(numWords)
220+
p0InAll = 2.0 # 词库中只有两类,所以此处初始化为2
221+
p1InAll = 2.0
222+
for i in range(numTrainDocs):
223+
if trainLabel[i] == 1:
224+
p1Num += trainMatrix[i]
225+
p1InAll += sum(trainMatrix[i])
226+
else:
227+
p0Num += trainMatrix[i]
228+
p0InAll += sum(trainMatrix[i])
229+
p0Vec = np.log(p0Num/p0InAll) # 对应p(wi|c0)
230+
p1Vec = np.log(p1Num/p1InAll) # 对应p(wi|c1)
231+
## 整合参数
232+
param = p0, p1, p0Vec, p1Vec
233+
return vocabList, param
234+
235+
def predict(self, test_X, vocabList, param):
236+
p0, p1, p0Vec, p1Vec = param
237+
testMat = []
238+
for wordList in test_X:
239+
testMat.append(self.listOfWords2Vec(vocabList, wordList))
240+
testMatrix = np.array(testMat) ## array
241+
predict_y = []
242+
for vec in testMatrix:
243+
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
244+
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
245+
if prob_y0 < prob_y1: ## 对应-1/1分类,但是NaiveBayes可以修改成多分类
246+
predict_y.append(1)
247+
else:
248+
predict_y.append(-1)
249+
predictLabel = np.array(predict_y) ## array
250+
return predictLabel
251+
252+
def predict1(self, test_X, test_y, vocabList, param):
253+
p0, p1, p0Vec, p1Vec = param
254+
testMat = []
255+
for wordList in test_X:
256+
testMat.append(self.listOfWords2Vec(vocabList, wordList))
257+
testMatrix = np.array(testMat) ## array
258+
m = testMatrix.shape[0]
259+
predict_y = []
260+
for vec in testMatrix:
261+
prob_y0 = sum(vec*p0Vec)+np.log(p0) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
262+
prob_y1 = sum(vec*p1Vec)+np.log(p1) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
263+
if prob_y0 < prob_y1: ## 对应-1/1分类,但是NaiveBayes可以修改成多分类
264+
predict_y.append(1)
265+
else:
266+
predict_y.append(-1)
267+
testLabel = np.array(test_y) ## array
268+
predictLabel = np.array(predict_y) ## array
269+
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
270+
return predictLabel
271+
272+
class LogisticRegression_neg1_pos1(): # 二分类,-1/1分类
273+
def __init__(self):
274+
pass
275+
276+
def createVocabList(self, train_x):
277+
vocabSet = set([])
278+
for wordList in train_x:
279+
vocabSet = vocabSet | set(wordList)
280+
return list(vocabSet)
281+
282+
def listOfWords2Vec(self, vocabList, wordList):
283+
wordsVec = [0] * len(vocabList)
284+
for word in wordList:
285+
if word in vocabList:
286+
wordsVec[vocabList.index(word)] = 1 # 词集模型
287+
# wordsVec[vocabList.index(word)] += 1 # 词袋模型
288+
# else:
289+
# print "the word:%s is not in my vocabulary!" % word
290+
return wordsVec
291+
292+
def sigmoid(self, inX):
293+
return 1.0/(1 + np.exp(-inX))
294+
295+
# 使用梯度下降方法训练模型,alpha为步长(学习率),maxCycles最大迭代次数
296+
def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):
297+
vocabList = self.createVocabList(train_x)
298+
trainMat = []
299+
for wordList in train_x:
300+
trainMat.append(self.listOfWords2Vec(vocabList, wordList))
301+
trainMatrix = np.matrix(trainMat) ## matrix是二维的 # size: m*n
302+
trainLabel = np.matrix(train_y).T ## matrix是二维的 # size: m*1
303+
m, n = trainMatrix.shape
304+
weigh = np.matrix(np.ones((n, 1))) # size: n*1
305+
for i in range(maxCycles):
306+
# hx = self.sigmoid(trainMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
307+
hx = self.sigmoid(trainMatrix*weigh)*2-1
308+
error = np.abs(trainLabel-hx) # size: m*1
309+
weigh += alpha*trainMatrix.T*error # size: n*1
310+
return vocabList, weigh
311+
312+
# 使用学习得到的参数进行分类
313+
def predict(self, test_X, vocabList, weigh):
314+
testMat = []
315+
for wordList in test_X:
316+
testMat.append(self.listOfWords2Vec(vocabList, wordList))
317+
testMatrix = np.matrix(testMat) ## matrix是二维的
318+
m = testMatrix.shape[0]
319+
# hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
320+
hx = self.sigmoid(testMatrix*weigh)*2-1
321+
predict_y = []
322+
for i in range(m): ## 对应-1/1分类
323+
if hx[i][0] > 0.0:
324+
predict_y.append(1)
325+
else:
326+
predict_y.append(-1)
327+
predictLabel = np.array(predict_y) ## array
328+
return predictLabel
329+
330+
# 使用学习得到的参数进行分类
331+
def predict1(self, test_X, test_y, vocabList, weigh):
332+
testMat = []
333+
for wordList in test_X:
334+
testMat.append(self.listOfWords2Vec(vocabList, wordList))
335+
testMatrix = np.matrix(testMat) ## matrix是二维的
336+
m = testMatrix.shape[0]
337+
# hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
338+
hx = self.sigmoid(testMatrix*weigh)*2-1
339+
predict_y = []
340+
for i in range(m): ## 对应-1/1分类
341+
if hx[i][0] > 0.0:
342+
predict_y.append(1)
343+
else:
344+
predict_y.append(-1)
345+
predictLabel = np.array(predict_y) ## array
346+
testLabel = np.array(test_y) ## array
347+
print 'accuracy:', sum(testLabel==predictLabel)/float(m)
348+
return predictLabel
349+
350+
def loadTrainDataSet_neg1_pos1():
351+
train_x = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
352+
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
353+
['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'],
354+
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
355+
['mr', 'licks','ate','my', 'steak', 'how', 'to', 'stop', 'him'],
356+
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
357+
train_y = [-1,1,-1,1,-1,1]
358+
return train_x, train_y
359+
360+
def loadTestDataSet_neg1_pos1():
361+
test_X = [['love', 'my', 'girl', 'friend'],
362+
['stupid', 'garbage'],
363+
['Haha', 'I', 'really', "Love", "You"],
364+
['This', 'is', "my", "dog"]]
365+
test_y = [-1,1,-1,-1]
366+
return test_X, test_y
367+
368+
185369
if __name__ == '__main__':
370+
186371
train_X, train_y = loadTrainDataSet()
187372
test_X, test_y = loadTestDataSet()
188373
clf = NaiveBayes()
@@ -196,4 +381,19 @@ def loadTestDataSet():
196381
results = clf.predict(test_X, vocabList, weigh)
197382
print results
198383
results1 = clf.predict1(test_X, test_y, vocabList, weigh)
384+
print results1
385+
386+
train_X, train_y = loadTrainDataSet_neg1_pos1()
387+
test_X, test_y = loadTestDataSet_neg1_pos1()
388+
clf = NaiveBayes_neg1_pos1()
389+
vocabList, param = clf.fit(train_X, train_y)
390+
results = clf.predict(test_X, vocabList, param)
391+
print results
392+
results1 = clf.predict1(test_X, test_y, vocabList, param)
393+
print results1
394+
clf = LogisticRegression_neg1_pos1()
395+
vocabList, weigh = clf.fit(train_X, train_y)
396+
results = clf.predict(test_X, vocabList, weigh)
397+
print results
398+
results1 = clf.predict1(test_X, test_y, vocabList, weigh)
199399
print results1

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /