@@ -30,9 +30,9 @@ def fit(self, train_x, train_y):
30
30
trainLabel = np .array (train_y ) ## array
31
31
numTrainDocs = len (trainMatrix ) # 统计样本个数
32
32
numWords = len (trainMatrix [0 ]) # 统计特征个数,理论上是词库的长度
33
- ## 计算p(c0),p(c1)
34
- p1 = sum (trainLabel )/ float (numTrainDocs ) # 对应p(c1 )
35
- p0 = 1 - p1 # 对应p(c0 )
33
+ ## 计算p(c0),p(c1) ## 对应0/1分类
34
+ p0 = sum (trainLabel == 0 )/ float (numTrainDocs ) # 对应p(c0 )
35
+ p1 = sum ( trainLabel == 1 ) / float ( numTrainDocs ) # 对应p(c1 )
36
36
## 计算p(wi|c0),p(wi|c1)
37
37
p0Num = np .ones (numWords ) # 初始样本个数为1,防止条件概率为0,影响结果
38
38
p1Num = np .ones (numWords )
@@ -61,7 +61,7 @@ def predict(self, test_X, vocabList, param):
61
61
for vec in testMatrix :
62
62
prob_y0 = sum (vec * p0Vec )+ np .log (p0 ) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
63
63
prob_y1 = sum (vec * p1Vec )+ np .log (p1 ) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
64
- if prob_y0 < prob_y1 : ## 对应0/1分类,但是NaiveBayes可以修改成多分类
64
+ if prob_y0 < prob_y1 : ## 对应0/1分类
65
65
predict_y .append (1 )
66
66
else :
67
67
predict_y .append (0 )
@@ -79,7 +79,7 @@ def predict1(self, test_X, test_y, vocabList, param):
79
79
for vec in testMatrix :
80
80
prob_y0 = sum (vec * p0Vec )+ np .log (p0 ) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
81
81
prob_y1 = sum (vec * p1Vec )+ np .log (p1 ) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
82
- if prob_y0 < prob_y1 : ## 对应0/1分类,但是NaiveBayes可以修改成多分类
82
+ if prob_y0 < prob_y1 : ## 对应0/1分类
83
83
predict_y .append (1 )
84
84
else :
85
85
predict_y .append (0 )
@@ -123,7 +123,7 @@ def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):
123
123
weigh = np .matrix (np .ones ((n , 1 ))) # size: n*1
124
124
for i in range (maxCycles ):
125
125
hx = self .sigmoid (trainMatrix * weigh ) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
126
- error = trainLabel - hx # size: m*1
126
+ error = np . abs ( trainLabel - hx ) # size: m*1
127
127
weigh += alpha * trainMatrix .T * error # size: n*1
128
128
return vocabList , weigh
129
129
@@ -142,7 +142,6 @@ def predict(self, test_X, vocabList, weigh):
142
142
else :
143
143
predict_y .append (0 )
144
144
predictLabel = np .array (predict_y ) ## array
145
- # predictLabel = np.matrix(predict_y).T ## matrix
146
145
return predictLabel
147
146
148
147
# 使用学习得到的参数进行分类
@@ -159,8 +158,8 @@ def predict1(self, test_X, test_y, vocabList, weigh):
159
158
predict_y .append (1 )
160
159
else :
161
160
predict_y .append (0 )
162
- testLabel = np .array (test_y ) ## array
163
161
predictLabel = np .array (predict_y ) ## array
162
+ testLabel = np .array (test_y ) ## array
164
163
print 'accuracy:' , sum (testLabel == predictLabel )/ float (m )
165
164
return predictLabel
166
165
@@ -171,18 +170,204 @@ def loadTrainDataSet():
171
170
['stop' , 'posting' , 'stupid' , 'worthless' , 'garbage' ],
172
171
['mr' , 'licks' ,'ate' ,'my' , 'steak' , 'how' , 'to' , 'stop' , 'him' ],
173
172
['quit' , 'buying' , 'worthless' , 'dog' , 'food' , 'stupid' ]]
174
- train_y = [0 ,1 ,0 ,1 ,0 ,1 ]# 0:good; 1: bad
173
+ train_y = [0 ,1 ,0 ,1 ,0 ,1 ]
175
174
return train_x , train_y
176
175
177
176
def loadTestDataSet ():
178
177
test_X = [['love' , 'my' , 'girl' , 'friend' ],
179
178
['stupid' , 'garbage' ],
180
179
['Haha' , 'I' , 'really' , "Love" , "You" ],
181
180
['This' , 'is' , "my" , "dog" ]]
182
- test_y = [0 ,1 ,0 ,0 ]# 0:good; 1: bad
181
+ test_y = [0 ,1 ,0 ,0 ]
183
182
return test_X , test_y
184
183
184
+
185
+ class NaiveBayes_neg1_pos1 ():
186
+ def __init__ (self ):
187
+ pass
188
+
189
+ def createVocabList (self , train_x ):
190
+ vocabSet = set ([])
191
+ for wordList in train_x :
192
+ vocabSet = vocabSet | set (wordList )
193
+ return list (vocabSet )
194
+
195
+ def listOfWords2Vec (self , vocabList , wordList ):
196
+ wordsVec = [0 ] * len (vocabList )
197
+ for word in wordList :
198
+ if word in vocabList :
199
+ wordsVec [vocabList .index (word )] = 1 # 词集模型
200
+ # wordsVec[vocabList.index(word)] += 1 # 词袋模型
201
+ # else:
202
+ # print "the word:%s is not in my vocabulary!" % word
203
+ return wordsVec
204
+
205
+ def fit (self , train_x , train_y ):
206
+ vocabList = self .createVocabList (train_x )
207
+ trainMat = []
208
+ for wordList in train_x :
209
+ trainMat .append (self .listOfWords2Vec (vocabList , wordList ))
210
+ trainMatrix = np .array (trainMat ) ## array
211
+ trainLabel = np .array (train_y ) ## array
212
+ numTrainDocs = len (trainMatrix ) # 统计样本个数
213
+ numWords = len (trainMatrix [0 ]) # 统计特征个数,理论上是词库的长度
214
+ ## 计算p(c0),p(c1) ## 对应-1/1分类
215
+ p0 = sum (trainLabel == - 1 )/ float (numTrainDocs ) # 对应p(c0)
216
+ p1 = sum (trainLabel == 1 )/ float (numTrainDocs ) # 对应p(c1)
217
+ ## 计算p(wi|c0),p(wi|c1)
218
+ p0Num = np .ones (numWords ) # 初始样本个数为1,防止条件概率为0,影响结果
219
+ p1Num = np .ones (numWords )
220
+ p0InAll = 2.0 # 词库中只有两类,所以此处初始化为2
221
+ p1InAll = 2.0
222
+ for i in range (numTrainDocs ):
223
+ if trainLabel [i ] == 1 :
224
+ p1Num += trainMatrix [i ]
225
+ p1InAll += sum (trainMatrix [i ])
226
+ else :
227
+ p0Num += trainMatrix [i ]
228
+ p0InAll += sum (trainMatrix [i ])
229
+ p0Vec = np .log (p0Num / p0InAll ) # 对应p(wi|c0)
230
+ p1Vec = np .log (p1Num / p1InAll ) # 对应p(wi|c1)
231
+ ## 整合参数
232
+ param = p0 , p1 , p0Vec , p1Vec
233
+ return vocabList , param
234
+
235
+ def predict (self , test_X , vocabList , param ):
236
+ p0 , p1 , p0Vec , p1Vec = param
237
+ testMat = []
238
+ for wordList in test_X :
239
+ testMat .append (self .listOfWords2Vec (vocabList , wordList ))
240
+ testMatrix = np .array (testMat ) ## array
241
+ predict_y = []
242
+ for vec in testMatrix :
243
+ prob_y0 = sum (vec * p0Vec )+ np .log (p0 ) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
244
+ prob_y1 = sum (vec * p1Vec )+ np .log (p1 ) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
245
+ if prob_y0 < prob_y1 : ## 对应-1/1分类,但是NaiveBayes可以修改成多分类
246
+ predict_y .append (1 )
247
+ else :
248
+ predict_y .append (- 1 )
249
+ predictLabel = np .array (predict_y ) ## array
250
+ return predictLabel
251
+
252
+ def predict1 (self , test_X , test_y , vocabList , param ):
253
+ p0 , p1 , p0Vec , p1Vec = param
254
+ testMat = []
255
+ for wordList in test_X :
256
+ testMat .append (self .listOfWords2Vec (vocabList , wordList ))
257
+ testMatrix = np .array (testMat ) ## array
258
+ m = testMatrix .shape [0 ]
259
+ predict_y = []
260
+ for vec in testMatrix :
261
+ prob_y0 = sum (vec * p0Vec )+ np .log (p0 ) # 对应p(w1|c0)*p(w2|c0)*...*p(c0),log(a*b) = log(a)+log(b)
262
+ prob_y1 = sum (vec * p1Vec )+ np .log (p1 ) # 对应p(w1|c1)*p(w2|c1)*...*p(c1),log(a*b) = log(a)+log(b)
263
+ if prob_y0 < prob_y1 : ## 对应-1/1分类,但是NaiveBayes可以修改成多分类
264
+ predict_y .append (1 )
265
+ else :
266
+ predict_y .append (- 1 )
267
+ testLabel = np .array (test_y ) ## array
268
+ predictLabel = np .array (predict_y ) ## array
269
+ print 'accuracy:' , sum (testLabel == predictLabel )/ float (m )
270
+ return predictLabel
271
+
272
+ class LogisticRegression_neg1_pos1 (): # 二分类,-1/1分类
273
+ def __init__ (self ):
274
+ pass
275
+
276
+ def createVocabList (self , train_x ):
277
+ vocabSet = set ([])
278
+ for wordList in train_x :
279
+ vocabSet = vocabSet | set (wordList )
280
+ return list (vocabSet )
281
+
282
+ def listOfWords2Vec (self , vocabList , wordList ):
283
+ wordsVec = [0 ] * len (vocabList )
284
+ for word in wordList :
285
+ if word in vocabList :
286
+ wordsVec [vocabList .index (word )] = 1 # 词集模型
287
+ # wordsVec[vocabList.index(word)] += 1 # 词袋模型
288
+ # else:
289
+ # print "the word:%s is not in my vocabulary!" % word
290
+ return wordsVec
291
+
292
+ def sigmoid (self , inX ):
293
+ return 1.0 / (1 + np .exp (- inX ))
294
+
295
+ # 使用梯度下降方法训练模型,alpha为步长(学习率),maxCycles最大迭代次数
296
+ def fit (self , train_x , train_y , alpha = 0.01 , maxCycles = 100 ):
297
+ vocabList = self .createVocabList (train_x )
298
+ trainMat = []
299
+ for wordList in train_x :
300
+ trainMat .append (self .listOfWords2Vec (vocabList , wordList ))
301
+ trainMatrix = np .matrix (trainMat ) ## matrix是二维的 # size: m*n
302
+ trainLabel = np .matrix (train_y ).T ## matrix是二维的 # size: m*1
303
+ m , n = trainMatrix .shape
304
+ weigh = np .matrix (np .ones ((n , 1 ))) # size: n*1
305
+ for i in range (maxCycles ):
306
+ # hx = self.sigmoid(trainMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
307
+ hx = self .sigmoid (trainMatrix * weigh )* 2 - 1
308
+ error = np .abs (trainLabel - hx ) # size: m*1
309
+ weigh += alpha * trainMatrix .T * error # size: n*1
310
+ return vocabList , weigh
311
+
312
+ # 使用学习得到的参数进行分类
313
+ def predict (self , test_X , vocabList , weigh ):
314
+ testMat = []
315
+ for wordList in test_X :
316
+ testMat .append (self .listOfWords2Vec (vocabList , wordList ))
317
+ testMatrix = np .matrix (testMat ) ## matrix是二维的
318
+ m = testMatrix .shape [0 ]
319
+ # hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
320
+ hx = self .sigmoid (testMatrix * weigh )* 2 - 1
321
+ predict_y = []
322
+ for i in range (m ): ## 对应-1/1分类
323
+ if hx [i ][0 ] > 0.0 :
324
+ predict_y .append (1 )
325
+ else :
326
+ predict_y .append (- 1 )
327
+ predictLabel = np .array (predict_y ) ## array
328
+ return predictLabel
329
+
330
+ # 使用学习得到的参数进行分类
331
+ def predict1 (self , test_X , test_y , vocabList , weigh ):
332
+ testMat = []
333
+ for wordList in test_X :
334
+ testMat .append (self .listOfWords2Vec (vocabList , wordList ))
335
+ testMatrix = np .matrix (testMat ) ## matrix是二维的
336
+ m = testMatrix .shape [0 ]
337
+ # hx = self.sigmoid(testMatrix*weigh) # size: m*1 sigmoid把线性回归转换到[0,1]之间,对应概率
338
+ hx = self .sigmoid (testMatrix * weigh )* 2 - 1
339
+ predict_y = []
340
+ for i in range (m ): ## 对应-1/1分类
341
+ if hx [i ][0 ] > 0.0 :
342
+ predict_y .append (1 )
343
+ else :
344
+ predict_y .append (- 1 )
345
+ predictLabel = np .array (predict_y ) ## array
346
+ testLabel = np .array (test_y ) ## array
347
+ print 'accuracy:' , sum (testLabel == predictLabel )/ float (m )
348
+ return predictLabel
349
+
350
+ def loadTrainDataSet_neg1_pos1 ():
351
+ train_x = [['my' , 'dog' , 'has' , 'flea' , 'problems' , 'help' , 'please' ],
352
+ ['maybe' , 'not' , 'take' , 'him' , 'to' , 'dog' , 'park' , 'stupid' ],
353
+ ['my' , 'dalmation' , 'is' , 'so' , 'cute' , ' and' , 'I' , 'love' , 'him' ],
354
+ ['stop' , 'posting' , 'stupid' , 'worthless' , 'garbage' ],
355
+ ['mr' , 'licks' ,'ate' ,'my' , 'steak' , 'how' , 'to' , 'stop' , 'him' ],
356
+ ['quit' , 'buying' , 'worthless' , 'dog' , 'food' , 'stupid' ]]
357
+ train_y = [- 1 ,1 ,- 1 ,1 ,- 1 ,1 ]
358
+ return train_x , train_y
359
+
360
+ def loadTestDataSet_neg1_pos1 ():
361
+ test_X = [['love' , 'my' , 'girl' , 'friend' ],
362
+ ['stupid' , 'garbage' ],
363
+ ['Haha' , 'I' , 'really' , "Love" , "You" ],
364
+ ['This' , 'is' , "my" , "dog" ]]
365
+ test_y = [- 1 ,1 ,- 1 ,- 1 ]
366
+ return test_X , test_y
367
+
368
+
185
369
if __name__ == '__main__' :
370
+
186
371
train_X , train_y = loadTrainDataSet ()
187
372
test_X , test_y = loadTestDataSet ()
188
373
clf = NaiveBayes ()
@@ -196,4 +381,19 @@ def loadTestDataSet():
196
381
results = clf .predict (test_X , vocabList , weigh )
197
382
print results
198
383
results1 = clf .predict1 (test_X , test_y , vocabList , weigh )
384
+ print results1
385
+
386
+ train_X , train_y = loadTrainDataSet_neg1_pos1 ()
387
+ test_X , test_y = loadTestDataSet_neg1_pos1 ()
388
+ clf = NaiveBayes_neg1_pos1 ()
389
+ vocabList , param = clf .fit (train_X , train_y )
390
+ results = clf .predict (test_X , vocabList , param )
391
+ print results
392
+ results1 = clf .predict1 (test_X , test_y , vocabList , param )
393
+ print results1
394
+ clf = LogisticRegression_neg1_pos1 ()
395
+ vocabList , weigh = clf .fit (train_X , train_y )
396
+ results = clf .predict (test_X , vocabList , weigh )
397
+ print results
398
+ results1 = clf .predict1 (test_X , test_y , vocabList , weigh )
199
399
print results1
0 commit comments