Commit 0b2b686

committed

test

1 parent 551a58e commit 0b2b686Copy full SHA for 0b2b686

File tree

1 file changed

+210

-10

lines changed

NB&LR
- NaiveBayes_vs_LogisticRegression.py

1 file changed

+210

-10

lines changed

`‎NB&LR/NaiveBayes_vs_LogisticRegression.py‎`

Lines changed: 210 additions & 10 deletions

Original file line number	Diff line number	Diff line change
`@@ -30,9 +30,9 @@ def fit(self, train_x, train_y):`
`30`	`30`	`trainLabel = np.array(train_y) ## array`
`31`	`31`	`numTrainDocs = len(trainMatrix) # 统计样本个数`
`32`	`32`	`numWords = len(trainMatrix[0]) # 统计特征个数,理论上是词库的长度`
`33`		`- ## 计算p(c0),p(c1)`
`34`		`- p1 = sum(trainLabel)/float(numTrainDocs) # 对应p(c1)`
`35`		`- p0 = 1-p1 # 对应p(c0)`
	`33`	`+ ## 计算p(c0),p(c1) ## 对应0/1分类`
	`34`	`+ p0 = sum(trainLabel==0)/float(numTrainDocs) # 对应p(c0)`
	`35`	`+ p1 = sum(trainLabel==1)/float(numTrainDocs) # 对应p(c1)`
`36`	`36`	`## 计算p(wi\|c0),p(wi\|c1)`
`37`	`37`	`p0Num = np.ones(numWords) # 初始样本个数为1,防止条件概率为0,影响结果`
`38`	`38`	`p1Num = np.ones(numWords)`
`@@ -61,7 +61,7 @@ def predict(self, test_X, vocabList, param):`
`61`	`61`	`for vec in testMatrix:`
`62`	`62`	`prob_y0 = sum(vecp0Vec)+np.log(p0) # 对应p(w1\|c0)p(w2\|c0)...p(c0),log(a*b) = log(a)+log(b)`
`63`	`63`	`prob_y1 = sum(vecp1Vec)+np.log(p1) # 对应p(w1\|c1)p(w2\|c1)...p(c1),log(a*b) = log(a)+log(b)`
`64`		`- if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类`
	`64`	`+ if prob_y0 < prob_y1: ## 对应0/1分类`
`65`	`65`	`predict_y.append(1)`
`66`	`66`	`else:`
`67`	`67`	`predict_y.append(0)`
`@@ -79,7 +79,7 @@ def predict1(self, test_X, test_y, vocabList, param):`
`79`	`79`	`for vec in testMatrix:`
`80`	`80`	`prob_y0 = sum(vecp0Vec)+np.log(p0) # 对应p(w1\|c0)p(w2\|c0)...p(c0),log(a*b) = log(a)+log(b)`
`81`	`81`	`prob_y1 = sum(vecp1Vec)+np.log(p1) # 对应p(w1\|c1)p(w2\|c1)...p(c1),log(a*b) = log(a)+log(b)`
`82`		`- if prob_y0 < prob_y1: ## 对应0/1分类,但是NaiveBayes可以修改成多分类`
	`82`	`+ if prob_y0 < prob_y1: ## 对应0/1分类`
`83`	`83`	`predict_y.append(1)`
`84`	`84`	`else:`
`85`	`85`	`predict_y.append(0)`
`@@ -123,7 +123,7 @@ def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):`
`123`	`123`	`weigh = np.matrix(np.ones((n, 1))) # size: n*1`
`124`	`124`	`for i in range(maxCycles):`
`125`	`125`	`hx = self.sigmoid(trainMatrixweigh) # size: m1 sigmoid把线性回归转换到[0,1]之间,对应概率`
`126`		`- error = trainLabel-hx # size: m*1`
	`126`	`+ error = np.abs(trainLabel-hx) # size: m*1`
`127`	`127`	`weigh += alphatrainMatrix.Terror # size: n*1`
`128`	`128`	`return vocabList, weigh`
`129`	`129`
`@@ -142,7 +142,6 @@ def predict(self, test_X, vocabList, weigh):`
`142`	`142`	`else:`
`143`	`143`	`predict_y.append(0)`
`144`	`144`	`predictLabel = np.array(predict_y) ## array`
`145`		`- # predictLabel = np.matrix(predict_y).T ## matrix`
`146`	`145`	`return predictLabel`
`147`	`146`
`148`	`147`	`# 使用学习得到的参数进行分类`
`@@ -159,8 +158,8 @@ def predict1(self, test_X, test_y, vocabList, weigh):`
`159`	`158`	`predict_y.append(1)`
`160`	`159`	`else:`
`161`	`160`	`predict_y.append(0)`
`162`		`- testLabel = np.array(test_y) ## array`
`163`	`161`	`predictLabel = np.array(predict_y) ## array`
	`162`	`+ testLabel = np.array(test_y) ## array`
`164`	`163`	`print 'accuracy:', sum(testLabel==predictLabel)/float(m)`
`165`	`164`	`return predictLabel`
`166`	`165`
`@@ -171,18 +170,204 @@ def loadTrainDataSet():`
`171`	`170`	`['stop', 'posting', 'stupid', 'worthless', 'garbage'],`
`172`	`171`	`['mr', 'licks','ate','my', 'steak', 'how', 'to', 'stop', 'him'],`
`173`	`172`	`['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]`
`174`		`- train_y = [0,1,0,1,0,1]# 0:good; 1: bad`
	`173`	`+ train_y = [0,1,0,1,0,1]`
`175`	`174`	`return train_x, train_y`
`176`	`175`
`177`	`176`	`def loadTestDataSet():`
`178`	`177`	`test_X = [['love', 'my', 'girl', 'friend'],`
`179`	`178`	`['stupid', 'garbage'],`
`180`	`179`	`['Haha', 'I', 'really', "Love", "You"],`
`181`	`180`	`['This', 'is', "my", "dog"]]`
`182`		`- test_y = [0,1,0,0]# 0:good; 1: bad`
	`181`	`+ test_y = [0,1,0,0]`
`183`	`182`	`return test_X, test_y`
`184`	`183`
	`184`	`+`
	`185`	`+class NaiveBayes_neg1_pos1():`
	`186`	`+ def __init__(self):`
	`187`	`+ pass`
	`188`	`+`
	`189`	`+ def createVocabList(self, train_x):`
	`190`	`+ vocabSet = set([])`
	`191`	`+ for wordList in train_x:`
	`192`	`+ vocabSet = vocabSet \| set(wordList)`
	`193`	`+ return list(vocabSet)`
	`194`	`+`
	`195`	`+ def listOfWords2Vec(self, vocabList, wordList):`
	`196`	`+ wordsVec = [0] * len(vocabList)`
	`197`	`+ for word in wordList:`
	`198`	`+ if word in vocabList:`
	`199`	`+ wordsVec[vocabList.index(word)] = 1 # 词集模型`
	`200`	`+ # wordsVec[vocabList.index(word)] += 1 # 词袋模型`
	`201`	`+ # else:`
	`202`	`+ # print "the word:%s is not in my vocabulary!" % word`
	`203`	`+ return wordsVec`
	`204`	`+`
	`205`	`+ def fit(self, train_x, train_y):`
	`206`	`+ vocabList = self.createVocabList(train_x)`
	`207`	`+ trainMat = []`
	`208`	`+ for wordList in train_x:`
	`209`	`+ trainMat.append(self.listOfWords2Vec(vocabList, wordList))`
	`210`	`+ trainMatrix = np.array(trainMat) ## array`
	`211`	`+ trainLabel = np.array(train_y) ## array`
	`212`	`+ numTrainDocs = len(trainMatrix) # 统计样本个数`
	`213`	`+ numWords = len(trainMatrix[0]) # 统计特征个数,理论上是词库的长度`
	`214`	`+ ## 计算p(c0),p(c1) ## 对应-1/1分类`
	`215`	`+ p0 = sum(trainLabel==-1)/float(numTrainDocs) # 对应p(c0)`
	`216`	`+ p1 = sum(trainLabel==1)/float(numTrainDocs) # 对应p(c1)`
	`217`	`+ ## 计算p(wi\|c0),p(wi\|c1)`
	`218`	`+ p0Num = np.ones(numWords) # 初始样本个数为1,防止条件概率为0,影响结果`
	`219`	`+ p1Num = np.ones(numWords)`
	`220`	`+ p0InAll = 2.0 # 词库中只有两类,所以此处初始化为2`
	`221`	`+ p1InAll = 2.0`
	`222`	`+ for i in range(numTrainDocs):`
	`223`	`+ if trainLabel[i] == 1:`
	`224`	`+ p1Num += trainMatrix[i]`
	`225`	`+ p1InAll += sum(trainMatrix[i])`
	`226`	`+ else:`
	`227`	`+ p0Num += trainMatrix[i]`
	`228`	`+ p0InAll += sum(trainMatrix[i])`
	`229`	`+ p0Vec = np.log(p0Num/p0InAll) # 对应p(wi\|c0)`
	`230`	`+ p1Vec = np.log(p1Num/p1InAll) # 对应p(wi\|c1)`
	`231`	`+ ## 整合参数`
	`232`	`+ param = p0, p1, p0Vec, p1Vec`
	`233`	`+ return vocabList, param`
	`234`	`+`
	`235`	`+ def predict(self, test_X, vocabList, param):`
	`236`	`+ p0, p1, p0Vec, p1Vec = param`
	`237`	`+ testMat = []`
	`238`	`+ for wordList in test_X:`
	`239`	`+ testMat.append(self.listOfWords2Vec(vocabList, wordList))`
	`240`	`+ testMatrix = np.array(testMat) ## array`
	`241`	`+ predict_y = []`
	`242`	`+ for vec in testMatrix:`
	`243`	`+ prob_y0 = sum(vecp0Vec)+np.log(p0) # 对应p(w1\|c0)p(w2\|c0)...p(c0),log(a*b) = log(a)+log(b)`
	`244`	`+ prob_y1 = sum(vecp1Vec)+np.log(p1) # 对应p(w1\|c1)p(w2\|c1)...p(c1),log(a*b) = log(a)+log(b)`
	`245`	`+ if prob_y0 < prob_y1: ## 对应-1/1分类,但是NaiveBayes可以修改成多分类`
	`246`	`+ predict_y.append(1)`
	`247`	`+ else:`
	`248`	`+ predict_y.append(-1)`
	`249`	`+ predictLabel = np.array(predict_y) ## array`
	`250`	`+ return predictLabel`
	`251`	`+`
	`252`	`+ def predict1(self, test_X, test_y, vocabList, param):`
	`253`	`+ p0, p1, p0Vec, p1Vec = param`
	`254`	`+ testMat = []`
	`255`	`+ for wordList in test_X:`
	`256`	`+ testMat.append(self.listOfWords2Vec(vocabList, wordList))`
	`257`	`+ testMatrix = np.array(testMat) ## array`
	`258`	`+ m = testMatrix.shape[0]`
	`259`	`+ predict_y = []`
	`260`	`+ for vec in testMatrix:`
	`261`	`+ prob_y0 = sum(vecp0Vec)+np.log(p0) # 对应p(w1\|c0)p(w2\|c0)...p(c0),log(a*b) = log(a)+log(b)`
	`262`	`+ prob_y1 = sum(vecp1Vec)+np.log(p1) # 对应p(w1\|c1)p(w2\|c1)...p(c1),log(a*b) = log(a)+log(b)`
	`263`	`+ if prob_y0 < prob_y1: ## 对应-1/1分类,但是NaiveBayes可以修改成多分类`
	`264`	`+ predict_y.append(1)`
	`265`	`+ else:`
	`266`	`+ predict_y.append(-1)`
	`267`	`+ testLabel = np.array(test_y) ## array`
	`268`	`+ predictLabel = np.array(predict_y) ## array`
	`269`	`+ print 'accuracy:', sum(testLabel==predictLabel)/float(m)`
	`270`	`+ return predictLabel`
	`271`	`+`
	`272`	`+class LogisticRegression_neg1_pos1(): # 二分类,-1/1分类`
	`273`	`+ def __init__(self):`
	`274`	`+ pass`
	`275`	`+`
	`276`	`+ def createVocabList(self, train_x):`
	`277`	`+ vocabSet = set([])`
	`278`	`+ for wordList in train_x:`
	`279`	`+ vocabSet = vocabSet \| set(wordList)`
	`280`	`+ return list(vocabSet)`
	`281`	`+`
	`282`	`+ def listOfWords2Vec(self, vocabList, wordList):`
	`283`	`+ wordsVec = [0] * len(vocabList)`
	`284`	`+ for word in wordList:`
	`285`	`+ if word in vocabList:`
	`286`	`+ wordsVec[vocabList.index(word)] = 1 # 词集模型`
	`287`	`+ # wordsVec[vocabList.index(word)] += 1 # 词袋模型`
	`288`	`+ # else:`
	`289`	`+ # print "the word:%s is not in my vocabulary!" % word`
	`290`	`+ return wordsVec`
	`291`	`+`
	`292`	`+ def sigmoid(self, inX):`
	`293`	`+ return 1.0/(1 + np.exp(-inX))`
	`294`	`+`
	`295`	`+ # 使用梯度下降方法训练模型,alpha为步长(学习率),maxCycles最大迭代次数`
	`296`	`+ def fit(self, train_x, train_y, alpha=0.01, maxCycles=100):`
	`297`	`+ vocabList = self.createVocabList(train_x)`
	`298`	`+ trainMat = []`
	`299`	`+ for wordList in train_x:`
	`300`	`+ trainMat.append(self.listOfWords2Vec(vocabList, wordList))`
	`301`	`+ trainMatrix = np.matrix(trainMat) ## matrix是二维的 # size: m*n`
	`302`	`+ trainLabel = np.matrix(train_y).T ## matrix是二维的 # size: m*1`
	`303`	`+ m, n = trainMatrix.shape`
	`304`	`+ weigh = np.matrix(np.ones((n, 1))) # size: n*1`
	`305`	`+ for i in range(maxCycles):`
	`306`	`+ # hx = self.sigmoid(trainMatrixweigh) # size: m1 sigmoid把线性回归转换到[0,1]之间,对应概率`
	`307`	`+ hx = self.sigmoid(trainMatrixweigh)2-1`
	`308`	`+ error = np.abs(trainLabel-hx) # size: m*1`
	`309`	`+ weigh += alphatrainMatrix.Terror # size: n*1`
	`310`	`+ return vocabList, weigh`
	`311`	`+`
	`312`	`+ # 使用学习得到的参数进行分类`
	`313`	`+ def predict(self, test_X, vocabList, weigh):`
	`314`	`+ testMat = []`
	`315`	`+ for wordList in test_X:`
	`316`	`+ testMat.append(self.listOfWords2Vec(vocabList, wordList))`
	`317`	`+ testMatrix = np.matrix(testMat) ## matrix是二维的`
	`318`	`+ m = testMatrix.shape[0]`
	`319`	`+ # hx = self.sigmoid(testMatrixweigh) # size: m1 sigmoid把线性回归转换到[0,1]之间,对应概率`
	`320`	`+ hx = self.sigmoid(testMatrixweigh)2-1`
	`321`	`+ predict_y = []`
	`322`	`+ for i in range(m): ## 对应-1/1分类`
	`323`	`+ if hx[i][0] > 0.0:`
	`324`	`+ predict_y.append(1)`
	`325`	`+ else:`
	`326`	`+ predict_y.append(-1)`
	`327`	`+ predictLabel = np.array(predict_y) ## array`
	`328`	`+ return predictLabel`
	`329`	`+`
	`330`	`+ # 使用学习得到的参数进行分类`
	`331`	`+ def predict1(self, test_X, test_y, vocabList, weigh):`
	`332`	`+ testMat = []`
	`333`	`+ for wordList in test_X:`
	`334`	`+ testMat.append(self.listOfWords2Vec(vocabList, wordList))`
	`335`	`+ testMatrix = np.matrix(testMat) ## matrix是二维的`
	`336`	`+ m = testMatrix.shape[0]`
	`337`	`+ # hx = self.sigmoid(testMatrixweigh) # size: m1 sigmoid把线性回归转换到[0,1]之间,对应概率`
	`338`	`+ hx = self.sigmoid(testMatrixweigh)2-1`
	`339`	`+ predict_y = []`
	`340`	`+ for i in range(m): ## 对应-1/1分类`
	`341`	`+ if hx[i][0] > 0.0:`
	`342`	`+ predict_y.append(1)`
	`343`	`+ else:`
	`344`	`+ predict_y.append(-1)`
	`345`	`+ predictLabel = np.array(predict_y) ## array`
	`346`	`+ testLabel = np.array(test_y) ## array`
	`347`	`+ print 'accuracy:', sum(testLabel==predictLabel)/float(m)`
	`348`	`+ return predictLabel`
	`349`	`+`
	`350`	`+def loadTrainDataSet_neg1_pos1():`
	`351`	`+ train_x = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],`
	`352`	`+ ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],`
	`353`	`+ ['my', 'dalmation', 'is', 'so', 'cute', ' and', 'I', 'love', 'him'],`
	`354`	`+ ['stop', 'posting', 'stupid', 'worthless', 'garbage'],`
	`355`	`+ ['mr', 'licks','ate','my', 'steak', 'how', 'to', 'stop', 'him'],`
	`356`	`+ ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]`
	`357`	`+ train_y = [-1,1,-1,1,-1,1]`
	`358`	`+ return train_x, train_y`
	`359`	`+`
	`360`	`+def loadTestDataSet_neg1_pos1():`
	`361`	`+ test_X = [['love', 'my', 'girl', 'friend'],`
	`362`	`+ ['stupid', 'garbage'],`
	`363`	`+ ['Haha', 'I', 'really', "Love", "You"],`
	`364`	`+ ['This', 'is', "my", "dog"]]`
	`365`	`+ test_y = [-1,1,-1,-1]`
	`366`	`+ return test_X, test_y`
	`367`	`+`
	`368`	`+`
`185`	`369`	`if __name__ == '__main__':`
	`370`	`+`
`186`	`371`	`train_X, train_y = loadTrainDataSet()`
`187`	`372`	`test_X, test_y = loadTestDataSet()`
`188`	`373`	`clf = NaiveBayes()`
`@@ -196,4 +381,19 @@ def loadTestDataSet():`
`196`	`381`	`results = clf.predict(test_X, vocabList, weigh)`
`197`	`382`	`print results`
`198`	`383`	`results1 = clf.predict1(test_X, test_y, vocabList, weigh)`
	`384`	`+ print results1`
	`385`	`+`
	`386`	`+ train_X, train_y = loadTrainDataSet_neg1_pos1()`
	`387`	`+ test_X, test_y = loadTestDataSet_neg1_pos1()`
	`388`	`+ clf = NaiveBayes_neg1_pos1()`
	`389`	`+ vocabList, param = clf.fit(train_X, train_y)`
	`390`	`+ results = clf.predict(test_X, vocabList, param)`
	`391`	`+ print results`
	`392`	`+ results1 = clf.predict1(test_X, test_y, vocabList, param)`
	`393`	`+ print results1`
	`394`	`+ clf = LogisticRegression_neg1_pos1()`
	`395`	`+ vocabList, weigh = clf.fit(train_X, train_y)`
	`396`	`+ results = clf.predict(test_X, vocabList, weigh)`
	`397`	`+ print results`
	`398`	`+ results1 = clf.predict1(test_X, test_y, vocabList, weigh)`
`199`	`399`	`print results1`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit 0b2b686

File tree

1 file changed

1 file changed

`‎NB&LR/NaiveBayes_vs_LogisticRegression.py‎`

0 commit comments