Commit a19ffd4

committed

test

0 parents commit a19ffd4Copy full SHA for a19ffd4

File tree

5 files changed

+530

-0

lines changed

Apriori
- AprioriTest.py
DesicionTree
- DesicionTreeTest.py
HMM
- ViterbiTest.py
NB&LR
- NaiveBayes_vs_LogisticRegression.py
README.md

5 files changed

+530

-0

lines changed

`‎Apriori/AprioriTest.py`

Lines changed: 128 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,128 @@`
	`1`	`+#coding:utf-8`
	`2`	`+`
	`3`	`+class Apriori():`
	`4`	`+ def __init__(self):`
	`5`	`+ pass`
	`6`	`+`
	`7`	`+ '''`
	`8`	`+ 关联分析的目标包括两项:发现频繁项集和发现关联规则`
	`9`	`+ '''`
	`10`	`+`
	`11`	`+ '''`
	`12`	`+ 频繁项集:{}`
	`13`	`+ 对于包含N种物品的数据集共有2^N-1种项集组合。`
	`14`	`+ 支持度(support)`
	`15`	`+ 一个项集的支持度被定义为数据集中包含该项集的记录所占的比例。`
	`16`	`+ Apriori算法:如果某个项集是频繁的,那么它的所有子集也是频繁的。`
	`17`	`+ 如果一个项集是非频繁集,那么它的所有超集也是非频繁集。`
	`18`	`+ '''`
	`19`	`+`
	`20`	`+ def _createC1(self, dataSet):`
	`21`	`+ C1 = []`
	`22`	`+ for transaction in dataSet:`
	`23`	`+ for item in transaction:`
	`24`	`+ if [item] not in C1:`
	`25`	`+ C1.append([item])`
	`26`	`+ C1.sort()`
	`27`	`+ return map(frozenset, C1) # use frozen set so we can use it as a key in a dict`
	`28`	`+`
	`29`	`+ def _scanD(self, D, Ck, minSupport=0.5):`
	`30`	`+ ssCnt = {}`
	`31`	`+ for tid in D:`
	`32`	`+ for can in Ck:`
	`33`	`+ if can.issubset(tid):`
	`34`	`+ if can in ssCnt:`
	`35`	`+ ssCnt[can] += 1`
	`36`	`+ else:`
	`37`	`+ ssCnt[can] = 1`
	`38`	`+ # if can not in ssCnt:`
	`39`	`+ # ssCnt[can] = 0`
	`40`	`+ # ssCnt[can] += 1`
	`41`	`+ # print ssCnt`
	`42`	`+ numItems = len(D)`
	`43`	`+ retList = []`
	`44`	`+ supportK = {}`
	`45`	`+ for key in ssCnt:`
	`46`	`+ support = ssCnt[key]/float(numItems) # 计算支持度`
	`47`	`+ if support >= minSupport:`
	`48`	`+ retList.append(key)`
	`49`	`+ supportK[key] = support`
	`50`	`+ return retList, supportK`
	`51`	`+`
	`52`	`+ def aprioriGen(self, Lk, k): # k>=2`
	`53`	`+ retList = []`
	`54`	`+ lenLk = len(Lk)`
	`55`	`+ for i in range(lenLk):`
	`56`	`+ for j in range(i+1, lenLk):`
	`57`	`+ L1 = list(Lk[i])[:k-2]`
	`58`	`+ L2 = list(Lk[j])[:k-2]`
	`59`	`+ L1.sort()`
	`60`	`+ L2.sort()`
	`61`	`+ if L1 == L2: # if first k-2 elements are equal. when k is 3, {0,1},{0,2},{1,2}→{0,1}U{0,2}→{0,1,2}`
	`62`	`+ retList.append(Lk[i] \| Lk[j])`
	`63`	`+ return retList`
	`64`	`+`
	`65`	`+ def apriori(self, dataSet, minSupport=0.5): # minSupport 最小支持度`
	`66`	`+ D = map(set, dataSet) # 转换为集合set`
	`67`	`+ C1 = self._createC1(dataSet) # 创建C1,转换为集合frozenset`
	`68`	`+ L1, supp1 = self._scanD(D, C1, minSupport) # 基于C1和minSupport创建L1`
	`69`	`+ L = []`
	`70`	`+ supportData = {}`
	`71`	`+ L.append(L1)`
	`72`	`+ supportData.update(supp1)`
	`73`	`+ k = 2`
	`74`	`+ while len(L[k-2]) > 1:`
	`75`	`+ Ck = self.aprioriGen(L[k-2], k) # 创建Ck`
	`76`	`+ Lk, suppK = self._scanD(D, Ck, minSupport) # 基于Ck和minSupport创建Lk`
	`77`	`+ L.append(Lk)`
	`78`	`+ supportData.update(suppK)`
	`79`	`+ k += 1`
	`80`	`+ return L, supportData`
	`81`	`+`
	`82`	`+ '''`
	`83`	`+ 关联规则:→`
	`84`	`+ 可信度(confidence):也称置信度`
	`85`	`+ 可信度(尿布→葡萄酒) = 支持度({尿布,葡萄酒})/支持度({尿布})`
	`86`	`+ 如果某条规则并不满足最小可信度要求,那么该规则的所有子集也不会满足最小可信度要求。`
	`87`	`+ '''`
	`88`	`+`
	`89`	`+ def _calcConf(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表,如{0},{1}`
	`90`	`+ prunedH = []`
	`91`	`+ for conseq in H:`
	`92`	`+ conf = supportData[freqSet]/supportData[freqSet-conseq] # 计算可信度`
	`93`	`+ if conf >= minConf:`
	`94`	`+ print freqSet-conseq, '-->', conseq, 'conf:', conf`
	`95`	`+ brl.append((freqSet-conseq, conseq, conf))`
	`96`	`+ prunedH.append(conseq)`
	`97`	`+ return prunedH`
	`98`	`+`
	`99`	`+ def _rulesFromConseq(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表,如{0},{1}`
	`100`	`+ m = len(H[0])`
	`101`	`+ if len(freqSet) > (m+1):`
	`102`	`+ Hmp1 = self.aprioriGen(H, m+1) # 合并规则`
	`103`	`+ Hmp = self._calcConf(freqSet, Hmp1, supportData, brl, minConf) # Hmp为出现在右部的合并规则列表,如{0,1}`
	`104`	`+ if len(Hmp) > 1: # 如果规则列表长度大于1,则进一步合并`
	`105`	`+ self._rulesFromConseq(freqSet, Hmp, supportData, brl, minConf)`
	`106`	`+`
	`107`	`+ def generateRules(self, L, supportData, minConf=0.7): # minConf 最小可信度`
	`108`	`+ bigRuleList = []`
	`109`	`+ for i in range(1, len(L)): # 从包含两个或者更多元素的项集开始规则构建过程`
	`110`	`+ for freqSet in L[i]:`
	`111`	`+ H1 = [frozenset([item]) for item in freqSet] # 构建只包含单个元素的列表,即出现在规则右部的规则列表,如{0},{1}`
	`112`	`+ if i > 1:`
	`113`	`+ self._rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 生成候选规则`
	`114`	`+ else:`
	`115`	`+ self._calcConf(freqSet, H1, supportData, bigRuleList, minConf) # 对规则进行评估`
	`116`	`+ return bigRuleList`
	`117`	`+`
	`118`	`+def loadDataSet():`
	`119`	`+ return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]`
	`120`	`+`
	`121`	`+if __name__ == '__main__':`
	`122`	`+ dataSet = loadDataSet()`
	`123`	`+ ap = Apriori()`
	`124`	`+ L, suppData = ap.apriori(dataSet, minSupport=0.5)`
	`125`	`+ print L`
	`126`	`+ print suppData`
	`127`	`+ rules = ap.generateRules(L, suppData, minConf=0.6)`
	`128`	`+ print rules`

`‎DesicionTree/DesicionTreeTest.py`

Lines changed: 128 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,128 @@`
	`1`	`+#coding:utf-8`
	`2`	`+`
	`3`	`+import math`
	`4`	`+`
	`5`	`+class DesicionTree():`
	`6`	`+ def __init__(self):`
	`7`	`+ pass`
	`8`	`+`
	`9`	`+ def _calcShannonEnt(self, dataSet): ## 计算数据集的熵`
	`10`	`+ numEntries = len(dataSet)`
	`11`	`+ classCounts = {}`
	`12`	`+ for data in dataSet:`
	`13`	`+ currentLabel = data[-1]`
	`14`	`+ if currentLabel not in classCounts:`
	`15`	`+ classCounts[currentLabel] = 1`
	`16`	`+ else:`
	`17`	`+ classCounts[currentLabel] += 1`
	`18`	`+ # if currentLabel not in classCounts:`
	`19`	`+ # classCounts[currentLabel] = 0`
	`20`	`+ # classCounts[currentLabel] += 1`
	`21`	`+ '''`
	`22`	`+ 信息 -log2(pi)`
	`23`	`+ 熵:信息的期望 sigma(-pi*log2(pi))`
	`24`	`+ '''`
	`25`	`+ shannonEnt = 0.0`
	`26`	`+ for key in classCounts:`
	`27`	`+ prob = classCounts[key]/float(numEntries)`
	`28`	`+ shannonEnt -= prob*math.log(prob, 2) # log base 2`
	`29`	`+ return shannonEnt`
	`30`	`+`
	`31`	`+ def _splitDataSet(self, dataSet, axis, value):`
	`32`	`+ retDataSet = []`
	`33`	`+ for data in dataSet:`
	`34`	`+ # print data[axis]`
	`35`	`+ if data[axis] == value:`
	`36`	`+ reduceddata = data[:axis]`
	`37`	`+ reduceddata.extend(data[axis+1:])`
	`38`	`+ retDataSet.append(reduceddata)`
	`39`	`+ return retDataSet`
	`40`	`+`
	`41`	`+ def _chooseBestFeatureToSplit(self, dataSet):`
	`42`	`+ numFeatures = len(dataSet[0])-1 # 最后一列是类标签`
	`43`	`+ baseEntropy = self._calcShannonEnt(dataSet)`
	`44`	`+ bestInfoGain = 0`
	`45`	`+ bestFeature = -1`
	`46`	`+ for i in range(numFeatures): # 依次迭代所有的特征`
	`47`	`+ featList = [data[i] for data in dataSet]`
	`48`	`+ values = set(featList)`
	`49`	`+ '''`
	`50`	`+ 条件熵:sigma(pj*子数据集的熵)`
	`51`	`+ '''`
	`52`	`+ ## 计算每个特征对数据集的条件熵`
	`53`	`+ newEntropy = 0.0`
	`54`	`+ for value in values:`
	`55`	`+ subDataSet = self._splitDataSet(dataSet, i, value)`
	`56`	`+ prob = len(subDataSet)/float(len(dataSet))`
	`57`	`+ newEntropy += prob*self._calcShannonEnt(subDataSet)`
	`58`	`+ '''`
	`59`	`+ 信息增益 = 熵-条件熵`
	`60`	`+ '''`
	`61`	`+ infoGain = baseEntropy-newEntropy`
	`62`	`+ if infoGain > bestInfoGain:`
	`63`	`+ bestInfoGain = infoGain`
	`64`	`+ bestFeature = i`
	`65`	`+ return bestFeature`
	`66`	`+`
	`67`	`+ def _majorityCnt(self, classList):`
	`68`	`+ classCount = {}`
	`69`	`+ for vote in classList:`
	`70`	`+ if vote not in classCount:`
	`71`	`+ classCount[vote] = 1`
	`72`	`+ else:`
	`73`	`+ classCount[vote] += 1`
	`74`	`+ # if vote not in classCount:`
	`75`	`+ # classCount[vote] = 0`
	`76`	`+ # classCount[vote] += 1`
	`77`	`+ sortedClassCount = sorted(classCount.items(), key=lambda xx:xx[1], reverse=True)`
	`78`	`+ return sortedClassCount[0][0]`
	`79`	`+`
	`80`	`+ def fit(self, dataSet, featLabels):`
	`81`	`+ classList = [data[-1] for data in dataSet]`
	`82`	`+ if classList.count(classList[0]) == len(classList):`
	`83`	`+ return classList[0] # 所有的类标签都相同,则返回类标签`
	`84`	`+ if len(dataSet[0]) == 1: # 所有的类标签不完全相同,但用完所有特征,则返回次数最多的类标签`
	`85`	`+ return self._majorityCnt(classList)`
	`86`	`+ bestFeat = self._chooseBestFeatureToSplit(dataSet)`
	`87`	`+ bestFeatLabel = featLabels[bestFeat]`
	`88`	`+ tree = {bestFeatLabel:{}}`
	`89`	`+ featLabels_copy = featLabels[:] # 这样不会改变输入的featLabels`
	`90`	`+ featLabels_copy.remove(bestFeatLabel)`
	`91`	`+ featList = [data[bestFeat] for data in dataSet]`
	`92`	`+ values = set(featList)`
	`93`	`+ for value in values:`
	`94`	`+ subfeatLabels_copy = featLabels_copy[:] # 列表复制,非列表引用`
	`95`	`+ tree[bestFeatLabel][value] = self.fit(self._splitDataSet(dataSet, bestFeat, value), subfeatLabels_copy)`
	`96`	`+ return tree`
	`97`	`+`
	`98`	`+ def predict(self, tree, featLabels, testVec):`
	`99`	`+ firstStr = tree.keys()[0]`
	`100`	`+ secondDict = tree[firstStr]`
	`101`	`+ featIndex = featLabels.index(firstStr)`
	`102`	`+ key = testVec[featIndex]`
	`103`	`+ valueOfFeat = secondDict[key]`
	`104`	`+ if isinstance(valueOfFeat, dict):`
	`105`	`+ classLabel = self.predict(valueOfFeat, featLabels, testVec)`
	`106`	`+ else:`
	`107`	`+ classLabel = valueOfFeat`
	`108`	`+ return classLabel`
	`109`	`+`
	`110`	`+def loadDataSet():`
	`111`	`+ dataSet = [[1, 1, 'yes'],`
	`112`	`+ [1, 1, 'yes'],`
	`113`	`+ [1, 0, 'no'],`
	`114`	`+ [0, 1, 'no'],`
	`115`	`+ [0, 1, 'no']]`
	`116`	`+ featLabels = ['no surfacing', 'flippers'] # 特征标签`
	`117`	`+ return dataSet, featLabels`
	`118`	`+`
	`119`	`+if __name__ == '__main__':`
	`120`	`+ myDataSet, myFeatLabels = loadDataSet()`
	`121`	`+ print myDataSet, myFeatLabels`
	`122`	`+ dt = DesicionTree()`
	`123`	`+ myTree = dt.fit(myDataSet, myFeatLabels)`
	`124`	`+ print myTree`
	`125`	`+ results = dt.predict(myTree, myFeatLabels, [1, 1])`
	`126`	`+ print results`
	`127`	`+ results = dt.predict(myTree, myFeatLabels, [0, 1])`
	`128`	`+ print results`

`‎HMM/ViterbiTest.py`

Lines changed: 101 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,101 @@`
	`1`	`+# -- coding: utf-8 --`
	`2`	`+'''`
	`3`	`+HMM(隐马尔可夫模型)是用来描述隐含未知参数的统计模型`
	`4`	`+举一个经典的例子:`
	`5`	`+一个东京的朋友每天根据天气{下雨,天晴}决定当天的活动{公园散步,购物,清理房间}中的一种`
	`6`	`+我每天只能在twitter上看到她发的推"啊,我前天公园散步、昨天购物、今天清理房间了!"`
	`7`	`+那么我可以根据她发的twitter推断东京这三天的天气`
	`8`	`+在这个例子里,显状态是活动,隐状态是天气`
	`9`	`+求解最可能的隐状态序列是HMM的三个典型问题之一,通常用Viterbi算法解决`
	`10`	`+Viterbi算法就是求解HMM上的最短路径(-log(prob),也即是最大概率)的算法`
	`11`	`+'''`
	`12`	`+`
	`13`	`+# HMM描述 lambda = (states, observations, start_probability, transition_probability, emission_probability)`
	`14`	`+states = ('Rainy', 'Sunny')`
	`15`	`+`
	`16`	`+observations = ('walk', 'shop', 'clean')`
	`17`	`+`
	`18`	`+start_probability = {'Rainy': 0.6, 'Sunny': 0.4}`
	`19`	`+`
	`20`	`+transition_probability = {`
	`21`	`+ 'Rainy' : {'Rainy': 0.7, 'Sunny': 0.3},`
	`22`	`+ 'Sunny' : {'Rainy': 0.4, 'Sunny': 0.6},`
	`23`	`+ }`
	`24`	`+`
	`25`	`+emission_probability = {`
	`26`	`+ 'Rainy' : {'walk': 0.1, 'shop': 0.4, 'clean': 0.5},`
	`27`	`+ 'Sunny' : {'walk': 0.6, 'shop': 0.3, 'clean': 0.1},`
	`28`	`+}`
	`29`	`+`
	`30`	`+# 打印路径概率表`
	`31`	`+def print_dptable(V):`
	`32`	`+ print '',`
	`33`	`+ for t in range(len(V)):`
	`34`	`+ print "%7d" % t,`
	`35`	`+ print ''`
	`36`	`+ for y in V[0].keys():`
	`37`	`+ print "%.5s:" % y,`
	`38`	`+ for t in range(len(V)):`
	`39`	`+ print "%.7s" % ("%f" % V[t][y]),`
	`40`	`+ print ''`
	`41`	`+`
	`42`	`+def viterbi(stas, obs, start_p, trans_p, emit_p):`
	`43`	`+ '''`
	`44`	`+ :param stas:隐状态`
	`45`	`+ :param obs:观测序列`
	`46`	`+ :param start_p:初始概率(隐状态)`
	`47`	`+ :param trans_p:转移概率(隐状态)`
	`48`	`+ :param emit_p:发射概率(隐状态表现为显状态的概率)`
	`49`	`+ :return:`
	`50`	`+ 思路:`
	`51`	`+ 定义V[时间][今天天气] = 概率,注意今天天气指的是,前几天的天气都确定下来了(概率最大)今天天气是X的概率,这里的概率就是一个累乘的概率了`
	`52`	`+ 因为第一天我的朋友去散步了,所以第一天下雨的概率V[第一天][下雨] = 初始概率[下雨] * 发射概率[下雨][散步] = 0.6 * 0.1 = 0.06,同理可得V[第一天][天晴] = 0.24。从直觉上来看,因为第一天朋友出门了,她一般喜欢在天晴的时候散步,所以第一天天晴的概率比较大,数字与直觉统一了。`
	`53`	`+ 从第二天开始,对于每种天气Y,都有前一天天气是X的概率 * X转移到Y的概率 * Y天气下朋友进行这天这种活动的概率。因为前一天天气X有两种可能,所以Y的概率有两个,选取其中较大一个作为V[第二天][天气Y]的概率,同时将今天的天气加入到结果序列中`
	`54`	`+ 比较V[最后一天][下雨]和[最后一天][天晴]的概率,找出较大的哪一个对应的序列,就是最终结果`
	`55`	`+ '''`
	`56`	`+`
	`57`	`+ # 路径概率表 V[时间][隐状态] = 概率`
	`58`	`+ V = [{}]`
	`59`	`+ # 一个中间变量,代表当前状态是哪个隐状态`
	`60`	`+ path = {}`
	`61`	`+`
	`62`	`+ # 初始化初始状态 (对t == 0)`
	`63`	`+ for y in stas:`
	`64`	`+ V[0][y] = start_p[y] * emit_p[y][obs[0]]`
	`65`	`+ path[y] = [y] # 记录初始路径,前面的key对应y状态`
	`66`	`+ print V`
	`67`	`+ print path`
	`68`	`+`
	`69`	`+ # 跑一遍维特比算法 (对 t > 0)`
	`70`	`+ for t in range(1, len(obs)):`
	`71`	`+ V.append({})`
	`72`	`+`
	`73`	`+ new_path = {}`
	`74`	`+ for y in stas:`
	`75`	`+ '''隐状态概率 = 前状态是y0的概率 * y0转移到y的概率 * y表现为当前状态的概率'''`
	`76`	`+ # y的最大概率及对应的前状态sta`
	`77`	`+ (prob, sta) = max([(V[t - 1][y0] * trans_p[y0][y] * emit_p[y][obs[t]], y0) for y0 in stas])`
	`78`	`+ # 记录最大隐状态概率`
	`79`	`+ V[t][y] = prob`
	`80`	`+ # 记录路径`
	`81`	`+ new_path[y] = path[sta] + [y] # 记录当前路径,前面的key对应y状态`
	`82`	`+ print V`
	`83`	`+ print new_path`
	`84`	`+`
	`85`	`+ # 不需要保留旧路径`
	`86`	`+ path = new_path`
	`87`	`+`
	`88`	`+ print_dptable(V)`
	`89`	`+`
	`90`	`+ # 找出概率最大的最后状态`
	`91`	`+ (prob, sta) = max([(V[len(obs) - 1][y], y) for y in stas])`
	`92`	`+ return prob, path[sta]`
	`93`	`+`
	`94`	`+def example():`
	`95`	`+ return viterbi(states,`
	`96`	`+ observations,`
	`97`	`+ start_probability,`
	`98`	`+ transition_probability,`
	`99`	`+ emission_probability)`
	`100`	`+`
	`101`	`+print example()`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit a19ffd4

File tree

5 files changed

5 files changed

`‎Apriori/AprioriTest.py`

`‎DesicionTree/DesicionTreeTest.py`

`‎HMM/ViterbiTest.py`

0 commit comments