[Python-checkins] python/nondist/sandbox/spambayes Tester.py,1.1,1.2 classifier.py,1.1,1.2
tim_one@users.sourceforge.net
tim_one@users.sourceforge.net
2002年8月28日 14:04:58 -0700
Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv27338
Modified Files:
Tester.py classifier.py
Log Message:
Tester.py: Repaired a comment. The false_{positive,negative})_rate()
functions return a percentage now (e.g., 1.0 instead of 0.01 -- it's
too hard to get motivated to reduce 0.01 <0.1 wink>).
GrahamBayes.spamprob: New optional bool argument; when true, a list of
the 15 strongest (word, probability) pairs is returned as well as the
overall probability (this is how to find out why a message scored as it
did).
Index: Tester.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/Tester.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** Tester.py 27 Aug 2002 22:10:04 -0000 1.1
--- Tester.py 28 Aug 2002 21:04:56 -0000 1.2
***************
*** 49,54 ****
# is entirely composed of spam (is_spam True), or of ham (is_spam False).
# Note that mispredictions are saved, and can be retrieved later via
! # false_negatives (ham mistakenly called spam) and false_positives (spam
! # mistakenly called ham). For this reason, you may wish to wrap examples
# in a little class that identifies the example in a useful way, and whose
# __iter__ produces a token stream for the classifier.
--- 49,54 ----
# is entirely composed of spam (is_spam True), or of ham (is_spam False).
# Note that mispredictions are saved, and can be retrieved later via
! # false_negatives (spam mistakenly called ham) and false_positives (ham
! # mistakenly called spam). For this reason, you may wish to wrap examples
# in a little class that identifies the example in a useful way, and whose
# __iter__ produces a token stream for the classifier.
***************
*** 77,84 ****
def false_positive_rate(self):
! return float(self.nham_wrong) / self.nham_tested
def false_negative_rate(self):
! return float(self.nspam_wrong) / self.nspam_tested
def false_positives(self):
--- 77,86 ----
def false_positive_rate(self):
! """Percentage of ham mistakenly identified as spam, in 0.0..100.0."""
! return self.nham_wrong * 1e2 / self.nham_tested
def false_negative_rate(self):
! """Percentage of spam mistakenly identified as ham, in 0.0..100.0."""
! return self.nspam_wrong * 1e2 / self.nspam_tested
def false_positives(self):
***************
*** 120,124 ****
(1, 1)
>>> t.false_positive_rate()
! 0.5
>>> [e.name for e in t.false_positives()]
['badham']
--- 122,126 ----
(1, 1)
>>> t.false_positive_rate()
! 50.0
>>> [e.name for e in t.false_positives()]
['badham']
***************
*** 129,133 ****
(1, 3)
>>> t.false_negative_rate()
! 0.75
>>> [e.name for e in t.false_negatives()]
['badspam1', 'badspam2', 'badspam3']
--- 131,135 ----
(1, 3)
>>> t.false_negative_rate()
! 75.0
>>> [e.name for e in t.false_negatives()]
['badspam1', 'badspam2', 'badspam3']
Index: classifier.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/classifier.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** classifier.py 23 Aug 2002 15:42:48 -0000 1.1
--- classifier.py 28 Aug 2002 21:04:56 -0000 1.2
***************
*** 87,95 ****
self.wordinfo, self.nspam, self.nham = t[1:]
! def spamprob(self, wordstream):
"""Return best-guess probability that wordstream is spam.
wordstream is an iterable object producing words.
The return value is a float in [0.0, 1.0].
"""
--- 87,99 ----
self.wordinfo, self.nspam, self.nham = t[1:]
! def spamprob(self, wordstream, evidence=False):
"""Return best-guess probability that wordstream is spam.
wordstream is an iterable object producing words.
The return value is a float in [0.0, 1.0].
+
+ If optional arg evidence is True, the return value is a pair
+ probability, evidence
+ where evidence is a list of (word, probability) pairs.
"""
***************
*** 139,142 ****
--- 143,148 ----
# to tend in part to cancel out distortions introduced earlier by
# HAMBIAS. Experiments will decide the issue.
+ if evidence:
+ clues = []
prob_product = inverse_prob_product = 1.0
for distance, prob, word, record in nbest:
***************
*** 145,148 ****
--- 151,156 ----
if record is not None: # else wordinfo doesn't know about it
record.killcount += 1
+ if evidence:
+ clues.append((word, prob))
if self.DEBUG:
print 'nbest P(%r) = %g' % (word, prob)
***************
*** 150,154 ****
inverse_prob_product *= 1.0 - prob
! return prob_product / (prob_product + inverse_prob_product)
def learn(self, wordstream, is_spam, update_probabilities=True):
--- 158,166 ----
inverse_prob_product *= 1.0 - prob
! prob = prob_product / (prob_product + inverse_prob_product)
! if evidence:
! return prob, clues
! else:
! return prob
def learn(self, wordstream, is_spam, update_probabilities=True):