[Python-checkins] python/nondist/sandbox/spambayes Tester.py,1.1,1.2 classifier.py,1.1,1.2

2002年8月28日 14:04:58 -0700

Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv27338
Modified Files:
	Tester.py classifier.py 
Log Message:
Tester.py: Repaired a comment. The false_{positive,negative})_rate()
functions return a percentage now (e.g., 1.0 instead of 0.01 -- it's
too hard to get motivated to reduce 0.01 <0.1 wink>).
GrahamBayes.spamprob: New optional bool argument; when true, a list of
the 15 strongest (word, probability) pairs is returned as well as the
overall probability (this is how to find out why a message scored as it
did).
Index: Tester.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/Tester.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** Tester.py	27 Aug 2002 22:10:04 -0000	1.1
--- Tester.py	28 Aug 2002 21:04:56 -0000	1.2
***************
*** 49,54 ****
 # is entirely composed of spam (is_spam True), or of ham (is_spam False).
 # Note that mispredictions are saved, and can be retrieved later via
! # false_negatives (ham mistakenly called spam) and false_positives (spam
! # mistakenly called ham). For this reason, you may wish to wrap examples
 # in a little class that identifies the example in a useful way, and whose
 # __iter__ produces a token stream for the classifier.
--- 49,54 ----
 # is entirely composed of spam (is_spam True), or of ham (is_spam False).
 # Note that mispredictions are saved, and can be retrieved later via
! # false_negatives (spam mistakenly called ham) and false_positives (ham
! # mistakenly called spam). For this reason, you may wish to wrap examples
 # in a little class that identifies the example in a useful way, and whose
 # __iter__ produces a token stream for the classifier.
***************
*** 77,84 ****

 def false_positive_rate(self):
! return float(self.nham_wrong) / self.nham_tested

 def false_negative_rate(self):
! return float(self.nspam_wrong) / self.nspam_tested

 def false_positives(self):
--- 77,86 ----

 def false_positive_rate(self):
! """Percentage of ham mistakenly identified as spam, in 0.0..100.0."""
! return self.nham_wrong * 1e2 / self.nham_tested

 def false_negative_rate(self):
! """Percentage of spam mistakenly identified as ham, in 0.0..100.0."""
! return self.nspam_wrong * 1e2 / self.nspam_tested

 def false_positives(self):
***************
*** 120,124 ****
 (1, 1)
 >>> t.false_positive_rate()
! 0.5
 >>> [e.name for e in t.false_positives()]
 ['badham']
--- 122,126 ----
 (1, 1)
 >>> t.false_positive_rate()
! 50.0
 >>> [e.name for e in t.false_positives()]
 ['badham']
***************
*** 129,133 ****
 (1, 3)
 >>> t.false_negative_rate()
! 0.75
 >>> [e.name for e in t.false_negatives()]
 ['badspam1', 'badspam2', 'badspam3']
--- 131,135 ----
 (1, 3)
 >>> t.false_negative_rate()
! 75.0
 >>> [e.name for e in t.false_negatives()]
 ['badspam1', 'badspam2', 'badspam3']
Index: classifier.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/classifier.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** classifier.py	23 Aug 2002 15:42:48 -0000	1.1
--- classifier.py	28 Aug 2002 21:04:56 -0000	1.2
***************
*** 87,95 ****
 self.wordinfo, self.nspam, self.nham = t[1:]

! def spamprob(self, wordstream):
 """Return best-guess probability that wordstream is spam.

 wordstream is an iterable object producing words.
 The return value is a float in [0.0, 1.0].
 """

--- 87,99 ----
 self.wordinfo, self.nspam, self.nham = t[1:]

! def spamprob(self, wordstream, evidence=False):
 """Return best-guess probability that wordstream is spam.

 wordstream is an iterable object producing words.
 The return value is a float in [0.0, 1.0].
+ 
+ If optional arg evidence is True, the return value is a pair
+ probability, evidence
+ where evidence is a list of (word, probability) pairs.
 """

***************
*** 139,142 ****
--- 143,148 ----
 # to tend in part to cancel out distortions introduced earlier by
 # HAMBIAS. Experiments will decide the issue.
+ if evidence:
+ clues = []
 prob_product = inverse_prob_product = 1.0
 for distance, prob, word, record in nbest:
***************
*** 145,148 ****
--- 151,156 ----
 if record is not None: # else wordinfo doesn't know about it
 record.killcount += 1
+ if evidence:
+ clues.append((word, prob))
 if self.DEBUG:
 print 'nbest P(%r) = %g' % (word, prob)
***************
*** 150,154 ****
 inverse_prob_product *= 1.0 - prob

! return prob_product / (prob_product + inverse_prob_product)

 def learn(self, wordstream, is_spam, update_probabilities=True):
--- 158,166 ----
 inverse_prob_product *= 1.0 - prob

! prob = prob_product / (prob_product + inverse_prob_product)
! if evidence:
! return prob, clues
! else:
! return prob

 def learn(self, wordstream, is_spam, update_probabilities=True):