[Python-checkins] python/nondist/sandbox/spambayes classifier.py,NONE,1.1 GBayes.py,1.11,1.12

2002年8月23日 08:42:50 -0700

Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv2797
Modified Files:
	GBayes.py 
Added Files:
	classifier.py 
Log Message:
Moved all the interesting code that was in the *original* GBayes.py into
a new classifier.py. It was designed to have a very clean interface,
and there's no reason to keep slamming everything into one file. The
ever-growing tokenizer stuff should probably also be split out, leaving
GBayes.py a pure driver.
Also repaired _test() (Skip's checkin left it without a binding for
the tokenize function).
--- NEW FILE: classifier.py ---
# This is an implementation of the Bayes-like spam classifier sketched
# by Paul Graham at <http://www.paulgraham.com/spam.html>. We say
# "Bayes-like" because there are many ad hoc deviations from a
# "normal" Bayesian classifier.
#
# This implementation is due to Tim Peters et alia.
import time
from heapq import heapreplace
HAMBIAS = 2.0
SPAMBIAS = 1.0
# "And then there is the question of what probability to assign to words
# that occur in one corpus but not the other. Again by trial and error I
# chose .01 and .99.". However, the code snippet clamps *all* probabilities
# into this range.
MIN_SPAMPROB = 0.01
MAX_SPAMPROB = 0.99
UNKNOWN_SPAMPROB = 0.20
# "I only consider words that occur more than five times in total".
# But the code snippet considers words that appear at least five times.
# This implementation follows the code rather than the explanation.
# (In addition, the count compared is after multiplying it with the
# appropriate bias factor.)
MINCOUNT = 5.0
MAX_DISCRIMINATORS = 15
PICKLE_VERSION = 1
class WordInfo(object):
 __slots__ = ('atime', # when this record was last used by scoring(*)
 'spamcount', # # of times word appears in spam
 'hamcount', # # of times word appears in non-spam
 'killcount', # # of times this made it to spamprob()'s nbest
 'spamprob', # prob(spam | msg contains this word)
 )
 # (*)atime is the last access time, a UTC time.time() value. It's the
 # most recent time this word was used by scoring (i.e., by spamprob(),
 # not by training via learn()); or, if the word has never been used by
 # scoring, the time the word record was created (i.e., by learn()).
 # One good criterion for identifying junk (word records that have no
 # value) is to delete words that haven't been used for a long time.
 # Perhaps they were typos, or unique identifiers, or relevant to a
 # once-hot topic or scam that's fallen out of favor. Whatever, if
 # a word is no longer being used, it's just wasting space.
 def __init__(self, atime):
 self.atime = atime
 self.spamcount = self.hamcount = self.killcount = 0
 self.spamprob = None
 def __repr__(self):
 return "WordInfo%r" % repr((self.atime, self.spamcount,
 self.hamcount, self.killcount,
 self.spamprob))
 def __getstate__(self):
 return (self.atime, self.spamcount, self.hamcount, self.killcount,
 self.spamprob)
 def __setstate__(self, t):
 (self.atime, self.spamcount, self.hamcount, self.killcount,
 self.spamprob) = t
class GrahamBayes(object):
 __slots__ = ('wordinfo', # map word to WordInfo record
 'nspam', # number of spam messages learn() has seen
 'nham', # number of non-spam messages learn() has seen
 )
 DEBUG = False
 def __init__(self):
 self.wordinfo = {}
 self.nspam = self.nham = 0
 def __getstate__(self):
 return PICKLE_VERSION, self.wordinfo, self.nspam, self.nham
 def __setstate__(self, t):
 if t[0] != PICKLE_VERSION:
 raise ValueError("Can't unpickle -- version %s unknown" % t[0])
 self.wordinfo, self.nspam, self.nham = t[1:]
 def spamprob(self, wordstream):
 """Return best-guess probability that wordstream is spam.
 wordstream is an iterable object producing words.
 The return value is a float in [0.0, 1.0].
 """
 if self.DEBUG:
 print "spamprob(%r)" % wordstream
 if not wordstream:
 raise ValueError("non-empty wordstream required")
 wordinfoget = self.wordinfo.get
 now = time.time()
 # A priority queue to remember the MAX_DISCRIMINATORS best
 # probabilities, where "best" means largest distance from 0.5.
 # The tuples are (distance, prob, word, wordinfo[word]).
 nbest = [(-1.0, None, None, None)] * MAX_DISCRIMINATORS
 smallest_best = -1.0
 for word in wordstream:
 record = wordinfoget(word)
 if record is None:
 prob = UNKNOWN_SPAMPROB
 else:
 record.atime = now
 prob = record.spamprob
 distance = abs(prob - 0.5)
 if distance > smallest_best:
 # Subtle: we didn't use ">" instead of ">=" just to save
 # calls to heapreplace(). The real intent is that if
 # there are many equally strong indicators throughout the
 # message, we want to favor the ones that appear earliest:
 # it's expected that spam headers will often have smoking
 # guns, and, even when not, spam has to grab your attention
 # early (& note that when spammers generate large blocks of
 # random gibberish to throw off exact-match filters, it's
 # always at the end of the msg -- if they put it at the
 # start, *nobody* would read the msg).
 heapreplace(nbest, (distance, prob, word, record))
 smallest_best = nbest[0][0]
 # Compute the probability. Note: This is what Graham's code did,
 # but it's dubious for reasons explained in great detail on Python-
 # Dev: it's missing P(spam) and P(not-spam) adjustments that
 # straightforward Bayesian analysis says should be here. It's
 # unclear how much it matters, though, as the omissions here seem
 # to tend in part to cancel out distortions introduced earlier by
 # HAMBIAS. Experiments will decide the issue.
 prob_product = inverse_prob_product = 1.0
 for distance, prob, word, record in nbest:
 if prob is None: # it's one of the dummies nbest started with
 continue
 if record is not None: # else wordinfo doesn't know about it
 record.killcount += 1
 if self.DEBUG:
 print 'nbest P(%r) = %g' % (word, prob)
 prob_product *= prob
 inverse_prob_product *= 1.0 - prob
 return prob_product / (prob_product + inverse_prob_product)
 def learn(self, wordstream, is_spam, update_probabilities=True):
 """Teach the classifier by example.
 wordstream is a word stream representing a message. If is_spam is
 True, you're telling the classifier this message is definitely spam,
 else that it's definitely not spam.
 If optional arg update_probabilities is False (the default is True),
 don't update word probabilities. Updating them is expensive, and if
 you're going to pass many messages to learn(), it's more efficient
 to pass False here and call update_probabilities() once when you're
 done -- or to call learn() with update_probabilities=True when
 passing the last new example. The important thing is that the
 probabilities get updated before calling spamprob() again.
 """
 self._add_msg(wordstream, is_spam)
 if update_probabilities:
 self.update_probabilities()
 def unlearn(self, wordstream, is_spam, update_probabilities=True):
 """In case of pilot error, call unlearn ASAP after screwing up.
 Pass the same arguments you passed to learn().
 """
 self._remove_msg(wordstream, is_spam)
 if update_probabilities:
 self.update_probabilities()
 def update_probabilities(self):
 """Update the word probabilities in the spam database.
 This computes a new probability for every word in the database,
 so can be expensive. learn() and unlearn() update the probabilities
 each time by default. Thay have an optional argument that allows
 to skip this step when feeding in many messages, and in that case
 you should call update_probabilities() after feeding the last
 message and before calling spamprob().
 """
 nham = float(self.nham or 1)
 nspam = float(self.nspam or 1)
 for record in self.wordinfo.itervalues():
 # Compute prob(msg is spam | msg contains word).
 hamcount = HAMBIAS * record.hamcount
 spamcount = SPAMBIAS * record.spamcount
 if hamcount + spamcount < MINCOUNT:
 prob = UNKNOWN_SPAMPROB
 else:
 hamratio = min(1.0, hamcount / nham)
 spamratio = min(1.0, spamcount / nspam)
 prob = spamratio / (hamratio + spamratio)
 if prob < MIN_SPAMPROB:
 prob = MIN_SPAMPROB
 elif prob > MAX_SPAMPROB:
 prob = MAX_SPAMPROB
 record.spamprob = prob
 if self.DEBUG:
 print 'New probabilities:'
 for w, r in self.wordinfo.iteritems():
 print "P(%r) = %g" % (w, r.spamprob)
 def clearjunk(self, oldesttime, mincount=MINCOUNT):
 """Forget useless wordinfo records. This can shrink the database size.
 A record for a word will be retained only if the word was accessed
 at or after oldesttime, or appeared at least mincount times in
 messages passed to learn(). mincount is optional, and defaults
 to the value an internal algorithm uses to decide that a word is so
 rare that it has no predictive value.
 """
 wordinfo = self.wordinfo
 mincount = float(mincount)
 tonuke = [w for w, r in wordinfo.iteritems()
 if r.atime < oldesttime and
 SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount]
 for w in tonuke:
 if self.DEBUG:
 print "clearjunk removing word %r: %r" % (w, r)
 del wordinfo[w]
 def _add_msg(self, wordstream, is_spam):
 if self.DEBUG:
 print "_add_msg(%r, %r)" % (wordstream, is_spam)
 if is_spam:
 self.nspam += 1
 else:
 self.nham += 1
 wordinfo = self.wordinfo
 wordinfoget = wordinfo.get
 now = time.time()
 for word in wordstream:
 record = wordinfoget(word)
 if record is None:
 record = wordinfo[word] = WordInfo(now)
 if is_spam:
 record.spamcount += 1
 else:
 record.hamcount += 1
 if self.DEBUG:
 print "new count for %r = %d" % (word,
 is_spam and record.spamcount or record.hamcount)
 def _remove_msg(self, wordstream, is_spam):
 if self.DEBUG:
 print "_remove_msg(%r, %r)" % (wordstream, is_spam)
 if is_spam:
 if self.nspam <= 0:
 raise ValueError("spam count would go negative!")
 self.nspam -= 1
 else:
 if self.nham <= 0:
 raise ValueError("non-spam count would go negative!")
 self.nham -= 1
 wordinfoget = self.wordinfo.get
 for word in wordstream:
 record = wordinfoget(word)
 if record is not None:
 if is_spam:
 if record.spamcount > 0:
 record.spamcount -= 1
 else:
 if record.hamcount > 0:
 record.hamcount -= 1
Index: GBayes.py
===================================================================
RCS file: /cvsroot/python/python/nondist/sandbox/spambayes/GBayes.py,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** GBayes.py	23 Aug 2002 14:25:39 -0000	1.11
--- GBayes.py	23 Aug 2002 15:42:48 -0000	1.12
***************
*** 1,13 ****
 #!/usr/bin/env python

! # This is an implementation of the Bayes-like spam classifier sketched
! # by Paul Graham at <http://www.paulgraham.com/spam.html>. We say
! # "Bayes-like" because there are many ad hoc deviations from a
! # "normal" Bayesian classifier.
! #
! # Tim Peters wrote the algorithmic part of the code.
! #
! # Barry Warsaw added integration infrastructure like command line
! # options and a pickled database.

 """Usage: %(program)s [options]
--- 1,5 ----
 #!/usr/bin/env python

! # A driver for the classifier module. Barry Warsaw is the primary author.

 """Usage: %(program)s [options]
***************
*** 44,49 ****
 import sys
 import getopt
- import time
- from heapq import heapreplace
 import cPickle as pickle
 import mailbox
--- 36,39 ----
***************
*** 51,337 ****
 import errno

! program = sys.argv[0]
! 
! HAMBIAS = 2.0
! SPAMBIAS = 1.0
! 
! # "And then there is the question of what probability to assign to words
! # that occur in one corpus but not the other. Again by trial and error I
! # chose .01 and .99.". However, the code snippet clamps *all* probabilities
! # into this range.
! MIN_SPAMPROB = 0.01
! MAX_SPAMPROB = 0.99
! 
! UNKNOWN_SPAMPROB = 0.20
! 
! # "I only consider words that occur more than five times in total".
! # But the code snippet considers words that appear at least five times.
! # This implementation follows the code rather than the explanation.
! # (In addition, the count compared is after multiplying it with the
! # appropriate bias factor.)
! MINCOUNT = 5.0
! 
! MAX_DISCRIMINATORS = 15
! 
! PICKLE_VERSION = 1
! 
! class WordInfo(object):
! __slots__ = ('atime', # when this record was last used by scoring(*)
! 'spamcount', # # of times word appears in spam
! 'hamcount', # # of times word appears in non-spam
! 'killcount', # # of times this made it to spamprob()'s nbest
! 'spamprob', # prob(spam | msg contains this word)
! )
! # (*)atime is the last access time, a UTC time.time() value. It's the
! # most recent time this word was used by scoring (i.e., by spamprob(),
! # not by training via learn()); or, if the word has never been used by
! # scoring, the time the word record was created (i.e., by learn()).
! # One good criterion for identifying junk (word records that have no
! # value) is to delete words that haven't been used for a long time.
! # Perhaps they were typos, or unique identifiers, or relevant to a
! # once-hot topic or scam that's fallen out of favor. Whatever, if
! # a word is no longer being used, it's just wasting space.
! 
! def __init__(self, atime):
! self.atime = atime
! self.spamcount = self.hamcount = self.killcount = 0
! self.spamprob = None
! 
! def __repr__(self):
! return "WordInfo%r" % repr((self.atime, self.spamcount,
! self.hamcount, self.killcount,
! self.spamprob))
! 
! def __getstate__(self):
! return (self.atime, self.spamcount, self.hamcount, self.killcount,
! self.spamprob)
! 
! def __setstate__(self, t):
! (self.atime, self.spamcount, self.hamcount, self.killcount,
! self.spamprob) = t
! 
! class GrahamBayes(object):
! __slots__ = ('wordinfo', # map word to WordInfo record
! 'nspam', # number of spam messages learn() has seen
! 'nham', # number of non-spam messages learn() has seen
! )
! 
! DEBUG = False
! 
! def __init__(self):
! self.wordinfo = {}
! self.nspam = self.nham = 0
! 
! def __getstate__(self):
! return PICKLE_VERSION, self.wordinfo, self.nspam, self.nham
! 
! def __setstate__(self, t):
! if t[0] != PICKLE_VERSION:
! raise ValueError("Can't unpickle -- version %s unknown" % t[0])
! self.wordinfo, self.nspam, self.nham = t[1:]
! 
! def spamprob(self, wordstream):
! """Return best-guess probability that wordstream is spam.
! 
! wordstream is an iterable object producing words.
! The return value is a float in [0.0, 1.0].
! """
! 
! if self.DEBUG:
! print "spamprob(%r)" % wordstream
! 
! if not wordstream:
! raise ValueError("non-empty wordstream required")
! 
! wordinfoget = self.wordinfo.get
! now = time.time()
! 
! # A priority queue to remember the MAX_DISCRIMINATORS best
! # probabilities, where "best" means largest distance from 0.5.
! # The tuples are (distance, prob, word, wordinfo[word]).
! nbest = [(-1.0, None, None, None)] * MAX_DISCRIMINATORS
! smallest_best = -1.0
! 
! for word in wordstream:
! record = wordinfoget(word)
! if record is None:
! prob = UNKNOWN_SPAMPROB
! else:
! record.atime = now
! prob = record.spamprob
! 
! distance = abs(prob - 0.5)
! if distance > smallest_best:
! # Subtle: we didn't use ">" instead of ">=" just to save
! # calls to heapreplace(). The real intent is that if
! # there are many equally strong indicators throughout the
! # message, we want to favor the ones that appear earliest:
! # it's expected that spam headers will often have smoking
! # guns, and, even when not, spam has to grab your attention
! # early (& note that when spammers generate large blocks of
! # random gibberish to throw off exact-match filters, it's
! # always at the end of the msg -- if they put it at the
! # start, *nobody* would read the msg).
! heapreplace(nbest, (distance, prob, word, record))
! smallest_best = nbest[0][0]
! 
! # Compute the probability. Note: This is what Graham's code did,
! # but it's dubious for reasons explained in great detail on Python-
! # Dev: it's missing P(spam) and P(not-spam) adjustments that
! # straightforward Bayesian analysis says should be here. It's
! # unclear how much it matters, though, as the omissions here seem
! # to tend in part to cancel out distortions introduced earlier by
! # HAMBIAS. Experiments will decide the issue.
! prob_product = inverse_prob_product = 1.0
! for distance, prob, word, record in nbest:
! if prob is None: # it's one of the dummies nbest started with
! continue
! if record is not None: # else wordinfo doesn't know about it
! record.killcount += 1
! if self.DEBUG:
! print 'nbest P(%r) = %g' % (word, prob)
! prob_product *= prob
! inverse_prob_product *= 1.0 - prob
! 
! return prob_product / (prob_product + inverse_prob_product)
! 
! def learn(self, wordstream, is_spam, update_probabilities=True):
! """Teach the classifier by example.
! 
! wordstream is a word stream representing a message. If is_spam is
! True, you're telling the classifier this message is definitely spam,
! else that it's definitely not spam.
! 
! If optional arg update_probabilities is False (the default is True),
! don't update word probabilities. Updating them is expensive, and if
! you're going to pass many messages to learn(), it's more efficient
! to pass False here and call update_probabilities() once when you're
! done -- or to call learn() with update_probabilities=True when
! passing the last new example. The important thing is that the
! probabilities get updated before calling spamprob() again.
! """
! 
! self._add_msg(wordstream, is_spam)
! if update_probabilities:
! self.update_probabilities()
! 
! def unlearn(self, wordstream, is_spam, update_probabilities=True):
! """In case of pilot error, call unlearn ASAP after screwing up.
! 
! Pass the same arguments you passed to learn().
! """
! 
! self._remove_msg(wordstream, is_spam)
! if update_probabilities:
! self.update_probabilities()
! 
! def update_probabilities(self):
! """Update the word probabilities in the spam database.
! 
! This computes a new probability for every word in the database,
! so can be expensive. learn() and unlearn() update the probabilities
! each time by default. Thay have an optional argument that allows
! to skip this step when feeding in many messages, and in that case
! you should call update_probabilities() after feeding the last
! message and before calling spamprob().
! """
! 
! nham = float(self.nham or 1)
! nspam = float(self.nspam or 1)
! for record in self.wordinfo.itervalues():
! # Compute prob(msg is spam | msg contains word).
! hamcount = HAMBIAS * record.hamcount
! spamcount = SPAMBIAS * record.spamcount
! if hamcount + spamcount < MINCOUNT:
! prob = UNKNOWN_SPAMPROB
! else:
! hamratio = min(1.0, hamcount / nham)
! spamratio = min(1.0, spamcount / nspam)
! 
! prob = spamratio / (hamratio + spamratio)
! if prob < MIN_SPAMPROB:
! prob = MIN_SPAMPROB
! elif prob > MAX_SPAMPROB:
! prob = MAX_SPAMPROB
! 
! record.spamprob = prob
! 
! if self.DEBUG:
! print 'New probabilities:'
! for w, r in self.wordinfo.iteritems():
! print "P(%r) = %g" % (w, r.spamprob)
! 
! def clearjunk(self, oldesttime, mincount=MINCOUNT):
! """Forget useless wordinfo records. This can shrink the database size.
! 
! A record for a word will be retained only if the word was accessed
! at or after oldesttime, or appeared at least mincount times in
! messages passed to learn(). mincount is optional, and defaults
! to the value an internal algorithm uses to decide that a word is so
! rare that it has no predictive value.
! """
! 
! wordinfo = self.wordinfo
! mincount = float(mincount)
! tonuke = [w for w, r in wordinfo.iteritems()
! if r.atime < oldesttime and
! SPAMBIAS*r.spamcount + HAMBIAS*r.hamcount < mincount]
! for w in tonuke:
! if self.DEBUG:
! print "clearjunk removing word %r: %r" % (w, r)
! del wordinfo[w]
! 
! def _add_msg(self, wordstream, is_spam):
! if self.DEBUG:
! print "_add_msg(%r, %r)" % (wordstream, is_spam)
! 
! if is_spam:
! self.nspam += 1
! else:
! self.nham += 1
! 
! wordinfo = self.wordinfo
! wordinfoget = wordinfo.get
! now = time.time()
! for word in wordstream:
! record = wordinfoget(word)
! if record is None:
! record = wordinfo[word] = WordInfo(now)
! 
! if is_spam:
! record.spamcount += 1
! else:
! record.hamcount += 1
! 
! if self.DEBUG:
! print "new count for %r = %d" % (word,
! is_spam and record.spamcount or record.hamcount)
! 
! def _remove_msg(self, wordstream, is_spam):
! if self.DEBUG:
! print "_remove_msg(%r, %r)" % (wordstream, is_spam)
! 
! if is_spam:
! if self.nspam <= 0:
! raise ValueError("spam count would go negative!")
! self.nspam -= 1
! else:
! if self.nham <= 0:
! raise ValueError("non-spam count would go negative!")
! self.nham -= 1
! 
! wordinfoget = self.wordinfo.get
! for word in wordstream:
! record = wordinfoget(word)
! if record is not None:
! if is_spam:
! if record.spamcount > 0:
! record.spamcount -= 1
! else:
! if record.hamcount > 0:
! record.hamcount -= 1

! #############################################################################
! # The rest of this is just for playing around and testing.

 # For the heck of it, a simple tokenizer to create word streams.
--- 41,47 ----
 import errno

! from classifier import GrahamBayes

! program = sys.argv[0]

 # For the heck of it, a simple tokenizer to create word streams.
***************
*** 615,618 ****
--- 325,329 ----
 def _test():
 b = GrahamBayes()
+ tokenize = tokenize_words_foldcase
 b.learn(tokenize(spam1), True)
 b.learn(tokenize(spam2), True)