[Python-checkins] python/nondist/sandbox/spambayes cleanarch,NONE,1.1

bwarsaw@users.sourceforge.net bwarsaw@users.sourceforge.net
2002年8月20日 12:08:48 -0700


Update of /cvsroot/python/python/nondist/sandbox/spambayes
In directory usw-pr-cvs1:/tmp/cvs-serv16802
Added Files:
	cleanarch 
Log Message:
An archive cleaner, adapted from the Mailman 2.1b3 version, but
de-Mailman-ified.
--- NEW FILE: cleanarch ---
#! /usr/bin/env python
# Copyright (C) 2001,2002 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software 
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""Clean up an .mbox archive file.
The archiver looks for Unix-From lines separating messages in an mbox archive
file. For compatibility, it specifically looks for lines that start with
"From " -- i.e. the letters capital-F, lowercase-r, o, m, space, ignoring
everything else on the line.
Normally, any lines that start "From " in the body of a message should be
escaped such that a > character is actually the first on a line. It is
possible though that body lines are not actually escaped. This script
attempts to fix these by doing a stricter test of the Unix-From lines. Any
lines that start "From " but do not pass this stricter test are escaped with a
> character.

Usage: cleanarch [options] < inputfile > outputfile
Options:
 -s n
 --status=n
 Print a # character every n lines processed
 -q / --quiet
 Don't print changed line information to standard error.
 -n / --dry-run
 Don't actually output anything.
 -h / --help
 Print this message and exit
"""
import sys
import re
import getopt
import mailbox
cre = re.compile(mailbox.UnixMailbox._fromlinepattern)
# From RFC 2822, a header field name must contain only characters from 33-126
# inclusive, excluding colon. I.e. from oct 41 to oct 176 less oct 072. Must
# use re.match() so that it's anchored at the beginning of the line.
fre = re.compile(r'[041円-071円073円-0176円]+')

def usage(code, msg=''):
 print >> sys.stderr, __doc__
 if msg:
 print >> sys.stderr, msg
 sys.exit(code)

def escape_line(line, lineno, quiet, output):
 if output:
 sys.stdout.write('>' + line)
 if not quiet:
 print >> sys.stderr, '[%d]' % lineno, line[:-1]

def main():
 try:
 opts, args = getopt.getopt(
 sys.argv[1:], 'hqns:',
 ['help', 'quiet', 'dry-run', 'status='])
 except getopt.error, msg:
 usage(1, msg)
 quiet = 0
 output = 1
 status = -1
 for opt, arg in opts:
 if opt in ('-h', '--help'):
 usage(0)
 elif opt in ('-q', '--quiet'):
 quiet = 1
 elif opt in ('-n', '--dry-run'):
 output = 0
 elif opt in ('-s', '--status'):
 try:
 status = int(arg)
 except ValueError:
 usage(1, 'Bad status number: %s' % arg)
 if args:
 usage(1)
 lineno = 0
 statuscnt = 0
 messages = 0
 while 1:
 lineno += 1
 line = sys.stdin.readline()
 if not line:
 break
 if line.startswith('From '):
 if cre.match(line):
 # This is a real Unix-From line. But it could be a message
 # /about/ Unix-From lines, so as a second order test, make
 # sure there's at least one RFC 2822 header following
 nextline = sys.stdin.readline()
 lineno += 1
 if not nextline:
 # It was the last line of the mbox, so it couldn't have
 # been a Unix-From
 escape_line(line, lineno, quiet, output)
 break
 fieldname = nextline.split(':', 1)
 if len(fieldname) < 2 or not fre.match(nextline):
 # The following line was not a header, so this wasn't a
 # valid Unix-From
 escape_line(line, lineno, quiet, output)
 if output:
 sys.stdout.write(nextline)
 else:
 # It's a valid Unix-From line
 messages += 1
 if output:
 sys.stdout.write(line)
 sys.stdout.write(nextline)
 else:
 # This is a bogus Unix-From line
 escape_line(line, lineno, quiet, output)
 elif output:
 # Any old line
 sys.stdout.write(line)
 if status > 0 and (lineno % status) == 0:
 sys.stderr.write('#')
 statuscnt += 1
 if statuscnt > 50:
 print >> sys.stderr
 statuscnt = 0
 # We've found `messages' separators, so the number of messages is +1
 messages += 1
 print >> sys.stderr, messages, 'messages found'

if __name__ == '__main__':
 main()

AltStyle によって変換されたページ (->オリジナル) /