The program has two states when reading in an article: one for the header
and one for the body. This prevents non-header information from being
deleted. It also allows you to specify what 'necessary' header information
is; the default is the subject, the sender, and the date.
Enjoy,
Steve
-- begin strip.py --
#!/u/smiale/Python/python
# strip.py - steven miale
# Strips header information out of saved USENET postings
# except fields in 'allowed'. It will also attempt to strip
# signatures if sigstrip is set to 1. In that case, any line
# containing a string in sigstart will delete all lines
# from that point to the beginning of the next message.
import sys
import string
allowed=["From", "Subject", "Date"]
sigstrip = 1
sigstart = ['--\012','---\012','-- \012']
# the \012 is a newline
infile = open(sys.argv[1],'r')
outfile = open(sys.argv[2],'w')
def process_header(line):
header=string.splitfields(line,":")[0]
if header in allowed or len(header)==1:
outfile.write(line)
if len(header)==1:
return 1
else:
return 0
def end_of_body(line):
if not line or line[0:7]=='Article':
return 1
elif sigstrip == 1 and line in sigstart:
return 1
else:
return 0
while 1:
while 1:
if process_header(infile.readline()):
break
while 1:
line=infile.readline()
if end_of_body(line):
break
else:
outfile.write(line)
if not line:
break
infile.close()
outfile.close()
-- end strip.py --