Re: e-mail database

Steven D. Majewski (sdm7g@elvis.med.virginia.edu)
Fri, 24 Jun 1994 16:49:08 -0400

On Jun 24, 10:24, Budi Rahardjo wrote:
>
> [Sorry for cross-post to several newsgroups.]
> I would like to database e-mails from a mailing list.
> (To date I already have around 30,000 e-mails)
> What I would like to do is to have WWW-browsers as the
> front end. Users should be able to search the database
> based on "From:", "Subject:", and the contents.
>
>
> I am looking for the database program to do that in
> either perl, icon, python, C, or whatever.
> Should I use WAIS instead ? (I thought WAIS cannot handle
> fielded text. Is there a trick to do this ?)
> An automatic mechanism to add new mails should be available.
>
> Does such beast exist?
> Reply through e-mail is preffered. Will sumarize if needed.
>

Well - I have an incomplete python IMAP module which I intend
to turn into a WWW to IMAP gateway. (someday!)

IMAP ( Interactive Mail Access Protocal ) allows remote access
to mailboxes with searching done by the server.
( with implicit AND or explicit OR of field specifiers )

However: the existing IMAP servers I'm aware of don't do any
pre-indexing of the mailboxes ( although, once freed of a
particular mailbox format by the protocol, that would be a
logical direction: mail stores in indexed files. ) so searches
thru a large mailbox may take a significantly long time.

imap servers and protocol specifications are available
from ftp://ftp.cac.washington.edu/

Here is the (VERY) rudimentary + experimental Python Classes.
If you have a server, you can try function test(), which
will return a list of messages from possibly several
mailboxes, and will print out URL's ( almost - I intend
to use either message-id's or IMAP unique-id's rather
than sequence number, which may not be the same between
sessions. )

The headers and body of a the first message returned
would be test()[0].head() and test()[0].body() respectively.

[ I haven't yet added parsing of the returned objects,
like envelopes, for example. That is next. ]

- Steve Majewski (804-982-0831) <sdm7g@Virginia.EDU>
- UVA Department of Molecular Physiology and Biological Physics

#
# Classes are defined for
# IMAP Sessions : ( host, user, password )
# basic socket and session oriented methods
# login, logout, select, examine, find, ...
# IMAP Mailboxes : ( Session, mailbox-name )
# search ( also select(self), examine(self) )
# IMAP Messages : ( Mailbox, seq-number )
# head(), body, etc.
#
# All classes have a url() method to return a url string.
#
#
from socket import gethostbyname,getservbyname,socket,AF_INET,SOCK_STREAM
from rand import choice
import string
import sys

# generate an increasing sequence of symbols
# all elements of sequence share a common "random" prefix
#
class GenSym:
def __init__( self ):
self._prefix = choice( string.lowercase ) + choice( string.uppercase )
self._last = 0
def last( self ):
return self._prefix + '%04d' % self._last
def next( self ):
self._last = self._last + 1
return self.last()

_BUFSIZ = 8192
_S_CONNECTED = 'Connected'
_S_LOGIN = 'Login'
InvalidState = 'InvalidState'
ImapError = 'ImapError'
ReadOnlyMode = 'RO'
ReadWriteMode = 'RW'
DefaultMode = ReadOnlyMode

# TODO: change session to require host,user,password args
# and make a anonSession class that suplies defaults.

class Session:
def __init__( self, *args ):
self.log = sys.stderr.write # can be another file write method or [].append
self.state = []
self._gensym = GenSym()
self.nextseq = self._gensym.next
self.lastseq = self._gensym.last
if args : None = self.connect( args[0] )
if args[1:] : apply( self.login, args[1:3] )
if args[3:] : self.examime( args[3] )
def connect( self, *hostname ):
if _S_CONNECTED in self.state :
raise InvalidState, 'Already Connected - logout first.'
if hostname and len(hostname) == 1 : hostname = hostname[0]
elif not hostname : hostname = 'loghost'
else: raise ValueError
self.hostname = hostname
self.host = gethostbyname( hostname )
self.port = getservbyname( 'imap', 'tcp' )
self.sock = socket( AF_INET, SOCK_STREAM )
self.sock.connect( (self.host, self.port) )
self.file = self.sock.makefile( 'rw' )
self.state.append( _S_CONNECTED )
self.banner = self.readline()
if self.log : self.log( self.banner )
return self
def login( self, user, passwd ):
self.user = user
self.passwd = passwd
if _S_CONNECTED not in self.state : raise InvalidState, 'Not connected'
self.send( 'LOGIN ' + user + ' ' + passwd )
reply = self.readlines()
if self.log:
for line in reply: self.log( line[:-1] )
self.set_status( reply[-1] )
if ( self.status[1] == 'OK' ) : self.state.append( _S_LOGIN )
self._url = 'imap://'+self.user+':'+self.passwd+'@'+self.hostname+'/'
def url( self ):
return self._url
# low level routines:
def send( self, cmd ):
self.sock.send( self.nextseq() + ' ' + cmd + '\r\n' )
def recv( self ):
return self.sock.recv( _BUFSIZ )
def readline( self ):
return self.file.readline()
def readlines( self ):
lines = []
last = self.lastseq()
slen = len(last)
lines.append( self.file.readline() )
while ( last <> lines[-1][:slen] ) :
lines.append( self.file.readline() )
self.set_status( lines[-1] )
return lines
def set_status( self, line ):
status = string.split( line )
if status[0] <> self.lastseq():
raise InvalidState, 'message sequence out of order'
self.status = status[:2]
self.status.append( string.join( status[2:] ) )
return self.status[1]
def logout( self ):
self.send( 'LOGOUT' )
if self.log : self.log( self.recv() )
self.state.remove( _S_LOGIN )
def close( self ):
self.logout()
self.sock.close()
self.state = []
def __del__( self ):
self.close()
def select( self, mbox ):
self.send( 'SELECT ' + mbox )
reply = self.readlines()
if self.status[1] == 'OK' :
self.mbox = mbox
return reply
def examine( self, mbox ):
self.send( 'EXAMINE ' + mbox )
reply = self.readlines()
if self.status[1] == 'OK' :
self.mbox = mbox
return reply
def find( self, match ):
self.send( 'FIND ALL.MAILBOXES ' + match )
reply = self.readlines()
mboxlist = []
for line in reply:
line = string.split( line )
if line[1] == 'MAILBOX' : mboxlist.append( line[-1] )
return mboxlist
def Mailbox( self, mbox ):
return Mailbox( self, mbox )

ImapSession = Session # qualified name synonym, in case of "from imap import *"

class Mailbox:
def __init__( self, *args ):
if not args or len(args) <> 2 :
raise TypeError , 'bad argument list'
self.mode = DefaultMode
self.choose = self.examine
self.session = args[0]
self.mbox = args[1]
if type(self.session) <> type( Session() ) :
raise TypeError, ' arg must be ' +repr(type(Session()))
if self.session.__class__ <> Session :
raise TypeError, ' arg must be instance of ' + repr(Session)
reply = self.session.examine( self.mbox )
for line in reply:
tmp = string.split( line )
if tmp[-1] == 'EXISTS' : self.len = eval( tmp[-2] )
def __len__( self ):
self.session.examine( self.mbox )
self.session.send( 'CHECK' )
reply = self.session.readlines()
if self.session.status[1] <> 'OK' :
raise ImapError, string.join( self.session.status )
for line in reply:
tmp = string.split( line )
if tmp[-1] == 'EXISTS' : self.len = eval( tmp[-2] )
return self.len
def RO( self ):
self.mode = ReadOnlyMode
self.choose = self.examine
self.choose()
def RW( self ):
self.mode = ReadWriteMode
self.choose = self.select
self.choose()
def examine( self ):
self.session.examine( self.mbox )
def select( self ):
self.session.select( self.mbox )
def __getitem__( self, i ):
return self.Message(i+1)
def Message( self, i ):
return Message( self, i )
def __getslice__( self, i, j ):
list = []
for n in range( i, j ):
list.append( self[n] )
return list
def search( self, match ):
if self.session.mbox <> self.mbox :
self.choose()
self.session.send( 'SEARCH ' + match )
reply = self.session.readlines()
stat = string.split(reply[-1])
if stat[1] <> 'OK' or stat[2] <> 'SEARCH' : return None
list = map( string.atoi, string.split(reply[-2])[2:] )
return map( self.Message, list )
def url( self ):
return self.session.url() + self.mbox

class Message:
def __init__( self, mbox, seqn ):
self.mbox = mbox
self.seqn = seqn
self.session = self.mbox.session
def body( self ):
self.mbox.choose()
self.session.send( 'FETCH ' + `self.seqn` + ' RFC822.TEXT' )
return self.session.readlines()
def head( self ):
self.mbox.choose()
self.session.send( 'FETCH ' + `self.seqn` + ' RFC822.HEADER' )
return self.session.readlines()
def uid( self ):
# NOTE: FETCH uid and some other FETCH commands in IMAP2bis don't
# work on my server.
self.mbox.choose()
self.session.send( 'FETCH ' + `self.seqn` + ' UID' )
return self.session.readlines()
def id(self):
return `self.seqn` # TEMP: should be uid or message id
def url( self ):
return self.mbox.url() + '#' + self.id()

def test():
host = raw_input( 'Enter imap server hostname: ')
if not host : return None
user = raw_input( 'Enter user_id: ' )
if not user : return None
passwd = raw_input( 'Enter password: ' )
if not passwd : return None
imap = Session( host, user, passwd )
names = raw_input( 'Enter mailbox wildcard name: ' )
if not names: return None # No match : bail out!

mboxes = imap.find( names )
print 'Mailboxes found: '
for mb in mboxes :
print mb
print 'enter a selection string to be used on mailboxes'
match = raw_input( 'selection string: ' )

msglist = []
for mb in mboxes:
mbox = Mailbox( imap, mb )
msglist = msglist + mbox.search( match )

for m in msglist: print '<' + m.url() + '>'
return msglist