#! /usr/bin/env python

import os, sys, pickle, string, re
from Mailman.Utils import mkdir, open_ex
# TBD: ugly, ugly, ugly -baw
open = open_ex

__version__='0.05 (Mailman edition)'
VERSION=__version__
CACHESIZE=100    # Number of slots in the cache


msgid_pat=re.compile(r'(<.*>)')
def strip_separators(s):
    "Remove quotes or parenthesization from a Message-ID string"
    if s==None or s=="": return ""
    if s[0] in '"<([' and s[-1] in '">)]': s=s[1:-1]
    return s

smallNameParts = ['van', 'von', 'der', 'de']

def fixAuthor(author):
    "Canonicalize a name into Last, First format"
    # If there's a comma, guess that it's already in "Last, First" format
    if ',' in author: return author
    L=string.split(author)
    i=len(L)-1
    if i==0: return author # The string's one word--forget it
    if string.upper(author)==author or string.lower(author)==author:
	# Damn, the name is all upper- or lower-case.  
	while i>0 and string.lower(L[i-1]) in smallNameParts: i=i-1
    else:
	# Mixed case; assume that small parts of the last name will be
        # in lowercase, and check them against the list.
	while i>0 and (L[i-1][0] in string.lowercase or 
		       string.lower(L[i-1]) in smallNameParts): 
	    i=i-1
    author=string.join(L[-1:]+L[i:-1], ' ')+', '+string.join(L[:i], ' ')
    return author

# Abstract class for databases

class Database:    
    def __init__(self): pass
    def close(self): pass
    def getArticle(self, archive, msgid): pass
    def hasArticle(self, archive, msgid): pass
    def addArticle(self, archive, article, subjectkey, authorkey): pass
    def firstdate(self, archive): pass
    def lastdate(self, archive): pass
    def first(self, archive, index): pass
    def next(self, archive, index): pass
    def numArticles(self, archive): pass
    def newArchive(self, archive): pass
    def setThreadKey(self, archive, key, msgid): pass
    def getOldestArticle(self, subject): pass

# The Article class encapsulates a single posting.  The attributes 
# are:
#
#  sequence : Sequence number, unique for each article in a set of archives
#  subject  : Subject
#  datestr  : The posting date, in human-readable format
#  date     : The posting date, in purely numeric format
#  headers  : Any other headers of interest
#  author   : The author's name (and possibly organization)
#  email    : The author's e-mail address
#  msgid    : A unique message ID
#  in_reply_to : If !="", this is the msgid of the article being replied to
#  references: A (possibly empty) list of msgid's of earlier articles in the thread
#  body     : A list of strings making up the message body

class Article:
    import time
    __last_article_time=time.time()
    def __init__(self, message=None, sequence=0, keepHeaders=[]):
	import time
	if message==None: return
	self.sequence=sequence

	self.parentID = None ; self.threadKey = None
	# otherwise the current sequence number is used.
	id=strip_separators(message.getheader('Message-Id'))
	if id=="": self.msgid=str(self.sequence)
	else: self.msgid=id

	if message.has_key('Subject'): self.subject=str(message['Subject'])
	else: self.subject='No subject'
	if self.subject=="": self.subject='No subject'

	if message.has_key('Date'): 
	    self.datestr=str(message['Date'])
   	    date=message.getdate_tz('Date')
	else: 
	    self.datestr='None' 
	    date=None
	if date!=None:
	    date, tzoffset=date[:9], date[-1] 
	    date=time.mktime(date)-tzoffset
	else:
	    date=self.__last_article_time+1 ; print 'Article without date:', self.msgid
	    
	self.__last_article_time=date 
	self.date='%011i' % (date,)

	# Figure out the e-mail address and poster's name
	self.author, self.email=message.getaddr('From')
	e=message.getheader('Reply-To')
	if e!=None: self.email=e
	self.email=strip_separators(self.email)
	self.author=strip_separators(self.author)

	if self.author=="": self.author=self.email

	# Save the 'In-Reply-To:' and 'References:' lines
	i_r_t=message.getheader('In-Reply-To')
	if i_r_t==None: self.in_reply_to=''
	else:
	    match=msgid_pat.search(i_r_t)
	    if match==None: self.in_reply_to=''
	    else: self.in_reply_to=strip_separators(match.group(1))
		
	references=message.getheader('References')
	if references==None: self.references=[]
	else: self.references=map(strip_separators, string.split(references))

	# Save any other interesting headers
	self.headers={}
	for i in keepHeaders:
	    if message.has_key(i): self.headers[i]=message[i]

	# Read the message body
	self.body=[]
	message.rewindbody()
	while (1):
	    line=message.fp.readline()
	    if line=="": break
	    self.body.append(line)
    def __repr__(self):
	return '<Article ID='+repr(self.msgid)+'>'

# Pipermail formatter class

class T:
    DIRMODE=0755      # Mode to give to created directories
    FILEMODE=0644     # Mode to give to created files
    INDEX_EXT = ".html" # Extension for indexes

    def __init__(self, basedir=None, reload=1, database=None):
	# If basedir isn't provided, assume the current directory
	if basedir==None: self.basedir=os.getcwd()
	else: 
            basedir=os.path.expanduser(basedir)
	    self.basedir=basedir
	self.database=database

	# If the directory doesn't exist, create it
	try: os.stat(self.basedir)
	except os.error, errdata:
	    errno, errmsg = errdata
	    if errno!=2: raise os.error, errdata
	    else: 
		self.message('Creating archive directory '+self.basedir)
		mkdir(self.basedir, self.DIRMODE)

	# Try to load previously pickled state
	try:
	    if not reload: raise IOError
	    f=open(os.path.join(self.basedir, 'pipermail.pck'), 'r')
	    self.message('Reloading pickled archive state')
	    d=pickle.load(f)
	    f.close()
	    for key, value in d.items(): setattr(self, key, value)
	except IOError: 
	    # No pickled version, so initialize various attributes
	    self.archives=[]        # Archives 
	    self._dirty_archives=[]  # Archives that will have to be updated
	    self.sequence=0         # Sequence variable used for numbering articles
	    self.update_TOC=0       # Does the TOC need updating?
        #
        # make the basedir variable work when passed in as an __init__ arg
        # and different from the one in the pickle.  Let the one passed in
        # as an __init__ arg take precedence if it's stated.  This way, an
        # archive can be moved from one place to another and still work.
        #
        if basedir != self.basedir:
            self.basedir = basedir

    def close(self):
	"Close an archive, saving its state and updating any changed archives."
	self.update_dirty_archives()# Update all changed archives
	# If required, update the table of contents
	if self.update_TOC or 1:
	    self.update_TOC=0
	    self.write_TOC()
	# Save the collective state 
	self.message('Pickling archive state into '+os.path.join(self.basedir, 'pipermail.pck'))
	self.database.close()
	del self.database
	f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w')
	pickle.dump(self.__dict__, f)
	f.close()

    # 
    # Private methods 
    # 
    # These will be neither overridden nor called by custom archivers.
    #

	
    # Create a dictionary of various parameters that will be passed 
    # to the write_index_{header,footer} functions
    def __set_parameters(self, archive):
	import time
	# Determine the earliest and latest date in the archive
	firstdate=self.database.firstdate(archive)
	lastdate=self.database.lastdate(archive)

	# Get the current time
	now=time.asctime(time.localtime(time.time()))	
	self.firstdate=firstdate ; self.lastdate=lastdate
	self.archivedate=now ; self.size=self.database.numArticles(archive)
	self.archive=archive ; self.version=__version__

    # Find the message ID of an article's parent, or return None
    # if no parent can be found.

    def __findParent(self, article, children=[]):
	    parentID=None
	    if article.in_reply_to!='': parentID=article.in_reply_to
	    elif article.references!=[]: 
		# Remove article IDs that aren't in the archive
		refs=filter(self.articleIndex.has_key, article.references)
		if len(refs):
		    refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs)
		    maxdate=refs[0]
		    for i in refs[1:]: 
			if i.date>maxdate.date: maxdate=i
		    parentID=maxdate.msgid
	    else:
		# Look for the oldest matching subject
		try: 
		    key, tempid=self.subjectIndex.set_location(article.subject)
		    print key, tempid
		    self.subjectIndex.next()	
		    [subject, date]= string.split(key, '\0')
		    print article.subject, subject, date
		    if (subject==article.subject and tempid not in children):
			parentID=tempid
		except KeyError: pass
	    return parentID

    # Update the threaded index completely
    def updateThreadedIndex(self):
	import pickle, sys
	# Erase the threaded index
	self.database.clearIndex(self.archive, 'thread')
	
	# Loop over all the articles 
	msgid=self.database.first(self.archive, 'date')
	while (msgid != None):
  	    article=self.database.getArticle(self.archive, msgid)
	    if article.parentID==None or not self.database.hasArticle(self.archive, article.parentID): 
		key=article.date
	    else: 
		parent=self.database.getArticle(self.archive, article.parentID)
		article.threadKey=parent.threadKey+article.date+'-' 
   	    self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, msgid)
	    msgid=self.database.next(self.archive, 'date')

## 	    L1=[] ; L2=[]
## 	    while (1):
## 		article=self.database.getArticle(self.archive, msgid)
## 		L1.append('') ; L2.append(msgid) 
## 		L1=map(lambda x, d=article.date: d+'-'+x, L1)
## 		parentID=self.__findParent(article, L2)
## 		if parentID==None or not self.database.hasArticle(parentID): 
## 		    break
## 		else: msgid=parentID
## 	    for i in range(0, len(L1)):
## 		self.database.setThreadKey(self.archive, L1[i], '\000'+L2[i])
## 		self.database.setThreadKey(self.archive, '\000'+L2[i], L1[i])

    #
    # Public methods:
    #
    # These are part of the public interface of the T class, but will
    # never be overridden (unless you're trying to do something very new).
    
    # Update a single archive's indices, whether the archive's been
    # dirtied or not. 
    def update_archive(self, archive):	
	self.archive=archive
	self.message("Updating index files for archive ["+archive+']')
	arcdir=os.path.join(self.basedir, archive)
	parameters=self.__set_parameters(archive)
	# Handle the 3 simple indices first
	for i in ['Date', 'Subject', 'Author']:
	    self.message("  "+i)
	    self.type=i
	    # Get the right index
	    i=string.lower(i)

	    # Redirect sys.stdout
	    import sys
	    f=open(os.path.join(arcdir, i+self.INDEX_EXT), 'w')
##	    os.chmod(f.name, self.FILEMODE)
	    temp_stdout, sys.stdout=sys.stdout, f
	    self.write_index_header()
	    count=0
	    # Loop over the index entries
	    finished=0
	    msgid=self.database.first(archive, i)
	    while (msgid != None):
		article=self.database.getArticle(self.archive, msgid)
		count=count+1
		self.write_index_entry(article)
		msgid = self.database.next(archive, i )
	    # Finish up this index
	    self.write_index_footer()
	    sys.stdout=temp_stdout
	    f.close()

	# Print the threaded index
	self.message("  Thread")
 	temp_stdout, sys.stdout=sys.stdout, open(os.path.join(arcdir, 'thread' + self.INDEX_EXT), 'w')
##	os.chmod(os.path.join(arcdir, 'thread' + self.INDEX_EXT), self.FILEMODE)
 	self.type='Thread'
 	self.write_index_header()

	# To handle the prev./next in thread pointers, we need to
	# track articles 5 at a time.  

	# Get the first 5 articles	
	L=[ None ]*5 ; i=2 ; finished=0
	msgid=self.database.first(self.archive, 'thread')
	while msgid!=None and i<5:
	    L[i]=self.database.getArticle(self.archive, msgid) ; i=i+1
	    msgid = self.database.next(self.archive, 'thread')

	while L[2]!=None:
 	    article=L[2] ; artkey=None
	    if article!=None: artkey=article.threadKey
	    if artkey!=None: 
		import sys
		self.write_threadindex_entry(article, string.count(artkey, '-')-1)
		if self.database.changed.has_key( (archive,article.msgid) ):
		    a1=L[1] ; a3=L[3]
		    self.update_article(arcdir, article, a1, a3) 
		    if a3!=None: self.database.changed[ (archive,a3.msgid) ]=None
		    if a1!=None:
			if not self.database.changed.has_key( (archive,a1.msgid) ): 
			    self.update_article(arcdir, a1, L[0], L[2])
			else: del self.database.changed[ (archive,a1.msgid) ]
	    L=L[1:]			# Rotate the list
	    if msgid==None: L.append(msgid)
	    else: L.append( self.database.getArticle(self.archive, msgid) )
	    msgid = self.database.next(self.archive, 'thread')
	    
 	self.write_index_footer()
 	sys.stdout=temp_stdout

    # Update only archives that have been marked as "changed".
    def update_dirty_archives(self):
	for i in self._dirty_archives: self.update_archive(i)
	self._dirty_archives=[]

    # Read a Unix mailbox file from the file object <input>,
    # and create a series of Article objects.  Each article
    # object will then be archived.
    
    def processUnixMailbox(self, input, articleClass=Article):
	import mailbox
	mbox=mailbox.UnixMailbox(input)
	while (1):
	    m=mbox.next()
	    if not m: break			# End of file reached
	    a=articleClass(m, self.sequence) # Create an article object
	    self.sequence=self.sequence+1  # Increment the archive's sequence number
	    self.add_article(a)		# Add the article

    # Archive an Article object.
    def add_article(self, article):
	# Determine into what archives the article should be placed
	archives=self.get_archives(article)
	if archives==None: archives=[]        # If no value was returned, ignore it
	if type(archives)==type(''): archives=[archives] 	# If a string was returned, convert to a list
	if archives==[]: return         # Ignore the article

	# Add the article to each archive in turn
	article.filename=filename=self.get_filename(article)
	temp=self.format_article(article) # Reformat the article
	self.message("Processing article #"+str(article.sequence)+' into archives '+str(archives))
	for i in archives:
	    self.archive=i
	    archivedir=os.path.join(self.basedir, i)
	    # If it's a new archive, create it
	    if i not in self.archives: 
		self.archives.append(i) ; self.update_TOC=1
		self.database.newArchive(i)
		# If the archive directory doesn't exist, create it
		try: os.stat(archivedir)
		except os.error, errdata:
		    errno, errmsg=errdata
		    if errno==2: 
			mkdir(archivedir, self.DIRMODE)
		    else: raise os.error, errdata
		self.open_new_archive(i, archivedir)
		
	    # Write the HTML-ized article
	    f=open(os.path.join(archivedir, filename), 'w')
##	    os.chmod(os.path.join(archivedir, filename), self.FILEMODE)
	    temp_stdout, sys.stdout = sys.stdout, f
	    self.write_article_header(temp)
	    sys.stdout.writelines(temp.body)
	    self.write_article_footer(temp)
	    sys.stdout=temp_stdout
	    f.close()

	    authorkey=fixAuthor(article.author)+'\000'+article.date
	    subjectkey=string.lower(article.subject)+'\000'+article.date

	    # Update parenting info
	    parentID=None
	    if article.in_reply_to!='': parentID=article.in_reply_to
	    elif article.references!=[]: 
		# Remove article IDs that aren't in the archive
		refs=filter(lambda x, self=self: self.database.hasArticle(self.archive, x), 
			    article.references)
		if len(refs):
		    refs=map(lambda x, s=self: s.database.getArticle(s.archive, x), refs)
		    maxdate=refs[0]
		    for ref in refs[1:]: 
			if ref.date>maxdate.date: maxdate=ref
		    parentID=maxdate.msgid
	    else:
		# Get the oldest article with a matching subject, and assume this is 
		# a follow-up to that article
		parentID=self.database.getOldestArticle(self.archive, article.subject)

	    if parentID!=None and not self.database.hasArticle(self.archive, parentID): 
		parentID=None
	    article.parentID=parentID 
	    if parentID!=None:
		parent=self.database.getArticle(self.archive, parentID)
		article.threadKey=parent.threadKey+article.date+'-'
	    else: article.threadKey=article.date+'-'
   	    self.database.setThreadKey(self.archive, article.threadKey+'\000'+article.msgid, article.msgid)
	    self.database.addArticle(i, temp, subjectkey, authorkey)
	    
	    if i not in self._dirty_archives: 
		self._dirty_archives.append(i)
	del temp

    # Abstract methods: these will need to be overridden by subclasses
    # before anything useful can be done.

    def get_filename(self, article): 
	pass
    def get_archives(self, article):
	"""Return a list of indexes where the article should be filed.
	A string can be returned if the list only contains one entry, 
	and the empty list is legal."""
	pass
    def format_article(self, article):
	pass
    def write_index_header(self):
	pass
    def write_index_footer(self):
	pass
    def write_index_entry(self, article):
	pass
    def write_threadindex_entry(self, article, depth):
	pass
    def write_article_header(self, article):
	pass
    def write_article_footer(self, article):
	pass
    def write_article_entry(self, article):
	pass
    def update_article(self, archivedir, article, prev, next):
	pass
    def write_TOC(self):
	pass
    def open_new_archive(self, archive, dir):
	pass
    def message(self, msg):
	pass


class BSDDBdatabase(Database):
    def __init__(self, basedir):
	self.__cachekeys=[] ; self.__cachedict={}
	self.__currentOpenArchive=None   # The currently open indices
	self.basedir=os.path.expanduser(basedir)
	self.changed={}         # Recently added articles, indexed only by message ID
    def firstdate(self, archive):
	import time
	self.__openIndices(archive)
	date='None'
	try:
	    date, msgid = self.dateIndex.first()
	    date=time.asctime(time.localtime(string.atof(date)))
	except KeyError: pass
	return date
    def lastdate(self, archive):
	import time
	self.__openIndices(archive)
	date='None'
	try:
	    date, msgid = self.dateIndex.last()
	    date=time.asctime(time.localtime(string.atof(date)))
	except KeyError: pass
	return date
    def numArticles(self, archive):
	self.__openIndices(archive)
	return len(self.dateIndex)    

    # Add a single article to the internal indexes for an archive.

    def addArticle(self, archive, article, subjectkey, authorkey):
	import pickle
	self.__openIndices(archive)

	# Add the new article
	self.dateIndex[article.date]=article.msgid
	self.authorIndex[authorkey]=article.msgid
	self.subjectIndex[subjectkey]=article.msgid
	# Set the 'body' attribute to empty, to avoid storing the whole message
	temp = article.body ; article.body=[]
	self.articleIndex[article.msgid]=pickle.dumps(article)
	article.body=temp
	self.changed[archive,article.msgid]=None

	parentID=article.parentID
	if parentID!=None and self.articleIndex.has_key(parentID): 
	    parent=self.getArticle(archive, parentID)
	    myThreadKey=parent.threadKey+article.date+'-'
	else: myThreadKey = article.date+'-'
	article.threadKey=myThreadKey
	self.setThreadKey(archive, myThreadKey+'\000'+article.msgid, article.msgid)

    # Open the BSDDB files that are being used as indices
    # (dateIndex, authorIndex, subjectIndex, articleIndex)
    def __openIndices(self, archive):
	if self.__currentOpenArchive==archive: return

	import bsddb
	self.__closeIndices()
#	print 'opening indices for [%s]' % (repr(archive),)
	arcdir=os.path.join(self.basedir, 'database')
	try: mkdir(arcdir)
	except os.error: pass
	for i in ['date', 'author', 'subject', 'article', 'thread']:
	    t=bsddb.btopen(os.path.join(arcdir, archive+'-'+i), 'c') 
	    setattr(self, i+'Index', t)
	self.__currentOpenArchive=archive

    # Close the BSDDB files that are being used as indices (if they're
    # open--this is safe to call if they're already closed)
    def __closeIndices(self):
	if self.__currentOpenArchive!=None: 
	    pass
#	    print 'closing indices for [%s]' % (repr(self.__currentOpenArchive),)
	for i in ['date', 'author', 'subject', 'thread', 'article']:
	    attr=i+'Index'
	    if hasattr(self, attr): 
		index=getattr(self, attr) 
		if i=='article': 
	            if not hasattr(self, 'archive_length'): self.archive_length={}
		    self.archive_length[self.__currentOpenArchive]=len(index)
		index.close() 
		delattr(self,attr)
	self.__currentOpenArchive=None
    def close(self):
	self.__closeIndices()
    def hasArticle(self, archive, msgid): 
	self.__openIndices(archive)
	return self.articleIndex.has_key(msgid)
    def setThreadKey(self, archive, key, msgid):
	self.__openIndices(archive)
	self.threadIndex[key]=msgid
    def getArticle(self, archive, msgid):
	self.__openIndices(archive)
	if self.__cachedict.has_key(msgid): 
	    self.__cachekeys.remove(msgid)
	    self.__cachekeys.append(msgid)
	    return self.__cachedict[msgid]
	if len(self.__cachekeys)==CACHESIZE: 
	    delkey, self.__cachekeys = self.__cachekeys[0], self.__cachekeys[1:]
	    del self.__cachedict[delkey]
	s=self.articleIndex[msgid]
	article=pickle.loads(s)
	self.__cachekeys.append(msgid) ; self.__cachedict[msgid]=article
	return article

    def first(self, archive, index): 
	self.__openIndices(archive)
	index=getattr(self, index+'Index')
	try: 
	    key, msgid = index.first()
	    return msgid
	except KeyError: return None
    def next(self, archive, index): 
	self.__openIndices(archive)
	index=getattr(self, index+'Index')
	try: 
	    key, msgid = index.next()
	    return msgid
	except KeyError: return None
	
    def getOldestArticle(self, archive, subject):
	self.__openIndices(archive)
	subject=string.lower(subject)
	try: 
	    key, tempid=self.subjectIndex.set_location(subject)
	    self.subjectIndex.next()	
	    [subject2, date]= string.split(key, '\0')
	    if subject!=subject2: return None
	    return tempid
	except KeyError: 
	    return None

    def newArchive(self, archive): pass
    def clearIndex(self, archive, index):
	self.__openIndices(archive)
	index=getattr(self, index+'Index')
	finished=0
	try:
	    key, msgid=self.threadIndex.first()	    		
	except KeyError: finished=1
	while not finished:
	    del self.threadIndex[key]
	    try:
		key, msgid=self.threadIndex.next()	    		
	    except KeyError: finished=1