# Copyright (C) 1998 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """HyperArch: Pipermail archiving for MailMan - The Dragon De Monsyne TODO: - The templates should be be files in Mailman's Template dir, instead of static strings. - Each list should be able to have it's own templates. Also, it should automatically fall back to default template in case of error in list specific template. - Should be able to force all HTML to be regenerated next time the archive is run, incase a template is changed. - Run a command to generate tarball of html archives for downloading (prolly in the 'update_dirty_archives' method ) """ import sys import re, cgi, urllib, string import time, pickle, os, posixfile import HyperDatabase import pipermail from Mailman import mm_cfg from Mailman.Utils import mkdir, open_ex # TBD: ugly, ugly, ugly -baw open = open_ex try: import gzip except ImportError: gzip = None def html_quote(s): repls = ( ('&', '&'), ("<", '<'), (">", '>'), ('"', '"')) for thing, repl in repls: s = string.replace(s, thing, repl) return s def url_quote(s): return urllib.quote(s) article_text_template="""\ From %(email)s %(datestr)s Date: %(datestr)s From: %(author)s %(email)s Subject: %(subject)s %(body)s """ article_template='''\ %(subject_html)s %(prev)s %(next)s

%(subject_html)s

%(author_html)s %(email_html)s
%(datestr_html)s


%(body)s

''' def CGIescape(arg): s=cgi.escape(str(arg)) s=re.sub('"', '"', s) return s # Parenthesized human name paren_name_pat=re.compile(r'([(].*[)])') # Subject lines preceded with 'Re:' REpat=re.compile( r"\s*RE\s*:\s*", re.IGNORECASE) # E-mail addresses and URLs in text emailpat=re.compile(r'([-+,.\w]+@[-+.\w]+)') # Argh! This pattern is buggy, and will choke on URLs with GET parameters. urlpat=re.compile(r'(\w+://[^>)\s]+)') # URLs in text # Blank lines blankpat=re.compile(r'^\s*$') # # Starting directive htmlpat=re.compile(r'^\s*\s*$', re.IGNORECASE) # Ending directive nohtmlpat=re.compile(r'^\s*\s*$', re.IGNORECASE) # Match quoted text quotedpat=re.compile(r'^([>|:]|>)+') # Note: I'm overriding most, if not all of the pipermail Article class # here -ddm # The Article class encapsulates a single posting. The attributes are: # # sequence : Sequence number, unique for each article in a set of archives # subject : Subject # datestr : The posting date, in human-readable format # date : The posting date, in purely numeric format # headers : Any other headers of interest # author : The author's name (and possibly organization) # email : The author's e-mail address # msgid : A unique message ID # in_reply_to : If !="", this is the msgid of the article being replied to # references: A (possibly empty) list of msgid's of earlier articles in # the thread # body : A list of strings making up the message body class Article(pipermail.Article): __last_article_time=time.time() html_tmpl=article_template text_tmpl=article_text_template def as_html(self): d = self.__dict__.copy() if self.prev: d["prev"] = ('' % (url_quote(self.prev.filename))) d["prev_wsubj"] = ('
  • Previous message:' ' %s
  • ' % (url_quote(self.prev.filename), html_quote(self.prev.subject))) else: d["prev"] = d["prev_wsubj"] = "" if self.next: d["next"] = ('' % (html_quote(self.next.filename))) d["next_wsubj"] = ('
  • Next message: %s
  • ' % (url_quote(self.next.filename), html_quote(self.next.subject))) else: d["next"] = d["next_wsubj"] = "" d["email_html"] = html_quote(self.email) d["subject_html"] = html_quote(self.subject) d["author_html"] = html_quote(self.author) d["email_url"] = url_quote(self.email) d["datestr_html"] = html_quote(self.datestr) d["body"] = string.join(self.body, "") return self.html_tmpl % d def as_text(self): d = self.__dict__.copy() d["body"] = string.join(self.body, "") return self.text_tmpl % d def __init__(self, message=None, sequence=0, keepHeaders=[]): import time if message==None: return self.sequence=sequence self.parentID = None self.threadKey = None self.prev=None self.next=None # otherwise the current sequence number is used. id=pipermail.strip_separators(message.getheader('Message-Id')) if id=="": self.msgid=str(self.sequence) else: self.msgid=id if message.has_key('Subject'): self.subject=str(message['Subject']) else: self.subject='No subject' i=0 while (i!=-1): result=REpat.match(self.subject) if result: i = result.end(0) self.subject=self.subject[i:] else: i=-1 if self.subject=="": self.subject='No subject' if message.has_key('Date'): self.datestr=str(message['Date']) date=message.getdate_tz('Date') else: self.datestr='None' date=None if date!=None: date, tzoffset=date[:9], date[-1] if not tzoffset: tzoffset = 0 date=time.mktime(date)-tzoffset else: date=self.__last_article_time+1 self.__last_article_time=date self.date='%011i' % (date,) # Figure out the e-mail address and poster's name self.author, self.email=message.getaddr('From') self.email=pipermail.strip_separators(self.email) self.author=pipermail.strip_separators(self.author) if self.author=="": self.author=self.email # Save the 'In-Reply-To:' and 'References:' lines i_r_t=message.getheader('In-Reply-To') if i_r_t==None: self.in_reply_to='' else: match=pipermail.msgid_pat.search(i_r_t) if match==None: self.in_reply_to='' else: self.in_reply_to=pipermail.strip_separators(match.group(1)) references=message.getheader('References') if references==None: self.references=[] else: self.references = map(pipermail.strip_separators, string.split(references)) # Save any other interesting headers self.headers={} for i in keepHeaders: if message.has_key(i): self.headers[i]=message[i] # Read the message body self.body=[] message.rewindbody() while (1): line=message.fp.readline() if line=="": break self.body.append(line) def loadbody_fromHTML(self,fileobj): self.body=[] begin=0 while(1): line=fileobj.readline() if not line: break if (not begin) and string.strip(line)=='': begin=1 continue if string.strip(line)=='': break if begin: self.body.append(line) def __getstate__(self): d={} for each in self.__dict__.keys(): if each in ['maillist','prev','next','body']: d[each] = None else: d[each] = self.__dict__[each] d['body']=[] return d # # Archive class specific stuff # index_header_template=''' The %(listname)s %(archive)s Archive by %(archtype)s

    %(archive)s Archives by %(archtype)s

    Starting: %(firstdate)s
    Ending: %(lastdate)s
    Messages: %(size)s

    Last message date: %(lastdate)s
    Archived on: %(archivedate)s


    This archive was generated by Pipermail %(version)s. ''' TOC_template='''\ The %(listname)s Archives

    The %(listname)s Archives

    More info on this list...

    %(noarchive_msg)s %(archive_listing_start)s %(archive_listing)s %(archive_listing_end)s ''' TOC_entry_template = '''\ %(archive)s: [ Thread ] [ Subject ] [ Author ] [ Date ] %(textlink)s ''' arch_listing_start = '''\ ''' arch_listing_end = '''\
    Archive View by: Downloadable version
    ''' class HyperArchive(pipermail.T): # some defaults DIRMODE=02775 FILEMODE=0660 VERBOSE=0 DEFAULTINDEX='thread' ARCHIVE_PERIOD='month' THREADLAZY=0 THREADLEVELS=3 ALLOWHTML=1 # "Lines between " handled as is. SHOWHTML=0 # Eg, nuke leading whitespace in html manner. IQUOTES=1 # Italicize quoted text. SHOWBR=0 # Add
    onto every line html_hdr_tmpl=index_header_template html_foot_tmpl=index_footer_template html_TOC_tmpl=TOC_template TOC_entry_tmpl = TOC_entry_template arch_listing_start = arch_listing_start arch_listing_end = arch_listing_end def html_foot(self): d = {"lastdate": html_quote(self.lastdate), "archivedate": html_quote(self.archivedate), "listinfo": self.maillist.GetAbsoluteScriptURL('listinfo'), "version": self.version} for t in ("thread", "subject", "author", "date"): cap = string.upper(t[0]) + t[1:] if self.type == cap: d["%s_ref" % (t)] = "" else: d["%s_ref" % (t)] = ('[ %s ]' % (t, t)) return self.html_foot_tmpl % d def html_head(self): d = {"listname": html_quote(self.maillist.real_name), "archtype": self.type, "archive": self.archive, "listinfo": self.maillist.GetAbsoluteScriptURL('listinfo'), "firstdate": html_quote(self.firstdate), "lastdate": html_quote(self.lastdate), "size": self.size, } for t in ("thread", "subject", "author", "date"): cap = string.upper(t[0]) + t[1:] if self.type == cap: d["%s_ref" % (t)] = "" else: d["%s_ref" % (t)] = ('[ %s ]' % (t, t)) return self.html_hdr_tmpl % d def html_TOC(self): d = {"listname": self.maillist.real_name, "listinfo": self.maillist.GetAbsoluteScriptURL('listinfo') } listing = "" if not self.archives: d["noarchive_msg"] = '

    Currently, there are no archives.

    ' d["archive_listing_start"] = "" d["archive_listing_end"] = "" d["archive_listing"] = "" else: d["noarchive_msg"] = "" d["archive_listing_start"] = self.arch_listing_start d["archive_listing_end"] = self.arch_listing_end for a in self.archives: # Check to see if the archive is gzip'd or not txtfile = os.path.join(mm_cfg.PREFIX, 'archives/private', self.maillist._internal_name, a + '.txt') gzfile = txtfile + '.gz' templ = '[ %(fmt)sText%(sz)s]' # which exists? .txt.gz first, then .txt if os.path.exists(gzfile): file = gzfile url = a + '.txt.gz' fmt = "Gzip'd " elif os.path.exists(txtfile): file = txtfile url = a + '.txt' fmt = '' else: # neither found? file = None # in Python 1.5.2 we have an easy way to get the size if file: try: size = os.path.getsize(file) except AttributeError: # getsize() probably does this anyway ;-) size = os.stat(file)[6] if size < 1000: sz = ' %d bytes ' % size elif size < 1000000: sz = ' %d KB ' % (size / 1000) else: sz = ' %d MB ' % (size / 1000000) # GB?? :-) textlink = templ % {'url': url, 'fmt': fmt, 'sz' : sz} else: # there's no archive file at all... hmmm. textlink = '' listing = listing + self.TOC_entry_tmpl % \ {'archive' : a, 'textlink': textlink} d["archive_listing"] = listing return self.html_TOC_tmpl % d def __init__(self, maillist,unlock=1): self.maillist=maillist self._unlocklist=unlock self._lock_file=None # # can't init the database while other # processes are writing to it! # XXX TODO- implement native locking # with mailman's flock module for HyperDatabase.HyperDatabase # pipermail.T.__init__( self, maillist.archive_dir(), reload=1, database=HyperDatabase.HyperDatabase(maillist.archive_dir())) if hasattr(self.maillist,'archive_volume_frequency'): if self.maillist.archive_volume_frequency == 0: self.ARCHIVE_PERIOD='year' elif self.maillist.archive_volume_frequency == 2: self.ARCHIVE_PERIOD='quarter' elif self.maillist.archive_volume_frequency == 3: self.ARCHIVE_PERIOD='week' elif self.maillist.archive_volume_frequency == 4: self.ARCHIVE_PERIOD='day' else: self.ARCHIVE_PERIOD='month' def GetArchLock(self): if self._lock_file: return 1 # TBD: This needs to be rewritten to use the generalized locking # mechanism (when that exists). -baw ou = os.umask(0) try: self._lock_file = posixfile.open( os.path.join(mm_cfg.LOCK_DIR, '%s@arch.lock' % self.maillist._internal_name), 'a+') finally: os.umask(ou) # minor race condition here, there is no way to atomicly # check & get a lock. That shouldn't matter here tho' -ddm if not self._lock_file.lock('w?', 1): self._lock_file.lock('w|', 1) else: return 0 return 1 def DropArchLock(self): if self._lock_file: self._lock_file.lock('u') self._lock_file.close() self._lock_file = None def processListArch(self): name = self.maillist.ArchiveFileName() wname= name+'.working' ename= name+'.err_unarchived' try: os.stat(name) except (IOError,os.error): #no archive file, nothin to do -ddm return #see if arch is locked here -ddm if not self.GetArchLock(): #another archiver is running, nothing to do. -ddm return #if the working file is still here, the archiver may have # crashed during archiving. Save it, log an error, and move on. try: wf=open(wname,'r') self.maillist.LogMsg("error","Archive working file %s present. " "Check %s for possibly unarchived msgs" % (wname,ename)) ef=open(ename, 'a+') ef.seek(1,2) if ef.read(1) <> '\n': ef.write('\n') ef.write(wf.read()) ef.close() wf.close() os.unlink(wname) except IOError: pass os.rename(name,wname) if self._unlocklist: self.maillist.Unlock() archfile=open(wname,'r') self.processUnixMailbox(archfile, Article) archfile.close() os.unlink(wname) self.DropArchLock() def get_filename(self, article): return '%06i.html' % (article.sequence,) def get_archives(self, article): """Return a list of indexes where the article should be filed. A string can be returned if the list only contains one entry, and the empty list is legal.""" if article.subject in ['subscribe', 'unsubscribe']: return None res = self.dateToVolName(string.atof(article.date)) self.message("figuring article archives\n") self.message(res + "\n") return res # The following two methods should be inverses of each other. -ddm def dateToVolName(self,date): datetuple=time.gmtime(date) if self.ARCHIVE_PERIOD=='year': return time.strftime("%Y",datetuple) elif self.ARCHIVE_PERIOD=='quarter': if datetuple[1] in [1,2,3]: return time.strftime("%Yq1",datetuple) elif datetuple[1] in [4,5,6]: return time.strftime("%Yq2",datetuple) elif datetuple[1] in [7,8,9]: return time.strftime("%Yq3",datetuple) else: return time.strftime("%Yq4",datetuple) elif self.ARCHIVE_PERIOD == 'day': return time.strftime("%Y%m%d", datetuple) elif self.ARCHIVE_PERIOD == 'week': datetuple = list(datetuple) datetuple[2] = datetuple[2] - datetuple[6] # subtract week day # # even if the the day of the month counter is negative, # we still get the right thing from strftime! -scott # return time.strftime("Week-of-Mon-%Y%m%d", tuple(datetuple)) # month. -ddm else: return time.strftime("%Y-%B",datetuple) def volNameToDate(self,volname): volname=string.strip(volname) volre= { 'year' : r'^(?P[0-9]{4,4})$', 'quarter' : r'^(?P[0-9]{4,4})q(?P[1234])$', 'month' : r'^(?P[0-9]{4,4})-(?P[a-zA-Z]+)$', 'week': r'^Week-of-Mon-(?P[0-9]{4,4})(?P[01][0-9])(?P[0123][0-9])', 'day': r'^(?P[0-9]{4,4})(?P[01][0-9])(?P[0123][0-9])$'} for each in volre.keys(): match=re.match(volre[each],volname) if match: year=string.atoi(match.group('year')) month=1 day = 1 if each == 'quarter': q=string.atoi(match.group('quarter')) month=(q*3)-2 elif each == 'month': monthstr=string.lower(match.group('month')) m=[] for i in range(1,13): m.append(string.lower( time.strftime("%B",(1999,i,1,0,0,0,0,1,0)))) try: month=m.index(monthstr)+1 except ValueError: pass elif each == 'week' or each == 'day': month = string.atoi(match.group("month")) day = string.atoi(match.group("day")) return time.mktime((year,month,1,0,0,0,0,1,-1)) return 0.0 def sortarchives(self): def sf(a,b,s=self): al=s.volNameToDate(a) bl=s.volNameToDate(b) if al>bl: return 1 elif al' print self.html_foot() def write_index_entry(self, article): print ('
  • %s %s' % (urllib.quote(article.filename), CGIescape(article.subject), article.sequence, CGIescape(article.author))) def write_threadindex_entry(self, article, depth): if depth<0: self.message('depth<0') depth=0 if depth>self.THREADLEVELS: depth=self.THREADLEVELS if depth' elif depth>self.depth: for i in range(depth-self.depth): print '
      ' print '' % (depth, article.threadKey) self.depth=depth print ('
    • %s %s' % (CGIescape(urllib.quote(article.filename)), CGIescape(article.subject), article.sequence+910, CGIescape(article.author))) def write_TOC(self): self.sortarchives() toc=open(os.path.join(self.basedir, 'index.html'), 'w') toc.write(self.html_TOC()) toc.close() # Archive an Article object. def add_article(self, article): # Determine into what archives the article should be placed archives=self.get_archives(article) # If no value was returned, ignore it: if archives==None: archives=[] # If a string was returned, convert to a list: if type(archives)==type(''): archives=[archives] if archives==[]: return # Ignore the article # Add the article to each archive in turn article.filename=filename=self.get_filename(article) article_text=article.as_text() temp=self.format_article(article) # Reformat the article self.message("Processing article #" + str(article.sequence) + ' into archives ' + str(archives)) for i in archives: self.archive=i archivedir=os.path.join(self.basedir, i) # If it's a new archive, create it if i not in self.archives: self.archives.append(i) self.update_TOC=1 self.database.newArchive(i) # If the archive directory doesn't exist, create it try: os.stat(archivedir) except os.error, errdata: errno, errmsg=errdata if errno==2: mkdir(archivedir) else: raise os.error, errdata self.open_new_archive(i, archivedir) # Write the HTML-ized article to the html archive. f=open(os.path.join(archivedir, filename), 'w') f.write(temp.as_html()) f.close() # Write the text article to the text archive. archivetextfile=os.path.join(self.basedir,"%s.txt" % i) f=open(archivetextfile, 'a+') f.write(article_text) f.close() authorkey=pipermail.fixAuthor(article.author)+'\000'+article.date subjectkey=string.lower(article.subject)+'\000'+article.date # Update parenting info parentID=None if article.in_reply_to!='': parentID=article.in_reply_to elif article.references!=[]: # Remove article IDs that aren't in the archive refs=filter( lambda x, self=self: self.database.hasArticle(self.archive, x), article.references) if len(refs): refs=map( lambda x, s=self: s.database.getArticle(s.archive, x), refs) maxdate=refs[0] for ref in refs[1:]: if ref.date>maxdate.date: maxdate=ref parentID=maxdate.msgid else: # Get the oldest article with a matching subject, and assume # this is a follow-up to that article parentID=self.database.getOldestArticle(self.archive, article.subject) if parentID!=None and not self.database.hasArticle(self.archive, parentID): parentID=None article.parentID=parentID if parentID!=None: parent=self.database.getArticle(self.archive, parentID) article.threadKey=parent.threadKey+article.date+'-' else: article.threadKey=article.date+'-' self.database.setThreadKey(self.archive, article.threadKey + '\000' + article.msgid, article.msgid) self.database.addArticle(i, temp, subjectkey, authorkey) if i not in self._dirty_archives: self._dirty_archives.append(i) del temp # Update only archives that have been marked as "changed". def update_dirty_archives(self): for i in self._dirty_archives: self.update_archive(i) archz=None archt=None # only do this if the gzip module was imported globally if gzip: try: txtfile = os.path.join(self.basedir, '%s.txt' % i) gzipfile = os.path.join(self.basedir, '%s.txt.gz' % i) oldgzip = os.path.join(self.basedir, '%s.old.txt.gz' % i) archt = open(txtfile, 'r') try: os.rename(gzipfile, oldgzip) archz = gzip.open(oldgzip) except (IOError, RuntimeError, os.error): pass newz = gzip.open(gzipfile, 'w') if archz : newz.write(archz.read()) archz.close() os.unlink(oldgzip) newz.write(archt.read()) newz.close() archt.close() os.unlink(txtfile) except IOError: pass self._dirty_archives=[] def close(self): "Close an archive, saving its state and updating any changed archives." self.update_dirty_archives()# Update all changed archives # If required, update the table of contents if self.update_TOC or 1: self.update_TOC=0 self.write_TOC() # Save the collective state self.message('Pickling archive state into ' + os.path.join(self.basedir, 'pipermail.pck')) self.database.close() del self.database f=open(os.path.join(self.basedir, 'pipermail.pck'), 'w') pickle.dump(self.__getstate__(), f) f.close() def __getstate__(self): d={} for each in self.__dict__.keys(): if not (each in ['maillist','_lock_file','_unlocklist'] or string.upper(each) == each): d[each] = self.__dict__[each] return d # Add tags around URLs and e-mail addresses. def __processbody_URLquote(self, source, dest): body2=[] last_line_was_quoted=0 for i in xrange(0, len(source)): Lorig=L=source[i] ; prefix=suffix="" if L==None: continue # Italicise quoted text if self.IQUOTES: quoted=quotedpat.match(L) if quoted==None: last_line_was_quoted=0 else: quoted = quoted.end(0) prefix=CGIescape(L[:quoted]) + '' suffix='' if self.SHOWHTML: suffix=suffix+'
      ' if not last_line_was_quoted: prefix='
      '+prefix L= L[quoted:] last_line_was_quoted=1 # Check for an e-mail address L2="" ; jr=emailpat.search(L) ; kr=urlpat.search(L) while jr!=None or kr!=None: if jr==None: j=-1 else: j = jr.start(0) if kr==None: k=-1 else: k = kr.start(0) if j!=-1 and (jk or j==-1): text=URL=kr.group(1) ; pos=k else: # j==k raise ValueError, "j==k: This can't happen!" length=len(text) #self.message("URL: %s %s %s \n" # % (CGIescape(L[:pos]), URL, CGIescape(text))) L2 = L2 + ('%s
      %s' % (CGIescape(L[:pos]), URL, CGIescape(text))) L=L[pos+length:] jr=emailpat.search(L) ; kr=urlpat.search(L) if jr==None and kr==None: L=CGIescape(L) L=prefix+L2+L+suffix if L!=Lorig: source[i], dest[i]=None, L # Escape all special characters def __processbody_CGIescape(self, source, dest): import cgi for i in xrange(0, len(source)): if source[i]!=None: dest[i]=cgi.escape(source[i]) ; source[i]=None # Perform Hypermail-style processing of directives # in message bodies. Lines between and will be written # out precisely as they are; other lines will be passed to func2 # for further processing . def __processbody_HTML(self, source, dest): l=len(source) ; i=0 while i directives if self.ALLOWHTML: self.__processbody_HTML(source, dest) self.__processbody_URLquote(source, dest) if not self.SHOWHTML: # Do simple formatting here:
      ..
      for i in range(0, len(source)): s=source[i] if s==None: continue dest[i]=CGIescape(s) ; source[i]=None if len(dest) > 0: dest.insert(0, '
      ')
                      dest.append('
      ') else: # Do fancy formatting here if self.SHOWBR: # Add
      onto every line for i in range(0, len(source)): s=source[i] if s==None: continue s=CGIescape(s) +'
      ' dest[i]=s ; source[i]=None else: for i in range(0, len(source)): s=source[i] if s==None: continue s=CGIescape(s) if s[0:1] in ' \t\n': s='

      '+s dest[i]=s ; source[i]=None article.body=filter(lambda x: x!=None, dest) return article def update_article(self, arcdir, article, prev, next): import os self.message('Updating HTML for article '+str(article.sequence)) try: f=open(os.path.join(arcdir, article.filename), 'r') article.loadbody_fromHTML(f) f.close() except IOError: self.message("article file %s is missing!" % os.path.join(arcdir, article.filename)) article.prev=prev article.next=next f=open(os.path.join(arcdir, article.filename), 'w') f.write(article.as_html()) f.close()