# Copyright (C) 1998,1999,2000,2001 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """HyperArch: Pipermail archiving for MailMan - The Dragon De Monsyne TODO: - The templates should be be files in Mailman's Template dir, instead of static strings. - Each list should be able to have it's own templates. Also, it should automatically fall back to default template in case of error in list specific template. - Should be able to force all HTML to be regenerated next time the archive is run, incase a template is changed. - Run a command to generate tarball of html archives for downloading (prolly in the 'update_dirty_archives' method ) """ import sys import re import cgi import urllib import time import os import posixfile import HyperDatabase import pipermail from Mailman import mm_cfg from Mailman import Utils from Mailman import EncWord from Mailman.Logging.Syslog import syslog gzip = None if mm_cfg.GZIP_ARCHIVE_TXT_FILES: try: import gzip except ImportError: pass EMPTYSTRING = '' NL = '\n' def html_quote(s): repls = ( ('&', '&'), ("<", '<'), (">", '>'), ('"', '"')) for thing, repl in repls: s = s.replace(thing, repl) return s def url_quote(s): return urllib.quote(s) def null_to_space(s): return s.replace('\000', ' ') def sizeof(filename): size = os.path.getsize(filename) if size < 1000: return ' %d bytes ' % size elif size < 1000000: return ' %d KB ' % (size / 1000) # GB?? :-) return ' %d MB ' % (size / 1000000) html_charset = '' def CGIescape(arg): s = cgi.escape(str(arg)) return s.replace('"', '"') # Parenthesized human name paren_name_pat = re.compile(r'([(].*[)])') # Subject lines preceded with 'Re:' REpat = re.compile( r"\s*RE\s*(\[\d+\]\s*)?:\s*", re.IGNORECASE) # E-mail addresses and URLs in text emailpat = re.compile(r'([-+,.\w]+@[-+.\w]+)') # Argh! This pattern is buggy, and will choke on URLs with GET parameters. urlpat = re.compile(r'(\w+://[^>)\s]+)') # URLs in text # Blank lines blankpat = re.compile(r'^\s*$') # Starting directive htmlpat = re.compile(r'^\s*\s*$', re.IGNORECASE) # Ending directive nohtmlpat = re.compile(r'^\s*\s*$', re.IGNORECASE) # Match quoted text quotedpat = re.compile(r'^([>|:]|>)+') # Note: I'm overriding most, if not all of the pipermail Article class # here -ddm # The Article class encapsulates a single posting. The attributes are: # # sequence : Sequence number, unique for each article in a set of archives # subject : Subject # datestr : The posting date, in human-readable format # date : The posting date, in purely numeric format # fromdate : The posting date, in `unixfrom' format # headers : Any other headers of interest # author : The author's name (and possibly organization) # email : The author's e-mail address # msgid : A unique message ID # in_reply_to : If !="", this is the msgid of the article being replied to # references: A (possibly empty) list of msgid's of earlier articles in # the thread # body : A list of strings making up the message body class Article(pipermail.Article): __super_init = pipermail.Article.__init__ __super_set_date = pipermail.Article._set_date _last_article_time = time.time() # for compatibility with old archives loaded via pickle x, charset = mm_cfg.LC_DESCRIPTIONS[mm_cfg.DEFAULT_SERVER_LANGUAGE] cenc = None decoded = {} def __init__(self, message=None, sequence=0, keepHeaders=[]): self.__super_init(message, sequence, keepHeaders) self.prev = None self.next = None # Trim Re: from the subject line i = 0 while i != -1: result = REpat.match(self.subject) if result: i = result.end(0) self.subject = self.subject[i:] else: i = -1 if mm_cfg.ARCHIVER_OBSCURES_EMAILADDRS: self.email = re.sub('@', ' at ', self.email) # Snag the content-* headers. RFC 1521 states that their values are # case insensitive. ctype = message.get('Content-Type', 'text/plain') cenc = message.get('Content-Transfer-Encoding', '') self.ctype = ctype.lower() self.cenc = cenc.lower() self.decoded = {} charset = message.get_param('charset') if charset: self.check_header_charsets(charset) else: self.check_header_charsets() if self.charset and self.charset in mm_cfg.VERBATIM_ENCODING: self.quote = lambda x:x def quote(self, buf): return html_quote(buf) def check_header_charsets(self, msg_charset=None): """Check From and Subject for encoded-words If the email, subject, or author attributes contain non-ASCII characters using the encoded-word syntax of RFC 2047, decoded versions of those attributes are placed in the self.decoded (a dictionary). If the charsets used by these headers differ from each other or from the charset specified by the message's Content-Type header, then an arbitrary charset is chosen. Only those values that match the chosen charset are decoded. """ self.charset = msg_charset author, a_charset = self.decode_charset(self.author) subject, s_charset = self.decode_charset(self.subject) if author is not None or subject is not None: # Both charsets should be the same. If they aren't, we # can only handle one way. if msg_charset is None: self.charset = a_charset or s_charset else: self.charset = msg_charset if author and self.charset == a_charset: self.decoded['author'] = author email, e_charset = self.decode_charset(self.email) if email: self.decoded['email'] = email if subject and self.charset == s_charset: self.decoded['subject'] = subject def decode_charset(self, field): if field.find("=?") == -1: return None, None try: s, c = EncWord.decode(field) except ValueError: return None, None return s, c.lower() def as_html(self): d = self.__dict__.copy() d["prev"], d["prev_wsubj"] = self._get_prev() d["next"], d["next_wsubj"] = self._get_next() d["email_html"] = self.quote(self.email) d["title"] = self.quote(self.subject) d["subject_html"] = self.quote(self.subject) d["author_html"] = self.quote(self.author) d["email_url"] = url_quote(self.email) d["datestr_html"] = self.quote(self.datestr) d["body"] = self._get_body() if self.charset is not None: d["encoding"] = html_charset % self.charset else: d["encoding"] = "" self._add_decoded(d) return Utils.maketext('article.html', d, raw=1) def _get_prev(self): """Return the href and subject for the previous message""" if self.prev: subject = self._get_subject_enc(self.prev) prev = ('' % (url_quote(self.prev.filename))) prev_wsubj = ('

Previous message:' ' %s\n

' % (url_quote(self.prev.filename), self.quote(subject))) else: prev = prev_wsubj = "" return prev, prev_wsubj def _get_subject_enc(self, art): """Return the subject of art, decoded if possible. If the charset of the current message and art match and the article's subject is encoded, decode it. """ if self.charset and art.charset \ and self.charset == art.charset \ and art.decoded.has_key('subject'): return art.decoded['subject'] return art.subject def _get_next(self): """Return the href and subject for the previous message""" if self.next: subject = self._get_subject_enc(self.next) next = ('' % (url_quote(self.next.filename))) next_wsubj = ('

Next message:' ' %s\n

' % (url_quote(self.next.filename), self.quote(subject))) else: next = next_wsubj = "" return next, next_wsubj _rx_quote = re.compile('=([A-F0-9][A-F0-9])') _rx_softline = re.compile('=[ \t]*$') def _get_body(self): """Return the message body ready for HTML, decoded if necessary""" try: body = self.html_body except AttributeError: body = self.body if self.charset is None or self.cenc != "quoted-printable": return null_to_space(EMPTYSTRING.join(body)) # the charset is specified and the body is quoted-printable # first get rid of soft line breaks, then decode literals lines = [] rx = self._rx_softline for line in body: mo = rx.search(line) if mo: i = line.rfind("=") line = line[:i] lines.append(line) buf = EMPTYSTRING.join(lines) chunks = [] offset = 0 rx = self._rx_quote while 1: mo = rx.search(buf, offset) if not mo: chunks.append(buf[offset:]) break i = mo.start() chunks.append(buf[offset:i]) offset = i + 3 chunks.append(chr(int(mo.group(1), 16))) return null_to_space(EMPTYSTRING.join(chunks)) def _add_decoded(self, d): """Add encoded-word keys to HTML output""" for src, dst in (('author', 'author_html'), ('email', 'email_html'), ('subject', 'subject_html')): if self.decoded.has_key(src): d[dst] = self.decoded[src] def as_text(self): d = self.__dict__.copy() # We need to guarantee a valid From_ line, even if there are # bososities in the headers. if not d.get('fromdate', '').strip(): d['fromdate'] = time.ctime(time.time()) if not d.get('email', '').strip(): d['email'] = 'bogus@does.not.exist.com' if not d.get('datestr', '').strip(): d['datestr'] = time.ctime(time.time()) # headers = ['From %(email)s %(fromdate)s', 'From: %(email)s (%(author)s)', 'Date: %(datestr)s', 'Subject: %(subject)s'] if d['_in_reply_to']: headers.append('In-Reply-To: %(_in_reply_to)s') if d['_references']: headers.append('References: %(_references)s') if d['_message_id']: headers.append('Message-ID: %(_message_id)s') return NL.join(headers) % d + \ '\n\n' + \ EMPTYSTRING.join(self.body) def _set_date(self, message): self.__super_set_date(message) self.fromdate = time.ctime(int(self.date)) def loadbody_fromHTML(self,fileobj): self.body = [] begin = 0 while 1: line = fileobj.readline() if not line: break if not begin: if line.strip() == '': begin = 1 continue if line.strip() == '': break self.body.append(line) def __getstate__(self): d={} for each in self.__dict__.keys(): if each == "quote": continue if each in ['maillist','prev','next','body']: d[each] = None else: d[each] = self.__dict__[each] d['body']=[] return d # # Archive class specific stuff # index_header_template=''' The %(listname)s %(archive)s Archive by %(archtype)s %(encoding)s

%(archive)s Archives by %(archtype)s

Messages sorted by: %(thread_ref)s %(subject_ref)s %(author_ref)s %(date_ref)s
More info on this list...

Starting: %(firstdate)s
Ending: %(lastdate)s
Messages: %(size)s

%s %s """ index_footer_template='''\

Last message date: %(lastdate)s
Archived on: %(archivedate)s

Messages sorted by: %(thread_ref)s %(subject_ref)s %(author_ref)s %(date_ref)s
More info on this list...

This archive was generated by Pipermail %(version)s. ''' TOC_template='''\ The %(listname)s Archives

The %(listname)s Archives

You can get more information about this list or you can download the full raw archive (%(size)s).

%(noarchive_msg)s %(archive_listing_start)s %(archive_listing)s %(archive_listing_end)s ''' TOC_entry_template = '''\ %(archive)s: [ Thread ] [ Subject ] [ Author ] [ Date ] %(textlink)s ''' arch_listing_start = '''\ ''' arch_listing_end = '''\

Archive

View by:

Downloadable version

''' class HyperArchive(pipermail.T): __super_init = pipermail.T.__init__ __super_update_archive = pipermail.T.update_archive __super_update_dirty_archives = pipermail.T.update_dirty_archives __super_add_article = pipermail.T.add_article # some defaults DIRMODE = 02775 FILEMODE = 0660 VERBOSE = 0 DEFAULTINDEX = 'thread' ARCHIVE_PERIOD = 'month' THREADLAZY = 0 THREADLEVELS = 3 ALLOWHTML = 1 # "Lines between " handled as is. SHOWHTML = 0 # Eg, nuke leading whitespace in html manner. IQUOTES = 1 # Italicize quoted text. SHOWBR = 0 # Add
onto every line def __init__(self, maillist, unlock=1): # can't init the database while other processes are writing to it! # XXX TODO- implement native locking # with mailman's LockFile module for HyperDatabase.HyperDatabase # dir = maillist.archive_dir() db = HyperDatabase.HyperDatabase(dir) self.__super_init(dir, reload=1, database=db) self.maillist = maillist self._unlocklist = unlock self._lock_file = None self._charsets = {} x, self.charset = mm_cfg.LC_DESCRIPTIONS.get( maillist.preferred_language, mm_cfg.LC_DESCRIPTIONS[mm_cfg.DEFAULT_SERVER_LANGUAGE]) if hasattr(self.maillist,'archive_volume_frequency'): if self.maillist.archive_volume_frequency == 0: self.ARCHIVE_PERIOD='year' elif self.maillist.archive_volume_frequency == 2: self.ARCHIVE_PERIOD='quarter' elif self.maillist.archive_volume_frequency == 3: self.ARCHIVE_PERIOD='week' elif self.maillist.archive_volume_frequency == 4: self.ARCHIVE_PERIOD='day' else: self.ARCHIVE_PERIOD='month' html_hdr_tmpl = index_header_template html_foot_tmpl = index_footer_template html_TOC_tmpl = TOC_template TOC_entry_tmpl = TOC_entry_template arch_listing_start = arch_listing_start arch_listing_end = arch_listing_end def html_foot(self): d = {"lastdate": html_quote(self.lastdate), "archivedate": html_quote(self.archivedate), "listinfo": self.maillist.GetScriptURL('listinfo', absolute=1), "version": self.version} for t in ("thread", "subject", "author", "date"): cap = t[0].upper() + t[1:] if self.type == cap: d["%s_ref" % (t)] = "" else: d["%s_ref" % (t)] = ('[ %s ]' % (t, t)) return self.html_foot_tmpl % d def html_head(self): d = {"listname": html_quote(self.maillist.real_name), "archtype": self.type, "archive": self.archive, "listinfo": self.maillist.GetScriptURL('listinfo', absolute=1), "firstdate": html_quote(self.firstdate), "lastdate": html_quote(self.lastdate), "size": self.size, } for t in ("thread", "subject", "author", "date"): cap = t[0].upper() + t[1:] if self.type == cap: d["%s_ref" % (t)] = "" else: d["%s_ref" % (t)] = ('[ %s ]' % (t, t)) if self.charset: d["encoding"] = html_charset % self.charset else: d["encoding"] = "" return self.html_hdr_tmpl % d def html_TOC(self): listname = self.maillist.internal_name() mbox = os.path.join(self.maillist.archive_dir()+'.mbox', listname+'.mbox') d = {"listname": self.maillist.real_name, "listinfo": self.maillist.GetScriptURL('listinfo', absolute=1), "fullarch": '../%s.mbox/%s.mbox' % (listname, listname), "size": sizeof(mbox), } if not self.archives: d["noarchive_msg"] = '

Currently, there are no archives.

' d["archive_listing_start"] = "" d["archive_listing_end"] = "" d["archive_listing"] = "" else: d["noarchive_msg"] = "" d["archive_listing_start"] = self.arch_listing_start d["archive_listing_end"] = self.arch_listing_end accum = [] for a in self.archives: accum.append(self.html_TOC_entry(a)) d["archive_listing"] = EMPTYSTRING.join(accum) if not d.has_key("encoding"): d["encoding"] = "" return self.html_TOC_tmpl % d def html_TOC_entry(self, arch): # Check to see if the archive is gzip'd or not txtfile = os.path.join(mm_cfg.PRIVATE_ARCHIVE_FILE_DIR, self.maillist.internal_name(), arch + '.txt') gzfile = txtfile + '.gz' templ = '[ %(fmt)sText%(sz)s]' # which exists? .txt.gz first, then .txt if os.path.exists(gzfile): file = gzfile url = arch + '.txt.gz' fmt = "Gzip'd " elif os.path.exists(txtfile): file = txtfile url = arch + '.txt' fmt = '' else: # neither found? file = None # in Python 1.5.2 we have an easy way to get the size if file: textlink = templ % {'url': url, 'fmt': fmt, 'sz' : sizeof(file), } else: # there's no archive file at all... hmmm. textlink = '' return self.TOC_entry_tmpl % { 'archive': arch, 'textlink': textlink } def GetArchLock(self): if self._lock_file: return 1 # TBD: This needs to be rewritten to use the generalized locking # mechanism (when that exists). -baw ou = os.umask(0) try: self._lock_file = posixfile.open( os.path.join(mm_cfg.LOCK_DIR, '%s@arch.lock' % self.maillist.internal_name()), 'a+') finally: os.umask(ou) # minor race condition here, there is no way to atomicly # check & get a lock. That shouldn't matter here tho' -ddm if not self._lock_file.lock('w?', 1): self._lock_file.lock('w|', 1) else: return 0 return 1 def DropArchLock(self): if self._lock_file: self._lock_file.lock('u') self._lock_file.close() self._lock_file = None def processListArch(self): name = self.maillist.ArchiveFileName() wname= name+'.working' ename= name+'.err_unarchived' try: os.stat(name) except (IOError,os.error): #no archive file, nothin to do -ddm return #see if arch is locked here -ddm if not self.GetArchLock(): #another archiver is running, nothing to do. -ddm return #if the working file is still here, the archiver may have # crashed during archiving. Save it, log an error, and move on. try: wf = open(wname) syslog('error', '\ Archive working file %s present. Check %s for possibly unarchived msgs', wname, ename) omask = os.umask(007) try: ef = open(ename, 'a+') finally: os.umask(omask) ef.seek(1,2) if ef.read(1) <> '\n': ef.write('\n') ef.write(wf.read()) ef.close() wf.close() os.unlink(wname) except IOError: pass os.rename(name,wname) if self._unlocklist: self.maillist.Unlock() archfile = open(wname) self.processUnixMailbox(archfile, Article) archfile.close() os.unlink(wname) self.DropArchLock() def get_filename(self, article): return '%06i.html' % (article.sequence,) def get_archives(self, article): """Return a list of indexes where the article should be filed. A string can be returned if the list only contains one entry, and the empty list is legal.""" res = self.dateToVolName(float(article.date)) self.message("figuring article archives\n") self.message(res + "\n") return res # The following two methods should be inverses of each other. -ddm def dateToVolName(self,date): datetuple=time.localtime(date) if self.ARCHIVE_PERIOD=='year': return time.strftime("%Y",datetuple) elif self.ARCHIVE_PERIOD=='quarter': if datetuple[1] in [1,2,3]: return time.strftime("%Yq1",datetuple) elif datetuple[1] in [4,5,6]: return time.strftime("%Yq2",datetuple) elif datetuple[1] in [7,8,9]: return time.strftime("%Yq3",datetuple) else: return time.strftime("%Yq4",datetuple) elif self.ARCHIVE_PERIOD == 'day': return time.strftime("%Y%m%d", datetuple) elif self.ARCHIVE_PERIOD == 'week': # Reconstruct "seconds since epoch", and subtract weekday # multiplied by the number of seconds in a day. monday = time.mktime(datetuple) - datetuple[6] * 24 * 60 * 60 # Build a new datetuple from this "seconds since epoch" value datetuple = time.localtime(monday) return time.strftime("Week-of-Mon-%Y%m%d", datetuple) # month. -ddm else: return time.strftime("%Y-%B",datetuple) def volNameToDate(self,volname): volname = volname.strip() volre = {'year' : r'^(?P[0-9]{4,4})$', 'quarter' : r'^(?P[0-9]{4,4})q(?P[1234])$', 'month' : r'^(?P[0-9]{4,4})-(?P[a-zA-Z]+)$', 'week': r'^Week-of-Mon-(?P[0-9]{4,4})(?P[01][0-9])(?P[0123][0-9])', 'day': r'^(?P[0-9]{4,4})(?P[01][0-9])(?P[0123][0-9])$'} for each in volre.keys(): match=re.match(volre[each],volname) if match: year=int(match.group('year')) month=1 day = 1 if each == 'quarter': q=int(match.group('quarter')) month=(q*3)-2 elif each == 'month': monthstr=match.group('month').lower() m=[] for i in range(1,13): m.append( time.strftime("%B",(1999,i,1,0,0,0,0,1,0)).lower()) try: month=m.index(monthstr)+1 except ValueError: pass elif each == 'week' or each == 'day': month = int(match.group("month")) day = int(match.group("day")) return time.mktime((year,month,1,0,0,0,0,1,-1)) return 0.0 def sortarchives(self): def sf(a,b,s=self): al=s.volNameToDate(a) bl=s.volNameToDate(b) if al>bl: return 1 elif al' print self.html_foot() def write_index_entry(self, article): if article.charset == self.charset: d = article.decoded subject = d.get("subject", article.subject) author = d.get("author", article.author) else: subject = CGIescape(article.subject) author = CGIescape(article.author) print index_entry_template % (urllib.quote(article.filename), subject, article.sequence, author) def write_threadindex_entry(self, article, depth): if depth < 0: self.message('depth<0') depth = 0 if depth > self.THREADLEVELS: depth = self.THREADLEVELS if depth < self.depth: for i in range(self.depth-depth): print '' elif depth > self.depth: for i in range(depth-self.depth): print '

tags around URLs and e-mail addresses. def __processbody_URLquote(self, lines): # XXX a lot to do here: # 1. use lines directly, rather than source and dest # 2. make it clearer # 3. make it faster source = lines[:] dest = lines body2=[] last_line_was_quoted=0 for i in xrange(0, len(source)): Lorig=L=source[i] ; prefix=suffix="" if L==None: continue # Italicise quoted text if self.IQUOTES: quoted=quotedpat.match(L) if quoted==None: last_line_was_quoted=0 else: quoted = quoted.end(0) prefix=CGIescape(L[:quoted]) + '' suffix='' if self.SHOWHTML: suffix=suffix+'
' if not last_line_was_quoted: prefix='
'+prefix L= L[quoted:] last_line_was_quoted=1 # Check for an e-mail address L2="" ; jr=emailpat.search(L) ; kr=urlpat.search(L) while jr!=None or kr!=None: if jr==None: j=-1 else: j = jr.start(0) if kr==None: k=-1 else: k = kr.start(0) if j!=-1 and (jk or j==-1): text=URL=kr.group(1) ; pos=k else: # j==k raise ValueError, "j==k: This can't happen!" length=len(text) #self.message("URL: %s %s %s \n" # % (CGIescape(L[:pos]), URL, CGIescape(text))) L2 = L2 + ('%s

')
            lines.append('

' + s article.html_body = lines return article def update_article(self, arcdir, article, prev, next): self.message('Updating HTML for article ' + str(article.sequence)) try: f = open(os.path.join(arcdir, article.filename)) article.loadbody_fromHTML(f) f.close() except IOError: self.message("article file %s is missing!" % os.path.join(arcdir, article.filename)) article.prev = prev article.next = next omask = os.umask(002) try: f = open(os.path.join(arcdir, article.filename), 'w') finally: os.umask(omask) f.write(article.as_html()) f.close()