# Copyright (C) 1998,1999,2000 by the Free Software Foundation, Inc. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """Decode encoded-words as defined by RFC 2047""" import sys import string import base64 class DecodeError(ValueError): __super_init = ValueError.__init__ def __init__(self, msg): self.__super_init('invalid encoded-word: %s' % msg) class Decoder: """Decode mail header encoded-word format defined by RFC 2047""" offset = 0 def decode(self, s): """Decode an encoded-word. Returns the charset of the encoded-word, the decoded text, and the position of the first character following the encoded-word. The first position of the input string must by the first character of the encoded-word. """ if s[:2] <> '=?': raise DecodeError('must start with "=?", not %s' % repr(s[:2])) charset = self._get_charset(s) encoding = self._get_encoding(s) _text = self._get_text(s) # encoding must be either 'q' or 'b', ensured by _get_encoding() if encoding == 'q': text = self._decode_q(_text) else: text = self._decode_b(_text) return charset, text, self.offset # TBD: Technically the charset and encoding can't contain SPACE, CTLs, or # especials; do not currently check this. def _get_charset(self, s): i = string.find(s, '?', 2) if i == -1: raise DecodeError("can't find end of charset") self.offset = i + 1 return s[2:i] _valid_encodings = ('q', 'b') def _get_encoding(self, s): i = string.find(s, '?', self.offset) if i < 0: raise DecodeError("can't find encoding") enc = string.lower(s[self.offset:i]) self.offset = i + 1 if enc not in Decoder._valid_encodings: raise DecodeError('not a valid encoding: %s' % enc) return enc def _get_text(self, s): i = string.find(s, '?=', self.offset) if i < 0: raise DecodeError("can't find end of encoded text") text = s[self.offset:i] self.offset = i + 2 return text SPACE = chr(0x20) def _decode_q(self, s): """Q encoding defined by RFC 2047""" chunks = [] offset = 0 end = len(s) while offset < end: i = string.find(s, '=', offset) j = string.find(s, '_', offset) if i < 0 and j < 0: chunks.append(s[offset:]) break if (j < i and j >= 0) or i < 0: chunks.append(s[offset:j]) chunks.append(Decoder.SPACE) offset = j + 1 else: chunks.append(s[offset:i]) hexdig = s[i+1:i+3] chunks.append(chr(string.atoi(hexdig, 16))) offset = i + 3 return string.join(chunks, '') def _decode_b(self, s): """B encoding == base64 encoding defined by RFC 2045""" return base64.decodestring(s) def decode(s): """Decode a string containing encoded words""" _decode = Decoder().decode chunks = [] offset = 0 charset = None while 1: i = string.find(s, '=?', offset) if i < 0: chunks.append(s[offset:]) break chunks.append(s[offset:i]) _charset, text, offset = _decode(s[i:]) offset = offset + i if charset is None: charset = _charset elif charset <> _charset: raise ValueError("can't decode string with multiple charsets") chunks.append(text) return string.join(chunks, ''), charset def test(): examples = [ # valid '=?US-ASCII?Q?Keith_Moore?= ', '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ', '=?ISO-8859-1?Q?Andr=E9_?= Pirard ', '=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=', '=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=', '=?US-ASCII?Q?.._cool!?=', '=?ISO-8859-1?Q?Olle_J=E4rnefors?= ', '(=?iso-8859-8?b?7eXs+SDv4SDp7Oj08A==?=)', # invalid 'abc', '=?abc', '=?abc?abc', '=?ISO-8859-1?abc?text', ] for s in examples: try: text, charset = decode(s) except ValueError, msg: print "error:", msg else: print text, charset if __name__ == "__main__": test()