Mailman/EncWord.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

# Copyright (C) 1998,1999,2000 by the Free Software Foundation, Inc.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software 
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

"""Decode encoded-words as defined by RFC 2047"""

import sys
import string
import base64


class DecodeError(ValueError):
    __super_init = ValueError.__init__
    def __init__(self, msg):
        self.__super_init('invalid encoded-word: %s' % msg)


class Decoder:
    """Decode mail header encoded-word format defined by RFC 2047"""
    
    offset = 0

    def decode(self, s):
        """Decode an encoded-word.

        Returns the charset of the encoded-word, the decoded text, and the
        position of the first character following the encoded-word.

        The first position of the input string must by the first character of
        the encoded-word.
        """
        if s[:2] <> '=?':
            raise DecodeError('must start with "=?", not %s' % repr(s[:2]))
        charset = self._get_charset(s)
        encoding = self._get_encoding(s)
        _text = self._get_text(s)
        # encoding must be either 'q' or 'b', ensured by _get_encoding()
        if encoding == 'q':
            text = self._decode_q(_text)
        else:
            text = self._decode_b(_text)
        return charset, text, self.offset

    # TBD: Technically the charset and encoding can't contain SPACE, CTLs, or
    # especials; do not currently check this.

    def _get_charset(self, s):
        i = string.find(s, '?', 2)
        if i == -1:
            raise DecodeError("can't find end of charset")
        self.offset = i + 1
        return s[2:i]

    _valid_encodings = ('q', 'b')

    def _get_encoding(self, s):
        i = string.find(s, '?', self.offset)
        if i < 0:
            raise DecodeError("can't find encoding")
        enc = string.lower(s[self.offset:i])
        self.offset = i + 1
        if enc not in Decoder._valid_encodings:
            raise DecodeError('not a valid encoding: %s' % enc)
        return enc

    def _get_text(self, s):
        i = string.find(s, '?=', self.offset)
        if i < 0:
            raise DecodeError("can't find end of encoded text")
        text = s[self.offset:i]
        self.offset = i + 2
        return text

    SPACE = chr(0x20)

    def _decode_q(self, s):
        """Q encoding defined by RFC 2047"""
        chunks = []
        offset = 0
        end = len(s)
        while offset < end:
            i = string.find(s, '=', offset)
            j = string.find(s, '_', offset)
            if i < 0 and j < 0:
                chunks.append(s[offset:])
                break
            if (j < i and j >= 0) or i < 0:
                chunks.append(s[offset:j])
                chunks.append(Decoder.SPACE)
                offset = j + 1
            else:
                chunks.append(s[offset:i])
                hexdig = s[i+1:i+3]
                chunks.append(chr(string.atoi(hexdig, 16)))
                offset = i + 3
        return string.join(chunks, '')

    def _decode_b(self, s):
        """B encoding == base64 encoding defined by RFC 2045"""
        return base64.decodestring(s)


def decode(s):
    """Decode a string containing encoded words"""
    _decode = Decoder().decode

    chunks = []
    offset = 0
    charset = None
    while 1:
        i = string.find(s, '=?', offset)
        if i < 0:
            chunks.append(s[offset:])
            break
        chunks.append(s[offset:i])
        _charset, text, offset = _decode(s[i:])
        offset = offset + i
        if charset is None:
            charset = _charset
        elif charset <> _charset:
            raise ValueError("can't decode string with multiple charsets")
        chunks.append(text)
    return string.join(chunks, ''), charset


def test():
    examples = [
        # valid
        '=?US-ASCII?Q?Keith_Moore?= <moore@cs.utk.edu>',
        '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
        '=?ISO-8859-1?Q?Andr=E9_?= Pirard <PIRARD@vm1.ulg.ac.be>',
        '=?ISO-8859-1?B?SWYgeW91IGNhbiByZWFkIHRoaXMgeW8=?=',
        '=?ISO-8859-2?B?dSB1bmRlcnN0YW5kIHRoZSBleGFtcGxlLg==?=',
        '=?US-ASCII?Q?.._cool!?=',
        '=?ISO-8859-1?Q?Olle_J=E4rnefors?= <ojarnef@admin.kth.se>',
        '(=?iso-8859-8?b?7eXs+SDv4SDp7Oj08A==?=)',
        # invalid
        'abc',
        '=?abc',
        '=?abc?abc',
        '=?ISO-8859-1?abc?text',
        ]
    for s in examples:
        try:
            text, charset = decode(s)
        except ValueError, msg:
            print "error:", msg
        else:
            print text, charset


if __name__ == "__main__":
    test()