src/mailman/utilities/string.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

# Copyright (C) 2009-2011 by the Free Software Foundation, Inc.
#
# This file is part of GNU Mailman.
#
# GNU Mailman is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# GNU Mailman is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# GNU Mailman.  If not, see <http://www.gnu.org/licenses/>.

"""String utilities."""

from __future__ import absolute_import, unicode_literals

__metaclass__ = type
__all__ = [
    'expand',
    'oneline',
    'uncanonstr',
    'websafe',
    ]


import cgi
import logging

from email.errors import HeaderParseError
from email.header import decode_header, make_header
from string import Template
from zope.component import getUtility

from mailman.interfaces.languages import ILanguageManager


EMPTYSTRING = ''
UEMPTYSTRING = u''

log = logging.getLogger('mailman.error')


def expand(template, substitutions, template_class=Template):
    """Expand string template with substitutions.

    :param template: A PEP 292 $-string template.
    :type template: string
    :param substitutions: The substitutions dictionary.
    :type substitutions: dict
    :param template_class: The template class to use.
    :type template_class: class
    :return: The substituted string.
    :rtype: string
    """
    # Python 2.6 requires ** dictionaries to have str, not unicode keys, so
    # convert as necessary.  Note that string.Template uses **.  For our
    # purposes, keys should always be ascii.  Values though can be anything.
    cooked = substitutions.__class__()
    for key in substitutions:
        if isinstance(key, unicode):
            key = key.encode('ascii')
        cooked[key] = substitutions[key]
    try:
        return template_class(template).safe_substitute(cooked)
    except (TypeError, ValueError):
        # The template is really screwed up.
        log.exception('broken template: %s', template)


def oneline(s, cset='us-ascii', in_unicode=False):
    """Decode a header string in one line and convert into specified charset.

    :param s: The header string
    :type s: string
    :param cset: The character set (encoding) to use.
    :type cset: string
    :param in_unicode: Flag specifying whether to return the converted string
        as a unicode (True) or an 8-bit string (False, the default).
    :type in_unicode: bool
    :return: The decoded header string.  If an error occurs while converting
        the input string, return the string undecoded, as an 8-bit string.
    :rtype: string
    """
    try:
        h = make_header(decode_header(s))
        ustr = h.__unicode__()
        line = UEMPTYSTRING.join(ustr.splitlines())
        if in_unicode:
            return line
        else:
            return line.encode(cset, 'replace')
    except (LookupError, UnicodeError, ValueError, HeaderParseError):
        # possibly charset problem. return with undecoded string in one line.
        return EMPTYSTRING.join(s.splitlines())


def websafe(s):
    return cgi.escape(s, quote=True)


# The opposite of canonstr() -- sorta.  I.e. it attempts to encode s in the
# charset of the given language, which is the character set that the page will
# be rendered in, and failing that, replaces non-ASCII characters with their
# html references.  It always returns a byte string.
def uncanonstr(s, lang=None):
    if s is None:
        s = u''
    if lang is None:
        charset = 'us-ascii'
    else:
        charset = getUtility(ILanguageManager)[lang].charset
    # See if the string contains characters only in the desired character
    # set.  If so, return it unchanged, except for coercing it to a byte
    # string.
    try:
        if isinstance(s, unicode):
            return s.encode(charset)
        else:
            unicode(s, charset)
            return s
    except UnicodeError:
        # Nope, it contains funny characters, so html-ref it
        a = []
        for c in s:
            o = ord(c)
            if o > 127:
                a.append('&#%3d;' % o)
            else:
                a.append(c)
        # Join characters together and coerce to byte string
        return str(EMPTYSTRING.join(a))