summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Mailman/Utils.py22
1 files changed, 17 insertions, 5 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index e5be24c0c..267f85f71 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -721,7 +721,11 @@ def canonstr(s, lang=None):
break
ref = parts.pop(0)
if ref.startswith('#'):
- appchr(int(ref[1:]))
+ try:
+ appchr(int(ref[1:]))
+ except ValueError:
+ # Non-convertable, stick with what we got
+ newparts.append('&'+ref+';')
else:
c = htmlentitydefs.entitydefs.get(ref, '?')
if c.startswith('#') and c.endswith(';'):
@@ -746,8 +750,9 @@ def canonstr(s, lang=None):
# The opposite of canonstr() -- sorta. I.e. it attempts to encode s in the
-# charset of the given language, and failing that, replaces non-ASCII
-# characters with their html references.
+# charset of the given language, which is the character set that the page will
+# be rendered in, and failing that, replaces non-ASCII characters with their
+# html references. It always returns a byte string.
def uncanonstr(s, lang=None):
if s is None:
s = u''
@@ -755,10 +760,17 @@ def uncanonstr(s, lang=None):
charset = 'us-ascii'
else:
charset = GetCharSet(lang)
- # BAW should change this to a type types of s
+ # See if the string contains characters only in the desired character
+ # set. If so, return it unchanged, except for coercing it to a byte
+ # string.
try:
- return s.encode(charset, 'strict')
+ if isinstance(s, UnicodeType):
+ return s.encode(charset)
+ else:
+ u = unicode(s, charset)
+ return s
except UnicodeError:
+ # Nope, it contains funny characters, so html-ref it
a = []
for c in s:
o = ord(c)