summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorbwarsaw2002-10-04 20:45:53 +0000
committerbwarsaw2002-10-04 20:45:53 +0000
commit9447ff68aa82ea696844e3a73f5ae79373dd5a9d (patch)
tree4d1f09040a2bfa3901a894ac61683548539f44c2
parentb1f0a5b8f5108fea2466ed04a4de9286b05198b9 (diff)
downloadmailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.tar.gz
mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.tar.zst
mailman-9447ff68aa82ea696844e3a73f5ae79373dd5a9d.zip
uncanonstr(): We always want this to return a byte string, but it
needs to be "safe" for the given charset, which will be the charset for the list's language, and the charset the page will be rendered in. If it's a Unicode and can be encoded in the desired charset, fine, just return the encoded byte string. If it's already a byte string encoded in the charset, again that's fine, just return the string. Otherwise, html-ify the string and return it as a byte string.
-rw-r--r--Mailman/Utils.py22
1 files changed, 17 insertions, 5 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index e5be24c0c..267f85f71 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -721,7 +721,11 @@ def canonstr(s, lang=None):
break
ref = parts.pop(0)
if ref.startswith('#'):
- appchr(int(ref[1:]))
+ try:
+ appchr(int(ref[1:]))
+ except ValueError:
+ # Non-convertable, stick with what we got
+ newparts.append('&'+ref+';')
else:
c = htmlentitydefs.entitydefs.get(ref, '?')
if c.startswith('#') and c.endswith(';'):
@@ -746,8 +750,9 @@ def canonstr(s, lang=None):
# The opposite of canonstr() -- sorta. I.e. it attempts to encode s in the
-# charset of the given language, and failing that, replaces non-ASCII
-# characters with their html references.
+# charset of the given language, which is the character set that the page will
+# be rendered in, and failing that, replaces non-ASCII characters with their
+# html references. It always returns a byte string.
def uncanonstr(s, lang=None):
if s is None:
s = u''
@@ -755,10 +760,17 @@ def uncanonstr(s, lang=None):
charset = 'us-ascii'
else:
charset = GetCharSet(lang)
- # BAW should change this to a type types of s
+ # See if the string contains characters only in the desired character
+ # set. If so, return it unchanged, except for coercing it to a byte
+ # string.
try:
- return s.encode(charset, 'strict')
+ if isinstance(s, UnicodeType):
+ return s.encode(charset)
+ else:
+ u = unicode(s, charset)
+ return s
except UnicodeError:
+ # Nope, it contains funny characters, so html-ref it
a = []
for c in s:
o = ord(c)