1 files changed, 58 insertions, 0 deletions
diff --git a/Mailman/Utils.py b/Mailman/Utils.py
index 0001c612b..12141c2c8 100644
--- a/Mailman/Utils.py
+++ b/Mailman/Utils.py
@@ -31,6 +31,7 @@ import sha
 import errno
 import time
 import cgi
+import htmlentitydefs
 import email.Iterators
 from string import whitespace, digits
 try:
@@ -694,3 +695,60 @@ def percent_identifiers(s):
     for name in cre.findall(s):
         d[name] = 1
     return d
+
+
+
+# Utilities to canonicalize a string, which means un-HTML-ifying the string to
+# produce a Unicode string or an 8-bit string if all the characters are ASCII.
+def canonstr(s, lang=None):
+    newparts = []
+    parts = re.split(r'&(?P<ref>[^;]+);', s)
+    while 1:
+        newparts.append(parts.pop(0))
+        if not parts:
+            break
+        ref = parts.pop(0)
+        if ref.startswith('#'):
+            newparts.append(chr(int(ref[1:])))
+        else:
+            c = htmlentitydefs.entitydefs.get(ref, '?')
+            if c.startswith('#') and c.endswith(';'):
+                newparts.append(chr(ref[1:-1]))
+            else:
+                newparts.append(c)
+    newstr = EMPTYSTRING.join(newparts)
+    # We want the default fallback to be iso-8859-1 even if the language is
+    # English (us-ascii).  This seems like a practical compromise so that
+    # non-ASCII characters in names can be used in English lists w/o having to
+    # change the global charset for English from us-ascii (which I
+    # superstitiously think my have unintended consequences).
+    if lang is None:
+        charset = 'iso-8859-1'
+    else:
+        charset = GetCharSet(lang)
+        if charset == 'us-ascii':
+            charset = 'iso-8859-1'
+    return unicode(newstr, charset, 'replace')
+
+
+# The opposite of canonstr() -- sorta.  I.e. it attempts to encode s in the
+# charset of the given language, and failing that, replaces non-ASCII
+# characters with their html references.
+def uncanonstr(s, lang=None):
+    if s is None:
+        s = ''
+    if lang is None:
+        charset = 'us-ascii'
+    else:
+        charset = GetCharSet(lang)
+    try:
+        return s.encode(charset, 'strict')
+    except UnicodeError:
+        a = []
+        for c in s:
+            o = ord(c)
+            if o > 127:
+                a.append('&#%3d;' % o)
+            else:
+                a.append(c)
+        return EMPTYSTRING.join(a)