diff options
| -rw-r--r-- | src/mailman/interfaces/mta.py | 8 | ||||
| -rw-r--r-- | src/mailman/mta/bulk.py | 67 | ||||
| -rw-r--r-- | src/mailman/mta/docs/bulk.txt | 118 |
3 files changed, 189 insertions, 4 deletions
diff --git a/src/mailman/interfaces/mta.py b/src/mailman/interfaces/mta.py index 3dd1793e5..f8dcb64b3 100644 --- a/src/mailman/interfaces/mta.py +++ b/src/mailman/interfaces/mta.py @@ -53,7 +53,13 @@ class IMailTransportAgentDelivery(Interface): Ordinarily the mailing list is consulted for delivery specifics, however the message metadata dictionary can contain additional directions to control delivery. Specifics are left to the - implementation. + implementation, but there are a few common keys: + + * envelope_sender - the email address of the RFC 2821 envelope sender; + * decorated - a flag indicating whether the message has been decorated + with headers and footers yet; + * recipients - the set of all recipients who should receive this + message, as a set of email addresses; :param mlist: The mailing list being delivered to. :type mlist: `IMailingList` diff --git a/src/mailman/mta/bulk.py b/src/mailman/mta/bulk.py index 12f94cc23..5c6d8257f 100644 --- a/src/mailman/mta/bulk.py +++ b/src/mailman/mta/bulk.py @@ -25,16 +25,83 @@ __all__ = [ ] +from itertools import chain + from zope.interface import implements from mailman.interfaces.mta import IMailTransportAgentDelivery +# A mapping of top-level domains to bucket numbers. The zeroth bucket is +# reserved for everything else. At one time, these were the most common +# domains. +CHUNKMAP = dict( + com=1, + net=2, + org=2, + edu=3, + us=3, + ca=3, + ) + + class BulkDelivery: """Deliver messages to the MTA in as few sessions as possible.""" implements(IMailTransportAgentDelivery) + def __init__(self, max_recipients): + """Create a bulk deliverer. + + :param max_recipients: The maximum number of recipients per delivery + chunk. Zero or less means to group all recipients into one + chunk. + :type max_recipients: integer + """ + self._max_recipients = max_recipients + def deliver(self, mlist, msg, msgdata): """See `IMailTransportAgentDelivery`.""" + + def chunkify(self, recipients): + """Split a set of recipients into chunks. + + The `max_recipients` argument given to the constructor specifies the + maximum number of recipients in each chunk. + + :param recipients: The set of recipient email addresses + :type recipients: sequence of email address strings + :return: A list of chunks, where each chunk is a set containing no + more than `max_recipients` number of addresses. The chunk can + contain fewer, and no packing is guaranteed. + :rtype: list of sets of strings + """ + if self._max_recipients <= 0: + yield set(recipients) + return + # This algorithm was originally suggested by Chuq Von Rospach. Start + # by splitting the recipient addresses into top-level domain buckets, + # using the "most common" domains. Everything else ends up in the + # zeroth bucket. + by_bucket = {} + for address in recipients: + localpart, at, domain = address.partition('@') + domain_parts = domain.split('.') + bucket_number = CHUNKMAP.get(domain_parts[-1], 0) + by_bucket.setdefault(bucket_number, set()).add(address) + # Fill chunks by sorting the tld values by length. + chunk = set() + for tld_chunk in sorted(by_bucket.values(), key=len, reverse=True): + while tld_chunk: + chunk.add(tld_chunk.pop()) + if len(chunk) == self._max_recipients: + yield chunk + chunk = set() + # Every tld bucket starts a new chunk, but only if non-empty + if len(chunk) > 0: + yield chunk + chunk = set() + # Be sure to include the last chunk, but only if it's non-empty. + if len(chunk) > 0: + yield chunk diff --git a/src/mailman/mta/docs/bulk.txt b/src/mailman/mta/docs/bulk.txt index 4f564173d..0943b276f 100644 --- a/src/mailman/mta/docs/bulk.txt +++ b/src/mailman/mta/docs/bulk.txt @@ -14,11 +14,123 @@ cannot be personalized. See `verp.txt`_ for an alternative strategy. >>> from mailman.mta.bulk import BulkDelivery -Delivery strategies must implement the proper interface. +The standard bulk deliverer takes as an argument the maximum number of +recipients per session. The default is to deliver the message in one chunk, +containing all recipients. + + >>> bulk = BulkDelivery(0) - >>> bulk = BulkDelivery() +Delivery strategies must implement the proper interface. >>> from mailman.interfaces.mta import IMailTransportAgentDelivery >>> from zope.interface.verify import verifyObject - >>> verifyObject(IMailTransportAgentDelivery, bulk) + >>> verifyObject(IMailTransportAgentDelivery, bulk) + True + + +Chunking recipients +=================== + +The set of final recipients is contained in the 'recipients' key in the +message metadata. When `max_recipients` is specified as zero, then the bulk +deliverer puts all recipients into one big chunk. + + >>> from string import ascii_letters + >>> recipients = set(letter + 'person@example.com' + ... for letter in ascii_letters) + + >>> chunks = list(bulk.chunkify(recipients)) + >>> len(chunks) + 1 + >>> len(chunks[0]) + 52 + +Let say the maximum number of recipients allowed is 4, then no chunk will have +more than 4 recipients, though they can have fewer (but still not zero). + + >>> bulk = BulkDelivery(4) + >>> chunks = list(bulk.chunkify(recipients)) + >>> len(chunks) + 13 + >>> all(0 < len(chunk) <= 4 for chunk in chunks) True + +The chunking algorithm sorts recipients by top level domain by length. + + >>> recipients = set([ + ... 'anne@example.com', + ... 'bart@example.org', + ... 'cate@example.net', + ... 'dave@example.com', + ... 'elle@example.org', + ... 'fred@example.net', + ... 'gwen@example.com', + ... 'herb@example.us', + ... 'ione@example.net', + ... 'john@example.com', + ... 'kate@example.com', + ... 'liam@example.ca', + ... 'mary@example.us', + ... 'neil@example.net', + ... 'ocho@example.org', + ... 'paco@example.xx', + ... 'quaq@example.zz', + ... ]) + + >>> bulk = BulkDelivery(4) + >>> chunks = list(bulk.chunkify(recipients)) + >>> len(chunks) + 6 + +We can't make any guarantees about sorting within each chunk, but we can tell +a few things. For example, the first two chunks will be composed of .net (4) +and .org (3) domains (for a total of 7). + + >>> len(chunks[0]) + 4 + >>> len(chunks[1]) + 3 + + >>> for address in sorted(chunks[0].union(chunks[1])): + ... print address + bart@example.org + cate@example.net + elle@example.org + fred@example.net + ione@example.net + neil@example.net + ocho@example.org + +We also know that the next two chunks will contain .com (5) addresses. + + >>> len(chunks[2]) + 4 + >>> len(chunks[3]) + 1 + + >>> for address in sorted(chunks[2].union(chunks[3])): + ... print address + anne@example.com + dave@example.com + gwen@example.com + john@example.com + kate@example.com + +The next chunk will contain the .us (2) and .ca (1) domains. + + >>> len(chunks[4]) + 3 + >>> for address in sorted(chunks[4]): + ... print address + herb@example.us + liam@example.ca + mary@example.us + +The final chunk will contain the outliers, .xx (1) and .zz (2). + + >>> len(chunks[5]) + 2 + >>> for address in sorted(chunks[5]): + ... print address + paco@example.xx + quaq@example.zz |
