summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorBarry Warsaw2009-10-19 07:56:20 -0400
committerBarry Warsaw2009-10-19 07:56:20 -0400
commit8b2b3712c3b9df2e7cc63e63799d830b12e12e00 (patch)
tree85d1d06f728c383b21a8f8ab5f2c4db19240a2ac /src
parent1bd649921b181c1f61083a04df0a9224d7164d0d (diff)
downloadmailman-8b2b3712c3b9df2e7cc63e63799d830b12e12e00.tar.gz
mailman-8b2b3712c3b9df2e7cc63e63799d830b12e12e00.tar.zst
mailman-8b2b3712c3b9df2e7cc63e63799d830b12e12e00.zip
Diffstat (limited to 'src')
-rw-r--r--src/mailman/interfaces/mta.py8
-rw-r--r--src/mailman/mta/bulk.py67
-rw-r--r--src/mailman/mta/docs/bulk.txt118
3 files changed, 189 insertions, 4 deletions
diff --git a/src/mailman/interfaces/mta.py b/src/mailman/interfaces/mta.py
index 3dd1793e5..f8dcb64b3 100644
--- a/src/mailman/interfaces/mta.py
+++ b/src/mailman/interfaces/mta.py
@@ -53,7 +53,13 @@ class IMailTransportAgentDelivery(Interface):
Ordinarily the mailing list is consulted for delivery specifics,
however the message metadata dictionary can contain additional
directions to control delivery. Specifics are left to the
- implementation.
+ implementation, but there are a few common keys:
+
+ * envelope_sender - the email address of the RFC 2821 envelope sender;
+ * decorated - a flag indicating whether the message has been decorated
+ with headers and footers yet;
+ * recipients - the set of all recipients who should receive this
+ message, as a set of email addresses;
:param mlist: The mailing list being delivered to.
:type mlist: `IMailingList`
diff --git a/src/mailman/mta/bulk.py b/src/mailman/mta/bulk.py
index 12f94cc23..5c6d8257f 100644
--- a/src/mailman/mta/bulk.py
+++ b/src/mailman/mta/bulk.py
@@ -25,16 +25,83 @@ __all__ = [
]
+from itertools import chain
+
from zope.interface import implements
from mailman.interfaces.mta import IMailTransportAgentDelivery
+# A mapping of top-level domains to bucket numbers. The zeroth bucket is
+# reserved for everything else. At one time, these were the most common
+# domains.
+CHUNKMAP = dict(
+ com=1,
+ net=2,
+ org=2,
+ edu=3,
+ us=3,
+ ca=3,
+ )
+
+
class BulkDelivery:
"""Deliver messages to the MTA in as few sessions as possible."""
implements(IMailTransportAgentDelivery)
+ def __init__(self, max_recipients):
+ """Create a bulk deliverer.
+
+ :param max_recipients: The maximum number of recipients per delivery
+ chunk. Zero or less means to group all recipients into one
+ chunk.
+ :type max_recipients: integer
+ """
+ self._max_recipients = max_recipients
+
def deliver(self, mlist, msg, msgdata):
"""See `IMailTransportAgentDelivery`."""
+
+ def chunkify(self, recipients):
+ """Split a set of recipients into chunks.
+
+ The `max_recipients` argument given to the constructor specifies the
+ maximum number of recipients in each chunk.
+
+ :param recipients: The set of recipient email addresses
+ :type recipients: sequence of email address strings
+ :return: A list of chunks, where each chunk is a set containing no
+ more than `max_recipients` number of addresses. The chunk can
+ contain fewer, and no packing is guaranteed.
+ :rtype: list of sets of strings
+ """
+ if self._max_recipients <= 0:
+ yield set(recipients)
+ return
+ # This algorithm was originally suggested by Chuq Von Rospach. Start
+ # by splitting the recipient addresses into top-level domain buckets,
+ # using the "most common" domains. Everything else ends up in the
+ # zeroth bucket.
+ by_bucket = {}
+ for address in recipients:
+ localpart, at, domain = address.partition('@')
+ domain_parts = domain.split('.')
+ bucket_number = CHUNKMAP.get(domain_parts[-1], 0)
+ by_bucket.setdefault(bucket_number, set()).add(address)
+ # Fill chunks by sorting the tld values by length.
+ chunk = set()
+ for tld_chunk in sorted(by_bucket.values(), key=len, reverse=True):
+ while tld_chunk:
+ chunk.add(tld_chunk.pop())
+ if len(chunk) == self._max_recipients:
+ yield chunk
+ chunk = set()
+ # Every tld bucket starts a new chunk, but only if non-empty
+ if len(chunk) > 0:
+ yield chunk
+ chunk = set()
+ # Be sure to include the last chunk, but only if it's non-empty.
+ if len(chunk) > 0:
+ yield chunk
diff --git a/src/mailman/mta/docs/bulk.txt b/src/mailman/mta/docs/bulk.txt
index 4f564173d..0943b276f 100644
--- a/src/mailman/mta/docs/bulk.txt
+++ b/src/mailman/mta/docs/bulk.txt
@@ -14,11 +14,123 @@ cannot be personalized. See `verp.txt`_ for an alternative strategy.
>>> from mailman.mta.bulk import BulkDelivery
-Delivery strategies must implement the proper interface.
+The standard bulk deliverer takes as an argument the maximum number of
+recipients per session. The default is to deliver the message in one chunk,
+containing all recipients.
+
+ >>> bulk = BulkDelivery(0)
- >>> bulk = BulkDelivery()
+Delivery strategies must implement the proper interface.
>>> from mailman.interfaces.mta import IMailTransportAgentDelivery
>>> from zope.interface.verify import verifyObject
- >>> verifyObject(IMailTransportAgentDelivery, bulk)
+ >>> verifyObject(IMailTransportAgentDelivery, bulk)
+ True
+
+
+Chunking recipients
+===================
+
+The set of final recipients is contained in the 'recipients' key in the
+message metadata. When `max_recipients` is specified as zero, then the bulk
+deliverer puts all recipients into one big chunk.
+
+ >>> from string import ascii_letters
+ >>> recipients = set(letter + 'person@example.com'
+ ... for letter in ascii_letters)
+
+ >>> chunks = list(bulk.chunkify(recipients))
+ >>> len(chunks)
+ 1
+ >>> len(chunks[0])
+ 52
+
+Let say the maximum number of recipients allowed is 4, then no chunk will have
+more than 4 recipients, though they can have fewer (but still not zero).
+
+ >>> bulk = BulkDelivery(4)
+ >>> chunks = list(bulk.chunkify(recipients))
+ >>> len(chunks)
+ 13
+ >>> all(0 < len(chunk) <= 4 for chunk in chunks)
True
+
+The chunking algorithm sorts recipients by top level domain by length.
+
+ >>> recipients = set([
+ ... 'anne@example.com',
+ ... 'bart@example.org',
+ ... 'cate@example.net',
+ ... 'dave@example.com',
+ ... 'elle@example.org',
+ ... 'fred@example.net',
+ ... 'gwen@example.com',
+ ... 'herb@example.us',
+ ... 'ione@example.net',
+ ... 'john@example.com',
+ ... 'kate@example.com',
+ ... 'liam@example.ca',
+ ... 'mary@example.us',
+ ... 'neil@example.net',
+ ... 'ocho@example.org',
+ ... 'paco@example.xx',
+ ... 'quaq@example.zz',
+ ... ])
+
+ >>> bulk = BulkDelivery(4)
+ >>> chunks = list(bulk.chunkify(recipients))
+ >>> len(chunks)
+ 6
+
+We can't make any guarantees about sorting within each chunk, but we can tell
+a few things. For example, the first two chunks will be composed of .net (4)
+and .org (3) domains (for a total of 7).
+
+ >>> len(chunks[0])
+ 4
+ >>> len(chunks[1])
+ 3
+
+ >>> for address in sorted(chunks[0].union(chunks[1])):
+ ... print address
+ bart@example.org
+ cate@example.net
+ elle@example.org
+ fred@example.net
+ ione@example.net
+ neil@example.net
+ ocho@example.org
+
+We also know that the next two chunks will contain .com (5) addresses.
+
+ >>> len(chunks[2])
+ 4
+ >>> len(chunks[3])
+ 1
+
+ >>> for address in sorted(chunks[2].union(chunks[3])):
+ ... print address
+ anne@example.com
+ dave@example.com
+ gwen@example.com
+ john@example.com
+ kate@example.com
+
+The next chunk will contain the .us (2) and .ca (1) domains.
+
+ >>> len(chunks[4])
+ 3
+ >>> for address in sorted(chunks[4]):
+ ... print address
+ herb@example.us
+ liam@example.ca
+ mary@example.us
+
+The final chunk will contain the outliers, .xx (1) and .zz (2).
+
+ >>> len(chunks[5])
+ 2
+ >>> for address in sorted(chunks[5]):
+ ... print address
+ paco@example.xx
+ quaq@example.zz