diff options
| author | bwarsaw | 1999-11-24 19:37:53 +0000 |
|---|---|---|
| committer | bwarsaw | 1999-11-24 19:37:53 +0000 |
| commit | 94ac9f8ed9bb490b4e86a8562dbf98ff61067c9c (patch) | |
| tree | ab21a9bb8d2b03cdaf565db05a254613070cdf8b | |
| parent | 2aac6c761efcaa26912ecd2ae18b21ba027bddf5 (diff) | |
| download | mailman-94ac9f8ed9bb490b4e86a8562dbf98ff61067c9c.tar.gz mailman-94ac9f8ed9bb490b4e86a8562dbf98ff61067c9c.tar.zst mailman-94ac9f8ed9bb490b4e86a8562dbf98ff61067c9c.zip | |
A serious rewrite to make it more robust, and to add a somewhat useful
command line interface. This latter allows you to manually poll a
specific list's newsgroup for a range of message numbers, sending them
to the list. Using this interface does /not/ update the watermark
file.
We also don't update the watermark entry for a specific list until
that list's child process (which does the actually gating of the
messages from Usenet to mailing list) exits with a zero status. If it
exits with non-zero, it is taken as a complete failure of the gating
process and the watermark isn't updated. This could cause
duplicates, so a really anal approach would be to create a pipe
between child and parent and write the new watermark after each
message is successfully gated.
| -rwxr-xr-x | cron/gate_news | 285 |
1 files changed, 200 insertions, 85 deletions
diff --git a/cron/gate_news b/cron/gate_news index bb8e3c79a..b259c954e 100755 --- a/cron/gate_news +++ b/cron/gate_news @@ -17,8 +17,29 @@ # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """Poll the NNTP servers for messages to be gatewayed to mailing lists. + +Usage is either + + gate_news + +or + + gate_news listname first last + +In the former case, the watermark file is used to find the first message to +post, and all messages up to the last available on the newsgroup are gated. +The watermark file is then updated for the next run. This is the way the +script should be run from cron. + +In the latter case, only the specified mailing list's newsgroup is polled, and +only the message from first to last (inclusive) are gated to the mailing +list. In this case, the watermark file is /not/ updated. + +Either form can be run from the command line. + """ +import sys import os import string import time @@ -26,11 +47,14 @@ import marshal import paths import nntplib import errno +import traceback from Mailman import MailList from Mailman import mm_cfg from Mailman import Utils from Mailman import LockFile +from Mailman import Message +from Mailman.Logging.Utils import LogStdErr # Work around known problems with some RedHat cron daemons @@ -41,21 +65,46 @@ signal.signal(signal.SIGCHLD, signal.SIG_DFL) WATERMARK_FILE = os.path.join(mm_cfg.DATA_DIR, 'gate_watermarks') LIST_LOCK_FILE = os.path.join(mm_cfg.LOCK_DIR, 'gate_lock.') +LogStdErr('fromusenet', 'gate_news') + def main(): - names = Utils.list_names() - try: - fp = open(WATERMARK_FILE) - watermarks = marshal.load(fp) - fp.close() - except IOError, (code, msg): - if code <> errno.ENOENT: - Utils.reraise() - watermarks = {} - # marshal or open could raise other exceptions, namely EOFError, - # ValueError or TypeError. TBD: should we zap the watermarks file if that - # happens? + # check command line options + names = None + updatewatermarks = 1 + if len(sys.argv) == 1: + names = Utils.list_names() + first = -1 + last = -1 + elif len(sys.argv) == 4: + names = [sys.argv[1]] + try: + first = int(sys.argv[2]) + last = int(sys.argv[3]) + except ValueError: + names = None + updatewatermarks = 0 + if not names: + print __doc__ + sys.exit(1) + # try to open the watermarks file + if updatewatermarks: + try: + fp = open(WATERMARK_FILE) + watermarks = marshal.load(fp) + fp.close() + # TBD: marshal or open could raise other exceptions, namely EOFError, + # ValueError or TypeError. Should we zap the watermarks file if that + # happens? + except IOError, (code, msg): + if code <> errno.ENOENT: + raise + watermarks = {} + # We need to reap the child processes, which will tell us the last article + # number actually gated + children = {} + # cruise through lists for name in names: # check to see if the list is gating news to mail. If not, skip the # list. If so, then we have to poll the newsgroup and gate any @@ -63,71 +112,108 @@ def main(): mlist = MailList.MailList(name, lock=0) if not mlist.gateway_to_mail: continue + # open up a connection to the gated newsgroup. we want to get the + # watermark for the group in the parent process so that we can safely + # update the gate_watermarks file. we'll actually do the gating in a + # child process + conn = nntplib.NNTP(mlist.nntp_host) + r,c,f,l,n = conn.group(mlist.linked_newsgroup) + if not updatewatermarks: + # just post the specified messages and be done with it + poll_newsgroup(mlist, conn, first, last+1) + return + # Otherwise, let's figure out what needs to be done + first = int(f) + last = int(l) + wm = watermarks.get(name, 0) + if wm == 0: + # This is the first time we've tried to gate this newsgroup. We + # essentially do a mass catch-up, otherwise we'd flood the mailing + # list. If you want to post all earlier messages, do this + # manually using the comand line interface. + watermarks[name] = last + continue # try to get a per-list lock because it makes no sense to have more # than one process gating a newsgroup. if we can't get the lock, just # ignore the list for now... 5 minutes (usually how cron invokes this) - # later we'll try again anyway. We don't need to be anal about giving - # up the lock because we're setting a hung_time out of 4 minutes. - # This means that if we crashed, the next time the cron job runs, - # it'll just wax the lock and try again. + # later we'll try again anyway. lock = LockFile.LockFile(LIST_LOCK_FILE + name, lifetime=240) try: - lock.lock(timeout=0.001) + lock.lock(timeout=0.5) except LockFile.TimeOutError: + # TBD: It is possible that some other process has laid claim to + # the gate lock for this list, but that said process has exited + # uncleanly. If that's the case, and it leaves it's lock claim on + # disk, we will never be able to gate from usenet to the list + # again, until the stale lock is removed. For now, we just log + # this potentially deadlocked situation, but this should really be + # fixed (probably in LockFile.py though). + sys.stderr.write('Could not acquire gate_news lock for %s\n' % + name) # someone else is gating this list already continue - # open up a connection to the gated newsgroup. we want to get the - # watermark for the group in the parent process so that we can safely - # update the gate_watermarks file. we'll actually do the gating in a - # child process - conn = nntplib.NNTP(mlist.nntp_host) - r,c,first,last,n = conn.group(mlist.linked_newsgroup) - first = int(first) - last = int(last) - wm = watermarks.get(name, 0) - watermarks[name] = last - if wm <> 0: - # TBD: Essentially this does a mass catch-up on the newsgroup. - # The first time this script is run, no messages will be will be - # forwarded. We *could* have an option to control this, but who - # wants that? ;-) - if not os.fork(): - # in the child. - # - # steal the lock from the parent because we're going to manage - # it from here on, and we have a different PID than our - # parent. we't want to minimize any race conditions where - # someone else can steal the lock from us. I think there's - # still a race condition during the time we've actually got - # the file open for writing and when we're done writing it - # (during the steal()), but that should be very small. + # Fork a child to do the actual gating. + # + # TBD: There are several issues here, revolving around finding out + # from the child exactly which messages were successfully gated. + # Let's say the child is going to gate messages 125-175. If we were + # really anal, we'd open a pipe and let the child tell us the last + # message it successfully gatewayed. Can't use an exit status here + # because message numbers can easily be > 255. But managing all those + # child pipes means pipes and selects, which is probably overkill. + # + # Instead what we do is just get the exit status of the child. If the + # child completes successfully, we assume it gated all the requested + # messages. If it exits with a non-zero status, we assume it gated + # none of them. This is probably good enough, although some + # duplicates are theoretically possible. + pid = os.fork() + if pid: + # in the parent. record the pid of the child, the child's list + # name, and last message number. when the child successfully + # exits, we'll update it's watermark + children[pid] = (name, last) + else: + # in the child. + # + # Steal the list's gateway lock from the parent because we're + # going to manage it from here on, and we have a different PID + # than our parent. We want to minimize any race conditions where + # someone else can steal the lock from us. I think there's still + # a race condition during the time we've actually got the file + # open for writing and when we're done writing it (during the + # steal()), but that should be very small. + try: lock.steal() - poll_newsgroup(mlist, conn, wm, first, last) - lock.unlock() + poll_newsgroup(mlist, conn, max(wm+1, first), last+1) + try: + lock.unlock() + except LockFile.NotLockedError: + # I think it's okay to ignore these specific exceptions + pass os._exit(0) - # Save the new watermarks after every newsgroup gating has - # started, so in case of a system crash we reduce the number of - # multiply gated messages. it might be better to save after every - # post, but that is harder to coordinate safely between the - # subprocesses, and would probably be *much* slower - omask = os.umask(002) - try: - fp = open(WATERMARK_FILE + '.tmp', 'w') - marshal.dump(watermarks, fp) - fp.close() - os.rename(WATERMARK_FILE + '.tmp', WATERMARK_FILE) - finally: - os.umask(omask) + except: + # if anything else bad happens, log the exception to stderr. + # TBD: we should probably generalize scripts/driver to handle + # this situation + traceback.print_exc() + os._exit(1) + # wait on at least one child + reap(children, watermarks) + # we're done forking off all the gating children, now just wait for them + # all to exit, and then we're done + while children: + reap(children, watermarks) # XXX: Bogus, but might as we do it `legally' QuickEscape = 'QuickEscape' -def poll_newsgroup(mlist, conn, wm, first, last): +def poll_newsgroup(mlist, conn, first, last): # NEWNEWS is not portable and has synchronization issues... Use a # watermark system instead. - for num in range(max(wm+1, first), last+1): + for num in range(first, last): try: headers = conn.head(`num`)[3] found_to = 0 @@ -141,42 +227,71 @@ def poll_newsgroup(mlist, conn, wm, first, last): if header[i:] == ': %s' % mlist.GetListEmail(): raise QuickEscape body = conn.body(`num`)[3] - # Create the pipe to the Mail posting script. Note that it is not - # installed executable, so we'll tack on the path to Python we - # discovered when we configured Mailman. The extra argument to - # `post' informs the system that the message is originating from - # Usenet and so should not get posted back to Usenet. I think - # this is mostly redundant with the X-BeenThere header, but I'm a - # little afraid to muck with that. - # - # TBD: This should just be injected directly into the message - # delivery pipeline. - cmd = '%s %s %s fromusenet' % ( - mm_cfg.PYTHON, - os.path.join(mm_cfg.SCRIPTS_DIR, 'post'), - mlist.internal_name()) - file = os.popen(cmd, 'w') # Usenet originated messages will not have a Unix envelope # (i.e. "From " header). This breaks Pipermail archiving, so # we will synthesize one. Be sure to use the format searched # for by mailbox.UnixMailbox._isrealfromline() timehdr = time.asctime(time.localtime(time.time())) - envhdr = 'From ' + mlist.GetAdminEmail() + ' ' + timehdr - file.write(envhdr + '\n') - file.write(string.join(headers,'\n')) - # If there wasn't already a TO: header, add one. + lines = ['From ' + mlist.GetAdminEmail() + ' ' + timehdr] + lines.extend(headers) + lines.append('') + lines.extend(body) + lines.append('') + msg = Message.OutgoingMessage(string.join(lines, '\n')) + msg.fromusenet = 1 if not found_to: - file.write("\nTo: %s" % mlist.GetListEmail()) - file.write('\n\n') - file.write(string.join(body,'\n')) - file.write('\n') - file.close() - except nntplib.error_temp: - pass # Probably canceled, etc... + msg['To'] = mlist.GetListEmail() + # the list must be locked during posting + lockflag = mlist.Locked() + try: + try: + mlist.Lock() + except Locked.AlreadyLockedError: + pass + mlist.Post(msg) + finally: + mlist.Save() + if not lockflag: + mlist.Unlock() + sys.stderr.write('%s: gated msg id %d\n' % + (mlist.internal_name(), num)) + except nntplib.error_temp, msg: + sys.stderr.write('%s: NNTP error: %s\n' % + (mlist.internal_name(), msg)) + pass # Probably canceled, etc... except QuickEscape: pass # We gated this TO news, don't repost it! +def reap(children, watermarks): + if not children: + return + # see if any children have exited yet + pid, status = os.waitpid(-1, os.WNOHANG) + if pid == 0: + # nope, none are ready + return + name, last = children[pid] + del children[pid] + if not status: + # successful exit + watermarks[name] = last + # Save the new watermarks after every newsgroup gating has started, so in + # case of a system crash we reduce the number of multiply gated messages. + # it might be better to save after every post, but that is harder to + # coordinate safely between the subprocesses, and would probably be *much* + # slower. + omask = os.umask(002) + try: + fp = open(WATERMARK_FILE + '.tmp', 'w') + marshal.dump(watermarks, fp) + fp.close() + os.rename(WATERMARK_FILE + '.tmp', WATERMARK_FILE) + finally: + os.umask(omask) + + + if __name__ == '__main__': main() |
