2 files changed, 91 insertions, 364 deletions
diff --git a/Mailman/bin/mailmanctl.py b/Mailman/bin/mailmanctl.py
index 2dc1905da..4dd7bd587 100644
--- a/Mailman/bin/mailmanctl.py
+++ b/Mailman/bin/mailmanctl.py
@@ -15,48 +15,36 @@
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
 # USA.
 
+"""Mailman start/stop script."""
+
+from __future__ import with_statement
+
 import os
 import grp
 import pwd
 import sys
 import errno
 import signal
-import socket
 import logging
-import optparse
 
-from datetime import timedelta
-from munepy import Enum
-from locknix import lockfile
+from optparse import OptionParser
 
-from Mailman import Defaults
 from Mailman import Version
-from Mailman import loginit
 from Mailman.configuration import config
 from Mailman.i18n import _
 from Mailman.initialize import initialize
 
 
 COMMASPACE = ', '
-DOT = '.'
-# Calculate this here and now, because we're going to do a chdir later on, and
-# if the path is relative, the qrunner script won't be found.
-BIN_DIR = os.path.abspath(os.path.dirname(sys.argv[0]))
 
-# Since we wake up once per day and refresh the lock, the LOCK_LIFETIME
-# needn't be (much) longer than SNOOZE.  We pad it 6 hours just to be safe.
-LOCK_LIFETIME = Defaults.days(1) + Defaults.hours(6)
-SNOOZE = Defaults.days(1)
-
-elog = None
-qlog = None
-opts = None
+log = None
+parser = None
 
 
 
 def parseargs():
-    parser = optparse.OptionParser(version=Version.MAILMAN_VERSION,
-                                   usage=_("""\
+    parser = OptionParser(version=Version.MAILMAN_VERSION,
+                          usage=_("""\
 Primary start-up and shutdown script for Mailman's qrunner daemon.
 
 This script starts, stops, and restarts the main Mailman queue runners, making
@@ -94,7 +82,7 @@ Commands:
 
 Usage: %prog [options] [ start | stop | restart | reopen ]"""))
     parser.add_option('-u', '--run-as-user',
-                      dest='checkprivs', default=True, action='store_false',
+                      default=True, action='store_false',
                       help=_("""\
 Normally, this script will refuse to run if the user id and group id are not
 set to the `mailman' user and group (as defined when you configured Mailman).
@@ -109,15 +97,15 @@ for normal production environments.
 Note though, that if you run with -u and are not in the mailman group, you may
 have permission problems, such as begin unable to delete a list's archives
 through the web.  Tough luck!"""))
-    parser.add_option('-s', '--stale-lock-cleanup',
-                      dest='force', default=False, action='store_true',
+    parser.add_option('-f', '--force',
+                      default=False, action='store_true',
                       help=_("""\
-If mailmanctl finds an existing master lock, it will normally exit with an
-error message.  With this option, mailmanctl will perform an extra level of
-checking.  If a process matching the host/pid described in the lock file is
-running, mailmanctl will still exit, but if no matching process is found,
-mailmanctl will remove the apparently stale lock and make another attempt to
-claim the master lock."""))
+If the master watcher finds an existing master lock, it will normally exit
+with an error message.  With this option,the master will perform an extra
+level of checking.  If a process matching the host/pid described in the lock
+file is running, the master will still exit, requiring you to manually clean
+up the lock.  But if no matching process is found, the master will remove the
+apparently stale lock and make another attempt to claim the master lock."""))
     parser.add_option('-q', '--quiet',
                       default=False, action='store_true',
                       help=_("""\
@@ -125,36 +113,31 @@ Don't print status messages.  Error messages are still printed to standard
 error."""))
     parser.add_option('-C', '--config',
                       help=_('Alternative configuration file to use'))
-    opts, args = parser.parse_args()
-    if not args:
-        parser.print_help()
-        print >> sys.stderr, _('No command given.')
-        sys.exit(1)
-    if len(args) > 1:
-        parser.print_help()
-        commands = COMMASPACE.join(args)
-        print >> sys.stderr, _('Bad command: $commands')
-        sys.exit(1)
-    return parser, opts, args
+    options, arguments = parser.parse_args()
+    if not arguments:
+        parser.error(_('No command given.'))
+    if len(arguments) > 1:
+        commands = COMMASPACE.join(arguments)
+        parser.error(_('Bad command: $commands'))
+    parser.options = options
+    parser.arguments = arguments
+    return parser
 
 
 
 def kill_watcher(sig):
     try:
-        fp = open(config.PIDFILE)
-        pidstr = fp.read()
-        fp.close()
-        pid = int(pidstr.strip())
+        with open(config.PIDFILE) as f:
+            pid = int(f.read().strip())
     except (IOError, ValueError), e:
         # For i18n convenience
-        pidfile = config.PIDFILE
-        print >> sys.stderr, _('PID unreadable in: $pidfile')
+        print >> sys.stderr, _('PID unreadable in: $config.PIDFILE')
         print >> sys.stderr, e
         print >> sys.stderr, _('Is qrunner even running?')
         return
     try:
         os.kill(pid, sig)
-    except OSError, e:
+    except OSError, error:
         if e.errno <> errno.ESRCH:
             raise
         print >> sys.stderr, _('No child with pid: $pid')
@@ -164,130 +147,17 @@ def kill_watcher(sig):
 
 
 
-def get_lock_data():
-    # Return the hostname, pid, and tempfile
-    fp = open(config.LOCK_FILE)
-    try:
-        filename = os.path.split(fp.read().strip())[1]
-    finally:
-        fp.close()
-    parts = filename.split('.')
-    hostname = DOT.join(parts[1:-1])
-    pid = int(parts[-1])
-    return hostname, int(pid), filename
-
-
-def qrunner_state():
-    # 1 if proc exists on host (but is it qrunner? ;)
-    # 0 if host matches but no proc
-    # hostname if hostname doesn't match
-    hostname, pid, tempfile = get_lock_data()
-    if hostname <> socket.gethostname():
-        return hostname
-    # Find out if the process exists by calling kill with a signal 0.
-    try:
-        os.kill(pid, 0)
-    except OSError, e:
-        if e.errno <> errno.ESRCH:
-            raise
-        return 0
-    return 1
-
-
-def acquire_lock_1(force):
-    # Be sure we can acquire the master qrunner lock.  If not, it means some
-    # other master qrunner daemon is already going.
-    lock = lockfile.Lock(config.LOCK_FILE, LOCK_LIFETIME)
-    try:
-        lock.lock(timedelta(seconds=0.1))
-        return lock
-    except lockfile.TimeOutError:
-        if not force:
-            raise
-        # Force removal of lock first
-        lock.disown()
-        hostname, pid, tempfile = get_lock_data()
-        os.unlink(config.LOCK_FILE)
-        os.unlink(os.path.join(config.LOCK_DIR, tempfile))
-        return acquire_lock_1(force=False)
-
-
-def acquire_lock(force):
-    try:
-        lock = acquire_lock_1(force)
-        return lock
-    except lockfile.TimeOutError:
-        status = qrunner_state()
-        if status == 1:
-            # host matches and proc exists
-            print >> sys.stderr, _("""\
-The master qrunner lock could not be acquired because it appears as if another
-master qrunner is already running.
-""")
-        elif status == 0:
-            # host matches but no proc
-            print >> sys.stderr, _("""\
-The master qrunner lock could not be acquired.  It appears as though there is
-a stale master qrunner lock.  Try re-running mailmanctl with the -s flag.
-""")
-        else:
-            # host doesn't even match
-            print >> sys.stderr, _("""\
-The master qrunner lock could not be acquired, because it appears as if some
-process on some other host may have acquired it.  We can't test for stale
-locks across host boundaries, so you'll have to do this manually.  Or, if you
-know the lock is stale, re-run mailmanctl with the -s flag.
-
-Lock file: $config.LOCK_FILE
-Lock host: $status
-
-Exiting.""")
-
-
-
-def start_runner(qrname, slice, count):
-    pid = os.fork()
-    if pid:
-        # parent
-        return pid
-    # child
-    #
-    # Craft the command line arguments for the exec() call.
-    rswitch = '--runner=%s:%d:%d' % (qrname, slice, count)
-    # Wherever mailmanctl lives, so too must live the qrunner script.
-    exe = os.path.join(BIN_DIR, 'qrunner')
-    # config.PYTHON, which is the absolute path to the Python interpreter,
-    # must be given as argv[0] due to Python's library search algorithm.
-    args = [sys.executable, sys.executable, exe, rswitch, '-s']
-    if opts.config:
-        args.extend(['-C', opts.config])
-    os.execl(*args)
-    # Should never get here
-    raise RuntimeError('os.execl() failed')
-
-
-def start_all_runners():
-    kids = {}
-    for qrname, count in config.qrunners.items():
-        for slice in range(count):
-            # queue runner name, slice, numslices, restart count
-            info = (qrname, slice, count, 0)
-            pid = start_runner(qrname, slice, count)
-            kids[pid] = info
-    return kids
-
-
-
-def check_privs(parser):
+def check_privileges():
     # If we're running as root (uid == 0), coerce the uid and gid to that
     # which Mailman was configured for, and refuse to run if we didn't coerce
     # the uid/gid.
-    gid = grp.getgrnam(config.MAILMAN_GROUP)[2]
-    uid = pwd.getpwnam(config.MAILMAN_USER)[2]
+    gid = grp.getgrnam(config.MAILMAN_GROUP).gr_gid
+    uid = pwd.getpwnam(config.MAILMAN_USER).pw_uid
     myuid = os.getuid()
     if myuid == 0:
         # Set the process's supplimental groups.
-        groups = [x[2] for x in grp.getgrall() if config.MAILMAN_USER in x[3]]
+        groups = [group.gr_gid for group in grp.getgrall()
+                  if config.MAILMAN_USER in group.gr_mem]
         groups.append(gid)
         os.setgroups(groups)
         os.setgid(gid)
@@ -300,208 +170,63 @@ def check_privs(parser):
 
 
 def main():
-    global elog, qlog, opts
+    global log, parser
 
-    parser, opts, args = parseargs()
-    initialize(opts.config)
+    parser = parseargs()
+    initialize(parser.options.config)
 
-    elog = logging.getLogger('mailman.error')
-    qlog = logging.getLogger('mailman.qrunner')
+    log = logging.getLogger('mailman.qrunner')
 
-    if opts.checkprivs:
-        check_privs(parser)
+    if not parser.options.run_as_user:
+        check_privileges()
     else:
-        print _('Warning!  You may encounter permission problems.')
+        if not parser.options.quiet:
+            print _('Warning!  You may encounter permission problems.')
 
     # Handle the commands
-    command = args[0].lower()
+    command = parser.arguments[0].lower()
     if command == 'stop':
-        if not opts.quiet:
+        if not parser.options.quiet:
             print _("Shutting down Mailman's master qrunner")
         kill_watcher(signal.SIGTERM)
     elif command == 'restart':
-        if not opts.quiet:
+        if not parser.options.quiet:
             print _("Restarting Mailman's master qrunner")
         kill_watcher(signal.SIGUSR1)
     elif command == 'reopen':
-        if not opts.quiet:
+        if not parser.options.quiet:
             print _('Re-opening all log files')
         kill_watcher(signal.SIGHUP)
     elif command == 'start':
-        # Here's the scoop on the processes we're about to create.  We'll need
-        # one for each qrunner, and one for a master child process watcher /
-        # lock refresher process.
-        #
-        # The child watcher process simply waits on the pids of the children
-        # qrunners.  Unless explicitly disabled by a mailmanctl switch (or the
-        # children are killed with SIGTERM instead of SIGINT), the watcher
-        # will automatically restart any child process that exits.  This
-        # allows us to be more robust, and also to implement restart by simply
-        # SIGINT'ing the qrunner children, and letting the watcher restart
-        # them.
+        # Start the master qrunner watcher process.
         #
-        # Under normal operation, we have a child per queue.  This lets us get
-        # the most out of the available resources, since a qrunner with no
-        # files in its queue directory is pretty cheap, but having a separate
-        # runner process per queue allows for a very responsive system.  Some
-        # people want a more traditional (i.e. MM2.0.x) cron-invoked qrunner.
-        # No problem, but using mailmanctl isn't the answer.  So while
-        # mailmanctl hard codes some things, others, such as the number of
-        # qrunners per queue, are configurable.
-        #
-        # First, acquire the master mailmanctl lock
-        lock = acquire_lock(opts.force)
-        if not lock:
-            return
         # Daemon process startup according to Stevens, Advanced Programming in
         # the UNIX Environment, Chapter 13.
         pid = os.fork()
         if pid:
             # parent
-            if not opts.quiet:
+            if not parser.options.quiet:
                 print _("Starting Mailman's master qrunner.")
-            # Give up the lock "ownership".  This just means the foreground
-            # process won't close/unlock the lock when it finalizes this lock
-            # instance.  We'll let the mater watcher subproc own the lock.
-            lock.transfer_to(pid)
             return
         # child
-        lock.take_possession()
-        # Save our pid in a file for "mailmanctl stop" rendezvous.
-        fp = open(config.PIDFILE, 'w')
-        try:
-            print >> fp, os.getpid()
-        finally:
-            fp.close()
+        #
         # Create a new session and become the session leader, but since we
         # won't be opening any terminal devices, don't do the ultra-paranoid
         # suggestion of doing a second fork after the setsid() call.
         os.setsid()
         # Instead of cd'ing to root, cd to the Mailman runtime directory.
         os.chdir(config.VAR_DIR)
-        # I don't think we have any unneeded file descriptors.
-        #
-        # Now start all the qrunners.  This returns a dictionary where the
-        # keys are qrunner pids and the values are tuples of the following
-        # form: (qrname, slice, count).  This does its own fork and exec, and
-        # sets up its own signal handlers.
-        kids = start_all_runners()
-        # Set up a SIGALRM handler to refresh the lock once per day.  The lock
-        # lifetime is 1day+6hours so this should be plenty.
-        def sigalrm_handler(signum, frame):
-            lock.refresh()
-            signal.alarm(Defaults.days(1))
-        signal.signal(signal.SIGALRM, sigalrm_handler)
-        signal.alarm(int(Defaults.days(1)))
-        # Set up a SIGHUP handler so that if we get one, we'll pass it along
-        # to all the qrunner children.  This will tell them to close and
-        # reopen their log files
-        def sighup_handler(signum, frame):
-            loginit.reopen()
-            for pid in kids.keys():
-                os.kill(pid, signal.SIGHUP)
-            # And just to tweak things...
-            qlog.info('Master watcher caught SIGHUP.  Re-opening log files.')
-        signal.signal(signal.SIGHUP, sighup_handler)
-        # We also need to install a SIGTERM handler because that's what init
-        # will kill this process with when changing run levels.  It's also the
-        # signal 'mailmanctl stop' uses.
-        def sigterm_handler(signum, frame):
-            # Make sure we never try to restart our children, no matter why
-            # the child exited.
-            opts.restart = False
-            qlog.info('I AM NEVER RESTARTING AGAIN: %d', pid)
-            for pid in kids.keys():
-                try:
-                    os.kill(pid, signal.SIGTERM)
-                except OSError, e:
-                    if e.errno <> errno.ESRCH:
-                        raise
-            qlog.info('Master watcher caught SIGTERM.  Exiting.')
-        signal.signal(signal.SIGTERM, sigterm_handler)
-        # Finally, we need a SIGINT handler which will cause the sub-qrunners
-        # to exit, but the master will restart SIGINT'd sub-processes unless
-        # the -n flag was given.
-        def sigint_handler(signum, frame):
-            for pid in kids.keys():
-                os.kill(pid, signal.SIGINT)
-            qlog.info('Master watcher caught SIGINT.  Restarting.')
-        signal.signal(signal.SIGINT, sigint_handler)
-        # Now we're ready to simply do our wait/restart loop.  This is the
-        # master qrunner watcher.
-        try:
-            while True:
-                try:
-                    pid, status = os.wait()
-                except OSError, e:
-                    # No children?  We're done
-                    if e.errno == errno.ECHILD:
-                        break
-                    # If the system call got interrupted, just restart it.
-                    elif e.errno <> errno.EINTR:
-                        raise
-                    continue
-                killsig = exitstatus = None
-                if os.WIFSIGNALED(status):
-                    killsig = os.WTERMSIG(status)
-                if os.WIFEXITED(status):
-                    exitstatus = os.WEXITSTATUS(status)
-                # We'll restart the process unless we were given the
-                # "no-restart" switch, or if the process was SIGTERM'd or
-                # exitted with a SIGTERM exit status.  This lets us better
-                # handle runaway restarts (say, if the subproc had a syntax
-                # error!)
-                restarting = ''
-                if opts.restart:
-                    if ((exitstatus is None and killsig <> signal.SIGTERM) or
-                        (killsig is None and exitstatus <> signal.SIGTERM)):
-                        # Then
-                        restarting = '[restarting]'
-                qrname, slice, count, restarts = kids[pid]
-                del kids[pid]
-                qlog.info("""\
-Master qrunner detected subprocess exit
-(pid: %d, sig: %s, sts: %s, class: %s, slice: %d/%d) %s""",
-                       pid, killsig, exitstatus, qrname,
-                       slice+1, count, restarting)
-                # See if we've reached the maximum number of allowable restarts
-                if exitstatus <> signal.SIGINT:
-                    restarts += 1
-                if restarts > config.MAX_RESTARTS:
-                    qlog.info("""\
-Qrunner %s reached maximum restart limit of %d, not restarting.""",
-                           qrname, config.MAX_RESTARTS)
-                    restarting = ''
-                # Now perhaps restart the process unless it exited with a
-                # SIGTERM or we aren't restarting.
-                if restarting:
-                    newpid = start_runner(qrname, slice, count)
-                    kids[newpid] = (qrname, slice, count, restarts)
-        finally:
-            # Should we leave the main loop for any reason, we want to be sure
-            # all of our children are exited cleanly.  Send SIGTERMs to all
-            # the child processes and wait for them all to exit.
-            for pid in kids.keys():
-                try:
-                    os.kill(pid, signal.SIGTERM)
-                except OSError, e:
-                    if e.errno == errno.ESRCH:
-                        # The child has already exited
-                        qlog.info('ESRCH on pid: %d', pid)
-                        del kids[pid]
-            # Wait for all the children to go away
-            while True:
-                try:
-                    pid, status = os.wait()
-                except OSError, e:
-                    if e.errno == errno.ECHILD:
-                        break
-                    elif e.errno <> errno.EINTR:
-                        raise
-                    continue
-        # Finally, give up the lock
-        lock.unlock(unconditionally=True)
-        os._exit(0)
+        # Exec the master watcher.
+        args = [sys.executable, sys.executable,
+                os.path.join(config.BIN_DIR, 'master')]
+        if parser.options.force:
+            args.append('--force')
+        if parser.options.config:
+            args.extend(['-C', parser.options.config])
+        log.debug('starting: %s', args)
+        os.execl(*args)
+        # We should never get here.
+        raise RuntimeError('os.execl() failed')
 
 
 
diff --git a/Mailman/bin/master.py b/Mailman/bin/master.py
index 6e7c5408d..e2a80934f 100644
--- a/Mailman/bin/master.py
+++ b/Mailman/bin/master.py
@@ -18,8 +18,6 @@
 from __future__ import with_statement
 
 import os
-import grp
-import pwd
 import sys
 import errno
 import signal
@@ -39,16 +37,8 @@ from Mailman.i18n import _
 from Mailman.initialize import initialize
 
 
-COMMASPACE = ', '
 DOT = '.'
-# Calculate this here and now, because we're going to do a chdir later on, and
-# if the path is relative, the qrunner script won't be found.
-BIN_DIR = os.path.abspath(os.path.dirname(sys.argv[0]))
-
-# Since we wake up once per day and refresh the lock, the LOCK_LIFETIME
-# needn't be (much) longer than SNOOZE.  We pad it 6 hours just to be safe.
 LOCK_LIFETIME = Defaults.days(1) + Defaults.hours(6)
-SNOOZE = Defaults.days(1)
 
 log = None
 parser = None
@@ -84,6 +74,15 @@ Usage: %prog [options]"""))
                       help=_("""\
 Don't restart the qrunners when they exit because of an error or a SIGUSR1.
 Use this only for debugging."""))
+    parser.add_option('-f', '--force',
+                      default=False, action='store_true',
+                      help=_("""\
+If the master watcher finds an existing master lock, it will normally exit
+with an error message.  With this option,the master will perform an extra
+level of checking.  If a process matching the host/pid described in the lock
+file is running, the master will still exit, requiring you to manually clean
+up the lock.  But if no matching process is found, the master will remove the
+apparently stale lock and make another attempt to claim the master lock."""))
     parser.add_option('-C', '--config',
                       help=_('Alternative configuration file to use'))
     options, arguments = parser.parse_args()
@@ -104,8 +103,8 @@ def get_lock_data():
     with open(config.LOCK_FILE) as fp:
         filename = os.path.split(fp.read().strip())[1]
     parts = filename.split('.')
-    hostname = DOT.join(parts[1:-1])
-    pid = int(parts[-1])
+    hostname = DOT.join(parts[1:-2])
+    pid = int(parts[-2])
     return hostname, int(pid), filename
 
 
@@ -164,28 +163,27 @@ def acquire_lock_1(force):
         return acquire_lock_1(force=False)
 
 
-def acquire_lock(force):
+def acquire_lock():
     """Acquire the master queue runner lock.
 
-    :param force: Flag that controls whether to force acquisition of the lock.
     :return: The master queue runner lock or None if the lock couldn't be
         acquired.  In that case, an error messages is also printed to standard
         error.
     """
     try:
-        lock = acquire_lock_1(force)
+        lock = acquire_lock_1(parser.options.force)
         return lock
     except lockfile.TimeOutError:
         status = master_state()
         if status == WatcherState.conflict:
             # Hostname matches and process exists.
-            print >> sys.stderr, _("""\
-The master qrunner lock could not be acquired because it appears as if another
-master qrunner is already running.
+            message = _("""\
+The master qrunner lock could not be acquired because it appears
+as though another master qrunner is already running.
 """)
         elif status == WatcherState.stale_lock:
             # Hostname matches but the process does not exist.
-            print >> sys.stderr, _("""\
+            message = _("""\
 The master qrunner lock could not be acquired.  It appears as though there is
 a stale master qrunner lock.  Try re-running mailmanctl with the -s flag.
 """)
@@ -193,17 +191,17 @@ a stale master qrunner lock.  Try re-running mailmanctl with the -s flag.
             assert status == WatcherState.host_mismatch, (
                 'Invalid enum value: %s' % status)
             # Hostname doesn't even match.
-            print >> sys.stderr, _("""\
+            hostname, pid, tempfile = get_lock_data()
+            message = _("""\
 The master qrunner lock could not be acquired, because it appears as if some
 process on some other host may have acquired it.  We can't test for stale
-locks across host boundaries, so you'll have to do this manually.  Or, if you
-know the lock is stale, re-run mailmanctl with the -s flag.
+locks across host boundaries, so you'll have to clean this up manually.
 
 Lock file: $config.LOCK_FILE
-Lock host: $status
+Lock host: $hostname
 
 Exiting.""")
-        return None
+        parser.error(message)
 
 
 
@@ -226,7 +224,7 @@ def start_runner(qrname, slice, count):
     # Craft the command line arguments for the exec() call.
     rswitch = '--runner=%s:%d:%d' % (qrname, slice, count)
     # Wherever mailmanctl lives, so too must live the qrunner script.
-    exe = os.path.join(BIN_DIR, 'qrunner')
+    exe = os.path.join(config.BIN_DIR, 'qrunner')
     # config.PYTHON, which is the absolute path to the Python interpreter,
     # must be given as argv[0] due to Python's library search algorithm.
     args = [sys.executable, sys.executable, exe, rswitch, '-s']
@@ -375,14 +373,18 @@ def main():
     log = logging.getLogger('mailman.qrunner')
 
     # Acquire the master lock, exiting if we can't acquire it.  We'll let the
-    # caller handle any clean up or lock breaking.
-    with lockfile.Lock(config.LOCK_FILE, LOCK_LIFETIME) as lock:
+    # caller handle any clean up or lock breaking.  No with statement here
+    # because Lock's constructor doesn't support a timeout.
+    lock = acquire_lock()
+    try:
         with open(config.PIDFILE, 'w') as fp:
             print >> fp, os.getpid()
         try:
             control_loop(lock)
         finally:
             os.remove(config.PIDFILE)
+    finally:
+        lock.unlock()