diff options
Diffstat (limited to 'Mailman/bin')
| -rw-r--r-- | Mailman/bin/mailmanctl.py | 399 | ||||
| -rw-r--r-- | Mailman/bin/master.py | 56 |
2 files changed, 91 insertions, 364 deletions
diff --git a/Mailman/bin/mailmanctl.py b/Mailman/bin/mailmanctl.py index 2dc1905da..4dd7bd587 100644 --- a/Mailman/bin/mailmanctl.py +++ b/Mailman/bin/mailmanctl.py @@ -15,48 +15,36 @@ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, # USA. +"""Mailman start/stop script.""" + +from __future__ import with_statement + import os import grp import pwd import sys import errno import signal -import socket import logging -import optparse -from datetime import timedelta -from munepy import Enum -from locknix import lockfile +from optparse import OptionParser -from Mailman import Defaults from Mailman import Version -from Mailman import loginit from Mailman.configuration import config from Mailman.i18n import _ from Mailman.initialize import initialize COMMASPACE = ', ' -DOT = '.' -# Calculate this here and now, because we're going to do a chdir later on, and -# if the path is relative, the qrunner script won't be found. -BIN_DIR = os.path.abspath(os.path.dirname(sys.argv[0])) -# Since we wake up once per day and refresh the lock, the LOCK_LIFETIME -# needn't be (much) longer than SNOOZE. We pad it 6 hours just to be safe. -LOCK_LIFETIME = Defaults.days(1) + Defaults.hours(6) -SNOOZE = Defaults.days(1) - -elog = None -qlog = None -opts = None +log = None +parser = None def parseargs(): - parser = optparse.OptionParser(version=Version.MAILMAN_VERSION, - usage=_("""\ + parser = OptionParser(version=Version.MAILMAN_VERSION, + usage=_("""\ Primary start-up and shutdown script for Mailman's qrunner daemon. This script starts, stops, and restarts the main Mailman queue runners, making @@ -94,7 +82,7 @@ Commands: Usage: %prog [options] [ start | stop | restart | reopen ]""")) parser.add_option('-u', '--run-as-user', - dest='checkprivs', default=True, action='store_false', + default=True, action='store_false', help=_("""\ Normally, this script will refuse to run if the user id and group id are not set to the `mailman' user and group (as defined when you configured Mailman). @@ -109,15 +97,15 @@ for normal production environments. Note though, that if you run with -u and are not in the mailman group, you may have permission problems, such as begin unable to delete a list's archives through the web. Tough luck!""")) - parser.add_option('-s', '--stale-lock-cleanup', - dest='force', default=False, action='store_true', + parser.add_option('-f', '--force', + default=False, action='store_true', help=_("""\ -If mailmanctl finds an existing master lock, it will normally exit with an -error message. With this option, mailmanctl will perform an extra level of -checking. If a process matching the host/pid described in the lock file is -running, mailmanctl will still exit, but if no matching process is found, -mailmanctl will remove the apparently stale lock and make another attempt to -claim the master lock.""")) +If the master watcher finds an existing master lock, it will normally exit +with an error message. With this option,the master will perform an extra +level of checking. If a process matching the host/pid described in the lock +file is running, the master will still exit, requiring you to manually clean +up the lock. But if no matching process is found, the master will remove the +apparently stale lock and make another attempt to claim the master lock.""")) parser.add_option('-q', '--quiet', default=False, action='store_true', help=_("""\ @@ -125,36 +113,31 @@ Don't print status messages. Error messages are still printed to standard error.""")) parser.add_option('-C', '--config', help=_('Alternative configuration file to use')) - opts, args = parser.parse_args() - if not args: - parser.print_help() - print >> sys.stderr, _('No command given.') - sys.exit(1) - if len(args) > 1: - parser.print_help() - commands = COMMASPACE.join(args) - print >> sys.stderr, _('Bad command: $commands') - sys.exit(1) - return parser, opts, args + options, arguments = parser.parse_args() + if not arguments: + parser.error(_('No command given.')) + if len(arguments) > 1: + commands = COMMASPACE.join(arguments) + parser.error(_('Bad command: $commands')) + parser.options = options + parser.arguments = arguments + return parser def kill_watcher(sig): try: - fp = open(config.PIDFILE) - pidstr = fp.read() - fp.close() - pid = int(pidstr.strip()) + with open(config.PIDFILE) as f: + pid = int(f.read().strip()) except (IOError, ValueError), e: # For i18n convenience - pidfile = config.PIDFILE - print >> sys.stderr, _('PID unreadable in: $pidfile') + print >> sys.stderr, _('PID unreadable in: $config.PIDFILE') print >> sys.stderr, e print >> sys.stderr, _('Is qrunner even running?') return try: os.kill(pid, sig) - except OSError, e: + except OSError, error: if e.errno <> errno.ESRCH: raise print >> sys.stderr, _('No child with pid: $pid') @@ -164,130 +147,17 @@ def kill_watcher(sig): -def get_lock_data(): - # Return the hostname, pid, and tempfile - fp = open(config.LOCK_FILE) - try: - filename = os.path.split(fp.read().strip())[1] - finally: - fp.close() - parts = filename.split('.') - hostname = DOT.join(parts[1:-1]) - pid = int(parts[-1]) - return hostname, int(pid), filename - - -def qrunner_state(): - # 1 if proc exists on host (but is it qrunner? ;) - # 0 if host matches but no proc - # hostname if hostname doesn't match - hostname, pid, tempfile = get_lock_data() - if hostname <> socket.gethostname(): - return hostname - # Find out if the process exists by calling kill with a signal 0. - try: - os.kill(pid, 0) - except OSError, e: - if e.errno <> errno.ESRCH: - raise - return 0 - return 1 - - -def acquire_lock_1(force): - # Be sure we can acquire the master qrunner lock. If not, it means some - # other master qrunner daemon is already going. - lock = lockfile.Lock(config.LOCK_FILE, LOCK_LIFETIME) - try: - lock.lock(timedelta(seconds=0.1)) - return lock - except lockfile.TimeOutError: - if not force: - raise - # Force removal of lock first - lock.disown() - hostname, pid, tempfile = get_lock_data() - os.unlink(config.LOCK_FILE) - os.unlink(os.path.join(config.LOCK_DIR, tempfile)) - return acquire_lock_1(force=False) - - -def acquire_lock(force): - try: - lock = acquire_lock_1(force) - return lock - except lockfile.TimeOutError: - status = qrunner_state() - if status == 1: - # host matches and proc exists - print >> sys.stderr, _("""\ -The master qrunner lock could not be acquired because it appears as if another -master qrunner is already running. -""") - elif status == 0: - # host matches but no proc - print >> sys.stderr, _("""\ -The master qrunner lock could not be acquired. It appears as though there is -a stale master qrunner lock. Try re-running mailmanctl with the -s flag. -""") - else: - # host doesn't even match - print >> sys.stderr, _("""\ -The master qrunner lock could not be acquired, because it appears as if some -process on some other host may have acquired it. We can't test for stale -locks across host boundaries, so you'll have to do this manually. Or, if you -know the lock is stale, re-run mailmanctl with the -s flag. - -Lock file: $config.LOCK_FILE -Lock host: $status - -Exiting.""") - - - -def start_runner(qrname, slice, count): - pid = os.fork() - if pid: - # parent - return pid - # child - # - # Craft the command line arguments for the exec() call. - rswitch = '--runner=%s:%d:%d' % (qrname, slice, count) - # Wherever mailmanctl lives, so too must live the qrunner script. - exe = os.path.join(BIN_DIR, 'qrunner') - # config.PYTHON, which is the absolute path to the Python interpreter, - # must be given as argv[0] due to Python's library search algorithm. - args = [sys.executable, sys.executable, exe, rswitch, '-s'] - if opts.config: - args.extend(['-C', opts.config]) - os.execl(*args) - # Should never get here - raise RuntimeError('os.execl() failed') - - -def start_all_runners(): - kids = {} - for qrname, count in config.qrunners.items(): - for slice in range(count): - # queue runner name, slice, numslices, restart count - info = (qrname, slice, count, 0) - pid = start_runner(qrname, slice, count) - kids[pid] = info - return kids - - - -def check_privs(parser): +def check_privileges(): # If we're running as root (uid == 0), coerce the uid and gid to that # which Mailman was configured for, and refuse to run if we didn't coerce # the uid/gid. - gid = grp.getgrnam(config.MAILMAN_GROUP)[2] - uid = pwd.getpwnam(config.MAILMAN_USER)[2] + gid = grp.getgrnam(config.MAILMAN_GROUP).gr_gid + uid = pwd.getpwnam(config.MAILMAN_USER).pw_uid myuid = os.getuid() if myuid == 0: # Set the process's supplimental groups. - groups = [x[2] for x in grp.getgrall() if config.MAILMAN_USER in x[3]] + groups = [group.gr_gid for group in grp.getgrall() + if config.MAILMAN_USER in group.gr_mem] groups.append(gid) os.setgroups(groups) os.setgid(gid) @@ -300,208 +170,63 @@ def check_privs(parser): def main(): - global elog, qlog, opts + global log, parser - parser, opts, args = parseargs() - initialize(opts.config) + parser = parseargs() + initialize(parser.options.config) - elog = logging.getLogger('mailman.error') - qlog = logging.getLogger('mailman.qrunner') + log = logging.getLogger('mailman.qrunner') - if opts.checkprivs: - check_privs(parser) + if not parser.options.run_as_user: + check_privileges() else: - print _('Warning! You may encounter permission problems.') + if not parser.options.quiet: + print _('Warning! You may encounter permission problems.') # Handle the commands - command = args[0].lower() + command = parser.arguments[0].lower() if command == 'stop': - if not opts.quiet: + if not parser.options.quiet: print _("Shutting down Mailman's master qrunner") kill_watcher(signal.SIGTERM) elif command == 'restart': - if not opts.quiet: + if not parser.options.quiet: print _("Restarting Mailman's master qrunner") kill_watcher(signal.SIGUSR1) elif command == 'reopen': - if not opts.quiet: + if not parser.options.quiet: print _('Re-opening all log files') kill_watcher(signal.SIGHUP) elif command == 'start': - # Here's the scoop on the processes we're about to create. We'll need - # one for each qrunner, and one for a master child process watcher / - # lock refresher process. - # - # The child watcher process simply waits on the pids of the children - # qrunners. Unless explicitly disabled by a mailmanctl switch (or the - # children are killed with SIGTERM instead of SIGINT), the watcher - # will automatically restart any child process that exits. This - # allows us to be more robust, and also to implement restart by simply - # SIGINT'ing the qrunner children, and letting the watcher restart - # them. + # Start the master qrunner watcher process. # - # Under normal operation, we have a child per queue. This lets us get - # the most out of the available resources, since a qrunner with no - # files in its queue directory is pretty cheap, but having a separate - # runner process per queue allows for a very responsive system. Some - # people want a more traditional (i.e. MM2.0.x) cron-invoked qrunner. - # No problem, but using mailmanctl isn't the answer. So while - # mailmanctl hard codes some things, others, such as the number of - # qrunners per queue, are configurable. - # - # First, acquire the master mailmanctl lock - lock = acquire_lock(opts.force) - if not lock: - return # Daemon process startup according to Stevens, Advanced Programming in # the UNIX Environment, Chapter 13. pid = os.fork() if pid: # parent - if not opts.quiet: + if not parser.options.quiet: print _("Starting Mailman's master qrunner.") - # Give up the lock "ownership". This just means the foreground - # process won't close/unlock the lock when it finalizes this lock - # instance. We'll let the mater watcher subproc own the lock. - lock.transfer_to(pid) return # child - lock.take_possession() - # Save our pid in a file for "mailmanctl stop" rendezvous. - fp = open(config.PIDFILE, 'w') - try: - print >> fp, os.getpid() - finally: - fp.close() + # # Create a new session and become the session leader, but since we # won't be opening any terminal devices, don't do the ultra-paranoid # suggestion of doing a second fork after the setsid() call. os.setsid() # Instead of cd'ing to root, cd to the Mailman runtime directory. os.chdir(config.VAR_DIR) - # I don't think we have any unneeded file descriptors. - # - # Now start all the qrunners. This returns a dictionary where the - # keys are qrunner pids and the values are tuples of the following - # form: (qrname, slice, count). This does its own fork and exec, and - # sets up its own signal handlers. - kids = start_all_runners() - # Set up a SIGALRM handler to refresh the lock once per day. The lock - # lifetime is 1day+6hours so this should be plenty. - def sigalrm_handler(signum, frame): - lock.refresh() - signal.alarm(Defaults.days(1)) - signal.signal(signal.SIGALRM, sigalrm_handler) - signal.alarm(int(Defaults.days(1))) - # Set up a SIGHUP handler so that if we get one, we'll pass it along - # to all the qrunner children. This will tell them to close and - # reopen their log files - def sighup_handler(signum, frame): - loginit.reopen() - for pid in kids.keys(): - os.kill(pid, signal.SIGHUP) - # And just to tweak things... - qlog.info('Master watcher caught SIGHUP. Re-opening log files.') - signal.signal(signal.SIGHUP, sighup_handler) - # We also need to install a SIGTERM handler because that's what init - # will kill this process with when changing run levels. It's also the - # signal 'mailmanctl stop' uses. - def sigterm_handler(signum, frame): - # Make sure we never try to restart our children, no matter why - # the child exited. - opts.restart = False - qlog.info('I AM NEVER RESTARTING AGAIN: %d', pid) - for pid in kids.keys(): - try: - os.kill(pid, signal.SIGTERM) - except OSError, e: - if e.errno <> errno.ESRCH: - raise - qlog.info('Master watcher caught SIGTERM. Exiting.') - signal.signal(signal.SIGTERM, sigterm_handler) - # Finally, we need a SIGINT handler which will cause the sub-qrunners - # to exit, but the master will restart SIGINT'd sub-processes unless - # the -n flag was given. - def sigint_handler(signum, frame): - for pid in kids.keys(): - os.kill(pid, signal.SIGINT) - qlog.info('Master watcher caught SIGINT. Restarting.') - signal.signal(signal.SIGINT, sigint_handler) - # Now we're ready to simply do our wait/restart loop. This is the - # master qrunner watcher. - try: - while True: - try: - pid, status = os.wait() - except OSError, e: - # No children? We're done - if e.errno == errno.ECHILD: - break - # If the system call got interrupted, just restart it. - elif e.errno <> errno.EINTR: - raise - continue - killsig = exitstatus = None - if os.WIFSIGNALED(status): - killsig = os.WTERMSIG(status) - if os.WIFEXITED(status): - exitstatus = os.WEXITSTATUS(status) - # We'll restart the process unless we were given the - # "no-restart" switch, or if the process was SIGTERM'd or - # exitted with a SIGTERM exit status. This lets us better - # handle runaway restarts (say, if the subproc had a syntax - # error!) - restarting = '' - if opts.restart: - if ((exitstatus is None and killsig <> signal.SIGTERM) or - (killsig is None and exitstatus <> signal.SIGTERM)): - # Then - restarting = '[restarting]' - qrname, slice, count, restarts = kids[pid] - del kids[pid] - qlog.info("""\ -Master qrunner detected subprocess exit -(pid: %d, sig: %s, sts: %s, class: %s, slice: %d/%d) %s""", - pid, killsig, exitstatus, qrname, - slice+1, count, restarting) - # See if we've reached the maximum number of allowable restarts - if exitstatus <> signal.SIGINT: - restarts += 1 - if restarts > config.MAX_RESTARTS: - qlog.info("""\ -Qrunner %s reached maximum restart limit of %d, not restarting.""", - qrname, config.MAX_RESTARTS) - restarting = '' - # Now perhaps restart the process unless it exited with a - # SIGTERM or we aren't restarting. - if restarting: - newpid = start_runner(qrname, slice, count) - kids[newpid] = (qrname, slice, count, restarts) - finally: - # Should we leave the main loop for any reason, we want to be sure - # all of our children are exited cleanly. Send SIGTERMs to all - # the child processes and wait for them all to exit. - for pid in kids.keys(): - try: - os.kill(pid, signal.SIGTERM) - except OSError, e: - if e.errno == errno.ESRCH: - # The child has already exited - qlog.info('ESRCH on pid: %d', pid) - del kids[pid] - # Wait for all the children to go away - while True: - try: - pid, status = os.wait() - except OSError, e: - if e.errno == errno.ECHILD: - break - elif e.errno <> errno.EINTR: - raise - continue - # Finally, give up the lock - lock.unlock(unconditionally=True) - os._exit(0) + # Exec the master watcher. + args = [sys.executable, sys.executable, + os.path.join(config.BIN_DIR, 'master')] + if parser.options.force: + args.append('--force') + if parser.options.config: + args.extend(['-C', parser.options.config]) + log.debug('starting: %s', args) + os.execl(*args) + # We should never get here. + raise RuntimeError('os.execl() failed') diff --git a/Mailman/bin/master.py b/Mailman/bin/master.py index 6e7c5408d..e2a80934f 100644 --- a/Mailman/bin/master.py +++ b/Mailman/bin/master.py @@ -18,8 +18,6 @@ from __future__ import with_statement import os -import grp -import pwd import sys import errno import signal @@ -39,16 +37,8 @@ from Mailman.i18n import _ from Mailman.initialize import initialize -COMMASPACE = ', ' DOT = '.' -# Calculate this here and now, because we're going to do a chdir later on, and -# if the path is relative, the qrunner script won't be found. -BIN_DIR = os.path.abspath(os.path.dirname(sys.argv[0])) - -# Since we wake up once per day and refresh the lock, the LOCK_LIFETIME -# needn't be (much) longer than SNOOZE. We pad it 6 hours just to be safe. LOCK_LIFETIME = Defaults.days(1) + Defaults.hours(6) -SNOOZE = Defaults.days(1) log = None parser = None @@ -84,6 +74,15 @@ Usage: %prog [options]""")) help=_("""\ Don't restart the qrunners when they exit because of an error or a SIGUSR1. Use this only for debugging.""")) + parser.add_option('-f', '--force', + default=False, action='store_true', + help=_("""\ +If the master watcher finds an existing master lock, it will normally exit +with an error message. With this option,the master will perform an extra +level of checking. If a process matching the host/pid described in the lock +file is running, the master will still exit, requiring you to manually clean +up the lock. But if no matching process is found, the master will remove the +apparently stale lock and make another attempt to claim the master lock.""")) parser.add_option('-C', '--config', help=_('Alternative configuration file to use')) options, arguments = parser.parse_args() @@ -104,8 +103,8 @@ def get_lock_data(): with open(config.LOCK_FILE) as fp: filename = os.path.split(fp.read().strip())[1] parts = filename.split('.') - hostname = DOT.join(parts[1:-1]) - pid = int(parts[-1]) + hostname = DOT.join(parts[1:-2]) + pid = int(parts[-2]) return hostname, int(pid), filename @@ -164,28 +163,27 @@ def acquire_lock_1(force): return acquire_lock_1(force=False) -def acquire_lock(force): +def acquire_lock(): """Acquire the master queue runner lock. - :param force: Flag that controls whether to force acquisition of the lock. :return: The master queue runner lock or None if the lock couldn't be acquired. In that case, an error messages is also printed to standard error. """ try: - lock = acquire_lock_1(force) + lock = acquire_lock_1(parser.options.force) return lock except lockfile.TimeOutError: status = master_state() if status == WatcherState.conflict: # Hostname matches and process exists. - print >> sys.stderr, _("""\ -The master qrunner lock could not be acquired because it appears as if another -master qrunner is already running. + message = _("""\ +The master qrunner lock could not be acquired because it appears +as though another master qrunner is already running. """) elif status == WatcherState.stale_lock: # Hostname matches but the process does not exist. - print >> sys.stderr, _("""\ + message = _("""\ The master qrunner lock could not be acquired. It appears as though there is a stale master qrunner lock. Try re-running mailmanctl with the -s flag. """) @@ -193,17 +191,17 @@ a stale master qrunner lock. Try re-running mailmanctl with the -s flag. assert status == WatcherState.host_mismatch, ( 'Invalid enum value: %s' % status) # Hostname doesn't even match. - print >> sys.stderr, _("""\ + hostname, pid, tempfile = get_lock_data() + message = _("""\ The master qrunner lock could not be acquired, because it appears as if some process on some other host may have acquired it. We can't test for stale -locks across host boundaries, so you'll have to do this manually. Or, if you -know the lock is stale, re-run mailmanctl with the -s flag. +locks across host boundaries, so you'll have to clean this up manually. Lock file: $config.LOCK_FILE -Lock host: $status +Lock host: $hostname Exiting.""") - return None + parser.error(message) @@ -226,7 +224,7 @@ def start_runner(qrname, slice, count): # Craft the command line arguments for the exec() call. rswitch = '--runner=%s:%d:%d' % (qrname, slice, count) # Wherever mailmanctl lives, so too must live the qrunner script. - exe = os.path.join(BIN_DIR, 'qrunner') + exe = os.path.join(config.BIN_DIR, 'qrunner') # config.PYTHON, which is the absolute path to the Python interpreter, # must be given as argv[0] due to Python's library search algorithm. args = [sys.executable, sys.executable, exe, rswitch, '-s'] @@ -375,14 +373,18 @@ def main(): log = logging.getLogger('mailman.qrunner') # Acquire the master lock, exiting if we can't acquire it. We'll let the - # caller handle any clean up or lock breaking. - with lockfile.Lock(config.LOCK_FILE, LOCK_LIFETIME) as lock: + # caller handle any clean up or lock breaking. No with statement here + # because Lock's constructor doesn't support a timeout. + lock = acquire_lock() + try: with open(config.PIDFILE, 'w') as fp: print >> fp, os.getpid() try: control_loop(lock) finally: os.remove(config.PIDFILE) + finally: + lock.unlock() |
