diff options
Diffstat (limited to 'Mailman/lockfile.py')
| -rw-r--r-- | Mailman/lockfile.py | 583 |
1 files changed, 583 insertions, 0 deletions
diff --git a/Mailman/lockfile.py b/Mailman/lockfile.py new file mode 100644 index 000000000..7db746952 --- /dev/null +++ b/Mailman/lockfile.py @@ -0,0 +1,583 @@ +# Copyright (C) 1998-2007 by the Free Software Foundation, Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +# USA. + +"""Portable, NFS-safe file locking with timeouts. + +This code implements an NFS-safe file-based locking algorithm influenced by +the GNU/Linux open(2) manpage, under the description of the O_EXCL option. +From RH6.1: + + [...] O_EXCL is broken on NFS file systems, programs which rely on it + for performing locking tasks will contain a race condition. The + solution for performing atomic file locking using a lockfile is to + create a unique file on the same fs (e.g., incorporating hostname and + pid), use link(2) to make a link to the lockfile. If link() returns + 0, the lock is successful. Otherwise, use stat(2) on the unique file + to check if its link count has increased to 2, in which case the lock + is also successful. + +The assumption made here is that there will be no `outside interference', +e.g. no agent external to this code will have access to link() to the affected +lock files. + +LockFile objects support lock-breaking so that you can't wedge a process +forever. This is especially helpful in a web environment, but may not be +appropriate for all applications. + +Locks have a `lifetime', which is the maximum length of time the process +expects to retain the lock. It is important to pick a good number here +because other processes will not break an existing lock until the expected +lifetime has expired. Too long and other processes will hang; too short and +you'll end up trampling on existing process locks -- and possibly corrupting +data. In a distributed (NFS) environment, you also need to make sure that +your clocks are properly synchronized. +""" + +__metaclass__ = type +__all__ = [ + 'LockError', + 'AlreadyLockedError', + 'NotLockedError', + 'LockFile', + ] + +# This code has undergone several revisions, with contributions from Barry +# Warsaw, Thomas Wouters, Harald Meland, and John Viega. It should also work +# well outside of Mailman so it could be used for other Python projects +# requiring file locking. See the __main__ section at the bottom of the file +# for unit testing. + +import os +import time +import errno +import random +import socket +import logging +import datetime +import traceback + +# Units are floating-point seconds. +DEFAULT_LOCK_LIFETIME = datetime.timedelta(seconds=15) +# Allowable a bit of clock skew, in seconds. +CLOCK_SLOP = 10 +# This is appropriate for Mailman, but you may want to change this if you're +# using this code outside Mailman. +log = logging.getLogger('mailman.locks') + + + +# Exceptions that can be raised by this module +class LockError(Exception): + """Base class for all exceptions in this module.""" + +class AlreadyLockedError(LockError): + """An attempt is made to lock an already locked object.""" + +class NotLockedError(LockError): + """An attempt is made to unlock an object that isn't locked.""" + +class TimeOutError(LockError): + """The timeout interval elapsed before the lock succeeded.""" + + + +class LockFile: + """A portable way to lock resources by way of the file system. + + This class supports the following methods: + + __init__(lockfile[, lifetime]): + Create the resource lock using lockfile as the global lock file. Each + process laying claim to this resource lock will create their own + temporary lock files based on the path specified by lockfile. + Optional lifetime is a timedelta specifying the number of seconds the + process expects to hold the lock. + + set_lifetime(lifetime): + Set a new lock lifetime. This takes affect the next time the file is + locked, but does not refresh a locked file. + + get_lifetime(): + Return the lock's lifetime. + + refresh([newlifetime[, unconditionally]]): + Refreshes the lifetime of a locked file. Use this if you realize that + you need to keep a resource locked longer than you thought. With + optional newlifetime, set the lock's lifetime. Raises NotLockedError + if the lock is not set, unless optional unconditionally flag is set to + true. + + lock([timeout]): + Acquire the lock. This blocks until the lock is acquired unless + optional timeout is greater than 0, in which case, a TimeOutError is + raised when timeout number of seconds (or possibly more) expires + without lock acquisition. Raises AlreadyLockedError if the lock is + already set. + + unlock([unconditionally]): + Relinquishes the lock. Raises a NotLockedError if the lock is not + set, unless optional unconditionally is true. + + locked(): + Return true if the lock is set, otherwise false. To avoid race + conditions, this refreshes the lock (on set locks). + + """ + # XXX We need to watch out for two lock objects in the same process + # pointing to the same lock file. Without this, if you lock lf1 and do + # not lock lf2, lf2.locked() will still return true. NOTE: this gimmick + # probably does /not/ work in a multithreaded world, but we don't have to + # worry about that, do we? <1 wink>. + COUNTER = 0 + + def __init__(self, lockfile, lifetime=DEFAULT_LOCK_LIFETIME): + """Create the resource lock using lockfile as the global lock file. + + Each process laying claim to this resource lock will create their own + temporary lock files based on the path specified by lockfile. + Optional lifetime is the number of seconds the process expects to hold + the lock. Optional withlogging, when true, turns on lockfile logging + (see the module docstring for details). + """ + self._lockfile = lockfile + self._lifetime = lifetime + # This works because we know we're single threaded + self._counter = LockFile.COUNTER + LockFile.COUNTER += 1 + self._tmpfname = '%s.%s.%d.%d' % ( + lockfile, socket.gethostname(), os.getpid(), self._counter) + # For transferring ownership across a fork. + self._owned = True + + def __repr__(self): + return '<LockFile %s: %s [%s: %s] pid=%s>' % ( + id(self), self._lockfile, + self.locked() and 'locked' or 'unlocked', + self._lifetime, os.getpid()) + + def set_lifetime(self, lifetime): + """Set a new lock lifetime. + + This takes affect the next time the file is locked, but does not + refresh a locked file. + """ + self._lifetime = lifetime + + def get_lifetime(self): + """Return the lock's lifetime.""" + return self._lifetime + + def refresh(self, newlifetime=None, unconditionally=False): + """Refreshes the lifetime of a locked file. + + Use this if you realize that you need to keep a resource locked longer + than you thought. With optional newlifetime, set the lock's lifetime. + Raises NotLockedError if the lock is not set, unless optional + unconditionally flag is set to true. + """ + if newlifetime is not None: + self.set_lifetime(newlifetime) + # Do we have the lock? As a side effect, this refreshes the lock! + if not self.locked() and not unconditionally: + raise NotLockedError('%s: %s' % (repr(self), self._read())) + + def lock(self, timeout=0): + """Acquire the lock. + + This blocks until the lock is acquired unless optional timeout is + greater than 0, in which case, a TimeOutError is raised when timeout + number of seconds (or possibly more) expires without lock acquisition. + Raises AlreadyLockedError if the lock is already set. + """ + if timeout: + timeout_time = time.time() + timeout + # Make sure my temp lockfile exists, and that its contents are + # up-to-date (e.g. the temp file name, and the lock lifetime). + self._write() + # XXX This next call can fail with an EPERM. I have no idea why, but + # I'm nervous about wrapping this in a try/except. It seems to be a + # very rare occurence, only happens from cron, and (only?) on Solaris + # 2.6. + self._touch() + log.debug('laying claim: %s', self._lockfile) + # for quieting the logging output + loopcount = -1 + while True: + loopcount += 1 + # Create the hard link and test for exactly 2 links to the file + try: + os.link(self._tmpfname, self._lockfile) + # If we got here, we know we know we got the lock, and never + # had it before, so we're done. Just touch it again for the + # fun of it. + log.debug('got the lock: %s', self._lockfile) + self._touch() + break + except OSError, e: + # The link failed for some reason, possibly because someone + # else already has the lock (i.e. we got an EEXIST), or for + # some other bizarre reason. + if e.errno == errno.ENOENT: + # XXX in some Linux environments, it is possible to get + # an ENOENT, which is truly strange, because this means + # that self._tmpfname doesn't exist at the time of the + # os.link(), but self._write() is supposed to guarantee + # that this happens! I don't honestly know why this + # happens, but for now we just say we didn't acquire the + # lock, and try again next time. + pass + elif e.errno <> errno.EEXIST: + # Something very bizarre happened. Clean up our state and + # pass the error on up. + log.exception('unexpected link') + os.unlink(self._tmpfname) + raise + elif self._linkcount() <> 2: + # Somebody's messin' with us! Log this, and try again + # later. XXX should we raise an exception? + log.error('unexpected linkcount: %d', self._linkcount()) + elif self._read() == self._tmpfname: + # It was us that already had the link. + log.debug('already locked: %s', self._lockfile) + raise AlreadyLockedError + # otherwise, someone else has the lock + pass + # We did not acquire the lock, because someone else already has + # it. Have we timed out in our quest for the lock? + if timeout and timeout_time < time.time(): + os.unlink(self._tmpfname) + log.error('timed out') + raise TimeOutError + # Okay, we haven't timed out, but we didn't get the lock. Let's + # find if the lock lifetime has expired. + if time.time() > self._releasetime() + CLOCK_SLOP: + # Yes, so break the lock. + self._break() + log.error('lifetime has expired, breaking') + # Okay, someone else has the lock, our claim hasn't timed out yet, + # and the expected lock lifetime hasn't expired yet. So let's + # wait a while for the owner of the lock to give it up. + elif not loopcount % 100: + log.debug('waiting for claim: %s', self._lockfile) + self._sleep() + + def unlock(self, unconditionally=False): + """Unlock the lock. + + If we don't already own the lock (either because of unbalanced unlock + calls, or because the lock was stolen out from under us), raise a + NotLockedError, unless optional `unconditionally' is true. + """ + islocked = self.locked() + if not islocked and not unconditionally: + raise NotLockedError + # If we owned the lock, remove the global file, relinquishing it. + if islocked: + try: + os.unlink(self._lockfile) + except OSError, e: + if e.errno <> errno.ENOENT: + raise + # Remove our tempfile + try: + os.unlink(self._tmpfname) + except OSError, e: + if e.errno <> errno.ENOENT: + raise + log.debug('unlocked: %s', self._lockfile) + + def locked(self): + """Return true if we own the lock, false if we do not. + + Checking the status of the lock resets the lock's lifetime, which + helps avoid race conditions during the lock status test. + """ + # Discourage breaking the lock for a while. + try: + self._touch() + except OSError, e: + if e.errno == errno.EPERM: + # We can't touch the file because we're not the owner. I + # don't see how we can own the lock if we're not the owner. + return False + else: + raise + # XXX Can the link count ever be > 2? + if self._linkcount() <> 2: + return False + return self._read() == self._tmpfname + + def finalize(self): + log.debug('finalize: %s', self._lockfile) + self.unlock(unconditionally=True) + + def __del__(self): + log.debug('__del__: %s', self._lockfile) + if self._owned: + self.finalize() + + # Python 2.5 context manager protocol support. + def __enter__(self): + self.lock() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.unlock() + # Don't suppress any exception that might have occurred. + return False + + # Use these only if you're transfering ownership to a child process across + # a fork. Use at your own risk, but it should be race-condition safe. + # _transfer_to() is called in the parent, passing in the pid of the child. + # _take_possession() is called in the child, and blocks until the parent + # has transferred possession to the child. _disown() is used to set the + # _owned flag to false, and it is a disgusting wart necessary to make + # forced lock acquisition work in mailmanctl. :( + def _transfer_to(self, pid): + # First touch it so it won't get broken while we're fiddling about. + self._touch() + # Find out current claim's temp filename + winner = self._read() + # Now twiddle ours to the given pid + self._tmpfname = '%s.%s.%d' % ( + self._lockfile, socket.gethostname(), pid) + # Create a hard link from the global lock file to the temp file. This + # actually does things in reverse order of normal operation because we + # know that lockfile exists, and tmpfname better not! + os.link(self._lockfile, self._tmpfname) + # Now update the lock file to contain a reference to the new owner + self._write() + # Toggle off our ownership of the file so we don't try to finalize it + # in our __del__() + self._owned = False + # Unlink the old winner, completing the transfer + os.unlink(winner) + # And do some sanity checks + assert self._linkcount() == 2 + assert self.locked() + log.debug('transferred the lock: %s', self._lockfile) + + def _take_possession(self): + self._tmpfname = tmpfname = '%s.%s.%d' % ( + self._lockfile, socket.gethostname(), os.getpid()) + # Wait until the linkcount is 2, indicating the parent has completed + # the transfer. + while self._linkcount() <> 2 or self._read() <> tmpfname: + time.sleep(0.25) + log.debug('took possession of the lock: %s', self._lockfile) + + def _disown(self): + self._owned = False + + # + # Private interface + # + + def _write(self): + # Make sure it's group writable + fp = open(self._tmpfname, 'w') + try: + fp.write(self._tmpfname) + finally: + fp.close() + + def _read(self): + try: + fp = open(self._lockfile) + try: + filename = fp.read() + finally: + fp.close() + return filename + except EnvironmentError, e: + if e.errno <> errno.ENOENT: + raise + return None + + def _touch(self, filename=None): + expiration_date = datetime.datetime.now() + self._lifetime + t = time.mktime(expiration_date.timetuple()) + try: + # XXX We probably don't need to modify atime, but this is easier. + os.utime(filename or self._tmpfname, (t, t)) + except OSError, e: + if e.errno <> errno.ENOENT: + raise + + def _releasetime(self): + try: + return os.stat(self._lockfile).st_mtime + except OSError, e: + if e.errno <> errno.ENOENT: + raise + return -1 + + def _linkcount(self): + try: + return os.stat(self._lockfile).st_nlink + except OSError, e: + if e.errno <> errno.ENOENT: + raise + return -1 + + def _break(self): + # First, touch the global lock file. This reduces but does not + # eliminate the chance for a race condition during breaking. Two + # processes could both pass the test for lock expiry in lock() before + # one of them gets to touch the global lockfile. This shouldn't be + # too bad because all they'll do in this function is wax the lock + # files, not claim the lock, and we can be defensive for ENOENTs + # here. + # + # Touching the lock could fail if the process breaking the lock and + # the process that claimed the lock have different owners. We could + # solve this by set-uid'ing the CGI and mail wrappers, but I don't + # think it's that big a problem. + try: + self._touch(self._lockfile) + except OSError, e: + if e.errno <> errno.EPERM: + raise + # Get the name of the old winner's temp file. + winner = self._read() + # Remove the global lockfile, which actually breaks the lock. + try: + os.unlink(self._lockfile) + except OSError, e: + if e.errno <> errno.ENOENT: + raise + # Try to remove the old winner's temp file, since we're assuming the + # winner process has hung or died. Don't worry too much if we can't + # unlink their temp file -- this doesn't wreck the locking algorithm, + # but will leave temp file turds laying around, a minor inconvenience. + try: + if winner: + os.unlink(winner) + except OSError, e: + if e.errno <> errno.ENOENT: + raise + + def _sleep(self): + interval = random.random() * 2.0 + 0.01 + time.sleep(interval) + + + +# Unit test framework +def _dochild(): + prefix = '[%d]' % os.getpid() + # Create somewhere between 1 and 1000 locks + lockfile = LockFile('/tmp/LockTest', lifetime=120) + # Use a lock lifetime of between 1 and 15 seconds. Under normal + # situations, Mailman's usage patterns (untested) shouldn't be much longer + # than this. + workinterval = 5 * random.random() + hitwait = 20 * random.random() + print prefix, 'workinterval:', workinterval + islocked = False + t0 = 0 + t1 = 0 + t2 = 0 + try: + try: + t0 = time.time() + print prefix, 'acquiring...' + lockfile.lock() + print prefix, 'acquired...' + islocked = True + except TimeOutError: + print prefix, 'timed out' + else: + t1 = time.time() + print prefix, 'acquisition time:', t1-t0, 'seconds' + time.sleep(workinterval) + finally: + if islocked: + try: + lockfile.unlock() + t2 = time.time() + print prefix, 'lock hold time:', t2-t1, 'seconds' + except NotLockedError: + print prefix, 'lock was broken' + # wait for next web hit + print prefix, 'webhit sleep:', hitwait + time.sleep(hitwait) + + +def _seed(): + try: + fp = open('/dev/random') + d = fp.read(40) + fp.close() + except EnvironmentError, e: + if e.errno <> errno.ENOENT: + raise + import sha + d = sha.new(`os.getpid()`+`time.time()`).hexdigest() + random.seed(d) + + +def _onetest(): + loopcount = random.randint(1, 100) + for i in range(loopcount): + print 'Loop %d of %d' % (i+1, loopcount) + pid = os.fork() + if pid: + # parent, wait for child to exit + pid, status = os.waitpid(pid, 0) + else: + # child + _seed() + try: + _dochild() + except KeyboardInterrupt: + pass + os._exit(0) + + +def _reap(kids): + if not kids: + return + pid, status = os.waitpid(-1, os.WNOHANG) + if pid <> 0: + del kids[pid] + + +def _test(numtests): + kids = {} + for i in range(numtests): + pid = os.fork() + if pid: + # parent + kids[pid] = pid + else: + # child + _seed() + try: + _onetest() + except KeyboardInterrupt: + pass + os._exit(0) + # slightly randomize each kid's seed + while kids: + _reap(kids) + + +if __name__ == '__main__': + import sys + import random + _test(int(sys.argv[1])) |
