Commit 0fafdc9f authored by David Howells's avatar David Howells

afs: Fix file locking

Fix the AFS file locking whereby the use of the big kernel lock (which
could be slept with) was replaced by a spinlock (which couldn't).  The
problem is that the AFS code was doing stuff inside the critical section
that might call schedule(), so this is a broken transformation.

Fix this by the following means:

 (1) Use a state machine with a proper state that can only be changed under
     the spinlock rather than using a collection of bit flags.

 (2) Cache the key used for the lock and the lock type in the afs_vnode
     struct so that the manager work function doesn't have to refer to a
     file_lock struct that's been dequeued.  This makes signal handling
     safer.

 (4) Move the unlock from afs_do_unlk() to afs_fl_release_private() which
     means that unlock is achieved in other circumstances too.

 (5) Unlock the file on the server before taking the next conflicting lock.

Also change:

 (1) Check the permits on a file before actually trying the lock.

 (2) fsync the file before effecting an explicit unlock operation.  We
     don't fsync if the lock is erased otherwise as we might not be in a
     context where we can actually do that.

Further fixes:

 (1) Fixed-fileserver address rotation is made to work.  It's only used by
     the locking functions, so couldn't be tested before.

Fixes: 72f98e72 ("locks: turn lock_flocks into a spinlock")
Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
cc: jlayton@redhat.com
parent cf9b0772
......@@ -170,7 +170,7 @@ void afs_lock_work(struct work_struct *work)
{
struct afs_vnode *vnode =
container_of(work, struct afs_vnode, lock_work.work);
struct file_lock *fl;
struct file_lock *fl, *next;
afs_lock_type_t type;
struct key *key;
int ret;
......@@ -179,117 +179,136 @@ void afs_lock_work(struct work_struct *work)
spin_lock(&vnode->lock);
if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) {
again:
_debug("wstate %u for %p", vnode->lock_state, vnode);
switch (vnode->lock_state) {
case AFS_VNODE_LOCK_NEED_UNLOCK:
_debug("unlock");
vnode->lock_state = AFS_VNODE_LOCK_UNLOCKING;
spin_unlock(&vnode->lock);
/* attempt to release the server lock; if it fails, we just
* wait 5 minutes and it'll time out anyway */
ret = afs_release_lock(vnode, vnode->unlock_key);
* wait 5 minutes and it'll expire anyway */
ret = afs_release_lock(vnode, vnode->lock_key);
if (ret < 0)
printk(KERN_WARNING "AFS:"
" Failed to release lock on {%x:%x} error %d\n",
vnode->fid.vid, vnode->fid.vnode, ret);
spin_lock(&vnode->lock);
key_put(vnode->unlock_key);
vnode->unlock_key = NULL;
clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags);
}
key_put(vnode->lock_key);
vnode->lock_key = NULL;
vnode->lock_state = AFS_VNODE_LOCK_NONE;
if (list_empty(&vnode->pending_locks)) {
spin_unlock(&vnode->lock);
return;
}
/* The new front of the queue now owns the state variables. */
next = list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link);
vnode->lock_key = afs_file_key(next->fl_file);
vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
goto again;
/* if we've got a lock, then it must be time to extend that lock as AFS
* locks time out after 5 minutes */
if (!list_empty(&vnode->granted_locks)) {
/* If we've already got a lock, then it must be time to extend that
* lock as AFS locks time out after 5 minutes.
*/
case AFS_VNODE_LOCK_GRANTED:
_debug("extend");
if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
BUG();
fl = list_entry(vnode->granted_locks.next,
struct file_lock, fl_u.afs.link);
key = key_get(afs_file_key(fl->fl_file));
ASSERT(!list_empty(&vnode->granted_locks));
key = key_get(vnode->lock_key);
vnode->lock_state = AFS_VNODE_LOCK_EXTENDING;
spin_unlock(&vnode->lock);
ret = afs_extend_lock(vnode, key);
clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
ret = afs_extend_lock(vnode, key); /* RPC */
key_put(key);
switch (ret) {
case 0:
if (ret < 0)
pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n",
vnode->fid.vid, vnode->fid.vnode, ret);
spin_lock(&vnode->lock);
if (vnode->lock_state != AFS_VNODE_LOCK_EXTENDING)
goto again;
vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
if (ret == 0)
afs_schedule_lock_extension(vnode);
break;
default:
/* ummm... we failed to extend the lock - retry
* extension shortly */
printk(KERN_WARNING "AFS:"
" Failed to extend lock on {%x:%x} error %d\n",
vnode->fid.vid, vnode->fid.vnode, ret);
else
queue_delayed_work(afs_lock_manager, &vnode->lock_work,
HZ * 10);
break;
}
_leave(" [extend]");
spin_unlock(&vnode->lock);
_leave(" [ext]");
return;
}
/* if we don't have a granted lock, then we must've been called back by
* the server, and so if might be possible to get a lock we're
* currently waiting for */
if (!list_empty(&vnode->pending_locks)) {
/* If we don't have a granted lock, then we must've been called
* back by the server, and so if might be possible to get a
* lock we're currently waiting for.
*/
case AFS_VNODE_LOCK_WAITING_FOR_CB:
_debug("get");
if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags))
BUG();
fl = list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link);
key = key_get(afs_file_key(fl->fl_file));
type = (fl->fl_type == F_RDLCK) ?
AFS_LOCK_READ : AFS_LOCK_WRITE;
key = key_get(vnode->lock_key);
type = vnode->lock_type;
vnode->lock_state = AFS_VNODE_LOCK_SETTING;
spin_unlock(&vnode->lock);
ret = afs_set_lock(vnode, key, type);
clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
ret = afs_set_lock(vnode, key, type); /* RPC */
key_put(key);
spin_lock(&vnode->lock);
switch (ret) {
case -EWOULDBLOCK:
_debug("blocked");
break;
case 0:
_debug("acquired");
if (type == AFS_LOCK_READ)
set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
else
set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
ret = AFS_LOCK_GRANTED;
vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
/* Fall through */
default:
spin_lock(&vnode->lock);
/* the pending lock may have been withdrawn due to a
* signal */
if (list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link) == fl) {
fl->fl_u.afs.state = ret;
if (ret == AFS_LOCK_GRANTED)
afs_grant_locks(vnode, fl);
else
list_del_init(&fl->fl_u.afs.link);
wake_up(&fl->fl_wait);
spin_unlock(&vnode->lock);
} else {
/* Pass the lock or the error onto the first locker in
* the list - if they're looking for this type of lock.
* If they're not, we assume that whoever asked for it
* took a signal.
*/
if (list_empty(&vnode->pending_locks)) {
_debug("withdrawn");
clear_bit(AFS_VNODE_READLOCKED, &vnode->flags);
clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
spin_unlock(&vnode->lock);
afs_release_lock(vnode, key);
if (!list_empty(&vnode->pending_locks))
afs_lock_may_be_available(vnode);
vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
goto again;
}
break;
fl = list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link);
type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
if (vnode->lock_type != type) {
_debug("changed");
vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
goto again;
}
fl->fl_u.afs.state = ret;
if (ret == 0)
afs_grant_locks(vnode, fl);
else
list_del_init(&fl->fl_u.afs.link);
wake_up(&fl->fl_wait);
spin_unlock(&vnode->lock);
_leave(" [granted]");
return;
}
key_put(key);
_leave(" [pend]");
default:
/* Looks like a lock request was withdrawn. */
spin_unlock(&vnode->lock);
_leave(" [no]");
return;
}
/* looks like the lock request was withdrawn on a signal */
spin_unlock(&vnode->lock);
_leave(" [no locks]");
}
/*
......@@ -298,15 +317,105 @@ void afs_lock_work(struct work_struct *work)
* AF_RXRPC
* - the caller must hold the vnode lock
*/
static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
static void afs_defer_unlock(struct afs_vnode *vnode)
{
cancel_delayed_work(&vnode->lock_work);
if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) &&
!test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags))
BUG();
if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags))
BUG();
vnode->unlock_key = key_get(key);
_enter("");
if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED ||
vnode->lock_state == AFS_VNODE_LOCK_EXTENDING) {
cancel_delayed_work(&vnode->lock_work);
vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK;
afs_lock_may_be_available(vnode);
}
}
/*
* Check that our view of the file metadata is up to date and check to see
* whether we think that we have a locking permit.
*/
static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key,
afs_lock_type_t type, bool can_sleep)
{
afs_access_t access;
int ret;
/* Make sure we've got a callback on this file and that our view of the
* data version is up to date.
*/
ret = afs_validate(vnode, key);
if (ret < 0)
return ret;
/* Check the permission set to see if we're actually going to be
* allowed to get a lock on this file.
*/
ret = afs_check_permit(vnode, key, &access);
if (ret < 0)
return ret;
/* At a rough estimation, you need LOCK, WRITE or INSERT perm to
* read-lock a file and WRITE or INSERT perm to write-lock a file.
*
* We can't rely on the server to do this for us since if we want to
* share a read lock that we already have, we won't go the server.
*/
if (type == AFS_LOCK_READ) {
if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE | AFS_ACE_LOCK)))
return -EACCES;
if (vnode->status.lock_count == -1 && !can_sleep)
return -EAGAIN; /* Write locked */
} else {
if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE)))
return -EACCES;
if (vnode->status.lock_count != 0 && !can_sleep)
return -EAGAIN; /* Locked */
}
return 0;
}
/*
* Remove the front runner from the pending queue.
* - The caller must hold vnode->lock.
*/
static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl)
{
struct file_lock *next;
_enter("");
/* ->lock_type, ->lock_key and ->lock_state only belong to this
* file_lock if we're at the front of the pending queue or if we have
* the lock granted or if the lock_state is NEED_UNLOCK or UNLOCKING.
*/
if (vnode->granted_locks.next == &fl->fl_u.afs.link &&
vnode->granted_locks.prev == &fl->fl_u.afs.link) {
list_del_init(&fl->fl_u.afs.link);
afs_defer_unlock(vnode);
return;
}
if (!list_empty(&vnode->granted_locks) ||
vnode->pending_locks.next != &fl->fl_u.afs.link) {
list_del_init(&fl->fl_u.afs.link);
return;
}
list_del_init(&fl->fl_u.afs.link);
key_put(vnode->lock_key);
vnode->lock_key = NULL;
vnode->lock_state = AFS_VNODE_LOCK_NONE;
if (list_empty(&vnode->pending_locks))
return;
/* The new front of the queue now owns the state variables. */
next = list_entry(vnode->pending_locks.next,
struct file_lock, fl_u.afs.link);
vnode->lock_key = afs_file_key(next->fl_file);
vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
afs_lock_may_be_available(vnode);
}
......@@ -315,7 +424,7 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key)
*/
static int afs_do_setlk(struct file *file, struct file_lock *fl)
{
struct inode *inode = file_inode(file);
struct inode *inode = locks_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
afs_lock_type_t type;
struct key *key = afs_file_key(file);
......@@ -333,165 +442,136 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
spin_lock(&inode->i_lock);
/* make sure we've got a callback on this file and that our view of the
* data version is up to date */
ret = afs_validate(vnode, key);
ret = afs_do_setlk_check(vnode, key, type, fl->fl_flags & FL_SLEEP);
if (ret < 0)
goto error;
if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) {
ret = -EAGAIN;
goto error;
}
return ret;
spin_lock(&vnode->lock);
/* if we've already got a readlock on the server then we can instantly
/* If we've already got a readlock on the server then we instantly
* grant another readlock, irrespective of whether there are any
* pending writelocks */
* pending writelocks.
*/
if (type == AFS_LOCK_READ &&
vnode->flags & (1 << AFS_VNODE_READLOCKED)) {
vnode->lock_state == AFS_VNODE_LOCK_GRANTED &&
vnode->lock_type == AFS_LOCK_READ) {
_debug("instant readlock");
ASSERTCMP(vnode->flags &
((1 << AFS_VNODE_LOCKING) |
(1 << AFS_VNODE_WRITELOCKED)), ==, 0);
ASSERT(!list_empty(&vnode->granted_locks));
goto sharing_existing_lock;
goto share_existing_lock;
}
/* if there's no-one else with a lock on this vnode, then we need to
* ask the server for a lock */
if (list_empty(&vnode->pending_locks) &&
list_empty(&vnode->granted_locks)) {
_debug("not locked");
ASSERTCMP(vnode->flags &
((1 << AFS_VNODE_LOCKING) |
(1 << AFS_VNODE_READLOCKED) |
(1 << AFS_VNODE_WRITELOCKED)), ==, 0);
list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
set_bit(AFS_VNODE_LOCKING, &vnode->flags);
spin_unlock(&vnode->lock);
list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
ret = afs_set_lock(vnode, key, type);
clear_bit(AFS_VNODE_LOCKING, &vnode->flags);
switch (ret) {
case 0:
_debug("acquired");
goto acquired_server_lock;
case -EWOULDBLOCK:
_debug("would block");
spin_lock(&vnode->lock);
ASSERT(list_empty(&vnode->granted_locks));
ASSERTCMP(vnode->pending_locks.next, ==,
&fl->fl_u.afs.link);
goto wait;
default:
spin_lock(&vnode->lock);
list_del_init(&fl->fl_u.afs.link);
spin_unlock(&vnode->lock);
goto error;
}
}
if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
goto need_to_wait;
/* otherwise, we need to wait for a local lock to become available */
_debug("wait local");
list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks);
wait:
if (!(fl->fl_flags & FL_SLEEP)) {
_debug("noblock");
ret = -EAGAIN;
goto abort_attempt;
}
/* We don't have a lock on this vnode and we aren't currently waiting
* for one either, so ask the server for a lock.
*
* Note that we need to be careful if we get interrupted by a signal
* after dispatching the request as we may still get the lock, even
* though we don't wait for the reply (it's not too bad a problem - the
* lock will expire in 10 mins anyway).
*/
_debug("not locked");
vnode->lock_key = key_get(key);
vnode->lock_type = type;
vnode->lock_state = AFS_VNODE_LOCK_SETTING;
spin_unlock(&vnode->lock);
/* now we need to sleep and wait for the lock manager thread to get the
* lock from the server */
_debug("sleep");
ret = wait_event_interruptible(fl->fl_wait,
fl->fl_u.afs.state <= AFS_LOCK_GRANTED);
if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
ret = fl->fl_u.afs.state;
if (ret < 0)
goto error;
spin_lock(&vnode->lock);
goto given_lock;
}
/* we were interrupted, but someone may still be in the throes of
* giving us the lock */
_debug("intr");
ASSERTCMP(ret, ==, -ERESTARTSYS);
ret = afs_set_lock(vnode, key, type); /* RPC */
spin_lock(&vnode->lock);
if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) {
ret = fl->fl_u.afs.state;
if (ret < 0) {
spin_unlock(&vnode->lock);
goto error;
}
goto given_lock;
}
switch (ret) {
default:
goto abort_attempt;
abort_attempt:
/* we aren't going to get the lock, either because we're unwilling to
* wait, or because some signal happened */
_debug("abort");
if (list_empty(&vnode->granted_locks) &&
vnode->pending_locks.next == &fl->fl_u.afs.link) {
if (vnode->pending_locks.prev != &fl->fl_u.afs.link) {
/* kick the next pending lock into having a go */
list_del_init(&fl->fl_u.afs.link);
afs_lock_may_be_available(vnode);
}
} else {
list_del_init(&fl->fl_u.afs.link);
case -EWOULDBLOCK:
/* The server doesn't have a lock-waiting queue, so the client
* will have to retry. The server will break the outstanding
* callbacks on a file when a lock is released.
*/
_debug("would block");
ASSERT(list_empty(&vnode->granted_locks));
ASSERTCMP(vnode->pending_locks.next, ==, &fl->fl_u.afs.link);
vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB;
goto need_to_wait;
case 0:
_debug("acquired");
break;
}
spin_unlock(&vnode->lock);
goto error;
acquired_server_lock:
/* we've acquired a server lock, but it needs to be renewed after 5
* mins */
spin_lock(&vnode->lock);
vnode->lock_state = AFS_VNODE_LOCK_GRANTED;
afs_schedule_lock_extension(vnode);
if (type == AFS_LOCK_READ)
set_bit(AFS_VNODE_READLOCKED, &vnode->flags);
else
set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags);
sharing_existing_lock:
share_existing_lock:
/* the lock has been granted as far as we're concerned... */
fl->fl_u.afs.state = AFS_LOCK_GRANTED;
list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks);
given_lock:
/* ... but we do still need to get the VFS's blessing */
ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING)));
ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) |
(1 << AFS_VNODE_WRITELOCKED))) != 0);
spin_unlock(&vnode->lock);
ret = posix_lock_file(file, fl, NULL);
if (ret < 0)
goto vfs_rejected_lock;
spin_unlock(&vnode->lock);
/* again, make sure we've got a callback on this file and, again, make
/* Again, make sure we've got a callback on this file and, again, make
* sure that our view of the data version is up to date (we ignore
* errors incurred here and deal with the consequences elsewhere) */
* errors incurred here and deal with the consequences elsewhere).
*/
afs_validate(vnode, key);
_leave(" = 0");
return 0;
error:
spin_unlock(&inode->i_lock);
need_to_wait:
/* We're going to have to wait. Either this client doesn't have a lock
* on the server yet and we need to wait for a callback to occur, or
* the client does have a lock on the server, but it belongs to some
* other process(es) and is incompatible with the lock we want.
*/
ret = -EAGAIN;
if (fl->fl_flags & FL_SLEEP) {
spin_unlock(&vnode->lock);
_debug("sleep");
ret = wait_event_interruptible(fl->fl_wait,
fl->fl_u.afs.state != AFS_LOCK_PENDING);
spin_lock(&vnode->lock);
}
if (fl->fl_u.afs.state == AFS_LOCK_GRANTED)
goto given_lock;
if (fl->fl_u.afs.state < 0)
ret = fl->fl_u.afs.state;
abort_attempt:
/* we aren't going to get the lock, either because we're unwilling to
* wait, or because some signal happened */
_debug("abort");
afs_dequeue_lock(vnode, fl);
error_unlock:
spin_unlock(&vnode->lock);
_leave(" = %d", ret);
return ret;
vfs_rejected_lock:
/* the VFS rejected the lock we just obtained, so we have to discard
* what we just got */
/* The VFS rejected the lock we just obtained, so we have to discard
* what we just got. We defer this to the lock manager work item to
* deal with.
*/
_debug("vfs refused %d", ret);
spin_lock(&vnode->lock);
list_del_init(&fl->fl_u.afs.link);
if (list_empty(&vnode->granted_locks))
afs_defer_unlock(vnode, key);
goto abort_attempt;
afs_defer_unlock(vnode);
goto error_unlock;
}
/*
......@@ -499,34 +579,21 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
*/
static int afs_do_unlk(struct file *file, struct file_lock *fl)
{
struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
struct key *key = afs_file_key(file);
struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
int ret;
_enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
/* Flush all pending writes before doing anything with locks. */
vfs_fsync(file, 0);
/* only whole-file unlocks are supported */
if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX)
return -EINVAL;
fl->fl_ops = &afs_lock_ops;
INIT_LIST_HEAD(&fl->fl_u.afs.link);
fl->fl_u.afs.state = AFS_LOCK_PENDING;
spin_lock(&vnode->lock);
ret = posix_lock_file(file, fl, NULL);
if (ret < 0) {
spin_unlock(&vnode->lock);
_leave(" = %d [vfs]", ret);
return ret;
}
/* discard the server lock only if all granted locks are gone */
if (list_empty(&vnode->granted_locks))
afs_defer_unlock(vnode, key);
spin_unlock(&vnode->lock);
_leave(" = 0");
return 0;
_leave(" = %d [%u]", ret, vnode->lock_state);
return ret;
}
/*
......@@ -534,7 +601,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
*/
static int afs_do_getlk(struct file *file, struct file_lock *fl)
{
struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host);
struct afs_vnode *vnode = AFS_FS_I(locks_inode(file));
struct key *key = afs_file_key(file);
int ret, lock_count;
......@@ -542,29 +609,25 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)