Commit c435ee34 authored by David Howells's avatar David Howells

afs: Overhaul the callback handling

Overhaul the AFS callback handling by the following means:

 (1) Don't give up callback promises on vnodes that we are no longer using,
     rather let them just expire on the server or let the server break
     them.  This is actually more efficient for the server as the callback
     lookup is expensive if there are lots of extant callbacks.

 (2) Only give up the callback promises we have from a server when the
     server record is destroyed.  Then we can just give up *all* the
     callback promises on it in one go.

 (3) Servers can end up being shared between cells if cells are aliased, so
     don't add all the vnodes being backed by a particular server into a
     big FID-indexed tree on that server as there may be duplicates.

     Instead have each volume instance (~= superblock) register an interest
     in a server as it starts to make use of it and use this to allow the
     processor for callbacks from the server to find the superblock and
     thence the inode corresponding to the FID being broken by means of
     ilookup_nowait().

 (4) Rather than iterating over the entire callback list when a mass-break
     comes in from the server, maintain a counter of mass-breaks in
     afs_server (cb_seq) and make afs_validate() check it against the copy
     in afs_vnode.

     It would be nice not to have to take a read_lock whilst doing this,
     but that's tricky without using RCU.

 (5) Save a ref on the fileserver we're using for a call in the afs_call
     struct so that we can access its cb_s_break during call decoding.

 (6) Write-lock around callback and status storage in a vnode and read-lock
     around getattr so that we don't see the status mid-update.

This has the following consequences:

 (1) Data invalidation isn't seen until someone calls afs_validate() on a
     vnode.  Unfortunately, we need to use a key to query the server, but
     getting one from a background thread is tricky without caching loads
     of keys all over the place.

 (2) Mass invalidation isn't seen until someone calls afs_validate().

 (3) Callback breaking is going to hit the inode_hash_lock quite a bit.
     Could this be replaced with rcu_read_lock() since inodes are destroyed
     under RCU conditions.
Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
parent d0676a16
......@@ -37,6 +37,7 @@ enum AFS_FS_Operations {
FSLOOKUP = 161, /* AFS lookup file in directory */
FSFETCHDATA64 = 65537, /* AFS Fetch file data */
FSSTOREDATA64 = 65538, /* AFS Store file data */
FSGIVEUPALLCALLBACKS = 65539, /* AFS Give up all outstanding callbacks on a server */
};
enum AFS_FS_Errors {
......
......@@ -20,116 +20,151 @@
#include <linux/sched.h>
#include "internal.h"
#if 0
unsigned afs_vnode_update_timeout = 10;
#endif /* 0 */
#define afs_breakring_space(server) \
CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail, \
ARRAY_SIZE((server)->cb_break))
struct workqueue_struct *afs_callback_update_worker;
/*
* allow the fileserver to request callback state (re-)initialisation
* Set up an interest-in-callbacks record for a volume on a server and
* register it with the server.
* - Called with volume->server_sem held.
*/
void afs_init_callback_state(struct afs_server *server)
int afs_register_server_cb_interest(struct afs_vnode *vnode,
struct afs_cb_interest **ppcbi,
struct afs_server *server)
{
struct afs_vnode *vnode;
struct afs_cb_interest *cbi = *ppcbi, *vcbi, *new, *x;
_enter("{%p}", server);
again:
vcbi = vnode->cb_interest;
if (vcbi) {
if (vcbi == cbi)
return 0;
spin_lock(&server->cb_lock);
if (cbi && vcbi->server == cbi->server) {
write_seqlock(&vnode->cb_lock);
vnode->cb_interest = afs_get_cb_interest(cbi);
write_sequnlock(&vnode->cb_lock);
afs_put_cb_interest(afs_v2net(vnode), cbi);
return 0;
}
/* kill all the promises on record from this server */
while (!RB_EMPTY_ROOT(&server->cb_promises)) {
vnode = rb_entry(server->cb_promises.rb_node,
struct afs_vnode, cb_promise);
_debug("UNPROMISE { vid=%x:%u uq=%u}",
vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
rb_erase(&vnode->cb_promise, &server->cb_promises);
vnode->cb_promised = false;
if (!cbi && vcbi->server == server) {
afs_get_cb_interest(vcbi);
x = cmpxchg(ppcbi, cbi, vcbi);
if (x != cbi) {
cbi = x;
afs_put_cb_interest(afs_v2net(vnode), vcbi);
goto again;
}
return 0;
}
}
spin_unlock(&server->cb_lock);
_leave("");
}
if (!cbi) {
new = kzalloc(sizeof(struct afs_cb_interest), GFP_KERNEL);
if (!new)
return -ENOMEM;
/*
* handle the data invalidation side of a callback being broken
*/
void afs_broken_callback_work(struct work_struct *work)
{
struct afs_vnode *vnode =
container_of(work, struct afs_vnode, cb_broken_work);
refcount_set(&new->usage, 1);
new->sb = vnode->vfs_inode.i_sb;
new->vid = vnode->volume->vid;
new->server = afs_get_server(server);
INIT_LIST_HEAD(&new->cb_link);
_enter("");
write_lock(&server->cb_break_lock);
list_add_tail(&new->cb_link, &server->cb_interests);
write_unlock(&server->cb_break_lock);
if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
return;
x = cmpxchg(ppcbi, cbi, new);
if (x == cbi) {
cbi = new;
} else {
cbi = x;
afs_put_cb_interest(afs_v2net(vnode), new);
}
}
/* we're only interested in dealing with a broken callback on *this*
* vnode and only if no-one else has dealt with it yet */
if (!mutex_trylock(&vnode->validate_lock))
return; /* someone else is dealing with it */
ASSERT(cbi);
if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
if (S_ISDIR(vnode->vfs_inode.i_mode))
afs_clear_permits(vnode);
/* Change the server the vnode is using. This entails scrubbing any
* interest the vnode had in the previous server it was using.
*/
write_seqlock(&vnode->cb_lock);
if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
goto out;
vnode->cb_interest = afs_get_cb_interest(cbi);
vnode->cb_s_break = cbi->server->cb_s_break;
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
goto out;
write_sequnlock(&vnode->cb_lock);
return 0;
}
/* if the vnode's data version number changed then its contents
* are different */
if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
afs_zap_data(vnode);
}
/*
* Set a vnode's interest on a server.
*/
void afs_set_cb_interest(struct afs_vnode *vnode, struct afs_cb_interest *cbi)
{
struct afs_cb_interest *old_cbi = NULL;
out:
mutex_unlock(&vnode->validate_lock);
if (vnode->cb_interest == cbi)
return;
/* avoid the potential race whereby the mutex_trylock() in this
* function happens again between the clear_bit() and the
* mutex_unlock() */
if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
_debug("requeue");
queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
write_seqlock(&vnode->cb_lock);
if (vnode->cb_interest != cbi) {
afs_get_cb_interest(cbi);
old_cbi = vnode->cb_interest;
vnode->cb_interest = cbi;
}
_leave("");
write_sequnlock(&vnode->cb_lock);
afs_put_cb_interest(afs_v2net(vnode), cbi);
}
/*
* Remove an interest on a server.
*/
void afs_put_cb_interest(struct afs_net *net, struct afs_cb_interest *cbi)
{
if (cbi && refcount_dec_and_test(&cbi->usage)) {
if (!list_empty(&cbi->cb_link)) {
write_lock(&cbi->server->cb_break_lock);
list_del_init(&cbi->cb_link);
write_unlock(&cbi->server->cb_break_lock);
afs_put_server(net, cbi->server);
}
kfree(cbi);
}
}
/*
* allow the fileserver to request callback state (re-)initialisation
*/
void afs_init_callback_state(struct afs_server *server)
{
if (!test_and_clear_bit(AFS_SERVER_NEW, &server->flags))
server->cb_s_break++;
}
/*
* actually break a callback
*/
static void afs_break_callback(struct afs_server *server,
struct afs_vnode *vnode)
void afs_break_callback(struct afs_vnode *vnode)
{
_enter("");
set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
write_seqlock(&vnode->cb_lock);
if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
vnode->cb_break++;
afs_clear_permits(vnode);
if (vnode->cb_promised) {
spin_lock(&vnode->lock);
_debug("break callback");
spin_lock(&server->cb_lock);
if (vnode->cb_promised) {
rb_erase(&vnode->cb_promise, &server->cb_promises);
vnode->cb_promised = false;
}
spin_unlock(&server->cb_lock);
queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
if (list_empty(&vnode->granted_locks) &&
!list_empty(&vnode->pending_locks))
afs_lock_may_be_available(vnode);
spin_unlock(&vnode->lock);
}
write_sequnlock(&vnode->cb_lock);
}
/*
......@@ -141,49 +176,31 @@ static void afs_break_callback(struct afs_server *server,
static void afs_break_one_callback(struct afs_server *server,
struct afs_fid *fid)
{
struct afs_cb_interest *cbi;
struct afs_iget_data data;
struct afs_vnode *vnode;
struct rb_node *p;
_debug("find");
spin_lock(&server->fs_lock);
p = server->fs_vnodes.rb_node;
while (p) {
vnode = rb_entry(p, struct afs_vnode, server_rb);
if (fid->vid < vnode->fid.vid)
p = p->rb_left;
else if (fid->vid > vnode->fid.vid)
p = p->rb_right;
else if (fid->vnode < vnode->fid.vnode)
p = p->rb_left;
else if (fid->vnode > vnode->fid.vnode)
p = p->rb_right;
else if (fid->unique < vnode->fid.unique)
p = p->rb_left;
else if (fid->unique > vnode->fid.unique)
p = p->rb_right;
else
goto found;
}
struct inode *inode;
/* not found so we just ignore it (it may have moved to another
* server) */
not_available:
_debug("not avail");
spin_unlock(&server->fs_lock);
_leave("");
return;
read_lock(&server->cb_break_lock);
found:
_debug("found");
ASSERTCMP(server, ==, vnode->server);
/* Step through all interested superblocks. There may be more than one
* because of cell aliasing.
*/
list_for_each_entry(cbi, &server->cb_interests, cb_link) {
if (cbi->vid != fid->vid)
continue;
if (!igrab(AFS_VNODE_TO_I(vnode)))
goto not_available;
spin_unlock(&server->fs_lock);
data.volume = NULL;
data.fid = *fid;
inode = ilookup5_nowait(cbi->sb, fid->vnode, afs_iget5_test, &data);
if (inode) {
vnode = AFS_FS_I(inode);
afs_break_callback(vnode);
iput(inode);
}
}
afs_break_callback(server, vnode);
iput(&vnode->vfs_inode);
_leave("");
read_unlock(&server->cb_break_lock);
}
/*
......@@ -214,243 +231,14 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
}
/*
* record the callback for breaking
* - the caller must hold server->cb_lock
*/
static void afs_do_give_up_callback(struct afs_server *server,
struct afs_vnode *vnode)
{
struct afs_callback *cb;
_enter("%p,%p", server, vnode);
cb = &server->cb_break[server->cb_break_head];
cb->fid = vnode->fid;
cb->version = vnode->cb_version;
cb->expiry = vnode->cb_expiry;
cb->type = vnode->cb_type;
smp_wmb();
server->cb_break_head =
(server->cb_break_head + 1) &
(ARRAY_SIZE(server->cb_break) - 1);
/* defer the breaking of callbacks to try and collect as many as
* possible to ship in one operation */
switch (atomic_inc_return(&server->cb_break_n)) {
case 1 ... AFSCBMAX - 1:
queue_delayed_work(afs_callback_update_worker,
&server->cb_break_work, HZ * 2);
break;
case AFSCBMAX:
afs_flush_callback_breaks(server);
break;
default:
break;
}
ASSERT(server->cb_promises.rb_node != NULL);
rb_erase(&vnode->cb_promise, &server->cb_promises);
vnode->cb_promised = false;
_leave("");
}
/*
* discard the callback on a deleted item
* Clear the callback interests in a server list.
*/
void afs_discard_callback_on_delete(struct afs_vnode *vnode)
void afs_clear_callback_interests(struct afs_net *net, struct afs_volume *volume)
{
struct afs_server *server = vnode->server;
_enter("%d", vnode->cb_promised);
int i;
if (!vnode->cb_promised) {
_leave(" [not promised]");
return;
for (i = 0; i < ARRAY_SIZE(volume->cb_interests); i++) {
afs_put_cb_interest(net, volume->cb_interests[i]);
volume->cb_interests[i] = NULL;
}
ASSERT(server != NULL);
spin_lock(&server->cb_lock);
if (vnode->cb_promised) {
ASSERT(server->cb_promises.rb_node != NULL);
rb_erase(&vnode->cb_promise, &server->cb_promises);
vnode->cb_promised = false;
}
spin_unlock(&server->cb_lock);
_leave("");
}
/*
* give up the callback registered for a vnode on the file server when the
* inode is being cleared
*/
void afs_give_up_callback(struct afs_vnode *vnode)
{
struct afs_server *server = vnode->server;
DECLARE_WAITQUEUE(myself, current);
_enter("%d", vnode->cb_promised);
_debug("GIVE UP INODE %p", &vnode->vfs_inode);
if (!vnode->cb_promised) {
_leave(" [not promised]");
return;
}
ASSERT(server != NULL);
spin_lock(&server->cb_lock);
if (vnode->cb_promised && afs_breakring_space(server) == 0) {
add_wait_queue(&server->cb_break_waitq, &myself);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!vnode->cb_promised ||
afs_breakring_space(server) != 0)
break;
spin_unlock(&server->cb_lock);
schedule();
spin_lock(&server->cb_lock);
}
remove_wait_queue(&server->cb_break_waitq, &myself);
__set_current_state(TASK_RUNNING);
}
/* of course, it's always possible for the server to break this vnode's
* callback first... */
if (vnode->cb_promised)
afs_do_give_up_callback(server, vnode);
spin_unlock(&server->cb_lock);
_leave("");
}
/*
* dispatch a deferred give up callbacks operation
*/
void afs_dispatch_give_up_callbacks(struct work_struct *work)
{
struct afs_server *server =
container_of(work, struct afs_server, cb_break_work.work);
_enter("");
/* tell the fileserver to discard the callback promises it has
* - in the event of ENOMEM or some other error, we just forget that we
* had callbacks entirely, and the server will call us later to break
* them
*/
afs_fs_give_up_callbacks(server->cell->net, server, true);
}
/*
* flush the outstanding callback breaks on a server
*/
void afs_flush_callback_breaks(struct afs_server *server)
{
mod_delayed_work(afs_callback_update_worker, &server->cb_break_work, 0);
}
#if 0
/*
* update a bunch of callbacks
*/
static void afs_callback_updater(struct work_struct *work)
{
struct afs_server *server;
struct afs_vnode *vnode, *xvnode;
time64_t now;
long timeout;
int ret;
server = container_of(work, struct afs_server, updater);
_enter("");
now = ktime_get_real_seconds();
/* find the first vnode to update */
spin_lock(&server->cb_lock);
for (;;) {
if (RB_EMPTY_ROOT(&server->cb_promises)) {
spin_unlock(&server->cb_lock);
_leave(" [nothing]");
return;
}
vnode = rb_entry(rb_first(&server->cb_promises),
struct afs_vnode, cb_promise);
if (atomic_read(&vnode->usage) > 0)
break;
rb_erase(&vnode->cb_promise, &server->cb_promises);
vnode->cb_promised = false;
}
timeout = vnode->update_at - now;
if (timeout > 0) {
queue_delayed_work(afs_vnode_update_worker,
&afs_vnode_update, timeout * HZ);
spin_unlock(&server->cb_lock);
_leave(" [nothing]");
return;
}
list_del_init(&vnode->update);
atomic_inc(&vnode->usage);
spin_unlock(&server->cb_lock);
/* we can now perform the update */
_debug("update %s", vnode->vldb.name);
vnode->state = AFS_VL_UPDATING;
vnode->upd_rej_cnt = 0;
vnode->upd_busy_cnt = 0;
ret = afs_vnode_update_record(vl, &vldb);
switch (ret) {
case 0:
afs_vnode_apply_update(vl, &vldb);
vnode->state = AFS_VL_UPDATING;
break;
case -ENOMEDIUM:
vnode->state = AFS_VL_VOLUME_DELETED;
break;
default:
vnode->state = AFS_VL_UNCERTAIN;
break;
}
/* and then reschedule */
_debug("reschedule");
vnode->update_at = ktime_get_real_seconds() +
afs_vnode_update_timeout;
spin_lock(&server->cb_lock);
if (!list_empty(&server->cb_promises)) {
/* next update in 10 minutes, but wait at least 1 second more
* than the newest record already queued so that we don't spam
* the VL server suddenly with lots of requests
*/
xvnode = list_entry(server->cb_promises.prev,
struct afs_vnode, update);
if (vnode->update_at <= xvnode->update_at)
vnode->update_at = xvnode->update_at + 1;
xvnode = list_entry(server->cb_promises.next,
struct afs_vnode, update);
timeout = xvnode->update_at - now;
if (timeout < 0)
timeout = 0;
} else {
timeout = afs_vnode_update_timeout;
}
list_add_tail(&vnode->update, &server->cb_promises);
_debug("timeout %ld", timeout);
queue_delayed_work(afs_vnode_update_worker,
&afs_vnode_update, timeout * HZ);
spin_unlock(&server->cb_lock);
afs_put_vnode(vl);
}
#endif
......@@ -153,7 +153,7 @@ static void afs_cm_destructor(struct afs_call *call)
}
/*
* allow the fileserver to see if the cache manager is still alive
* The server supplied a list of callbacks that it wanted to break.
*/
static void SRXAFSCB_CallBack(struct work_struct *work)
{
......
......@@ -581,6 +581,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
struct afs_vnode *vnode, *dir;
struct afs_fid uninitialized_var(fid);
struct dentry *parent;
struct inode *inode;
struct key *key;
void *dir_version;
int ret;
......@@ -588,30 +589,39 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (flags & LOOKUP_RCU)
return -ECHILD;
if (d_really_is_positive(dentry)) {
vnode = AFS_FS_I(d_inode(dentry));
if (d_really_is_positive(dentry))
_enter("{v={%x:%u} n=%pd fl=%lx},",
vnode->fid.vid, vnode->fid.vnode, dentry,
vnode->flags);
else
} else {
_enter("{neg n=%pd}", dentry);
}
key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
if (IS_ERR(key))
key = NULL;
if (d_really_is_positive(dentry)) {
inode = d_inode(dentry);
if (inode) {
vnode = AFS_FS_I(inode);
afs_validate(vnode, key);
if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
goto out_bad;
}
}
/* lock down the parent dentry so we can peer at it */
parent = dget_parent(dentry);
dir = AFS_FS_I(d_inode(parent));
/* validate the parent directory */
if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
afs_validate(dir, key);
if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
_debug("%pd: parent dir deleted", dentry);
goto out_bad;
goto out_bad_parent;
}
dir_version = (void *) (unsigned long) dir->status.data_version;
......@@ -626,13 +636,16 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
case 0:
/* the filename maps to something */
if (d_really_is_negative(dentry))
goto out_bad;
if (is_bad_inode(d_inode(dentry))) {
goto out_bad_parent;
inode = d_inode(dentry);
if (is_bad_inode(inode)) {
printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n",
dentry);
goto out_bad;
goto out_bad_parent;
}
vnode = AFS_FS_I(inode);
/* if the vnode ID has changed, then the dirent points to a
* different file */
if (fid.vnode != vnode->fid.vnode) {
......@@ -649,10 +662,10 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
_debug("%pd: file deleted (uq %u -> %u I:%u)",
dentry, fid.unique,
vnode->fid.unique,
d_inode(dentry)->i_generation);
spin_lock(&vnode->lock);
vnode->vfs_inode.i_generation);
write_seqlock(&vnode->cb_lock);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
spin_unlock(&vnode->lock);
write_sequnlock(&vnode->cb_lock);
goto not_found;
}