Commit 6e80133f authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge git://

* git:// (31 commits)
  FS-Cache: Provide nop fscache_stat_d() if CONFIG_FSCACHE_STATS=n
  SLOW_WORK: Fix GFS2 to #include <linux/module.h> before using THIS_MODULE
  SLOW_WORK: Fix CIFS to pass THIS_MODULE to slow_work_register_user()
  CacheFiles: Don't log lookup/create failing with ENOBUFS
  CacheFiles: Catch an overly long wait for an old active object
  CacheFiles: Better showing of debugging information in active object problems
  CacheFiles: Mark parent directory locks as I_MUTEX_PARENT to keep lockdep happy
  CacheFiles: Handle truncate unlocking the page we're reading
  CacheFiles: Don't write a full page if there's only a partial page to cache
  FS-Cache: Actually requeue an object when requested
  FS-Cache: Start processing an object's operations on that object's death
  FS-Cache: Make sure FSCACHE_COOKIE_LOOKING_UP cleared on lookup failure
  FS-Cache: Add a retirement stat counter
  FS-Cache: Handle pages pending storage that get evicted under OOM conditions
  FS-Cache: Handle read request vs lookup, creation or other cache failure
  FS-Cache: Don't delete pending pages from the page-store tracking tree
  FS-Cache: Fix lock misorder in fscache_write_op()
  FS-Cache: The object-available state can't rely on the cookie to be available
  FS-Cache: Permit cache retrieval ops to be interrupted in the initial wait phase
  FS-Cache: Use radix tree preload correctly in tracking of pages to be stored
parents e3a41d7b 4fa9f4ed
......@@ -235,6 +235,7 @@ proc files.
neg=N Number of negative lookups made
pos=N Number of positive lookups made
crt=N Number of objects created by lookup
tmo=N Number of lookups timed out and requeued
Updates n=N Number of update cookie requests seen
nul=N Number of upd reqs given a NULL parent
run=N Number of upd reqs granted CPU time
......@@ -250,8 +251,10 @@ proc files.
ok=N Number of successful alloc reqs
wt=N Number of alloc reqs that waited on lookup completion
nbf=N Number of alloc reqs rejected -ENOBUFS
int=N Number of alloc reqs aborted -ERESTARTSYS
ops=N Number of alloc reqs submitted
owt=N Number of alloc reqs waited for CPU time
abt=N Number of alloc reqs aborted due to object death
Retrvls n=N Number of retrieval (read) requests seen
ok=N Number of successful retr reqs
wt=N Number of retr reqs that waited on lookup completion
......@@ -261,6 +264,7 @@ proc files.
oom=N Number of retr reqs failed -ENOMEM
ops=N Number of retr reqs submitted
owt=N Number of retr reqs waited for CPU time
abt=N Number of retr reqs aborted due to object death
Stores n=N Number of storage (write) requests seen
ok=N Number of successful store reqs
agn=N Number of store reqs on a page already pending storage
......@@ -268,12 +272,37 @@ proc files.
oom=N Number of store reqs failed -ENOMEM
ops=N Number of store reqs submitted
run=N Number of store reqs granted CPU time
pgs=N Number of pages given store req processing time
rxd=N Number of store reqs deleted from tracking tree
olm=N Number of store reqs over store limit
VmScan nos=N Number of release reqs against pages with no pending store
gon=N Number of release reqs against pages stored by time lock granted
bsy=N Number of release reqs ignored due to in-progress store
can=N Number of page stores cancelled due to release req
Ops pend=N Number of times async ops added to pending queues
run=N Number of times async ops given CPU time
enq=N Number of times async ops queued for processing
can=N Number of async ops cancelled
rej=N Number of async ops rejected due to object lookup/create failure
dfr=N Number of async ops queued for deferred release
rel=N Number of async ops released
gc=N Number of deferred-release async ops garbage collected
CacheOp alo=N Number of in-progress alloc_object() cache ops
luo=N Number of in-progress lookup_object() cache ops
luc=N Number of in-progress lookup_complete() cache ops
gro=N Number of in-progress grab_object() cache ops
upo=N Number of in-progress update_object() cache ops
dro=N Number of in-progress drop_object() cache ops
pto=N Number of in-progress put_object() cache ops
syn=N Number of in-progress sync_cache() cache ops
atc=N Number of in-progress attr_changed() cache ops
rap=N Number of in-progress read_or_alloc_page() cache ops
ras=N Number of in-progress read_or_alloc_pages() cache ops
alp=N Number of in-progress allocate_page() cache ops
als=N Number of in-progress allocate_pages() cache ops
wrp=N Number of in-progress write_page() cache ops
ucp=N Number of in-progress uncache_page() cache ops
dsp=N Number of in-progress dissociate_pages() cache ops
(*) /proc/fs/fscache/histogram
......@@ -299,6 +328,87 @@ proc files.
jiffy range covered, and the SECS field the equivalent number of seconds.
If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a
list of all the objects currently allocated and allow them to be viewed
This will look something like:
[root@andromeda ~]# head /proc/fs/fscache/objects
======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
where the first set of columns before the '|' describe the object:
======= ===============================================================
OBJECT Object debugging ID (appears as OBJ%x in some debug messages)
PARENT Debugging ID of parent object
STAT Object state
CHLDN Number of child objects of this object
OPS Number of outstanding operations on this object
OOP Number of outstanding child object management operations
EX Number of outstanding exclusive operations
READS Number of outstanding read operations
EM Object's event mask
EV Events raised on this object
F Object flags
S Object slow-work work item flags
and the second set of columns describe the object's cookie, if present:
=============== =======================================================
NETFS_COOKIE_DEF Name of netfs cookie definition
TY Cookie type (IX - index, DT - data, hex - special)
FL Cookie flags
NETFS_DATA Netfs private data stored in the cookie
OBJECT_KEY Object key } 1 column, with separating comma
AUX_DATA Object aux data } presence may be configured
The data shown may be filtered by attaching the a key to an appropriate keyring
before viewing the file. Something like:
keyctl add user fscache:objlist <restrictions> @s
where <restrictions> are a selection of the following letters:
K Show hexdump of object key (don't show if not given)
A Show hexdump of object aux data (don't show if not given)
and the following paired letters:
C Show objects that have a cookie
c Show objects that don't have a cookie
B Show objects that are busy
b Show objects that aren't busy
W Show objects that have pending writes
w Show objects that don't have pending writes
R Show objects that have outstanding reads
r Show objects that don't have outstanding reads
S Show objects that have slow work queued
s Show objects that don't have slow work queued
If neither side of a letter pair is given, then both are implied. For example:
keyctl add user fscache:objlist KB @s
shows objects that are busy, and lists their object keys, but does not dump
their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is
not implied.
By default all objects and all fields will be shown.
......@@ -641,7 +641,7 @@ data file must be retired (see the relinquish cookie function below).
Furthermore, note that this does not cancel the asynchronous read or write
operation started by the read/alloc and write functions, so the page
invalidation and release functions must use:
invalidation functions must use:
bool fscache_check_page_write(struct fscache_cookie *cookie,
struct page *page);
......@@ -654,6 +654,25 @@ to see if a page is being written to the cache, and:
to wait for it to finish if it is.
When releasepage() is being implemented, a special FS-Cache function exists to
manage the heuristics of coping with vmscan trying to eject pages, which may
conflict with the cache trying to write pages to the cache (which may itself
need to allocate memory):
bool fscache_maybe_release_page(struct fscache_cookie *cookie,
struct page *page,
gfp_t gfp);
This takes the netfs cookie, and the page and gfp arguments as supplied to
releasepage(). It will return false if the page cannot be released yet for
some reason and if it returns true, the page has been uncached and can now be
To make a page available for release, this function may wait for an outstanding
storage request to complete, or it may attempt to cancel the storage request -
in which case the page will not be stored in the cache this time.
......@@ -41,6 +41,13 @@ expand files, provided the time taken to do so isn't too long.
Operations of both types may sleep during execution, thus tying up the thread
loaned to it.
A further class of work item is available, based on the slow work item class:
(*) Delayed slow work items.
These are slow work items that have a timer to defer queueing of the item for
a while.
......@@ -64,9 +71,11 @@ USING SLOW WORK ITEMS
Firstly, a module or subsystem wanting to make use of slow work items must
register its interest:
int ret = slow_work_register_user();
int ret = slow_work_register_user(struct module *module);
This will return 0 if successful, or a -ve error upon failure.
This will return 0 if successful, or a -ve error upon failure. The module
pointer should be the module interested in using this facility (almost
certainly THIS_MODULE).
Slow work items may then be set up by:
......@@ -91,6 +100,10 @@ Slow work items may then be set up by:
slow_work_init(&myitem, &myitem_ops);
delayed_slow_work_init(&myitem, &myitem_ops);
vslow_work_init(&myitem, &myitem_ops);
......@@ -102,15 +115,92 @@ A suitably set up work item can then be enqueued for processing:
int ret = slow_work_enqueue(&myitem);
This will return a -ve error if the thread pool is unable to gain a reference
on the item, 0 otherwise.
on the item, 0 otherwise, or (for delayed work):
int ret = delayed_slow_work_enqueue(&myitem, my_jiffy_delay);
The items are reference counted, so there ought to be no need for a flush
operation. When all a module's slow work items have been processed, and the
operation. But as the reference counting is optional, means to cancel
existing work items are also included:
can be used to cancel pending work. The above cancel function waits for
existing work to have been executed (or prevent execution of them, depending
on timing).
When all a module's slow work items have been processed, and the
module has no further interest in the facility, it should unregister its
slow_work_unregister_user(struct module *module);
The module pointer is used to wait for all outstanding work items for that
module before completing the unregistration. This prevents the put_ref() code
from being taken away before it completes. module should almost certainly be
The slow-work facility provides a function by which it can be determined
whether or not an item is queued for later execution:
bool queued = slow_work_is_queued(struct slow_work *work);
If it returns false, then the item is not on the queue (it may be executing
with a requeue pending). This can be used to work out whether an item on which
another depends is on the queue, thus allowing a dependent item to be queued
after it.
If the above shows an item on which another depends not to be queued, then the
owner of the dependent item might need to wait. However, to avoid locking up
the threads unnecessarily be sleeping in them, it can make sense under some
circumstances to return the work item to the queue, thus deferring it until
some other items have had a chance to make use of the yielded thread.
To yield a thread and defer an item, the work function should simply enqueue
the work item again and return. However, this doesn't work if there's nothing
actually on the queue, as the thread just vacated will jump straight back into
the item's work function, thus busy waiting on a CPU.
Instead, the item should use the thread to wait for the dependency to go away,
but rather than using schedule() or schedule_timeout() to sleep, it should use
the following function:
bool requeue = slow_work_sleep_till_thread_needed(
struct slow_work *work,
signed long *_timeout);
This will add a second wait and then sleep, such that it will be woken up if
either something appears on the queue that could usefully make use of the
thread - and behind which this item can be queued, or if the event the caller
set up to wait for happens. True will be returned if something else appeared
on the queue and this work function should perhaps return, of false if
something else woke it up. The timeout is as for schedule_timeout().
For example:
wq = bit_waitqueue(&my_flags, MY_BIT);
requeue = false;
do {
prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
if (!test_bit(MY_BIT, &my_flags))
requeue = slow_work_sleep_till_thread_needed(&my_work,
} while (timeout > 0 && !requeue);
finish_wait(wq, &wait);
if (!test_bit(MY_BIT, &my_flags)
goto do_my_thing;
if (requeue)
return; // to slow_work
......@@ -118,7 +208,8 @@ ITEM OPERATIONS
Each work item requires a table of operations of type struct slow_work_ops.
All members are required:
Only ->execute() is required; the getting and putting of a reference and the
describing of an item are all optional.
(*) Get a reference on an item:
......@@ -148,6 +239,16 @@ All members are required:
This should perform the work required of the item. It may sleep, it may
perform disk I/O and it may wait for locks.
(*) View an item through /proc:
void (*desc)(struct slow_work *work, struct seq_file *m);
If supplied, this should print to 'm' a small string describing the work
the item is to do. This should be no more than about 40 characters, and
shouldn't include a newline character.
See the 'Viewing executing and queued items' section below.
......@@ -172,3 +273,50 @@ The slow-work thread pool has a number of configurables:
is bounded to between 1 and one fewer than the number of active threads.
This ensures there is always at least one thread that can process very
slow work items, and always at least one thread that won't.
If CONFIG_SLOW_WORK_PROC is enabled, a proc file is made available:
through which the list of work items being executed and the queues of items to
be executed may be viewed. The owner of a work item is given the chance to
add some information of its own.
The contents look something like the following:
=== ===== ================ == ===== ==========
0 3005 ffff880023f52348 a 952ms FSC: OBJ17d3: LOOK
1 3006 ffff880024e33668 2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
2 3165 ffff8800296dd180 a 424ms FSC: OBJ17e4: LOOK
3 4089 ffff8800262c8d78 a 212ms FSC: OBJ17ea: CRTN
4 4090 ffff88002792bed8 2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
5 4092 ffff88002a0ef308 2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
6 4094 ffff88002abaf4b8 2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
7 4095 ffff88002bb188e0 a 388ms FSC: OBJ17e9: CRTN
vsq - ffff880023d99668 1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
vsq - ffff8800295d1740 1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
vsq - ffff880025ba3308 1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
vsq - ffff880024ec83e0 1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
vsq - ffff880026618e00 1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
vsq - ffff880025a2a4b8 1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
vsq - ffff880023cbe6d8 9 212ms FSC: OBJ17eb: LOOK
vsq - ffff880024d37590 9 212ms FSC: OBJ17ec: LOOK
vsq - ffff880027746cb0 9 212ms FSC: OBJ17ed: LOOK
vsq - ffff880024d37ae8 9 212ms FSC: OBJ17ee: LOOK
vsq - ffff880024d37cb0 9 212ms FSC: OBJ17ef: LOOK
vsq - ffff880025036550 9 212ms FSC: OBJ17f0: LOOK
vsq - ffff8800250368e0 9 212ms FSC: OBJ17f1: LOOK
vsq - ffff880025036aa8 9 212ms FSC: OBJ17f2: LOOK
In the 'THR' column, executing items show the thread they're occupying and
queued threads indicate which queue they're on. 'PID' shows the process ID of
a slow-work thread that's executing something. 'FL' shows the work item flags.
'MARK' indicates how long since an item was queued or began executing. Lastly,
the 'DESC' column permits the owner of an item to give some information.
......@@ -343,18 +343,7 @@ int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
if (PageFsCache(page)) {
if (fscache_check_page_write(vcookie->fscache, page)) {
if (!(gfp & __GFP_WAIT))
return 0;
fscache_wait_on_page_write(vcookie->fscache, page);
fscache_uncache_page(vcookie->fscache, page);
return 1;
return fscache_maybe_release_page(vnode->cache, page, gfp);
void __v9fs_fscache_invalidate_page(struct page *page)
......@@ -368,7 +357,6 @@ void __v9fs_fscache_invalidate_page(struct page *page)
fscache_wait_on_page_write(vcookie->fscache, page);
fscache_uncache_page(vcookie->fscache, page);
......@@ -315,7 +315,6 @@ static void afs_invalidatepage(struct page *page, unsigned long offset)
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
fscache_wait_on_page_write(vnode->cache, page);
fscache_uncache_page(vnode->cache, page);
......@@ -349,17 +348,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
/* deny if page is being written to the cache and the caller hasn't
* elected to wait */
if (PageFsCache(page)) {
if (fscache_check_page_write(vnode->cache, page)) {
if (!(gfp_flags & __GFP_WAIT)) {
_leave(" = F [cache busy]");
return 0;
fscache_wait_on_page_write(vnode->cache, page);
fscache_uncache_page(vnode->cache, page);
if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
_leave(" = F [cache busy]");
return 0;
......@@ -114,8 +114,9 @@ static struct fscache_object *cachefiles_alloc_object(
* attempt to look up the nominated node in this cache
* - return -ETIMEDOUT to be scheduled again
static void cachefiles_lookup_object(struct fscache_object *_object)
static int cachefiles_lookup_object(struct fscache_object *_object)
struct cachefiles_lookup_data *lookup_data;
struct cachefiles_object *parent, *object;
......@@ -145,13 +146,15 @@ static void cachefiles_lookup_object(struct fscache_object *_object)
object->fscache.cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX)
if (ret < 0) {
printk(KERN_WARNING "CacheFiles: Lookup failed error %d\n",
if (ret < 0 && ret != -ETIMEDOUT) {
if (ret != -ENOBUFS)
"CacheFiles: Lookup failed error %d\n", ret);
_leave(" [%d]", ret);
return ret;
......@@ -331,6 +334,7 @@ static void cachefiles_put_object(struct fscache_object *_object)
cache = object->fscache.cache;
kmem_cache_free(cachefiles_object_jar, object);
......@@ -403,12 +407,26 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
if (oi_size == ni_size)
return 0;
newattrs.ia_size = ni_size;
newattrs.ia_valid = ATTR_SIZE;
cachefiles_begin_secure(cache, &saved_cred);
/* if there's an extension to a partial page at the end of the backing
* file, we need to discard the partial page so that we pick up new
* data after it */
if (oi_size & ~PAGE_MASK && ni_size > oi_size) {
_debug("discard tail %llx", oi_size);
newattrs.ia_valid = ATTR_SIZE;
newattrs.ia_size = oi_size & PAGE_MASK;
ret = notify_change(object->backer, &newattrs);
if (ret < 0)
goto truncate_failed;
newattrs.ia_valid = ATTR_SIZE;
newattrs.ia_size = ni_size;
ret = notify_change(object->backer, &newattrs);
cachefiles_end_secure(cache, saved_cred);
......@@ -21,17 +21,81 @@
#include <linux/security.h>
#include "internal.h"
static int cachefiles_wait_bit(void *flags)
* dump debugging info about an object
static noinline
void __cachefiles_printk_object(struct cachefiles_object *object,
const char *prefix,
u8 *keybuf)
return 0;
struct fscache_cookie *cookie;
unsigned keylen, loop;
printk(KERN_ERR "%sobject: OBJ%x\n",
prefix, object->fscache.debug_id);
printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n",
prefix, fscache_object_states[object->fscache.state],
object->fscache.flags, object->,
object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK);
printk(KERN_ERR "%sops=%u inp=%u exc=%u\n",
prefix, object->fscache.n_ops, object->fscache.n_in_progress,
printk(KERN_ERR "%sparent=%p\n",
prefix, object->fscache.parent);
cookie = object->fscache.cookie;
if (cookie) {
printk(KERN_ERR "%scookie=%p [pr=%p nd=%p fl=%lx]\n",
if (keybuf)
keylen = cookie->def->get_key(cookie->netfs_data, keybuf,
keylen = 0;
} else {
printk(KERN_ERR "%scookie=NULL\n", prefix);
keylen = 0;
if (keylen) {
printk(KERN_ERR "%skey=[%u] '", prefix, keylen);
for (loop = 0; loop < keylen; loop++)
printk("%02x", keybuf[loop]);
* dump debugging info about a pair of objects
static noinline void cachefiles_printk_object(struct cachefiles_object *object,
struct cachefiles_object *xobject)
u8 *keybuf;
if (object)
__cachefiles_printk_object(object, "", keybuf);
if (xobject)
__cachefiles_printk_object(xobject, "x", keybuf);
* record the fact that an object is now active
static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
struct cachefiles_object *object)
static int cachefiles_mark_object_active(struct cachefiles_cache *cache,
struct cachefiles_object *object)
struct cachefiles_object *xobject;
struct rb_node **_p, *_parent = NULL;
......@@ -42,8 +106,11 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags))
if (test_and_set_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) {
printk(KERN_ERR "CacheFiles: Error: Object already active\n");
cachefiles_printk_object(object, NULL);
dentry = object->dentry;
_p = &cache->active_nodes.rb_node;
......@@ -66,8 +133,8 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
rb_insert_color(&object->active_node, &cache->active_nodes);
_leave(" = 0");
return 0;
/* an old object from a previous incarnation is hogging the slot - we
* need to wait for it to be destroyed */
......@@ -76,44 +143,70 @@ static void cachefiles_mark_object_active(struct cachefiles_cache *cache,
printk(KERN_ERR "\n");
printk(KERN_ERR "CacheFiles: Error:"