Commit 93e1585e authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'dm-3.13-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm

Pull device mapper fixes from Mike Snitzer:
 "A set of device-mapper fixes for 3.13.

  A fix for possible memory corruption during DM table load, fix a
  possible leak of snapshot space in case of a crash, fix a possible
  deadlock due to a shared workqueue in the delay target, fix to
  initialize read-only module parameters that are used to export metrics
  for dm stats and dm bufio.

  Quite a few stable fixes were identified for both the thin-
  provisioning and caching targets as a result of increased regression
  testing using the device-mapper-test-suite (dmts).  The most notable
  of these are the reference counting fixes for the space map btree that
  is used by the dm-array interface -- without these the dm-cache
  metadata will leak, resulting in dm-cache devices running out of
  metadata blocks.  Also, some important fixes related to the
  thin-provisioning target's transition to read-only mode on error"

* tag 'dm-3.13-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm:
  dm array: fix a reference counting bug in shadow_ablock
  dm space map: disallow decrementing a reference count below zero
  dm stats: initialize read-only module parameter
  dm bufio: initialize read-only module parameters
  dm cache: actually resize cache
  dm cache: update Documentation for invalidate_cblocks's range syntax
  dm cache policy mq: fix promotions to occur as expected
  dm thin: allow pool in read-only mode to transition to read-write mode
  dm thin: re-establish read-only state when switching to fail mode
  dm thin: always fallback the pool mode if commit fails
  dm thin: switch to read-only mode if metadata space is exhausted
  dm thin: switch to read only mode if a mapping insert fails
  dm space map metadata: return on failure in sm_metadata_new_block
  dm table: fail dm_table_create on dm_round_up overflow
  dm snapshot: avoid snapshot space leak on crash
  dm delay: fix a possible deadlock due to shared workqueue
parents 1008ebb6 ed9571f0
......@@ -266,10 +266,12 @@ E.g.
Invalidation is removing an entry from the cache without writing it
back. Cache blocks can be invalidated via the invalidate_cblocks
message, which takes an arbitrary number of cblock ranges. Each cblock
must be expressed as a decimal value, in the future a variant message
that takes cblock ranges expressed in hexidecimal may be needed to
better support efficient invalidation of larger caches. The cache must
be in passthrough mode when invalidate_cblocks is used.
range's end value is "one past the end", meaning 5-10 expresses a range
of values from 5 to 9. Each cblock must be expressed as a decimal
value, in the future a variant message that takes cblock ranges
expressed in hexidecimal may be needed to better support efficient
invalidation of larger caches. The cache must be in passthrough mode
when invalidate_cblocks is used.
invalidate_cblocks [<cblock>|<cblock begin>-<cblock end>]*
......
......@@ -1717,6 +1717,11 @@ static int __init dm_bufio_init(void)
{
__u64 mem;
dm_bufio_allocated_kmem_cache = 0;
dm_bufio_allocated_get_free_pages = 0;
dm_bufio_allocated_vmalloc = 0;
dm_bufio_current_allocated = 0;
memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
......
......@@ -730,15 +730,18 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
int r = 0;
bool updated = updated_this_tick(mq, e);
requeue_and_update_tick(mq, e);
if ((!discarded_oblock && updated) ||
!should_promote(mq, e, discarded_oblock, data_dir))
!should_promote(mq, e, discarded_oblock, data_dir)) {
requeue_and_update_tick(mq, e);
result->op = POLICY_MISS;
else if (!can_migrate)
} else if (!can_migrate)
r = -EWOULDBLOCK;
else
else {
requeue_and_update_tick(mq, e);
r = pre_cache_to_cache(mq, e, result);
}
return r;
}
......
......@@ -2755,7 +2755,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
{
int r;
r = dm_cache_resize(cache->cmd, cache->cache_size);
r = dm_cache_resize(cache->cmd, new_size);
if (r) {
DMERR("could not resize cache metadata");
return r;
......
......@@ -20,6 +20,7 @@
struct delay_c {
struct timer_list delay_timer;
struct mutex timer_lock;
struct workqueue_struct *kdelayd_wq;
struct work_struct flush_expired_bios;
struct list_head delayed_bios;
atomic_t may_delay;
......@@ -45,14 +46,13 @@ struct dm_delay_info {
static DEFINE_MUTEX(delayed_bios_lock);
static struct workqueue_struct *kdelayd_wq;
static struct kmem_cache *delayed_cache;
static void handle_delayed_timer(unsigned long data)
{
struct delay_c *dc = (struct delay_c *)data;
queue_work(kdelayd_wq, &dc->flush_expired_bios);
queue_work(dc->kdelayd_wq, &dc->flush_expired_bios);
}
static void queue_timeout(struct delay_c *dc, unsigned long expires)
......@@ -191,6 +191,12 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_dev_write;
}
dc->kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!dc->kdelayd_wq) {
DMERR("Couldn't start kdelayd");
goto bad_queue;
}
setup_timer(&dc->delay_timer, handle_delayed_timer, (unsigned long)dc);
INIT_WORK(&dc->flush_expired_bios, flush_expired_bios);
......@@ -203,6 +209,8 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = dc;
return 0;
bad_queue:
mempool_destroy(dc->delayed_pool);
bad_dev_write:
if (dc->dev_write)
dm_put_device(ti, dc->dev_write);
......@@ -217,7 +225,7 @@ static void delay_dtr(struct dm_target *ti)
{
struct delay_c *dc = ti->private;
flush_workqueue(kdelayd_wq);
destroy_workqueue(dc->kdelayd_wq);
dm_put_device(ti, dc->dev_read);
......@@ -350,12 +358,6 @@ static int __init dm_delay_init(void)
{
int r = -ENOMEM;
kdelayd_wq = alloc_workqueue("kdelayd", WQ_MEM_RECLAIM, 0);
if (!kdelayd_wq) {
DMERR("Couldn't start kdelayd");
goto bad_queue;
}
delayed_cache = KMEM_CACHE(dm_delay_info, 0);
if (!delayed_cache) {
DMERR("Couldn't create delayed bio cache.");
......@@ -373,8 +375,6 @@ static int __init dm_delay_init(void)
bad_register:
kmem_cache_destroy(delayed_cache);
bad_memcache:
destroy_workqueue(kdelayd_wq);
bad_queue:
return r;
}
......@@ -382,7 +382,6 @@ static void __exit dm_delay_exit(void)
{
dm_unregister_target(&delay_target);
kmem_cache_destroy(delayed_cache);
destroy_workqueue(kdelayd_wq);
}
/* Module hooks */
......
......@@ -66,6 +66,18 @@ struct dm_snapshot {
atomic_t pending_exceptions_count;
/* Protected by "lock" */
sector_t exception_start_sequence;
/* Protected by kcopyd single-threaded callback */
sector_t exception_complete_sequence;
/*
* A list of pending exceptions that completed out of order.
* Protected by kcopyd single-threaded callback.
*/
struct list_head out_of_order_list;
mempool_t *pending_pool;
struct dm_exception_table pending;
......@@ -173,6 +185,14 @@ struct dm_snap_pending_exception {
*/
int started;
/* There was copying error. */
int copy_error;
/* A sequence number, it is used for in-order completion. */
sector_t exception_sequence;
struct list_head out_of_order_entry;
/*
* For writing a complete chunk, bypassing the copy.
*/
......@@ -1094,6 +1114,9 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
s->valid = 1;
s->active = 0;
atomic_set(&s->pending_exceptions_count, 0);
s->exception_start_sequence = 0;
s->exception_complete_sequence = 0;
INIT_LIST_HEAD(&s->out_of_order_list);
init_rwsem(&s->lock);
INIT_LIST_HEAD(&s->list);
spin_lock_init(&s->pe_lock);
......@@ -1443,6 +1466,19 @@ static void commit_callback(void *context, int success)
pending_complete(pe, success);
}
static void complete_exception(struct dm_snap_pending_exception *pe)
{
struct dm_snapshot *s = pe->snap;
if (unlikely(pe->copy_error))
pending_complete(pe, 0);
else
/* Update the metadata if we are persistent */
s->store->type->commit_exception(s->store, &pe->e,
commit_callback, pe);
}
/*
* Called when the copy I/O has finished. kcopyd actually runs
* this code so don't block.
......@@ -1452,13 +1488,32 @@ static void copy_callback(int read_err, unsigned long write_err, void *context)
struct dm_snap_pending_exception *pe = context;
struct dm_snapshot *s = pe->snap;
if (read_err || write_err)
pending_complete(pe, 0);
pe->copy_error = read_err || write_err;
else
/* Update the metadata if we are persistent */
s->store->type->commit_exception(s->store, &pe->e,
commit_callback, pe);
if (pe->exception_sequence == s->exception_complete_sequence) {
s->exception_complete_sequence++;
complete_exception(pe);
while (!list_empty(&s->out_of_order_list)) {
pe = list_entry(s->out_of_order_list.next,
struct dm_snap_pending_exception, out_of_order_entry);
if (pe->exception_sequence != s->exception_complete_sequence)
break;
s->exception_complete_sequence++;
list_del(&pe->out_of_order_entry);
complete_exception(pe);
}
} else {
struct list_head *lh;
struct dm_snap_pending_exception *pe2;
list_for_each_prev(lh, &s->out_of_order_list) {
pe2 = list_entry(lh, struct dm_snap_pending_exception, out_of_order_entry);
if (pe2->exception_sequence < pe->exception_sequence)
break;
}
list_add(&pe->out_of_order_entry, lh);
}
}
/*
......@@ -1553,6 +1608,8 @@ __find_pending_exception(struct dm_snapshot *s,
return NULL;
}
pe->exception_sequence = s->exception_start_sequence++;
dm_insert_exception(&s->pending, &pe->e);
return pe;
......@@ -2192,7 +2249,7 @@ static struct target_type origin_target = {
static struct target_type snapshot_target = {
.name = "snapshot",
.version = {1, 11, 1},
.version = {1, 12, 0},
.module = THIS_MODULE,
.ctr = snapshot_ctr,
.dtr = snapshot_dtr,
......
......@@ -964,6 +964,7 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
int __init dm_statistics_init(void)
{
shared_memory_amount = 0;
dm_stat_need_rcu_barrier = 0;
return 0;
}
......
......@@ -200,6 +200,11 @@ int dm_table_create(struct dm_table **result, fmode_t mode,
num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
if (!num_targets) {
kfree(t);
return -ENOMEM;
}
if (alloc_targets(t, num_targets)) {
kfree(t);
return -ENOMEM;
......
......@@ -1697,6 +1697,14 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
up_write(&pmd->root_lock);
}
void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
{
down_write(&pmd->root_lock);
pmd->read_only = false;
dm_bm_set_read_write(pmd->bm);
up_write(&pmd->root_lock);
}
int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
dm_block_t threshold,
dm_sm_threshold_fn fn,
......
......@@ -193,6 +193,7 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_siz
* that nothing is changing.
*/
void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd);
void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd);
int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
dm_block_t threshold,
......
......@@ -640,7 +640,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
*/
r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
if (r) {
DMERR_LIMIT("dm_thin_insert_block() failed");
DMERR_LIMIT("%s: dm_thin_insert_block() failed: error = %d",
dm_device_name(pool->pool_md), r);
set_pool_mode(pool, PM_READ_ONLY);
cell_error(pool, m->cell);
goto out;
}
......@@ -881,32 +883,23 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
}
}
static int commit(struct pool *pool)
{
int r;
r = dm_pool_commit_metadata(pool->pmd);
if (r)
DMERR_LIMIT("%s: commit failed: error = %d",
dm_device_name(pool->pool_md), r);
return r;
}
/*
* A non-zero return indicates read_only or fail_io mode.
* Many callers don't care about the return value.
*/
static int commit_or_fallback(struct pool *pool)
static int commit(struct pool *pool)
{
int r;
if (get_pool_mode(pool) != PM_WRITE)
return -EINVAL;
r = commit(pool);
if (r)
r = dm_pool_commit_metadata(pool->pmd);
if (r) {
DMERR_LIMIT("%s: dm_pool_commit_metadata failed: error = %d",
dm_device_name(pool->pool_md), r);
set_pool_mode(pool, PM_READ_ONLY);
}
return r;
}
......@@ -943,7 +936,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
* Try to commit to see if that will free up some
* more space.
*/
(void) commit_or_fallback(pool);
r = commit(pool);
if (r)
return r;
r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
if (r)
......@@ -957,7 +952,7 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
* table reload).
*/
if (!free_blocks) {
DMWARN("%s: no free space available.",
DMWARN("%s: no free data space available.",
dm_device_name(pool->pool_md));
spin_lock_irqsave(&pool->lock, flags);
pool->no_free_space = 1;
......@@ -967,8 +962,16 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
}
r = dm_pool_alloc_data_block(pool->pmd, result);
if (r)
if (r) {
if (r == -ENOSPC &&
!dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks) &&
!free_blocks) {
DMWARN("%s: no free metadata space available.",
dm_device_name(pool->pool_md));
set_pool_mode(pool, PM_READ_ONLY);
}
return r;
}
return 0;
}
......@@ -1349,7 +1352,7 @@ static void process_deferred_bios(struct pool *pool)
if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
return;
if (commit_or_fallback(pool)) {
if (commit(pool)) {
while ((bio = bio_list_pop(&bios)))
bio_io_error(bio);
return;
......@@ -1397,6 +1400,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
case PM_FAIL:
DMERR("%s: switching pool to failure mode",
dm_device_name(pool->pool_md));
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_fail;
pool->process_discard = process_bio_fail;
pool->process_prepared_mapping = process_prepared_mapping_fail;
......@@ -1421,6 +1425,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode mode)
break;
case PM_WRITE:
dm_pool_metadata_read_write(pool->pmd);
pool->process_bio = process_bio;
pool->process_discard = process_discard;
pool->process_prepared_mapping = process_prepared_mapping;
......@@ -1637,12 +1642,19 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
struct pool_c *pt = ti->private;
/*
* We want to make sure that degraded pools are never upgraded.
* We want to make sure that a pool in PM_FAIL mode is never upgraded.
*/
enum pool_mode old_mode = pool->pf.mode;
enum pool_mode new_mode = pt->adjusted_pf.mode;
if (old_mode > new_mode)
/*
* If we were in PM_FAIL mode, rollback of metadata failed. We're
* not going to recover without a thin_repair. So we never let the
* pool move out of the old mode. On the other hand a PM_READ_ONLY
* may have been due to a lack of metadata or data space, and may
* now work (ie. if the underlying devices have been resized).
*/
if (old_mode == PM_FAIL)
new_mode = old_mode;
pool->ti = ti;
......@@ -2266,7 +2278,7 @@ static int pool_preresume(struct dm_target *ti)
return r;
if (need_commit1 || need_commit2)
(void) commit_or_fallback(pool);
(void) commit(pool);
return 0;
}
......@@ -2293,7 +2305,7 @@ static void pool_postsuspend(struct dm_target *ti)
cancel_delayed_work(&pool->waker);
flush_workqueue(pool->wq);
(void) commit_or_fallback(pool);
(void) commit(pool);
}
static int check_arg_count(unsigned argc, unsigned args_required)
......@@ -2427,7 +2439,7 @@ static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct
if (r)
return r;
(void) commit_or_fallback(pool);
(void) commit(pool);
r = dm_pool_reserve_metadata_snap(pool->pmd);
if (r)
......@@ -2489,7 +2501,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
if (!r)
(void) commit_or_fallback(pool);
(void) commit(pool);
return r;
}
......@@ -2544,7 +2556,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
/* Commit to ensure statistics aren't out-of-date */
if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
(void) commit_or_fallback(pool);
(void) commit(pool);
r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
if (r) {
......
......@@ -317,8 +317,16 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root,
* The shadow op will often be a noop. Only insert if it really
* copied data.
*/
if (dm_block_location(*block) != b)
if (dm_block_location(*block) != b) {
/*
* dm_tm_shadow_block will have already decremented the old
* block, but it is still referenced by the btree. We
* increment to stop the insert decrementing it below zero
* when overwriting the old value.
*/
dm_tm_inc(info->btree_info.tm, b);
r = insert_ablock(info, index, *block, root);
}
return r;
}
......
......@@ -626,6 +626,12 @@ void dm_bm_set_read_only(struct dm_block_manager *bm)
}
EXPORT_SYMBOL_GPL(dm_bm_set_read_only);
void dm_bm_set_read_write(struct dm_block_manager *bm)
{
bm->read_only = false;
}
EXPORT_SYMBOL_GPL(dm_bm_set_read_write);
u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor)
{
return crc32c(~(u32) 0, data, len) ^ init_xor;
......
......@@ -108,9 +108,9 @@ int dm_bm_unlock(struct dm_block *b);
int dm_bm_flush_and_unlock(struct dm_block_manager *bm,
struct dm_block *superblock);
/*
* Request data be prefetched into the cache.
*/
/*
* Request data is prefetched into the cache.
*/
void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
/*
......@@ -125,6 +125,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
* be returned if you do.
*/
void dm_bm_set_read_only(struct dm_block_manager *bm);
void dm_bm_set_read_write(struct dm_block_manager *bm);
u32 dm_bm_checksum(const void *data, size_t len, u32 init_xor);
......
......@@ -381,7 +381,7 @@ int sm_ll_find_free_block(struct ll_disk *ll, dm_block_t begin,
}
static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
uint32_t (*mutator)(void *context, uint32_t old),
int (*mutator)(void *context, uint32_t old, uint32_t *new),
void *context, enum allocation_event *ev)
{
int r;
......@@ -410,11 +410,17 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
if (old > 2) {
r = sm_ll_lookup_big_ref_count(ll, b, &old);
if (r < 0)
if (r < 0) {
dm_tm_unlock(ll->tm, nb);
return r;
}
}
ref_count = mutator(context, old);
r = mutator(context, old, &ref_count);
if (r) {
dm_tm_unlock(ll->tm, nb);
return r;
}
if (ref_count <= 2) {
sm_set_bitmap(bm_le, bit, ref_count);
......@@ -465,9 +471,10 @@ static int sm_ll_mutate(struct ll_disk *ll, dm_block_t b,
return ll->save_ie(ll, index, &ie_disk);
}
static uint32_t set_ref_count(void *context, uint32_t old)
static int set_ref_count(void *context, uint32_t old, uint32_t *new)
{
return *((uint32_t *) context);
*new = *((uint32_t *) context);
return 0;
}
int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
......@@ -476,9 +483,10 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
return sm_ll_mutate(ll, b, set_ref_count, &ref_count, ev);
}
static uint32_t inc_ref_count(void *context, uint32_t old)
static int inc_ref_count(void *context, uint32_t old, uint32_t *new)
{
return old + 1;
*new = old + 1;
return 0;
}
int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
......@@ -486,9 +494,15 @@ int sm_ll_inc(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
return sm_ll_mutate(ll, b, inc_ref_count, NULL, ev);
}
static uint32_t dec_ref_count(void *context, uint32_t old)
static int dec_ref_count(void *context, uint32_t old, uint32_t *new)
{
return old - 1;
if (!old) {
DMERR_LIMIT("unable to decrement a reference count below 0");
return -EINVAL;
}
*new = old - 1;
return 0;
}
int sm_ll_dec(struct ll_disk *ll, dm_block_t b, enum allocation_event *ev)
......
......@@ -384,12 +384,16 @@ static int sm_metadata_new_block(struct dm_space_map *sm, dm_block_t *b)
struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
int r = sm_metadata_new_block_(sm, b);
if (r)
if (r) {
DMERR("unable to allocate new metadata block");
return r;
}
r = sm_metadata_get_nr_free(sm, &count);
if (r)
if (r) {
DMERR("couldn't get free block count");
return r;
}
check_threshold(&smm->threshold, count);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment