diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/Kconfig | 1 | ||||
-rw-r--r-- | drivers/md/bitmap.c | 13 | ||||
-rw-r--r-- | drivers/md/dm-cache-metadata.c | 104 | ||||
-rw-r--r-- | drivers/md/dm-cache-target.c | 89 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 35 | ||||
-rw-r--r-- | drivers/md/dm.c | 11 | ||||
-rw-r--r-- | drivers/md/raid5.c | 5 |
7 files changed, 203 insertions, 55 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 5bdedf6df153..c355a226a024 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -5,6 +5,7 @@ menuconfig MD bool "Multiple devices driver support (RAID and LVM)" depends on BLOCK + select SRCU help Support multiple physical spindles through a single logical device. Required for RAID and logical volume management. diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index da3604e73e8a..1695ee5f3ffc 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -72,6 +72,19 @@ __acquires(bitmap->lock) /* this page has not been allocated yet */ spin_unlock_irq(&bitmap->lock); + /* It is possible that this is being called inside a + * prepare_to_wait/finish_wait loop from raid5c:make_request(). + * In general it is not permitted to sleep in that context as it + * can cause the loop to spin freely. + * That doesn't apply here as we can only reach this point + * once with any loop. + * When this function completes, either bp[page].map or + * bp[page].hijacked. In either case, this function will + * abort before getting to this point again. So there is + * no risk of a free-spin, and so it is safe to assert + * that sleeping here is allowed. + */ + sched_annotate_sleep(); mappage = kzalloc(PAGE_SIZE, GFP_NOIO); spin_lock_irq(&bitmap->lock); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9fc616c2755e..c1c010498a21 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -94,6 +94,9 @@ struct cache_disk_superblock { } __packed; struct dm_cache_metadata { + atomic_t ref_count; + struct list_head list; + struct block_device *bdev; struct dm_block_manager *bm; struct dm_space_map *metadata_sm; @@ -669,10 +672,10 @@ static void unpack_value(__le64 value_le, dm_oblock_t *block, unsigned *flags) /*----------------------------------------------------------------*/ -struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, - sector_t data_block_size, - bool may_format_device, - size_t policy_hint_size) +static struct dm_cache_metadata *metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) { int r; struct dm_cache_metadata *cmd; @@ -680,9 +683,10 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (!cmd) { DMERR("could not allocate metadata struct"); - return NULL; + return ERR_PTR(-ENOMEM); } + atomic_set(&cmd->ref_count, 1); init_rwsem(&cmd->root_lock); cmd->bdev = bdev; cmd->data_block_size = data_block_size; @@ -705,10 +709,96 @@ struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, return cmd; } +/* + * We keep a little list of ref counted metadata objects to prevent two + * different target instances creating separate bufio instances. This is + * an issue if a table is reloaded before the suspend. + */ +static DEFINE_MUTEX(table_lock); +static LIST_HEAD(table); + +static struct dm_cache_metadata *lookup(struct block_device *bdev) +{ + struct dm_cache_metadata *cmd; + + list_for_each_entry(cmd, &table, list) + if (cmd->bdev == bdev) { + atomic_inc(&cmd->ref_count); + return cmd; + } + + return NULL; +} + +static struct dm_cache_metadata *lookup_or_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) +{ + struct dm_cache_metadata *cmd, *cmd2; + + mutex_lock(&table_lock); + cmd = lookup(bdev); + mutex_unlock(&table_lock); + + if (cmd) + return cmd; + + cmd = metadata_open(bdev, data_block_size, may_format_device, policy_hint_size); + if (!IS_ERR(cmd)) { + mutex_lock(&table_lock); + cmd2 = lookup(bdev); + if (cmd2) { + mutex_unlock(&table_lock); + __destroy_persistent_data_objects(cmd); + kfree(cmd); + return cmd2; + } + list_add(&cmd->list, &table); + mutex_unlock(&table_lock); + } + + return cmd; +} + +static bool same_params(struct dm_cache_metadata *cmd, sector_t data_block_size) +{ + if (cmd->data_block_size != data_block_size) { + DMERR("data_block_size (%llu) different from that in metadata (%llu)\n", + (unsigned long long) data_block_size, + (unsigned long long) cmd->data_block_size); + return false; + } + + return true; +} + +struct dm_cache_metadata *dm_cache_metadata_open(struct block_device *bdev, + sector_t data_block_size, + bool may_format_device, + size_t policy_hint_size) +{ + struct dm_cache_metadata *cmd = lookup_or_open(bdev, data_block_size, + may_format_device, policy_hint_size); + + if (!IS_ERR(cmd) && !same_params(cmd, data_block_size)) { + dm_cache_metadata_close(cmd); + return ERR_PTR(-EINVAL); + } + + return cmd; +} + void dm_cache_metadata_close(struct dm_cache_metadata *cmd) { - __destroy_persistent_data_objects(cmd); - kfree(cmd); + if (atomic_dec_and_test(&cmd->ref_count)) { + mutex_lock(&table_lock); + list_del(&cmd->list); + mutex_unlock(&table_lock); + + __destroy_persistent_data_objects(cmd); + kfree(cmd); + } } /* diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 1e96d7889f51..e1650539cc2f 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -221,7 +221,13 @@ struct cache { struct list_head need_commit_migrations; sector_t migration_threshold; wait_queue_head_t migration_wait; - atomic_t nr_migrations; + atomic_t nr_allocated_migrations; + + /* + * The number of in flight migrations that are performing + * background io. eg, promotion, writeback. + */ + atomic_t nr_io_migrations; wait_queue_head_t quiescing_wait; atomic_t quiescing; @@ -258,7 +264,6 @@ struct cache { struct dm_deferred_set *all_io_ds; mempool_t *migration_pool; - struct dm_cache_migration *next_migration; struct dm_cache_policy *policy; unsigned policy_nr_args; @@ -350,10 +355,31 @@ static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cel dm_bio_prison_free_cell(cache->prison, cell); } +static struct dm_cache_migration *alloc_migration(struct cache *cache) +{ + struct dm_cache_migration *mg; + + mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + if (mg) { + mg->cache = cache; + atomic_inc(&mg->cache->nr_allocated_migrations); + } + + return mg; +} + +static void free_migration(struct dm_cache_migration *mg) +{ + if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations)) + wake_up(&mg->cache->migration_wait); + + mempool_free(mg, mg->cache->migration_pool); +} + static int prealloc_data_structs(struct cache *cache, struct prealloc *p) { if (!p->mg) { - p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT); + p->mg = alloc_migration(cache); if (!p->mg) return -ENOMEM; } @@ -382,7 +408,7 @@ static void prealloc_free_structs(struct cache *cache, struct prealloc *p) free_prison_cell(cache, p->cell1); if (p->mg) - mempool_free(p->mg, cache->migration_pool); + free_migration(p->mg); } static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p) @@ -854,24 +880,14 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio, * Migration covers moving data from the origin device to the cache, or * vice versa. *--------------------------------------------------------------*/ -static void free_migration(struct dm_cache_migration *mg) -{ - mempool_free(mg, mg->cache->migration_pool); -} - -static void inc_nr_migrations(struct cache *cache) +static void inc_io_migrations(struct cache *cache) { - atomic_inc(&cache->nr_migrations); + atomic_inc(&cache->nr_io_migrations); } -static void dec_nr_migrations(struct cache *cache) +static void dec_io_migrations(struct cache *cache) { - atomic_dec(&cache->nr_migrations); - - /* - * Wake the worker in case we're suspending the target. - */ - wake_up(&cache->migration_wait); + atomic_dec(&cache->nr_io_migrations); } static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, @@ -894,11 +910,10 @@ static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, wake_worker(cache); } -static void cleanup_migration(struct dm_cache_migration *mg) +static void free_io_migration(struct dm_cache_migration *mg) { - struct cache *cache = mg->cache; + dec_io_migrations(mg->cache); free_migration(mg); - dec_nr_migrations(cache); } static void migration_failure(struct dm_cache_migration *mg) @@ -923,7 +938,7 @@ static void migration_failure(struct dm_cache_migration *mg) cell_defer(cache, mg->new_ocell, true); } - cleanup_migration(mg); + free_io_migration(mg); } static void migration_success_pre_commit(struct dm_cache_migration *mg) @@ -934,7 +949,7 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg) if (mg->writeback) { clear_dirty(cache, mg->old_oblock, mg->cblock); cell_defer(cache, mg->old_ocell, false); - cleanup_migration(mg); + free_io_migration(mg); return; } else if (mg->demote) { @@ -944,14 +959,14 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg) mg->old_oblock); if (mg->promote) cell_defer(cache, mg->new_ocell, true); - cleanup_migration(mg); + free_io_migration(mg); return; } } else { if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) { DMWARN_LIMIT("promotion failed; couldn't update on disk metadata"); policy_remove_mapping(cache->policy, mg->new_oblock); - cleanup_migration(mg); + free_io_migration(mg); return; } } @@ -984,7 +999,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) } else { if (mg->invalidate) policy_remove_mapping(cache->policy, mg->old_oblock); - cleanup_migration(mg); + free_io_migration(mg); } } else { @@ -999,7 +1014,7 @@ static void migration_success_post_commit(struct dm_cache_migration *mg) bio_endio(mg->new_ocell->holder, 0); cell_defer(cache, mg->new_ocell, false); } - cleanup_migration(mg); + free_io_migration(mg); } } @@ -1251,7 +1266,7 @@ static void promote(struct cache *cache, struct prealloc *structs, mg->new_ocell = cell; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1275,7 +1290,7 @@ static void writeback(struct cache *cache, struct prealloc *structs, mg->new_ocell = NULL; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1302,7 +1317,7 @@ static void demote_then_promote(struct cache *cache, struct prealloc *structs, mg->new_ocell = new_ocell; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1330,7 +1345,7 @@ static void invalidate(struct cache *cache, struct prealloc *structs, mg->new_ocell = NULL; mg->start_jiffies = jiffies; - inc_nr_migrations(cache); + inc_io_migrations(cache); quiesce_migration(mg); } @@ -1412,7 +1427,7 @@ static void process_discard_bio(struct cache *cache, struct prealloc *structs, static bool spare_migration_bandwidth(struct cache *cache) { - sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) * + sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) * cache->sectors_per_block; return current_volume < cache->migration_threshold; } @@ -1764,7 +1779,7 @@ static void stop_quiescing(struct cache *cache) static void wait_for_migrations(struct cache *cache) { - wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations)); + wait_event(cache->migration_wait, !atomic_read(&cache->nr_allocated_migrations)); } static void stop_worker(struct cache *cache) @@ -1876,9 +1891,6 @@ static void destroy(struct cache *cache) { unsigned i; - if (cache->next_migration) - mempool_free(cache->next_migration, cache->migration_pool); - if (cache->migration_pool) mempool_destroy(cache->migration_pool); @@ -2424,7 +2436,8 @@ static int cache_create(struct cache_args *ca, struct cache **result) INIT_LIST_HEAD(&cache->quiesced_migrations); INIT_LIST_HEAD(&cache->completed_migrations); INIT_LIST_HEAD(&cache->need_commit_migrations); - atomic_set(&cache->nr_migrations, 0); + atomic_set(&cache->nr_allocated_migrations, 0); + atomic_set(&cache->nr_io_migrations, 0); init_waitqueue_head(&cache->migration_wait); init_waitqueue_head(&cache->quiescing_wait); @@ -2487,8 +2500,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) goto bad; } - cache->next_migration = NULL; - cache->need_tick_bio = true; cache->sized = false; cache->invalidate = false; diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 8735543eacdb..07705ee181e3 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1127,6 +1127,24 @@ static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, schedule_zero(tc, virt_block, data_dest, cell, bio); } +static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); + +static void check_for_space(struct pool *pool) +{ + int r; + dm_block_t nr_free; + + if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE) + return; + + r = dm_pool_get_free_block_count(pool->pmd, &nr_free); + if (r) + return; + + if (nr_free) + set_pool_mode(pool, PM_WRITE); +} + /* * A non-zero return indicates read_only or fail_io mode. * Many callers don't care about the return value. @@ -1141,6 +1159,8 @@ static int commit(struct pool *pool) r = dm_pool_commit_metadata(pool->pmd); if (r) metadata_operation_failed(pool, "dm_pool_commit_metadata", r); + else + check_for_space(pool); return r; } @@ -1159,8 +1179,6 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) } } -static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); - static int alloc_data_block(struct thin_c *tc, dm_block_t *result) { int r; @@ -2155,7 +2173,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode) pool->process_cell = process_cell_read_only; pool->process_discard_cell = process_discard_cell; pool->process_prepared_mapping = process_prepared_mapping; - pool->process_prepared_discard = process_prepared_discard_passdown; + pool->process_prepared_discard = process_prepared_discard; if (!pool->pf.error_if_no_space && no_space_timeout) queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout); @@ -3367,6 +3385,12 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) struct pool_c *pt = ti->private; struct pool *pool = pt->pool; + if (get_pool_mode(pool) >= PM_READ_ONLY) { + DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode", + dm_device_name(pool->pool_md)); + return -EINVAL; + } + if (!strcasecmp(argv[0], "create_thin")) r = process_create_thin_mesg(argc, argv, pool); @@ -3814,6 +3838,8 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) r = -EINVAL; goto bad; } + atomic_set(&tc->refcount, 1); + init_completion(&tc->can_destroy); list_add_tail_rcu(&tc->list, &tc->pool->active_thins); spin_unlock_irqrestore(&tc->pool->lock, flags); /* @@ -3826,9 +3852,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) dm_put(pool_md); - atomic_set(&tc->refcount, 1); - init_completion(&tc->can_destroy); - return 0; bad: diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4c06585bf165..2caf5b374649 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -206,6 +206,9 @@ struct mapped_device { /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; + /* the number of internal suspends */ + unsigned internal_suspend_count; + struct dm_stats stats; }; @@ -899,7 +902,7 @@ static void disable_write_same(struct mapped_device *md) static void clone_endio(struct bio *bio, int error) { - int r = 0; + int r = error; struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; @@ -2928,7 +2931,7 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla { struct dm_table *map = NULL; - if (dm_suspended_internally_md(md)) + if (md->internal_suspend_count++) return; /* nested internal suspend */ if (dm_suspended_md(md)) { @@ -2953,7 +2956,9 @@ static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_fla static void __dm_internal_resume(struct mapped_device *md) { - if (!dm_suspended_internally_md(md)) + BUG_ON(!md->internal_suspend_count); + + if (--md->internal_suspend_count) return; /* resume from nested internal suspend */ if (dm_suspended_md(md)) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c1b0d52bfcb0..b98765f6f77f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3195,6 +3195,11 @@ static void handle_stripe_dirtying(struct r5conf *conf, (unsigned long long)sh->sector, rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); } + + if (rcw > disks && rmw > disks && + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + set_bit(STRIPE_DELAYED, &sh->state); + /* now if nothing is locked, and if we have enough data, * we can start a write request */ |