From 1974a3b42d8cf7a9c74f1e0310c593023617037a Mon Sep 17 00:00:00 2001 From: Miao Xie <miaox@cn.fujitsu.com> Date: Wed, 5 Jan 2011 10:07:24 +0000 Subject: btrfs: fix wrong calculation of stripe size There are two tiny problem: - One is When we check the chunk size is greater than the max chunk size or not, we should take mirrors into account, but the original code didn't. - The other is btrfs shouldn't use the size of the residual free space as the length of of a dup chunk when doing chunk allocation. It is because the device space that a dup chunk needs is twice as large as the chunk size, if we use the size of the residual free space as the length of a dup chunk, we can not get enough free space. Fix it. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Reviewed-by: Josef Bacik <josef@redhat.com> Signed-off-by: Chris Mason <chris.mason@oracle.com> --- fs/btrfs/volumes.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 177b73179590..c50a85e0d08f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2177,6 +2177,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int num_stripes = 1; int min_stripes = 1; int sub_stripes = 0; + int ncopies = 1; int looped = 0; int ret; int index; @@ -2197,12 +2198,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & (BTRFS_BLOCK_GROUP_DUP)) { num_stripes = 2; min_stripes = 2; + ncopies = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID1)) { if (fs_devices->rw_devices < 2) return -ENOSPC; num_stripes = 2; min_stripes = 2; + ncopies = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID10)) { num_stripes = fs_devices->rw_devices; @@ -2210,6 +2213,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, return -ENOSPC; num_stripes &= ~(u32)1; sub_stripes = 2; + ncopies = 2; min_stripes = 4; } @@ -2239,8 +2243,8 @@ again: map->num_stripes = num_stripes; } - if (calc_size * num_stripes > max_chunk_size) { - calc_size = max_chunk_size; + if (calc_size * num_stripes > max_chunk_size * ncopies) { + calc_size = max_chunk_size * ncopies; do_div(calc_size, num_stripes); do_div(calc_size, stripe_len); calc_size *= stripe_len; @@ -2321,6 +2325,8 @@ again: if (!looped && max_avail > 0) { looped = 1; calc_size = max_avail; + if (type & BTRFS_BLOCK_GROUP_DUP) + do_div(calc_size, 2); goto again; } kfree(map); -- cgit v1.2.1 From 7bfc837df935d850fe996dfe92ef48975cd4170a Mon Sep 17 00:00:00 2001 From: Miao Xie <miaox@cn.fujitsu.com> Date: Wed, 5 Jan 2011 10:07:26 +0000 Subject: btrfs: restructure find_free_dev_extent() - make it return the start position and length of the max free space when it can not find a suitable free space. - make it more readability Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com> --- fs/btrfs/volumes.c | 155 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 89 insertions(+), 66 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c50a85e0d08f..4838bd395e49 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -729,58 +729,82 @@ error: } /* + * find_free_dev_extent - find free space in the specified device + * @trans: transaction handler + * @device: the device which we search the free space in + * @num_bytes: the size of the free space that we need + * @start: store the start of the free space. + * @len: the size of the free space. that we find, or the size of the max + * free space if we don't find suitable free space + * * this uses a pretty simple search, the expectation is that it is * called very infrequently and that a given device has a small number * of extents + * + * @start is used to store the start of the free space if we find. But if we + * don't find suitable free space, it will be used to store the start position + * of the max free space. + * + * @len is used to store the size of the free space that we find. + * But if we don't find suitable free space, it is used to store the size of + * the max free space. */ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail) + u64 *start, u64 *len) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; - struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; - u64 hole_size = 0; - u64 last_byte = 0; - u64 search_start = 0; + u64 hole_size; + u64 max_hole_start; + u64 max_hole_size; + u64 extent_end; + u64 search_start; u64 search_end = device->total_bytes; int ret; - int slot = 0; - int start_found; + int slot; struct extent_buffer *l; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - path->reada = 2; - start_found = 0; - /* FIXME use last free of some kind */ /* we don't want to overwrite the superblock on the drive, * so we make sure to start at an offset of at least 1MB */ - search_start = max((u64)1024 * 1024, search_start); + search_start = 1024 * 1024; - if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + if (root->fs_info->alloc_start + num_bytes <= search_end) search_start = max(root->fs_info->alloc_start, search_start); + max_hole_start = search_start; + max_hole_size = 0; + + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto error; + } + path->reada = 2; + key.objectid = device->devid; key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) - goto error; + goto out; if (ret > 0) { ret = btrfs_previous_item(root, path, key.objectid, key.type); if (ret < 0) - goto error; - if (ret > 0) - start_found = 1; + goto out; } - l = path->nodes[0]; - btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { l = path->nodes[0]; slot = path->slots[0]; @@ -789,24 +813,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, if (ret == 0) continue; if (ret < 0) - goto error; -no_more_items: - if (!start_found) { - if (search_start >= search_end) { - ret = -ENOSPC; - goto error; - } - *start = search_start; - start_found = 1; - goto check_pending; - } - *start = last_byte > search_start ? - last_byte : search_start; - if (search_end <= *start) { - ret = -ENOSPC; - goto error; - } - goto check_pending; + goto out; + + break; } btrfs_item_key_to_cpu(l, &key, slot); @@ -814,48 +823,62 @@ no_more_items: goto next; if (key.objectid > device->devid) - goto no_more_items; + break; - if (key.offset >= search_start && key.offset > last_byte && - start_found) { - if (last_byte < search_start) - last_byte = search_start; - hole_size = key.offset - last_byte; + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; - if (hole_size > *max_avail) - *max_avail = hole_size; + if (key.offset > search_start) { + hole_size = key.offset - search_start; - if (key.offset > last_byte && - hole_size >= num_bytes) { - *start = last_byte; - goto check_pending; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } + + /* + * If this free space is greater than which we need, + * it must be the max free space that we have found + * until now, so max_hole_start must point to the start + * of this free space and the length of this free space + * is stored in max_hole_size. Thus, we return + * max_hole_start and max_hole_size and go back to the + * caller. + */ + if (hole_size >= num_bytes) { + ret = 0; + goto out; } } - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) - goto next; - start_found = 1; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); - last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (extent_end > search_start) + search_start = extent_end; next: path->slots[0]++; cond_resched(); } -check_pending: - /* we have to make sure we didn't find an extent that has already - * been allocated by the map tree or the original allocation - */ - BUG_ON(*start < search_start); - if (*start + num_bytes > search_end) { - ret = -ENOSPC; - goto error; + hole_size = search_end- search_start; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; } - /* check for pending inserts here */ - ret = 0; -error: + /* See above. */ + if (hole_size < num_bytes) + ret = -ENOSPC; + else + ret = 0; + +out: btrfs_free_path(path); +error: + *start = max_hole_start; + if (len && max_hole_size > *len) + *len = max_hole_size; return ret; } -- cgit v1.2.1 From b2117a39fa96cf4814e7cab8c11494149ba6f29d Mon Sep 17 00:00:00 2001 From: Miao Xie <miaox@cn.fujitsu.com> Date: Wed, 5 Jan 2011 10:07:28 +0000 Subject: btrfs: make the chunk allocator utilize the devices better With this patch, we change the handling method when we can not get enough free extents with default size. Implementation: 1. Look up the suitable free extent on each device and keep the search result. If not find a suitable free extent, keep the max free extent 2. If we get enough suitable free extents with default size, chunk allocation succeeds. 3. If we can not get enough free extents, but the number of the extent with default size is >= min_stripes, we just change the mapping information (reduce the number of stripes in the extent map), and chunk allocation succeeds. 4. If the number of the extent with default size is < min_stripes, sort the devices by its max free extent's size descending 5. Use the size of the max free extent on the (num_stripes - 1)th device as the stripe size to allocate the device space By this way, the chunk allocator can allocate chunks as large as possible when the devices' space is not enough and make full use of the devices. Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com> --- fs/btrfs/volumes.c | 379 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 276 insertions(+), 103 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 4838bd395e49..c22784b989b7 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -877,7 +877,7 @@ out: btrfs_free_path(path); error: *start = max_hole_start; - if (len && max_hole_size > *len) + if (len) *len = max_hole_size; return ret; } @@ -2176,70 +2176,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, return calc_size * num_stripes; } -static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, - struct map_lookup **map_ret, - u64 *num_bytes, u64 *stripe_size, - u64 start, u64 type) +/* Used to sort the devices by max_avail(descending sort) */ +int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2) { - struct btrfs_fs_info *info = extent_root->fs_info; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct list_head *cur; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; - struct extent_map *em; - struct list_head private_devs; - int min_stripe_size = 1 * 1024 * 1024; - u64 calc_size = 1024 * 1024 * 1024; - u64 max_chunk_size = calc_size; - u64 min_free; - u64 avail; - u64 max_avail = 0; - u64 dev_offset; - int num_stripes = 1; - int min_stripes = 1; - int sub_stripes = 0; - int ncopies = 1; - int looped = 0; - int ret; - int index; - int stripe_len = 64 * 1024; + if (((struct btrfs_device_info *)dev_info1)->max_avail > + ((struct btrfs_device_info *)dev_info2)->max_avail) + return -1; + else if (((struct btrfs_device_info *)dev_info1)->max_avail < + ((struct btrfs_device_info *)dev_info2)->max_avail) + return 1; + else + return 0; +} - if ((type & BTRFS_BLOCK_GROUP_RAID1) && - (type & BTRFS_BLOCK_GROUP_DUP)) { - WARN_ON(1); - type &= ~BTRFS_BLOCK_GROUP_DUP; - } - if (list_empty(&fs_devices->alloc_list)) - return -ENOSPC; +static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type, + int *num_stripes, int *min_stripes, + int *sub_stripes) +{ + *num_stripes = 1; + *min_stripes = 1; + *sub_stripes = 0; if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - num_stripes = fs_devices->rw_devices; - min_stripes = 2; + *num_stripes = fs_devices->rw_devices; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = 2; - min_stripes = 2; - ncopies = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID1)) { if (fs_devices->rw_devices < 2) return -ENOSPC; - num_stripes = 2; - min_stripes = 2; - ncopies = 2; + *num_stripes = 2; + *min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - num_stripes = fs_devices->rw_devices; - if (num_stripes < 4) + *num_stripes = fs_devices->rw_devices; + if (*num_stripes < 4) return -ENOSPC; - num_stripes &= ~(u32)1; - sub_stripes = 2; - ncopies = 2; - min_stripes = 4; + *num_stripes &= ~(u32)1; + *sub_stripes = 2; + *min_stripes = 4; } + return 0; +} + +static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices, + u64 proposed_size, u64 type, + int num_stripes, int small_stripe) +{ + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = proposed_size; + u64 max_chunk_size = calc_size; + int ncopies = 1; + + if (type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP | + BTRFS_BLOCK_GROUP_RAID10)) + ncopies = 2; + if (type & BTRFS_BLOCK_GROUP_DATA) { max_chunk_size = 10 * calc_size; min_stripe_size = 64 * 1024 * 1024; @@ -2256,51 +2253,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); -again: - max_avail = 0; - if (!map || map->num_stripes != num_stripes) { - kfree(map); - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) - return -ENOMEM; - map->num_stripes = num_stripes; - } - if (calc_size * num_stripes > max_chunk_size * ncopies) { calc_size = max_chunk_size * ncopies; do_div(calc_size, num_stripes); - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; } /* we don't want tiny stripes */ - if (!looped) + if (!small_stripe) calc_size = max_t(u64, min_stripe_size, calc_size); /* - * we're about to do_div by the stripe_len so lets make sure + * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure * we end up with something bigger than a stripe */ - calc_size = max_t(u64, calc_size, stripe_len * 4); + calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN); + + do_div(calc_size, BTRFS_STRIPE_LEN); + calc_size *= BTRFS_STRIPE_LEN; + + return calc_size; +} + +static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map, + int num_stripes) +{ + struct map_lookup *new; + size_t len = map_lookup_size(num_stripes); + + BUG_ON(map->num_stripes < num_stripes); + + if (map->num_stripes == num_stripes) + return map; + + new = kmalloc(len, GFP_NOFS); + if (!new) { + /* just change map->num_stripes */ + map->num_stripes = num_stripes; + return map; + } + + memcpy(new, map, len); + new->num_stripes = num_stripes; + kfree(map); + return new; +} + +/* + * helper to allocate device space from btrfs_device_info, in which we stored + * max free space information of every device. It is used when we can not + * allocate chunks by default size. + * + * By this helper, we can allocate a new chunk as larger as possible. + */ +static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_devices *fs_devices, + struct btrfs_device_info *devices, + int nr_device, u64 type, + struct map_lookup **map_lookup, + int min_stripes, u64 *stripe_size) +{ + int i, index, sort_again = 0; + int min_devices = min_stripes; + u64 max_avail, min_free; + struct map_lookup *map = *map_lookup; + int ret; + + if (nr_device < min_stripes) + return -ENOSPC; + + btrfs_descending_sort_devices(devices, nr_device); + + max_avail = devices[0].max_avail; + if (!max_avail) + return -ENOSPC; + + for (i = 0; i < nr_device; i++) { + /* + * if dev_offset = 0, it means the free space of this device + * is less than what we need, and we didn't search max avail + * extent on this device, so do it now. + */ + if (!devices[i].dev_offset) { + ret = find_free_dev_extent(trans, devices[i].dev, + max_avail, + &devices[i].dev_offset, + &devices[i].max_avail); + if (ret != 0 && ret != -ENOSPC) + return ret; + sort_again = 1; + } + } + + /* we update the max avail free extent of each devices, sort again */ + if (sort_again) + btrfs_descending_sort_devices(devices, nr_device); + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_devices = 1; + + if (!devices[min_devices - 1].max_avail) + return -ENOSPC; + + max_avail = devices[min_devices - 1].max_avail; + if (type & BTRFS_BLOCK_GROUP_DUP) + do_div(max_avail, 2); - do_div(calc_size, stripe_len); - calc_size *= stripe_len; + max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type, + min_stripes, 1); + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = max_avail * 2; + else + min_free = max_avail; + + if (min_free > devices[min_devices - 1].max_avail) + return -ENOSPC; + + map = __shrink_map_lookup_stripes(map, min_stripes); + *stripe_size = max_avail; + + index = 0; + for (i = 0; i < min_stripes; i++) { + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset; + if (type & BTRFS_BLOCK_GROUP_DUP) { + i++; + map->stripes[i].dev = devices[index].dev; + map->stripes[i].physical = devices[index].dev_offset + + max_avail; + } + index++; + } + *map_lookup = map; + + return 0; +} + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct btrfs_device_info *devices_info; + struct list_head private_devs; + u64 calc_size = 1024 * 1024 * 1024; + u64 min_free; + u64 avail; + u64 dev_offset; + int num_stripes; + int min_stripes; + int sub_stripes; + int min_devices; /* the min number of devices we need */ + int i; + int ret; + int index; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes, + &min_stripes, &sub_stripes); + if (ret) + return ret; + + devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + GFP_NOFS); + if (!devices_info) + return -ENOMEM; + + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + ret = -ENOMEM; + goto error; + } + map->num_stripes = num_stripes; cur = fs_devices->alloc_list.next; index = 0; + i = 0; - if (type & BTRFS_BLOCK_GROUP_DUP) + calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type, + num_stripes, 0); + + if (type & BTRFS_BLOCK_GROUP_DUP) { min_free = calc_size * 2; - else + min_devices = 1; + } else { min_free = calc_size; - - /* - * we add 1MB because we never use the first 1MB of the device, unless - * we've looped, then we are likely allocating the maximum amount of - * space left already - */ - if (!looped) - min_free += 1024 * 1024; + min_devices = min_stripes; + } INIT_LIST_HEAD(&private_devs); while (index < num_stripes) { @@ -2313,27 +2468,39 @@ again: cur = cur->next; if (device->in_fs_metadata && avail >= min_free) { - ret = find_free_dev_extent(trans, device, - min_free, &dev_offset, - &max_avail); + ret = find_free_dev_extent(trans, device, min_free, + &devices_info[i].dev_offset, + &devices_info[i].max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); map->stripes[index].dev = device; - map->stripes[index].physical = dev_offset; + map->stripes[index].physical = + devices_info[i].dev_offset; index++; if (type & BTRFS_BLOCK_GROUP_DUP) { map->stripes[index].dev = device; map->stripes[index].physical = - dev_offset + calc_size; + devices_info[i].dev_offset + + calc_size; index++; } - } - } else if (device->in_fs_metadata && avail > max_avail) - max_avail = avail; + } else if (ret != -ENOSPC) + goto error; + + devices_info[i].dev = device; + i++; + } else if (device->in_fs_metadata && + avail >= BTRFS_STRIPE_LEN) { + devices_info[i].dev = device; + devices_info[i].max_avail = avail; + i++; + } + if (cur == &fs_devices->alloc_list) break; } + list_splice(&private_devs, &fs_devices->alloc_list); if (index < num_stripes) { if (index >= min_stripes) { @@ -2342,36 +2509,36 @@ again: num_stripes /= sub_stripes; num_stripes *= sub_stripes; } - looped = 1; - goto again; - } - if (!looped && max_avail > 0) { - looped = 1; - calc_size = max_avail; - if (type & BTRFS_BLOCK_GROUP_DUP) - do_div(calc_size, 2); - goto again; + + map = __shrink_map_lookup_stripes(map, num_stripes); + } else if (i >= min_devices) { + ret = __btrfs_alloc_tiny_space(trans, fs_devices, + devices_info, i, type, + &map, min_stripes, + &calc_size); + if (ret) + goto error; + } else { + ret = -ENOSPC; + goto error; } - kfree(map); - return -ENOSPC; } map->sector_size = extent_root->sectorsize; - map->stripe_len = stripe_len; - map->io_align = stripe_len; - map->io_width = stripe_len; + map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; map->type = type; - map->num_stripes = num_stripes; map->sub_stripes = sub_stripes; *map_ret = map; *stripe_size = calc_size; *num_bytes = chunk_bytes_by_type(type, calc_size, - num_stripes, sub_stripes); + map->num_stripes, sub_stripes); em = alloc_extent_map(GFP_NOFS); if (!em) { - kfree(map); - return -ENOMEM; + ret = -ENOMEM; + goto error; } em->bdev = (struct block_device *)map; em->start = start; @@ -2404,7 +2571,13 @@ again: index++; } + kfree(devices_info); return 0; + +error: + kfree(map); + kfree(devices_info); + return ret; } static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, -- cgit v1.2.1 From 6d07bcec969af335d4e35b3921131b7929bd634e Mon Sep 17 00:00:00 2001 From: Miao Xie <miaox@cn.fujitsu.com> Date: Wed, 5 Jan 2011 10:07:31 +0000 Subject: btrfs: fix wrong free space information of btrfs When we store data by raid profile in btrfs with two or more different size disks, df command shows there is some free space in the filesystem, but the user can not write any data in fact, df command shows the wrong free space information of btrfs. # mkfs.btrfs -d raid1 /dev/sda9 /dev/sda10 # btrfs-show Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64 Total devices 2 FS bytes used 28.00KB devid 1 size 5.01GB used 2.03GB path /dev/sda9 devid 2 size 10.00GB used 2.01GB path /dev/sda10 # btrfs device scan /dev/sda9 /dev/sda10 # mount /dev/sda9 /mnt # dd if=/dev/zero of=tmpfile0 bs=4K count=9999999999 (fill the filesystem) # sync # df -TH Filesystem Type Size Used Avail Use% Mounted on /dev/sda9 btrfs 17G 8.6G 5.4G 62% /mnt # btrfs-show Label: none uuid: a95cd49e-6e33-45b8-8741-a36153ce4b64 Total devices 2 FS bytes used 3.99GB devid 1 size 5.01GB used 5.01GB path /dev/sda9 devid 2 size 10.00GB used 4.99GB path /dev/sda10 It is because btrfs cannot allocate chunks when one of the pairing disks has no space, the free space on the other disks can not be used for ever, and should be subtracted from the total space, but btrfs doesn't subtract this space from the total. It is strange to the user. This patch fixes it by calcing the free space that can be used to allocate chunks. Implementation: 1. get all the devices free space, and align them by stripe length. 2. sort the devices by the free space. 3. check the free space of the devices, 3.1. if it is not zero, and then check the number of the devices that has more free space than this device, if the number of the devices is beyond the min stripe number, the free space can be used, and add into total free space. if the number of the devices is below the min stripe number, we can not use the free space, the check ends. 3.2. if the free space is zero, check the next devices, goto 3.1 This implementation is just likely fake chunk allocation. After appling this patch, df can show correct space information: # df -TH Filesystem Type Size Used Avail Use% Mounted on /dev/sda9 btrfs 17G 8.6G 0 100% /mnt Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Chris Mason <chris.mason@oracle.com> --- fs/btrfs/volumes.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c22784b989b7..0c7f478cf645 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -728,6 +728,90 @@ error: return ret; } +/* helper to account the used device space in the range */ +int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, + u64 end, u64 *length) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent; + struct btrfs_path *path; + u64 extent_end; + int ret; + int slot; + struct extent_buffer *l; + + *length = 0; + + if (start >= device->total_bytes) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto out; + } + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + + break; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + break; + + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + extent_end = key.offset + btrfs_dev_extent_length(l, + dev_extent); + if (key.offset <= start && extent_end > end) { + *length = end - start + 1; + break; + } else if (key.offset <= start && extent_end > start) + *length += extent_end - start; + else if (key.offset > start && extent_end <= end) + *length += extent_end - key.offset; + else if (key.offset > start && key.offset <= end) { + *length += end - key.offset + 1; + break; + } else if (key.offset > end) + break; + +next: + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + /* * find_free_dev_extent - find free space in the specified device * @trans: transaction handler -- cgit v1.2.1 From 20b450773d17e325190c158e10bfdb25dc21d2d6 Mon Sep 17 00:00:00 2001 From: Dave Young <hidave.darkstar@gmail.com> Date: Sat, 8 Jan 2011 10:09:13 +0000 Subject: btrfs: mount failure return value fix I happened to pass swap partition as root partition in cmdline, then kernel panic and tell me about "Cannot open root device". It is not correct, in fact it is a fs type mismatch instead of 'no device'. Eventually I found btrfs mounting failed with -EIO, it should be -EINVAL. The logic in init/do_mounts.c: for (p = fs_names; *p; p += strlen(p)+1) { int err = do_mount_root(name, p, flags, root_mount_data); switch (err) { case 0: goto out; case -EACCES: flags |= MS_RDONLY; goto retry; case -EINVAL: continue; } print "Cannot open root device" panic } SO fs type after btrfs will have no chance to mount Here fix the return value as -EINVAL Signed-off-by: Dave Young <hidave.darkstar@gmail.com> Signed-off-by: Chris Mason <chris.mason@oracle.com> --- fs/btrfs/volumes.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0c7f478cf645..e8be478178aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -600,8 +600,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); - if (!bh) + if (!bh) { + ret = -EINVAL; goto error_close; + } disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); @@ -702,7 +704,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, goto error_close; bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; @@ -1302,7 +1304,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) set_blocksize(bdev, 4096); bh = btrfs_read_dev_super(bdev); if (!bh) { - ret = -EIO; + ret = -EINVAL; goto error_close; } disk_super = (struct btrfs_super_block *)bh->b_data; -- cgit v1.2.1 From 6f88a4403def422bd8e276ddf6863d6ac71435d2 Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben@decadent.org.uk> Date: Wed, 29 Dec 2010 14:55:03 +0000 Subject: btrfs: Require CAP_SYS_ADMIN for filesystem rebalance Filesystem rebalancing (BTRFS_IOC_BALANCE) affects the entire filesystem and may run uninterruptibly for a long time. This does not seem to be something that an unprivileged user should be able to do. Reported-by: Aron Xu <happyaron.xu@gmail.com> Signed-off-by: Ben Hutchings <ben@decadent.org.uk> Signed-off-by: Chris Mason <chris.mason@oracle.com> --- fs/btrfs/volumes.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/btrfs/volumes.c') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e8be478178aa..f2d2f4ccc738 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -22,6 +22,7 @@ #include <linux/blkdev.h> #include <linux/random.h> #include <linux/iocontext.h> +#include <linux/capability.h> #include <asm/div64.h> #include "compat.h" #include "ctree.h" @@ -2024,6 +2025,9 @@ int btrfs_balance(struct btrfs_root *dev_root) if (dev_root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + mutex_lock(&dev_root->fs_info->volume_mutex); dev_root = dev_root->fs_info->dev_root; -- cgit v1.2.1