diff options
Diffstat (limited to 'volumes.c')
-rw-r--r-- | volumes.c | 254 |
1 files changed, 213 insertions, 41 deletions
@@ -35,17 +35,22 @@ struct stripe { u64 physical; }; -struct map_lookup { - struct cache_extent ce; - u64 type; - int io_align; - int io_width; - int stripe_len; - int sector_size; - int num_stripes; - int sub_stripes; - struct btrfs_bio_stripe stripes[]; -}; +static inline int nr_parity_stripes(struct map_lookup *map) +{ + if (map->type & BTRFS_BLOCK_GROUP_RAID5) + return 1; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + return 2; + else + return 0; +} + +static inline int nr_data_stripes(struct map_lookup *map) +{ + return map->num_stripes - nr_parity_stripes(map); +} + +#define is_parity_stripe(x) ( ((x) == BTRFS_RAID5_P_STRIPE) || ((x) == BTRFS_RAID6_Q_STRIPE) ) #define map_lookup_size(n) (sizeof(struct map_lookup) + \ (sizeof(struct btrfs_bio_stripe) * (n))) @@ -128,7 +133,14 @@ static int device_list_add(const char *path, btrfs_stack_device_bytes_used(&disk_super->dev_item); list_add(&device->dev_list, &fs_devices->devices); device->fs_devices = fs_devices; - } + } else if (!device->name || strcmp(device->name, path)) { + char *name = strdup(path); + if (!name) + return -ENOMEM; + kfree(device->name); + device->name = name; + } + if (found_transid > fs_devices->latest_trans) { fs_devices->latest_devid = devid; @@ -181,6 +193,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int flags) goto fail; } + if (posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED)) + fprintf(stderr, "Warning, could not drop caches\n"); + if (device->devid == fs_devices->latest_devid) fs_devices->latest_bdev = fd; if (device->devid == fs_devices->lowest_devid) @@ -589,7 +604,7 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; @@ -623,11 +638,21 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes, return calc_size; else if (type & BTRFS_BLOCK_GROUP_RAID10) return calc_size * (num_stripes / sub_stripes); + else if (type & BTRFS_BLOCK_GROUP_RAID5) + return calc_size * (num_stripes - 1); + else if (type & BTRFS_BLOCK_GROUP_RAID6) + return calc_size * (num_stripes - 2); else return calc_size * num_stripes; } +static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) +{ + /* TODO, add a way to store the preferred stripe size */ + return 64 * 1024; +} + int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 *start, u64 *num_bytes, u64 type) @@ -657,12 +682,14 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, int index; int stripe_len = 64 * 1024; struct btrfs_key key; + u64 offset; if (list_empty(dev_list)) { return -ENOSPC; } if (type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { if (type & BTRFS_BLOCK_GROUP_SYSTEM) { @@ -681,7 +708,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, } if (type & BTRFS_BLOCK_GROUP_RAID1) { num_stripes = min_t(u64, 2, - btrfs_super_num_devices(&info->super_copy)); + btrfs_super_num_devices(info->super_copy)); if (num_stripes < 2) return -ENOSPC; min_stripes = 2; @@ -691,20 +718,36 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID0)) { - num_stripes = btrfs_super_num_devices(&info->super_copy); + num_stripes = btrfs_super_num_devices(info->super_copy); min_stripes = 2; } if (type & (BTRFS_BLOCK_GROUP_RAID10)) { - num_stripes = btrfs_super_num_devices(&info->super_copy); + num_stripes = btrfs_super_num_devices(info->super_copy); if (num_stripes < 4) return -ENOSPC; num_stripes &= ~(u32)1; sub_stripes = 2; min_stripes = 4; } + if (type & (BTRFS_BLOCK_GROUP_RAID5)) { + num_stripes = btrfs_super_num_devices(info->super_copy); + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + stripe_len = find_raid56_stripe_len(num_stripes - 1, + btrfs_super_stripesize(info->super_copy)); + } + if (type & (BTRFS_BLOCK_GROUP_RAID6)) { + num_stripes = btrfs_super_num_devices(info->super_copy); + if (num_stripes < 3) + return -ENOSPC; + min_stripes = 3; + stripe_len = find_raid56_stripe_len(num_stripes - 2, + btrfs_super_stripesize(info->super_copy)); + } /* we don't want a chunk larger than 10% of the FS */ - percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1); + percent_max = div_factor(btrfs_super_total_bytes(info->super_copy), 1); max_chunk_size = min(percent_max, max_chunk_size); again: @@ -762,12 +805,13 @@ again: } return -ENOSPC; } - key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.type = BTRFS_CHUNK_ITEM_KEY; ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, - &key.offset); + &offset); if (ret) return ret; + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = offset; chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS); if (!chunk) @@ -977,6 +1021,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) ret = map->sub_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID5) + ret = 2; + else if (map->type & BTRFS_BLOCK_GROUP_RAID6) + ret = 3; else ret = 1; return ret; @@ -1016,6 +1064,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, u64 bytenr; u64 length; u64 stripe_nr; + u64 rmap_len; int i, j, nr = 0; ce = find_first_cache_extent(&map_tree->cache_tree, chunk_start); @@ -1023,10 +1072,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map = container_of(ce, struct map_lookup, ce); length = ce->size; + rmap_len = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID10) length = ce->size / (map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) length = ce->size / map->num_stripes; + else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + length = ce->size / nr_data_stripes(map); + rmap_len = map->stripe_len * nr_data_stripes(map); + } buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); @@ -1045,8 +1100,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, map->sub_stripes; } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; - } - bytenr = ce->start + stripe_nr * map->stripe_len; + } /* else if RAID[56], multiply by nr_data_stripes(). + * Alternatively, just use rmap_len below instead of + * map->stripe_len */ + + bytenr = ce->start + stripe_nr * rmap_len; for (j = 0; j < nr; j++) { if (buf[j] == bytenr) break; @@ -1057,28 +1115,60 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, *logical = buf; *naddrs = nr; - *stripe_len = map->stripe_len; + *stripe_len = rmap_len; return 0; } +static inline int parity_smaller(u64 a, u64 b) +{ + return a > b; +} + +/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +static void sort_parity_stripes(struct btrfs_multi_bio *bbio, u64 *raid_map) +{ + struct btrfs_bio_stripe s; + int i; + u64 l; + int again = 1; + + while (again) { + again = 0; + for (i = 0; i < bbio->num_stripes - 1; i++) { + if (parity_smaller(raid_map[i], raid_map[i+1])) { + s = bbio->stripes[i]; + l = raid_map[i]; + bbio->stripes[i] = bbio->stripes[i+1]; + raid_map[i] = raid_map[i+1]; + bbio->stripes[i+1] = s; + raid_map[i+1] = l; + again = 1; + } + } + } +} + int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_multi_bio **multi_ret, int mirror_num, + u64 **raid_map_ret) { return __btrfs_map_block(map_tree, rw, logical, length, NULL, - multi_ret, mirror_num); + multi_ret, mirror_num, raid_map_ret); } int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, u64 logical, u64 *length, u64 *type, - struct btrfs_multi_bio **multi_ret, int mirror_num) + struct btrfs_multi_bio **multi_ret, int mirror_num, + u64 **raid_map_ret) { struct cache_extent *ce; struct map_lookup *map; u64 offset; u64 stripe_offset; u64 stripe_nr; + u64 *raid_map = NULL; int stripes_allocated = 8; int stripes_required = 1; int stripe_index; @@ -1118,11 +1208,26 @@ again: stripes_required = map->sub_stripes; } } + if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) + && multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) { + /* RAID[56] write or recovery. Return all stripes */ + stripes_required = map->num_stripes; + + /* Only allocate the map if we've already got a large enough multi_ret */ + if (stripes_allocated >= stripes_required) { + raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + if (!raid_map) { + kfree(multi); + return -ENOMEM; + } + } + } + /* if our multi bio struct is too small, back off and try again */ - if (multi_ret && rw == WRITE && - stripes_allocated < stripes_required) { - stripes_allocated = map->num_stripes; + if (multi_ret && stripes_allocated < stripes_required) { + stripes_allocated = stripes_required; kfree(multi); + multi = NULL; goto again; } stripe_nr = offset; @@ -1139,6 +1244,7 @@ again: stripe_offset = offset - stripe_offset; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_DUP)) { /* we limit the length of each bio to what fits in a stripe */ @@ -1177,6 +1283,59 @@ again: multi->num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_RAID6)) { + + if (raid_map) { + int i, rot; + u64 tmp; + u64 raid56_full_stripe_start; + u64 full_stripe_len = nr_data_stripes(map) * map->stripe_len; + + /* + * align the start of our data stripe in the logical + * address space + */ + raid56_full_stripe_start = offset / full_stripe_len; + raid56_full_stripe_start *= full_stripe_len; + + /* get the data stripe number */ + stripe_nr = raid56_full_stripe_start / map->stripe_len; + stripe_nr = stripe_nr / nr_data_stripes(map); + + /* Work out the disk rotation on this stripe-set */ + rot = stripe_nr % map->num_stripes; + + /* Fill in the logical address of each stripe */ + tmp = stripe_nr * nr_data_stripes(map); + + for (i = 0; i < nr_data_stripes(map); i++) + raid_map[(i+rot) % map->num_stripes] = + ce->start + (tmp + i) * map->stripe_len; + + raid_map[(i+rot) % map->num_stripes] = BTRFS_RAID5_P_STRIPE; + if (map->type & BTRFS_BLOCK_GROUP_RAID6) + raid_map[(i+rot+1) % map->num_stripes] = BTRFS_RAID6_Q_STRIPE; + + *length = map->stripe_len; + stripe_index = 0; + stripe_offset = 0; + multi->num_stripes = map->num_stripes; + } else { + stripe_index = stripe_nr % nr_data_stripes(map); + stripe_nr = stripe_nr / nr_data_stripes(map); + + /* + * Mirror #0 or #1 means the original data block. + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ + if (mirror_num > 1) + stripe_index = nr_data_stripes(map) + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ + stripe_index = (stripe_nr + stripe_index) % map->num_stripes; + } } else { /* * after this do_div call, stripe_nr is the number of stripes @@ -1196,8 +1355,14 @@ again: stripe_index++; } *multi_ret = multi; + if (type) *type = map->type; + + if (raid_map) { + sort_parity_stripes(multi, raid_map); + *raid_map_ret = raid_map; + } out: return 0; } @@ -1222,6 +1387,22 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, return NULL; } +struct btrfs_device *btrfs_find_device_by_devid(struct btrfs_root *root, + u64 devid, int instance) +{ + struct list_head *head = &root->fs_info->fs_devices->devices; + struct btrfs_device *dev; + struct list_head *cur; + int num_found = 0; + + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (dev->devid == devid && num_found++ == instance) + return dev; + } + return NULL; +} + int btrfs_bootstrap_super_map(struct btrfs_mapping_tree *map_tree, struct btrfs_fs_devices *fs_devices) { @@ -1460,18 +1641,9 @@ static int read_one_dev(struct btrfs_root *root, return ret; } -int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) -{ - struct btrfs_dev_item *dev_item; - - dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, - dev_item); - return read_one_dev(root, buf, dev_item); -} - int btrfs_read_sys_array(struct btrfs_root *root) { - struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_super_block *super_copy = root->fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; @@ -1489,7 +1661,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); - write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); + write_extent_buffer(sb, super_copy, 0, sizeof(*super_copy)); array_size = btrfs_super_sys_array_size(super_copy); /* @@ -1588,9 +1760,9 @@ again: goto again; } - btrfs_free_path(path); ret = 0; error: + btrfs_free_path(path); return ret; } |