summaryrefslogtreecommitdiff
path: root/volumes.c
diff options
context:
space:
mode:
Diffstat (limited to 'volumes.c')
-rw-r--r--volumes.c254
1 files changed, 213 insertions, 41 deletions
diff --git a/volumes.c b/volumes.c
index 4c29cef..d6f81f8 100644
--- a/volumes.c
+++ b/volumes.c
@@ -35,17 +35,22 @@ struct stripe {
u64 physical;
};
-struct map_lookup {
- struct cache_extent ce;
- u64 type;
- int io_align;
- int io_width;
- int stripe_len;
- int sector_size;
- int num_stripes;
- int sub_stripes;
- struct btrfs_bio_stripe stripes[];
-};
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+ if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ return 1;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ return 2;
+ else
+ return 0;
+}
+
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+ return map->num_stripes - nr_parity_stripes(map);
+}
+
+#define is_parity_stripe(x) ( ((x) == BTRFS_RAID5_P_STRIPE) || ((x) == BTRFS_RAID6_Q_STRIPE) )
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
@@ -128,7 +133,14 @@ static int device_list_add(const char *path,
btrfs_stack_device_bytes_used(&disk_super->dev_item);
list_add(&device->dev_list, &fs_devices->devices);
device->fs_devices = fs_devices;
- }
+ } else if (!device->name || strcmp(device->name, path)) {
+ char *name = strdup(path);
+ if (!name)
+ return -ENOMEM;
+ kfree(device->name);
+ device->name = name;
+ }
+
if (found_transid > fs_devices->latest_trans) {
fs_devices->latest_devid = devid;
@@ -181,6 +193,9 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int flags)
goto fail;
}
+ if (posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED))
+ fprintf(stderr, "Warning, could not drop caches\n");
+
if (device->devid == fs_devices->latest_devid)
fs_devices->latest_bdev = fd;
if (device->devid == fs_devices->lowest_devid)
@@ -589,7 +604,7 @@ int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
struct btrfs_key *key,
struct btrfs_chunk *chunk, int item_size)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct btrfs_disk_key disk_key;
u32 array_size;
u8 *ptr;
@@ -623,11 +638,21 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
return calc_size;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
return calc_size * (num_stripes / sub_stripes);
+ else if (type & BTRFS_BLOCK_GROUP_RAID5)
+ return calc_size * (num_stripes - 1);
+ else if (type & BTRFS_BLOCK_GROUP_RAID6)
+ return calc_size * (num_stripes - 2);
else
return calc_size * num_stripes;
}
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+ /* TODO, add a way to store the preferred stripe size */
+ return 64 * 1024;
+}
+
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 *start,
u64 *num_bytes, u64 type)
@@ -657,12 +682,14 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
int index;
int stripe_len = 64 * 1024;
struct btrfs_key key;
+ u64 offset;
if (list_empty(dev_list)) {
return -ENOSPC;
}
if (type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_DUP)) {
if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
@@ -681,7 +708,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
}
if (type & BTRFS_BLOCK_GROUP_RAID1) {
num_stripes = min_t(u64, 2,
- btrfs_super_num_devices(&info->super_copy));
+ btrfs_super_num_devices(info->super_copy));
if (num_stripes < 2)
return -ENOSPC;
min_stripes = 2;
@@ -691,20 +718,36 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
- num_stripes = btrfs_super_num_devices(&info->super_copy);
+ num_stripes = btrfs_super_num_devices(info->super_copy);
min_stripes = 2;
}
if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
- num_stripes = btrfs_super_num_devices(&info->super_copy);
+ num_stripes = btrfs_super_num_devices(info->super_copy);
if (num_stripes < 4)
return -ENOSPC;
num_stripes &= ~(u32)1;
sub_stripes = 2;
min_stripes = 4;
}
+ if (type & (BTRFS_BLOCK_GROUP_RAID5)) {
+ num_stripes = btrfs_super_num_devices(info->super_copy);
+ if (num_stripes < 2)
+ return -ENOSPC;
+ min_stripes = 2;
+ stripe_len = find_raid56_stripe_len(num_stripes - 1,
+ btrfs_super_stripesize(info->super_copy));
+ }
+ if (type & (BTRFS_BLOCK_GROUP_RAID6)) {
+ num_stripes = btrfs_super_num_devices(info->super_copy);
+ if (num_stripes < 3)
+ return -ENOSPC;
+ min_stripes = 3;
+ stripe_len = find_raid56_stripe_len(num_stripes - 2,
+ btrfs_super_stripesize(info->super_copy));
+ }
/* we don't want a chunk larger than 10% of the FS */
- percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
+ percent_max = div_factor(btrfs_super_total_bytes(info->super_copy), 1);
max_chunk_size = min(percent_max, max_chunk_size);
again:
@@ -762,12 +805,13 @@ again:
}
return -ENOSPC;
}
- key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
- key.type = BTRFS_CHUNK_ITEM_KEY;
ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
- &key.offset);
+ &offset);
if (ret)
return ret;
+ key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
+ key.type = BTRFS_CHUNK_ITEM_KEY;
+ key.offset = offset;
chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
if (!chunk)
@@ -977,6 +1021,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ ret = 2;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ ret = 3;
else
ret = 1;
return ret;
@@ -1016,6 +1064,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 bytenr;
u64 length;
u64 stripe_nr;
+ u64 rmap_len;
int i, j, nr = 0;
ce = find_first_cache_extent(&map_tree->cache_tree, chunk_start);
@@ -1023,10 +1072,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
map = container_of(ce, struct map_lookup, ce);
length = ce->size;
+ rmap_len = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
length = ce->size / (map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
length = ce->size / map->num_stripes;
+ else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ length = ce->size / nr_data_stripes(map);
+ rmap_len = map->stripe_len * nr_data_stripes(map);
+ }
buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
@@ -1045,8 +1100,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
map->sub_stripes;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
- }
- bytenr = ce->start + stripe_nr * map->stripe_len;
+ } /* else if RAID[56], multiply by nr_data_stripes().
+ * Alternatively, just use rmap_len below instead of
+ * map->stripe_len */
+
+ bytenr = ce->start + stripe_nr * rmap_len;
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr)
break;
@@ -1057,28 +1115,60 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
*logical = buf;
*naddrs = nr;
- *stripe_len = map->stripe_len;
+ *stripe_len = rmap_len;
return 0;
}
+static inline int parity_smaller(u64 a, u64 b)
+{
+ return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_multi_bio *bbio, u64 *raid_map)
+{
+ struct btrfs_bio_stripe s;
+ int i;
+ u64 l;
+ int again = 1;
+
+ while (again) {
+ again = 0;
+ for (i = 0; i < bbio->num_stripes - 1; i++) {
+ if (parity_smaller(raid_map[i], raid_map[i+1])) {
+ s = bbio->stripes[i];
+ l = raid_map[i];
+ bbio->stripes[i] = bbio->stripes[i+1];
+ raid_map[i] = raid_map[i+1];
+ bbio->stripes[i+1] = s;
+ raid_map[i+1] = l;
+ again = 1;
+ }
+ }
+ }
+}
+
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
- struct btrfs_multi_bio **multi_ret, int mirror_num)
+ struct btrfs_multi_bio **multi_ret, int mirror_num,
+ u64 **raid_map_ret)
{
return __btrfs_map_block(map_tree, rw, logical, length, NULL,
- multi_ret, mirror_num);
+ multi_ret, mirror_num, raid_map_ret);
}
int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length, u64 *type,
- struct btrfs_multi_bio **multi_ret, int mirror_num)
+ struct btrfs_multi_bio **multi_ret, int mirror_num,
+ u64 **raid_map_ret)
{
struct cache_extent *ce;
struct map_lookup *map;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
+ u64 *raid_map = NULL;
int stripes_allocated = 8;
int stripes_required = 1;
int stripe_index;
@@ -1118,11 +1208,26 @@ again:
stripes_required = map->sub_stripes;
}
}
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)
+ && multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) {
+ /* RAID[56] write or recovery. Return all stripes */
+ stripes_required = map->num_stripes;
+
+ /* Only allocate the map if we've already got a large enough multi_ret */
+ if (stripes_allocated >= stripes_required) {
+ raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+ if (!raid_map) {
+ kfree(multi);
+ return -ENOMEM;
+ }
+ }
+ }
+
/* if our multi bio struct is too small, back off and try again */
- if (multi_ret && rw == WRITE &&
- stripes_allocated < stripes_required) {
- stripes_allocated = map->num_stripes;
+ if (multi_ret && stripes_allocated < stripes_required) {
+ stripes_allocated = stripes_required;
kfree(multi);
+ multi = NULL;
goto again;
}
stripe_nr = offset;
@@ -1139,6 +1244,7 @@ again:
stripe_offset = offset - stripe_offset;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_DUP)) {
/* we limit the length of each bio to what fits in a stripe */
@@ -1177,6 +1283,59 @@ again:
multi->num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+
+ if (raid_map) {
+ int i, rot;
+ u64 tmp;
+ u64 raid56_full_stripe_start;
+ u64 full_stripe_len = nr_data_stripes(map) * map->stripe_len;
+
+ /*
+ * align the start of our data stripe in the logical
+ * address space
+ */
+ raid56_full_stripe_start = offset / full_stripe_len;
+ raid56_full_stripe_start *= full_stripe_len;
+
+ /* get the data stripe number */
+ stripe_nr = raid56_full_stripe_start / map->stripe_len;
+ stripe_nr = stripe_nr / nr_data_stripes(map);
+
+ /* Work out the disk rotation on this stripe-set */
+ rot = stripe_nr % map->num_stripes;
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * nr_data_stripes(map);
+
+ for (i = 0; i < nr_data_stripes(map); i++)
+ raid_map[(i+rot) % map->num_stripes] =
+ ce->start + (tmp + i) * map->stripe_len;
+
+ raid_map[(i+rot) % map->num_stripes] = BTRFS_RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ raid_map[(i+rot+1) % map->num_stripes] = BTRFS_RAID6_Q_STRIPE;
+
+ *length = map->stripe_len;
+ stripe_index = 0;
+ stripe_offset = 0;
+ multi->num_stripes = map->num_stripes;
+ } else {
+ stripe_index = stripe_nr % nr_data_stripes(map);
+ stripe_nr = stripe_nr / nr_data_stripes(map);
+
+ /*
+ * Mirror #0 or #1 means the original data block.
+ * Mirror #2 is RAID5 parity block.
+ * Mirror #3 is RAID6 Q block.
+ */
+ if (mirror_num > 1)
+ stripe_index = nr_data_stripes(map) + mirror_num - 2;
+
+ /* We distribute the parity blocks across stripes */
+ stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
+ }
} else {
/*
* after this do_div call, stripe_nr is the number of stripes
@@ -1196,8 +1355,14 @@ again:
stripe_index++;
}
*multi_ret = multi;
+
if (type)
*type = map->type;
+
+ if (raid_map) {
+ sort_parity_stripes(multi, raid_map);
+ *raid_map_ret = raid_map;
+ }
out:
return 0;
}
@@ -1222,6 +1387,22 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
return NULL;
}
+struct btrfs_device *btrfs_find_device_by_devid(struct btrfs_root *root,
+ u64 devid, int instance)
+{
+ struct list_head *head = &root->fs_info->fs_devices->devices;
+ struct btrfs_device *dev;
+ struct list_head *cur;
+ int num_found = 0;
+
+ list_for_each(cur, head) {
+ dev = list_entry(cur, struct btrfs_device, dev_list);
+ if (dev->devid == devid && num_found++ == instance)
+ return dev;
+ }
+ return NULL;
+}
+
int btrfs_bootstrap_super_map(struct btrfs_mapping_tree *map_tree,
struct btrfs_fs_devices *fs_devices)
{
@@ -1460,18 +1641,9 @@ static int read_one_dev(struct btrfs_root *root,
return ret;
}
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
-{
- struct btrfs_dev_item *dev_item;
-
- dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
- dev_item);
- return read_one_dev(root, buf, dev_item);
-}
-
int btrfs_read_sys_array(struct btrfs_root *root)
{
- struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+ struct btrfs_super_block *super_copy = root->fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
@@ -1489,7 +1661,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
if (!sb)
return -ENOMEM;
btrfs_set_buffer_uptodate(sb);
- write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
+ write_extent_buffer(sb, super_copy, 0, sizeof(*super_copy));
array_size = btrfs_super_sys_array_size(super_copy);
/*
@@ -1588,9 +1760,9 @@ again:
goto again;
}
- btrfs_free_path(path);
ret = 0;
error:
+ btrfs_free_path(path);
return ret;
}