devices: open rw before closing ro from scandev-dct-reopen-rw

A command opens devices RDONLY to scan them. If it then wants to update the VG, it needs to open the devices RDWR. Previously it would close the device and open it again with the new mode. This left a small window in which the device was not open and could potentially change. We also want to avoid opening devices in the low level metadata writing where errors are not handled as cleanly. It's better to reopen all the devices RDWR before updating anything. To address these issues, open the devices RDWR after the vg lock is acquired and before the device is reread to verify its contents in vg_read. The RDONLY fd is closed after the new RDWR fd is opened. Because bcache uses the fd as an index into its cache, the interface to the bcache layer is changed to accept an per device index instead of the fd, and a new array is added to bcache that maps the index to the fd. TODO: fix some broken bcache unit tests
author: David Teigland <teigland@redhat.com> 2020-09-17 09:40:18 -0500
committer: David Teigland <teigland@redhat.com> 2020-09-17 16:57:05 -0500
commit: ece962dc5328d4bc3f6d10c835c6d1f1e59cd8e0 (patch)
tree: 60e9e63cbb3010c473e04609f44e338b6972a132 /lib/device/bcache.c
parent: 72b931d66407fedc68735324996fa190ebaf1a3f (diff)
download: lvm2-dev-dct-reopen-rw.tar.gz
1 files changed, 135 insertions, 53 deletions
diff --git a/lib/device/bcache.c b/lib/device/bcache.c
index 03cd4be87..c78445d99 100644
--- a/lib/device/bcache.c
+++ b/lib/device/bcache.c
@@ -33,6 +33,11 @@
 
 #define SECTOR_SHIFT 9L
 
+#define FD_TABLE_INC 1024
+static int _fd_table_size;
+static int *_fd_table;
+
+
 //----------------------------------------------------------------
 
 static void log_sys_warn(const char *call)
@@ -155,11 +160,11 @@ static void _async_destroy(struct io_engine *ioe)
 	free(e);
 }
 
-static int _last_byte_fd;
+static int _last_byte_di;
 static uint64_t _last_byte_offset;
 static int _last_byte_sector_size;
 
-static bool _async_issue(struct io_engine *ioe, enum dir d, int fd,
+static bool _async_issue(struct io_engine *ioe, enum dir d, int di,
 			 sector_t sb, sector_t se, void *data, void *context)
 {
 	int r;
@@ -183,7 +188,7 @@ static bool _async_issue(struct io_engine *ioe, enum dir d, int fd,
 	/*
 	 * If bcache block goes past where lvm wants to write, then clamp it.
 	 */
-	if ((d == DIR_WRITE) && _last_byte_offset && (fd == _last_byte_fd)) {
+	if ((d == DIR_WRITE) && _last_byte_offset && (di == _last_byte_di)) {
 		if (offset > _last_byte_offset) {
 			log_error("Limit write at %llu len %llu beyond last byte %llu",
 				  (unsigned long long)offset,
@@ -268,7 +273,7 @@ static bool _async_issue(struct io_engine *ioe, enum dir d, int fd,
 
 	memset(&cb->cb, 0, sizeof(cb->cb));
 
-	cb->cb.aio_fildes = (int) fd;
+	cb->cb.aio_fildes = (int) _fd_table[di];
 	cb->cb.u.c.buf = data;
 	cb->cb.u.c.offset = offset;
 	cb->cb.u.c.nbytes = nbytes;
@@ -276,13 +281,15 @@ static bool _async_issue(struct io_engine *ioe, enum dir d, int fd,
 
 #if 0
 	if (d == DIR_READ) {
-		log_debug("io R off %llu bytes %llu",
+		log_debug("io R off %llu bytes %llu di %d fd %d",
 			  (unsigned long long)cb->cb.u.c.offset,
-			  (unsigned long long)cb->cb.u.c.nbytes);
+			  (unsigned long long)cb->cb.u.c.nbytes,
+			  di, _fd_table[di]);
 	} else {
-		log_debug("io W off %llu bytes %llu",
+		log_debug("io W off %llu bytes %llu di %d fd %d",
 			  (unsigned long long)cb->cb.u.c.offset,
-			  (unsigned long long)cb->cb.u.c.nbytes);
+			  (unsigned long long)cb->cb.u.c.nbytes,
+			  di, _fd_table[di]);
 	}
 #endif
 
@@ -414,7 +421,7 @@ static void _sync_destroy(struct io_engine *ioe)
         free(e);
 }
 
-static bool _sync_issue(struct io_engine *ioe, enum dir d, int fd,
+static bool _sync_issue(struct io_engine *ioe, enum dir d, int di,
                         sector_t sb, sector_t se, void *data, void *context)
 {
 	int rv;
@@ -430,7 +437,7 @@ static bool _sync_issue(struct io_engine *ioe, enum dir d, int fd,
 	}
 
 	where = sb * 512;
-	off = lseek(fd, where, SEEK_SET);
+	off = lseek(_fd_table[di], where, SEEK_SET);
 	if (off == (off_t) -1) {
 		log_warn("Device seek error %d for offset %llu", errno, (unsigned long long)where);
 		free(io);
@@ -445,7 +452,7 @@ static bool _sync_issue(struct io_engine *ioe, enum dir d, int fd,
 	/*
 	 * If bcache block goes past where lvm wants to write, then clamp it.
 	 */
-	if ((d == DIR_WRITE) && _last_byte_offset && (fd == _last_byte_fd)) {
+	if ((d == DIR_WRITE) && _last_byte_offset && (di == _last_byte_di)) {
 		uint64_t offset = where;
 		uint64_t nbytes = len;
 		sector_t limit_nbytes = 0;
@@ -526,9 +533,9 @@ static bool _sync_issue(struct io_engine *ioe, enum dir d, int fd,
 
 	while (pos < len) {
 		if (d == DIR_READ)
-			rv = read(fd, (char *)data + pos, len - pos);
+			rv = read(_fd_table[di], (char *)data + pos, len - pos);
 		else
-			rv = write(fd, (char *)data + pos, len - pos);
+			rv = write(_fd_table[di], (char *)data + pos, len - pos);
 
 		if (rv == -1 && errno == EINTR)
 			continue;
@@ -688,7 +695,7 @@ struct bcache {
 //----------------------------------------------------------------
 
 struct key_parts {
-	uint32_t fd;
+	uint32_t di;
 	uint64_t b;
 } __attribute__ ((packed));
 
@@ -697,12 +704,12 @@ union key {
         uint8_t bytes[12];
 };
 
-static struct block *_block_lookup(struct bcache *cache, int fd, uint64_t i)
+static struct block *_block_lookup(struct bcache *cache, int di, uint64_t i)
 {
 	union key k;
 	union radix_value v;
 
-	k.parts.fd = fd;
+	k.parts.di = di;
 	k.parts.b = i;
 
 	if (radix_tree_lookup(cache->rtree, k.bytes, k.bytes + sizeof(k.bytes), &v))
@@ -716,7 +723,7 @@ static bool _block_insert(struct block *b)
         union key k;
         union radix_value v;
 
-        k.parts.fd = b->fd;
+        k.parts.di = b->di;
         k.parts.b = b->index;
         v.ptr = b;
 
@@ -727,7 +734,7 @@ static void _block_remove(struct block *b)
 {
         union key k;
 
-        k.parts.fd = b->fd;
+        k.parts.di = b->di;
         k.parts.b = b->index;
 
 	radix_tree_remove(b->cache->rtree, k.bytes, k.bytes + sizeof(k.bytes));
@@ -869,7 +876,7 @@ static void _issue_low_level(struct block *b, enum dir d)
 
 	dm_list_move(&cache->io_pending, &b->list);
 
-	if (!cache->engine->issue(cache->engine, d, b->fd, sb, se, b->data, b)) {
+	if (!cache->engine->issue(cache->engine, d, b->di, sb, se, b->data, b)) {
 		/* FIXME: if io_submit() set an errno, return that instead of EIO? */
 		_complete_io(b, -EIO);
 		return;
@@ -945,7 +952,7 @@ static struct block *_find_unused_clean_block(struct bcache *cache)
 	return NULL;
 }
 
-static struct block *_new_block(struct bcache *cache, int fd, block_address i, bool can_wait)
+static struct block *_new_block(struct bcache *cache, int di, block_address i, bool can_wait)
 {
 	struct block *b;
 
@@ -958,8 +965,8 @@ static struct block *_new_block(struct bcache *cache, int fd, block_address i, b
 					_writeback(cache, 16);  // FIXME: magic number
 				_wait_io(cache);
 			} else {
-				log_debug("bcache no new blocks for fd %d index %u",
-					  fd, (uint32_t) i);
+				log_debug("bcache no new blocks for di %d index %u",
+					  di, (uint32_t) i);
 				return NULL;
 			}
 		}
@@ -968,7 +975,7 @@ static struct block *_new_block(struct bcache *cache, int fd, block_address i, b
 	if (b) {
 		dm_list_init(&b->list);
 		b->flags = 0;
-		b->fd = fd;
+		b->di = di;
 		b->index = i;
 		b->ref_count = 0;
 		b->error = 0;
@@ -1014,10 +1021,10 @@ static void _miss(struct bcache *cache, unsigned flags)
 }
 
 static struct block *_lookup_or_read_block(struct bcache *cache,
-				  	   int fd, block_address i,
+				  	   int di, block_address i,
 					   unsigned flags)
 {
-	struct block *b = _block_lookup(cache, fd, i);
+	struct block *b = _block_lookup(cache, di, i);
 
 	if (b) {
 		// FIXME: this is insufficient.  We need to also catch a read
@@ -1042,7 +1049,7 @@ static struct block *_lookup_or_read_block(struct bcache *cache,
 	} else {
 		_miss(cache, flags);
 
-		b = _new_block(cache, fd, i, true);
+		b = _new_block(cache, di, i, true);
 		if (b) {
 			if (flags & GF_ZERO)
 				_zero_block(b);
@@ -1087,6 +1094,7 @@ struct bcache *bcache_create(sector_t block_sectors, unsigned nr_cache_blocks,
 	struct bcache *cache;
 	unsigned max_io = engine->max_io(engine);
 	long pgsize = sysconf(_SC_PAGESIZE);
+	int i;
 
 	if (pgsize < 0) {
 		log_warn("WARNING: _SC_PAGESIZE returns negative value.");
@@ -1147,6 +1155,18 @@ struct bcache *bcache_create(sector_t block_sectors, unsigned nr_cache_blocks,
 		return NULL;
 	}
 
+	_fd_table_size = FD_TABLE_INC;
+
+	if (!(_fd_table = malloc(sizeof(int) * _fd_table_size))) {
+		cache->engine->destroy(cache->engine);
+		radix_tree_destroy(cache->rtree);
+		free(cache);
+		return NULL;
+	}
+
+	for (i = 0; i < _fd_table_size; i++)
+		_fd_table[i] = -1;
+
 	return cache;
 }
 
@@ -1162,6 +1182,9 @@ void bcache_destroy(struct bcache *cache)
 	radix_tree_destroy(cache->rtree);
 	cache->engine->destroy(cache->engine);
 	free(cache);
+	free(_fd_table);
+	_fd_table = NULL;
+	_fd_table_size = 0;
 }
 
 sector_t bcache_block_sectors(struct bcache *cache)
@@ -1179,13 +1202,13 @@ unsigned bcache_max_prefetches(struct bcache *cache)
 	return cache->max_io;
 }
 
-void bcache_prefetch(struct bcache *cache, int fd, block_address i)
+void bcache_prefetch(struct bcache *cache, int di, block_address i)
 {
-	struct block *b = _block_lookup(cache, fd, i);
+	struct block *b = _block_lookup(cache, di, i);
 
 	if (!b) {
 		if (cache->nr_io_pending < cache->max_io) {
-			b = _new_block(cache, fd, i, false);
+			b = _new_block(cache, di, i, false);
 			if (b) {
 				cache->prefetches++;
 				_issue_read(b);
@@ -1203,12 +1226,15 @@ static void _recycle_block(struct bcache *cache, struct block *b)
 	_free_block(b);
 }
 
-bool bcache_get(struct bcache *cache, int fd, block_address i,
+bool bcache_get(struct bcache *cache, int di, block_address i,
 	        unsigned flags, struct block **result)
 {
 	struct block *b;
 
-	b = _lookup_or_read_block(cache, fd, i, flags);
+	if (di >= _fd_table_size)
+		goto bad;
+
+	b = _lookup_or_read_block(cache, di, i, flags);
 	if (b) {
 		if (b->error) {
 			if (b->io_dir == DIR_READ) {
@@ -1227,10 +1253,10 @@ bool bcache_get(struct bcache *cache, int fd, block_address i,
 		*result = b;
 		return true;
 	}
-
+bad:
 	*result = NULL;
 
-	log_error("bcache failed to get block %u fd %d", (uint32_t) i, fd);
+	log_error("bcache failed to get block %u di %d", (uint32_t) i, di);
 	return false;
 }
 
@@ -1294,7 +1320,7 @@ static bool _invalidate_block(struct bcache *cache, struct block *b)
 
 	if (b->ref_count) {
 		log_warn("bcache_invalidate: block (%d, %llu) still held",
-			 b->fd, (unsigned long long) b->index);
+			 b->di, (unsigned long long) b->index);
 		return false;
 	}
 
@@ -1311,9 +1337,9 @@ static bool _invalidate_block(struct bcache *cache, struct block *b)
 	return true;
 }
 
-bool bcache_invalidate(struct bcache *cache, int fd, block_address i)
+bool bcache_invalidate(struct bcache *cache, int di, block_address i)
 {
-	return _invalidate_block(cache, _block_lookup(cache, fd, i));
+	return _invalidate_block(cache, _block_lookup(cache, di, i));
 }
 
 //----------------------------------------------------------------
@@ -1342,14 +1368,14 @@ static bool _invalidate_v(struct radix_tree_iterator *it,
 
 	if (b->error || _test_flags(b, BF_DIRTY)) {
         	log_warn("bcache_invalidate: block (%d, %llu) still dirty",
-                         b->fd, (unsigned long long) b->index);
+                         b->di, (unsigned long long) b->index);
         	iit->success = false;
         	return true;
 	}
 
 	if (b->ref_count) {
 		log_warn("bcache_invalidate: block (%d, %llu) still held",
-			 b->fd, (unsigned long long) b->index);
+			 b->di, (unsigned long long) b->index);
 		iit->success = false;
 		return true;
 	}
@@ -1362,24 +1388,24 @@ static bool _invalidate_v(struct radix_tree_iterator *it,
 	return true;
 }
 
-bool bcache_invalidate_fd(struct bcache *cache, int fd)
+bool bcache_invalidate_di(struct bcache *cache, int di)
 {
         union key k;
 	struct invalidate_iterator it;
 
-	k.parts.fd = fd;
+	k.parts.di = di;
 
 	it.it.visit = _writeback_v;
-	radix_tree_iterate(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.fd), &it.it);
+	radix_tree_iterate(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.di), &it.it);
 
 	_wait_all(cache);
 
 	it.success = true;
 	it.it.visit = _invalidate_v;
-	radix_tree_iterate(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.fd), &it.it);
+	radix_tree_iterate(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.di), &it.it);
 
 	if (it.success)
-		radix_tree_remove_prefix(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.fd));
+		radix_tree_remove_prefix(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.di));
 
 	return it.success;
 }
@@ -1393,7 +1419,7 @@ static bool _abort_v(struct radix_tree_iterator *it,
 
 	if (b->ref_count) {
 		log_fatal("bcache_abort: block (%d, %llu) still held",
-			 b->fd, (unsigned long long) b->index);
+			 b->di, (unsigned long long) b->index);
 		return true;
 	}
 
@@ -1405,35 +1431,91 @@ static bool _abort_v(struct radix_tree_iterator *it,
 	return true;
 }
 
-void bcache_abort_fd(struct bcache *cache, int fd)
+void bcache_abort_di(struct bcache *cache, int di)
 {
         union key k;
 	struct radix_tree_iterator it;
 
-	k.parts.fd = fd;
+	k.parts.di = di;
 
 	it.visit = _abort_v;
-	radix_tree_iterate(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.fd), &it);
-	radix_tree_remove_prefix(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.fd));
+	radix_tree_iterate(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.di), &it);
+	radix_tree_remove_prefix(cache->rtree, k.bytes, k.bytes + sizeof(k.parts.di));
 }
 
 //----------------------------------------------------------------
 
-void bcache_set_last_byte(struct bcache *cache, int fd, uint64_t offset, int sector_size)
+void bcache_set_last_byte(struct bcache *cache, int di, uint64_t offset, int sector_size)
 {
-	_last_byte_fd = fd;
+	_last_byte_di = di;
 	_last_byte_offset = offset;
 	_last_byte_sector_size = sector_size;
 	if (!sector_size)
 		_last_byte_sector_size = 512;
 }
 
-void bcache_unset_last_byte(struct bcache *cache, int fd)
+void bcache_unset_last_byte(struct bcache *cache, int di)
 {
-	if (_last_byte_fd == fd) {
-		_last_byte_fd = 0;
+	if (_last_byte_di == di) {
+		_last_byte_di = 0;
 		_last_byte_offset = 0;
 		_last_byte_sector_size = 0;
 	}
 }
 
+int bcache_set_fd(int fd)
+{
+	int *new_table = NULL;
+	int new_size = 0;
+	int i;
+
+ retry:
+	for (i = 0; i < _fd_table_size; i++) {
+		if (_fd_table[i] == -1) {
+			_fd_table[i] = fd;
+			return i;
+		}
+	}
+
+	/* already tried once, shouldn't happen */
+	if (new_size)
+		return -1;
+
+	new_size = _fd_table_size + FD_TABLE_INC;
+
+	new_table = realloc(_fd_table, sizeof(int) * new_size);
+	if (!new_table) {
+		log_error("Cannot extend bcache fd table");
+		return -1;
+	}
+
+	for (i = _fd_table_size; i < new_size; i++)
+		new_table[i] = -1;
+
+	_fd_table = new_table;
+	_fd_table_size = new_size;
+
+	goto retry;
+}
+
+/*
+ * Should we check for unflushed or inprogress io on an fd
+ * prior to doing clear_fd or change_fd?  (To catch mistakes;
+ * the caller should be smart enough to not do that.)
+ */
+
+void bcache_clear_fd(int di)
+{
+	if (di >= _fd_table_size)
+		return;
+	_fd_table[di] = -1;
+}
+
+int bcache_change_fd(int di, int fd)
+{
+	if (di >= _fd_table_size)
+		return -1;
+	_fd_table[di] = fd;
+	return 1;
+}
+
author	David Teigland <teigland@redhat.com>	2020-09-17 09:40:18 -0500
committer	David Teigland <teigland@redhat.com>	2020-09-17 16:57:05 -0500
commit	ece962dc5328d4bc3f6d10c835c6d1f1e59cd8e0 (patch)
tree	60e9e63cbb3010c473e04609f44e338b6972a132 /lib/device/bcache.c
parent	72b931d66407fedc68735324996fa190ebaf1a3f (diff)
download	lvm2-dev-dct-reopen-rw.tar.gz