summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuri Weinstein <yweinste@redhat.com>2021-07-28 11:39:44 -0700
committerGitHub <noreply@github.com>2021-07-28 11:39:44 -0700
commitd85c152d182151361c7384b2863c1793d41d7aa7 (patch)
treec39cc02e84fcb3f6f95b9c9da6a7f1fc94a816ba
parent57697b978ea3a0f6253bc0f0b226dc254596e3f1 (diff)
parentb2035cc1154283f917d18c8f2ceb1781c09a852a (diff)
downloadceph-d85c152d182151361c7384b2863c1793d41d7aa7.tar.gz
Merge pull request #42374 from ifed01/wip-ifed-bluefs-safer-flush-oct
octopus: os/bluestore: Remove possibility of replay log and file inconsistency Reviewed-by: Deepika Upadhyay <dupadhya@redhat.com>
-rw-r--r--src/os/bluestore/BlueFS.cc75
-rw-r--r--src/os/bluestore/BlueFS.h6
-rw-r--r--src/os/bluestore/BlueRocksEnv.cc2
-rw-r--r--src/test/objectstore/test_bluefs.cc69
4 files changed, 123 insertions, 29 deletions
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 2aa8628e834..bc6d34012bc 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -2841,6 +2841,33 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
return 0;
}
+int BlueFS::_signal_dirty_to_log(FileWriter *h)
+{
+ h->file->fnode.mtime = ceph_clock_now();
+ ceph_assert(h->file->fnode.ino >= 1);
+ if (h->file->dirty_seq == 0) {
+ h->file->dirty_seq = log_seq + 1;
+ dirty_files[h->file->dirty_seq].push_back(*h->file);
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (was clean)" << dendl;
+ } else {
+ if (h->file->dirty_seq != log_seq + 1) {
+ // need re-dirty, erase from list first
+ ceph_assert(dirty_files.count(h->file->dirty_seq));
+ auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
+ dirty_files[h->file->dirty_seq].erase(it);
+ h->file->dirty_seq = log_seq + 1;
+ dirty_files[h->file->dirty_seq].push_back(*h->file);
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (was " << h->file->dirty_seq << ")" << dendl;
+ } else {
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (unchanged, do nothing) " << dendl;
+ }
+ }
+ return 0;
+}
+
int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
{
dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
@@ -2876,7 +2903,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
// do not bother to dirty the file if we are overwriting
// previously allocated extents.
- bool must_dirty = false;
+
if (allocated < offset + length) {
// we should never run out of log space here; see the min runway check
// in _flush_and_sync_log.
@@ -2892,7 +2919,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
ceph_abort_msg("bluefs enospc");
return r;
}
- must_dirty = true;
+ h->file->is_dirty = true;
}
if (h->file->fnode.size < offset + length) {
h->file->fnode.size = offset + length;
@@ -2900,34 +2927,10 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
// we do not need to dirty the log file (or it's compacting
// replacement) when the file size changes because replay is
// smart enough to discover it on its own.
- must_dirty = true;
+ h->file->is_dirty = true;
}
}
- if (must_dirty) {
- h->file->fnode.mtime = ceph_clock_now();
- ceph_assert(h->file->fnode.ino >= 1);
- if (h->file->dirty_seq == 0) {
- h->file->dirty_seq = log_seq + 1;
- dirty_files[h->file->dirty_seq].push_back(*h->file);
- dout(20) << __func__ << " dirty_seq = " << log_seq + 1
- << " (was clean)" << dendl;
- } else {
- if (h->file->dirty_seq != log_seq + 1) {
- // need re-dirty, erase from list first
- ceph_assert(dirty_files.count(h->file->dirty_seq));
- auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
- dirty_files[h->file->dirty_seq].erase(it);
- h->file->dirty_seq = log_seq + 1;
- dirty_files[h->file->dirty_seq].push_back(*h->file);
- dout(20) << __func__ << " dirty_seq = " << log_seq + 1
- << " (was " << h->file->dirty_seq << ")" << dendl;
- } else {
- dout(20) << __func__ << " dirty_seq = " << log_seq + 1
- << " (unchanged, do nothing) " << dendl;
- }
- }
- }
- dout(20) << __func__ << " file now " << h->file->fnode << dendl;
+ dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
uint64_t x_off = 0;
auto p = h->file->fnode.seek(offset, &x_off);
@@ -3148,6 +3151,10 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
int r = _flush(h, true);
if (r < 0)
return r;
+ if (h->file->is_dirty) {
+ _signal_dirty_to_log(h);
+ h->file->is_dirty = false;
+ }
uint64_t old_dirty_seq = h->file->dirty_seq;
_flush_bdev_safely(h);
@@ -3518,6 +3525,18 @@ void BlueFS::_close_writer(FileWriter *h)
delete h;
}
+uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h)
+{
+ std::lock_guard l(lock);
+ return h->file->dirty_seq;
+}
+
+bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev)
+{
+ std::lock_guard l(lock);
+ return h->dirty_devs[dev];
+}
+
int BlueFS::open_for_read(
std::string_view dirname,
std::string_view filename,
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 524a2544348..659c4c9e615 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -120,6 +120,7 @@ public:
uint64_t dirty_seq;
bool locked;
bool deleted;
+ bool is_dirty;
boost::intrusive::list_member_hook<> dirty_item;
std::atomic_int num_readers, num_writers;
@@ -135,6 +136,7 @@ public:
dirty_seq(0),
locked(false),
deleted(false),
+ is_dirty(false),
num_readers(0),
num_writers(0),
num_reading(0),
@@ -360,6 +362,8 @@ private:
int _allocate_without_fallback(uint8_t id, uint64_t len,
PExtentVector* extents);
+ /* signal replay log to include h->file in nearest log flush */
+ int _signal_dirty_to_log(FileWriter *h);
int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
int _flush(FileWriter *h, bool focce, std::unique_lock<ceph::mutex>& l);
int _flush(FileWriter *h, bool force, bool *flushed = nullptr);
@@ -637,6 +641,8 @@ public:
const PerfCounters* get_perf_counters() const {
return logger;
}
+ uint64_t debug_get_dirty_seq(FileWriter *h);
+ bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev);
private:
// Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...)
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
index 9733fc9a428..0794b87ad4a 100644
--- a/src/os/bluestore/BlueRocksEnv.cc
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -213,7 +213,7 @@ class BlueRocksWritableFile : public rocksdb::WritableFile {
}
rocksdb::Status Close() override {
- fs->flush(h, true);
+ fs->fsync(h);
// mimic posix env, here. shrug.
size_t block_size;
diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc
index 68db26f77f5..6f274db6165 100644
--- a/src/test/objectstore/test_bluefs.cc
+++ b/src/test/objectstore/test_bluefs.cc
@@ -840,6 +840,75 @@ TEST(BlueFS, test_replay_growth) {
fs.umount();
}
+TEST(BlueFS, test_tracker_50965) {
+ uint64_t size_wal = 1048576 * 64;
+ TempBdev bdev_wal{size_wal};
+ uint64_t size_db = 1048576 * 128;
+ TempBdev bdev_db{size_db};
+ uint64_t size_slow = 1048576 * 256;
+ TempBdev bdev_slow{size_slow};
+
+ ConfSaver conf(g_ceph_context->_conf);
+ conf.SetVal("bluefs_min_flush_size", "65536");
+ conf.ApplyChanges();
+
+ BlueFS fs(g_ceph_context);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_WAL, bdev_wal.path, false, 0));
+ fs.add_block_extent(BlueFS::BDEV_WAL, 1048576, size_wal - 1048576);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0));
+ fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size_db - 1048576);
+ ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0));
+ fs.add_block_extent(BlueFS::BDEV_SLOW, 1048576, size_slow - 1048576);
+ uuid_d fsid;
+ ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, true, true }));
+ ASSERT_EQ(0, fs.mount());
+ ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, true, true }));
+
+ string dir_slow = "dir.slow";
+ ASSERT_EQ(0, fs.mkdir(dir_slow));
+ string dir_db = "dir_db";
+ ASSERT_EQ(0, fs.mkdir(dir_db));
+
+ string file_slow = "file";
+ BlueFS::FileWriter *h_slow;
+ ASSERT_EQ(0, fs.open_for_write(dir_slow, file_slow, &h_slow, false));
+ ASSERT_NE(nullptr, h_slow);
+
+ string file_db = "file";
+ BlueFS::FileWriter *h_db;
+ ASSERT_EQ(0, fs.open_for_write(dir_db, file_db, &h_db, false));
+ ASSERT_NE(nullptr, h_db);
+
+ bufferlist bl1;
+ std::unique_ptr<char[]> buf1 = gen_buffer(70000);
+ bufferptr bp1 = buffer::claim_char(70000, buf1.get());
+ bl1.push_back(bp1);
+ h_slow->append(bl1.c_str(), bl1.length());
+ fs.flush(h_slow);
+
+ uint64_t h_slow_dirty_seq_1 = fs.debug_get_dirty_seq(h_slow);
+
+ bufferlist bl2;
+ std::unique_ptr<char[]> buf2 = gen_buffer(1000);
+ bufferptr bp2 = buffer::claim_char(1000, buf2.get());
+ bl2.push_back(bp2);
+ h_db->append(bl2.c_str(), bl2.length());
+ fs.fsync(h_db);
+
+ uint64_t h_slow_dirty_seq_2 = fs.debug_get_dirty_seq(h_slow);
+ bool h_slow_dev_dirty = fs.debug_get_is_dev_dirty(h_slow, BlueFS::BDEV_SLOW);
+
+ //problem if allocations are stable in log but slow device is not flushed yet
+ ASSERT_FALSE(h_slow_dirty_seq_1 != 0 &&
+ h_slow_dirty_seq_2 == 0 &&
+ h_slow_dev_dirty == true);
+
+ fs.close_writer(h_slow);
+ fs.close_writer(h_db);
+
+ fs.umount();
+}
+
int main(int argc, char **argv) {
vector<const char*> args;
argv_to_vec(argc, (const char **)argv, args);