diff options
author | Yuri Weinstein <yweinste@redhat.com> | 2021-07-28 11:39:44 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-07-28 11:39:44 -0700 |
commit | d85c152d182151361c7384b2863c1793d41d7aa7 (patch) | |
tree | c39cc02e84fcb3f6f95b9c9da6a7f1fc94a816ba | |
parent | 57697b978ea3a0f6253bc0f0b226dc254596e3f1 (diff) | |
parent | b2035cc1154283f917d18c8f2ceb1781c09a852a (diff) | |
download | ceph-d85c152d182151361c7384b2863c1793d41d7aa7.tar.gz |
Merge pull request #42374 from ifed01/wip-ifed-bluefs-safer-flush-oct
octopus: os/bluestore: Remove possibility of replay log and file inconsistency
Reviewed-by: Deepika Upadhyay <dupadhya@redhat.com>
-rw-r--r-- | src/os/bluestore/BlueFS.cc | 75 | ||||
-rw-r--r-- | src/os/bluestore/BlueFS.h | 6 | ||||
-rw-r--r-- | src/os/bluestore/BlueRocksEnv.cc | 2 | ||||
-rw-r--r-- | src/test/objectstore/test_bluefs.cc | 69 |
4 files changed, 123 insertions, 29 deletions
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc index 2aa8628e834..bc6d34012bc 100644 --- a/src/os/bluestore/BlueFS.cc +++ b/src/os/bluestore/BlueFS.cc @@ -2841,6 +2841,33 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l, return 0; } +int BlueFS::_signal_dirty_to_log(FileWriter *h) +{ + h->file->fnode.mtime = ceph_clock_now(); + ceph_assert(h->file->fnode.ino >= 1); + if (h->file->dirty_seq == 0) { + h->file->dirty_seq = log_seq + 1; + dirty_files[h->file->dirty_seq].push_back(*h->file); + dout(20) << __func__ << " dirty_seq = " << log_seq + 1 + << " (was clean)" << dendl; + } else { + if (h->file->dirty_seq != log_seq + 1) { + // need re-dirty, erase from list first + ceph_assert(dirty_files.count(h->file->dirty_seq)); + auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file); + dirty_files[h->file->dirty_seq].erase(it); + h->file->dirty_seq = log_seq + 1; + dirty_files[h->file->dirty_seq].push_back(*h->file); + dout(20) << __func__ << " dirty_seq = " << log_seq + 1 + << " (was " << h->file->dirty_seq << ")" << dendl; + } else { + dout(20) << __func__ << " dirty_seq = " << log_seq + 1 + << " (unchanged, do nothing) " << dendl; + } + } + return 0; +} + int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) { dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos @@ -2876,7 +2903,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) vselector->sub_usage(h->file->vselector_hint, h->file->fnode); // do not bother to dirty the file if we are overwriting // previously allocated extents. - bool must_dirty = false; + if (allocated < offset + length) { // we should never run out of log space here; see the min runway check // in _flush_and_sync_log. @@ -2892,7 +2919,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) ceph_abort_msg("bluefs enospc"); return r; } - must_dirty = true; + h->file->is_dirty = true; } if (h->file->fnode.size < offset + length) { h->file->fnode.size = offset + length; @@ -2900,34 +2927,10 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length) // we do not need to dirty the log file (or it's compacting // replacement) when the file size changes because replay is // smart enough to discover it on its own. - must_dirty = true; + h->file->is_dirty = true; } } - if (must_dirty) { - h->file->fnode.mtime = ceph_clock_now(); - ceph_assert(h->file->fnode.ino >= 1); - if (h->file->dirty_seq == 0) { - h->file->dirty_seq = log_seq + 1; - dirty_files[h->file->dirty_seq].push_back(*h->file); - dout(20) << __func__ << " dirty_seq = " << log_seq + 1 - << " (was clean)" << dendl; - } else { - if (h->file->dirty_seq != log_seq + 1) { - // need re-dirty, erase from list first - ceph_assert(dirty_files.count(h->file->dirty_seq)); - auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file); - dirty_files[h->file->dirty_seq].erase(it); - h->file->dirty_seq = log_seq + 1; - dirty_files[h->file->dirty_seq].push_back(*h->file); - dout(20) << __func__ << " dirty_seq = " << log_seq + 1 - << " (was " << h->file->dirty_seq << ")" << dendl; - } else { - dout(20) << __func__ << " dirty_seq = " << log_seq + 1 - << " (unchanged, do nothing) " << dendl; - } - } - } - dout(20) << __func__ << " file now " << h->file->fnode << dendl; + dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl; uint64_t x_off = 0; auto p = h->file->fnode.seek(offset, &x_off); @@ -3148,6 +3151,10 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l) int r = _flush(h, true); if (r < 0) return r; + if (h->file->is_dirty) { + _signal_dirty_to_log(h); + h->file->is_dirty = false; + } uint64_t old_dirty_seq = h->file->dirty_seq; _flush_bdev_safely(h); @@ -3518,6 +3525,18 @@ void BlueFS::_close_writer(FileWriter *h) delete h; } +uint64_t BlueFS::debug_get_dirty_seq(FileWriter *h) +{ + std::lock_guard l(lock); + return h->file->dirty_seq; +} + +bool BlueFS::debug_get_is_dev_dirty(FileWriter *h, uint8_t dev) +{ + std::lock_guard l(lock); + return h->dirty_devs[dev]; +} + int BlueFS::open_for_read( std::string_view dirname, std::string_view filename, diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h index 524a2544348..659c4c9e615 100644 --- a/src/os/bluestore/BlueFS.h +++ b/src/os/bluestore/BlueFS.h @@ -120,6 +120,7 @@ public: uint64_t dirty_seq; bool locked; bool deleted; + bool is_dirty; boost::intrusive::list_member_hook<> dirty_item; std::atomic_int num_readers, num_writers; @@ -135,6 +136,7 @@ public: dirty_seq(0), locked(false), deleted(false), + is_dirty(false), num_readers(0), num_writers(0), num_reading(0), @@ -360,6 +362,8 @@ private: int _allocate_without_fallback(uint8_t id, uint64_t len, PExtentVector* extents); + /* signal replay log to include h->file in nearest log flush */ + int _signal_dirty_to_log(FileWriter *h); int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); int _flush(FileWriter *h, bool focce, std::unique_lock<ceph::mutex>& l); int _flush(FileWriter *h, bool force, bool *flushed = nullptr); @@ -637,6 +641,8 @@ public: const PerfCounters* get_perf_counters() const { return logger; } + uint64_t debug_get_dirty_seq(FileWriter *h); + bool debug_get_is_dev_dirty(FileWriter *h, uint8_t dev); private: // Wrappers for BlockDevice::read(...) and BlockDevice::read_random(...) diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc index 9733fc9a428..0794b87ad4a 100644 --- a/src/os/bluestore/BlueRocksEnv.cc +++ b/src/os/bluestore/BlueRocksEnv.cc @@ -213,7 +213,7 @@ class BlueRocksWritableFile : public rocksdb::WritableFile { } rocksdb::Status Close() override { - fs->flush(h, true); + fs->fsync(h); // mimic posix env, here. shrug. size_t block_size; diff --git a/src/test/objectstore/test_bluefs.cc b/src/test/objectstore/test_bluefs.cc index 68db26f77f5..6f274db6165 100644 --- a/src/test/objectstore/test_bluefs.cc +++ b/src/test/objectstore/test_bluefs.cc @@ -840,6 +840,75 @@ TEST(BlueFS, test_replay_growth) { fs.umount(); } +TEST(BlueFS, test_tracker_50965) { + uint64_t size_wal = 1048576 * 64; + TempBdev bdev_wal{size_wal}; + uint64_t size_db = 1048576 * 128; + TempBdev bdev_db{size_db}; + uint64_t size_slow = 1048576 * 256; + TempBdev bdev_slow{size_slow}; + + ConfSaver conf(g_ceph_context->_conf); + conf.SetVal("bluefs_min_flush_size", "65536"); + conf.ApplyChanges(); + + BlueFS fs(g_ceph_context); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_WAL, bdev_wal.path, false, 0)); + fs.add_block_extent(BlueFS::BDEV_WAL, 1048576, size_wal - 1048576); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_DB, bdev_db.path, false, 0)); + fs.add_block_extent(BlueFS::BDEV_DB, 1048576, size_db - 1048576); + ASSERT_EQ(0, fs.add_block_device(BlueFS::BDEV_SLOW, bdev_slow.path, false, 0)); + fs.add_block_extent(BlueFS::BDEV_SLOW, 1048576, size_slow - 1048576); + uuid_d fsid; + ASSERT_EQ(0, fs.mkfs(fsid, { BlueFS::BDEV_DB, true, true })); + ASSERT_EQ(0, fs.mount()); + ASSERT_EQ(0, fs.maybe_verify_layout({ BlueFS::BDEV_DB, true, true })); + + string dir_slow = "dir.slow"; + ASSERT_EQ(0, fs.mkdir(dir_slow)); + string dir_db = "dir_db"; + ASSERT_EQ(0, fs.mkdir(dir_db)); + + string file_slow = "file"; + BlueFS::FileWriter *h_slow; + ASSERT_EQ(0, fs.open_for_write(dir_slow, file_slow, &h_slow, false)); + ASSERT_NE(nullptr, h_slow); + + string file_db = "file"; + BlueFS::FileWriter *h_db; + ASSERT_EQ(0, fs.open_for_write(dir_db, file_db, &h_db, false)); + ASSERT_NE(nullptr, h_db); + + bufferlist bl1; + std::unique_ptr<char[]> buf1 = gen_buffer(70000); + bufferptr bp1 = buffer::claim_char(70000, buf1.get()); + bl1.push_back(bp1); + h_slow->append(bl1.c_str(), bl1.length()); + fs.flush(h_slow); + + uint64_t h_slow_dirty_seq_1 = fs.debug_get_dirty_seq(h_slow); + + bufferlist bl2; + std::unique_ptr<char[]> buf2 = gen_buffer(1000); + bufferptr bp2 = buffer::claim_char(1000, buf2.get()); + bl2.push_back(bp2); + h_db->append(bl2.c_str(), bl2.length()); + fs.fsync(h_db); + + uint64_t h_slow_dirty_seq_2 = fs.debug_get_dirty_seq(h_slow); + bool h_slow_dev_dirty = fs.debug_get_is_dev_dirty(h_slow, BlueFS::BDEV_SLOW); + + //problem if allocations are stable in log but slow device is not flushed yet + ASSERT_FALSE(h_slow_dirty_seq_1 != 0 && + h_slow_dirty_seq_2 == 0 && + h_slow_dev_dirty == true); + + fs.close_writer(h_slow); + fs.close_writer(h_db); + + fs.umount(); +} + int main(int argc, char **argv) { vector<const char*> args; argv_to_vec(argc, (const char **)argv, args); |