summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Kupczyk <akupczyk@redhat.com>2021-05-24 14:49:51 +0200
committerIgor Fedotov <ifed@suse.com>2021-07-20 20:02:06 +0300
commit4383c65c777889ae4344140654fded11615be201 (patch)
treed87a6f4262dbefab224ae1bb862b64d7e8ffcda2
parent7d518f6b629e6292006e88108f6ca78edde2db67 (diff)
downloadceph-4383c65c777889ae4344140654fded11615be201.tar.gz
os/bluestore/bluefs: Remove possibility of bluefs replay log containing files without data
It had been possible to have a bluefs replay log to serialize file metadata (size, allocations), but actual data stored in these allocations is not yet synced to disk. This could happen if _flush_range(h1) allocated space for file h1 on device (like SLOW) that will not be used when flushing future replay log. Such thing can happen when we have h2 that wrote to WAL and out replay log is on DB. After fsync(h2) we write to replay log, wait for fdatasync on WAL and DB. There is no waiting on SLOW, but h1 was dirty and has been serialized to replay log. Solution is to delay notifying replay log that it has to include h1 after finishing fdatasync. Fixes: https://tracker.ceph.com/issues/50965 Signed-off-by: Adam Kupczyk <akupczyk@redhat.com> (cherry picked from commit 03ac53f7d4c83e56f664ad371ffe3bc2d40e1837) Conflicts: (trivial - additional future stuff src/os/bluestore/BlueFS.cc
-rw-r--r--src/os/bluestore/BlueFS.cc63
-rw-r--r--src/os/bluestore/BlueFS.h4
-rw-r--r--src/os/bluestore/BlueRocksEnv.cc2
3 files changed, 40 insertions, 29 deletions
diff --git a/src/os/bluestore/BlueFS.cc b/src/os/bluestore/BlueFS.cc
index 2aa8628e834..4e1aea0a7d6 100644
--- a/src/os/bluestore/BlueFS.cc
+++ b/src/os/bluestore/BlueFS.cc
@@ -2841,6 +2841,33 @@ int BlueFS::_flush_and_sync_log(std::unique_lock<ceph::mutex>& l,
return 0;
}
+int BlueFS::_signal_dirty_to_log(FileWriter *h)
+{
+ h->file->fnode.mtime = ceph_clock_now();
+ ceph_assert(h->file->fnode.ino >= 1);
+ if (h->file->dirty_seq == 0) {
+ h->file->dirty_seq = log_seq + 1;
+ dirty_files[h->file->dirty_seq].push_back(*h->file);
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (was clean)" << dendl;
+ } else {
+ if (h->file->dirty_seq != log_seq + 1) {
+ // need re-dirty, erase from list first
+ ceph_assert(dirty_files.count(h->file->dirty_seq));
+ auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
+ dirty_files[h->file->dirty_seq].erase(it);
+ h->file->dirty_seq = log_seq + 1;
+ dirty_files[h->file->dirty_seq].push_back(*h->file);
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (was " << h->file->dirty_seq << ")" << dendl;
+ } else {
+ dout(20) << __func__ << " dirty_seq = " << log_seq + 1
+ << " (unchanged, do nothing) " << dendl;
+ }
+ }
+ return 0;
+}
+
int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
{
dout(10) << __func__ << " " << h << " pos 0x" << std::hex << h->pos
@@ -2876,7 +2903,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
vselector->sub_usage(h->file->vselector_hint, h->file->fnode);
// do not bother to dirty the file if we are overwriting
// previously allocated extents.
- bool must_dirty = false;
+
if (allocated < offset + length) {
// we should never run out of log space here; see the min runway check
// in _flush_and_sync_log.
@@ -2892,7 +2919,7 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
ceph_abort_msg("bluefs enospc");
return r;
}
- must_dirty = true;
+ h->file->is_dirty = true;
}
if (h->file->fnode.size < offset + length) {
h->file->fnode.size = offset + length;
@@ -2900,34 +2927,10 @@ int BlueFS::_flush_range(FileWriter *h, uint64_t offset, uint64_t length)
// we do not need to dirty the log file (or it's compacting
// replacement) when the file size changes because replay is
// smart enough to discover it on its own.
- must_dirty = true;
- }
- }
- if (must_dirty) {
- h->file->fnode.mtime = ceph_clock_now();
- ceph_assert(h->file->fnode.ino >= 1);
- if (h->file->dirty_seq == 0) {
- h->file->dirty_seq = log_seq + 1;
- dirty_files[h->file->dirty_seq].push_back(*h->file);
- dout(20) << __func__ << " dirty_seq = " << log_seq + 1
- << " (was clean)" << dendl;
- } else {
- if (h->file->dirty_seq != log_seq + 1) {
- // need re-dirty, erase from list first
- ceph_assert(dirty_files.count(h->file->dirty_seq));
- auto it = dirty_files[h->file->dirty_seq].iterator_to(*h->file);
- dirty_files[h->file->dirty_seq].erase(it);
- h->file->dirty_seq = log_seq + 1;
- dirty_files[h->file->dirty_seq].push_back(*h->file);
- dout(20) << __func__ << " dirty_seq = " << log_seq + 1
- << " (was " << h->file->dirty_seq << ")" << dendl;
- } else {
- dout(20) << __func__ << " dirty_seq = " << log_seq + 1
- << " (unchanged, do nothing) " << dendl;
- }
+ h->file->is_dirty = true;
}
}
- dout(20) << __func__ << " file now " << h->file->fnode << dendl;
+ dout(20) << __func__ << " file now, unflushed " << h->file->fnode << dendl;
uint64_t x_off = 0;
auto p = h->file->fnode.seek(offset, &x_off);
@@ -3148,6 +3151,10 @@ int BlueFS::_fsync(FileWriter *h, std::unique_lock<ceph::mutex>& l)
int r = _flush(h, true);
if (r < 0)
return r;
+ if (h->file->is_dirty) {
+ _signal_dirty_to_log(h);
+ h->file->is_dirty = false;
+ }
uint64_t old_dirty_seq = h->file->dirty_seq;
_flush_bdev_safely(h);
diff --git a/src/os/bluestore/BlueFS.h b/src/os/bluestore/BlueFS.h
index 524a2544348..990b9d4f9a4 100644
--- a/src/os/bluestore/BlueFS.h
+++ b/src/os/bluestore/BlueFS.h
@@ -120,6 +120,7 @@ public:
uint64_t dirty_seq;
bool locked;
bool deleted;
+ bool is_dirty;
boost::intrusive::list_member_hook<> dirty_item;
std::atomic_int num_readers, num_writers;
@@ -135,6 +136,7 @@ public:
dirty_seq(0),
locked(false),
deleted(false),
+ is_dirty(false),
num_readers(0),
num_writers(0),
num_reading(0),
@@ -360,6 +362,8 @@ private:
int _allocate_without_fallback(uint8_t id, uint64_t len,
PExtentVector* extents);
+ /* signal replay log to include h->file in nearest log flush */
+ int _signal_dirty_to_log(FileWriter *h);
int _flush_range(FileWriter *h, uint64_t offset, uint64_t length);
int _flush(FileWriter *h, bool focce, std::unique_lock<ceph::mutex>& l);
int _flush(FileWriter *h, bool force, bool *flushed = nullptr);
diff --git a/src/os/bluestore/BlueRocksEnv.cc b/src/os/bluestore/BlueRocksEnv.cc
index 9733fc9a428..0794b87ad4a 100644
--- a/src/os/bluestore/BlueRocksEnv.cc
+++ b/src/os/bluestore/BlueRocksEnv.cc
@@ -213,7 +213,7 @@ class BlueRocksWritableFile : public rocksdb::WritableFile {
}
rocksdb::Status Close() override {
- fs->flush(h, true);
+ fs->fsync(h);
// mimic posix env, here. shrug.
size_t block_size;