summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGreg Farnum <greg@inktank.com>2013-10-02 18:14:23 -0700
committerGreg Farnum <greg@inktank.com>2013-10-02 18:14:23 -0700
commita0ba5c66162af720627fcf7ba63fdc76ac97f568 (patch)
treef50b27ac3bab25274a5e62e1e27cd54e91e3ca75
parenta9df335b12a093c31f947d5ca98883de9c2a5cf9 (diff)
downloadceph-wip-journaler-safety.tar.gz
Journaler: do not advance safe_pos inappropriatelywip-journaler-safety
We were previously advancing safe_pos to the end of the first flush we had started in the queue, without checking if the finishing flush was actually the first one we'd triggered. When crossing object boundaries the flushes can come back out of order (especially on a degraded cluster), which could lead to problems if the MDS got restarted without fully committing the intermediary events. Now, we only advance the safe_pos if we just received the first flush in the queue. This continues to function as intended before -- any follow-on flushes that completed earlier have been removed from the pending_safe queue so we advance to either the (already very advanced) flush_pos, or to the first position in pending_safe that still hasn't finished. And because all of our safety callbacks are triggered by advancing safe_pos (and not merely getting a flush), our data is safe without any further changes to the code base. Signed-off-by: Greg Farnum <greg@inktank.com>
-rw-r--r--src/osdc/Journaler.cc17
1 files changed, 11 insertions, 6 deletions
diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc
index ba4ca8dc4b9..badfb3c54e8 100644
--- a/src/osdc/Journaler.cc
+++ b/src/osdc/Journaler.cc
@@ -405,12 +405,17 @@ void Journaler::_finish_flush(int r, uint64_t start, utime_t stamp)
}
// adjust safe_pos
- assert(pending_safe.count(start));
- pending_safe.erase(start);
- if (pending_safe.empty())
- safe_pos = flush_pos;
- else
- safe_pos = *pending_safe.begin();
+ std::set<uint64_t>::iterator start_iter = pending_safe.find(start);
+ assert(start_iter != pending_safe.end());
+ if (start_iter == pending_safe.begin()) {
+ pending_safe.erase(start_iter);
+ if (pending_safe.empty())
+ safe_pos = flush_pos;
+ else
+ safe_pos = *pending_safe.begin();
+ } else {
+ pending_safe.erase(start_iter);
+ }
ldout(cct, 10) << "_finish_flush safe from " << start
<< ", pending_safe " << pending_safe