summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Just <sam.just@inktank.com>2013-09-17 08:26:51 -0700
committerSamuel Just <sam.just@inktank.com>2013-10-04 13:49:55 -0700
commit664b589b05243b30a92ac3642958d56fb9144e3d (patch)
treea89e97a78da51ec467f4f415f1eff954dba10a80
parente73ec48371fffbb16b03e57b157e35c087e0b342 (diff)
downloadceph-664b589b05243b30a92ac3642958d56fb9144e3d.tar.gz
ReplicatedPG: don't rescan the local collection if we can avoid it
Signed-off-by: Samuel Just <sam.just@inktank.com>
-rw-r--r--src/osd/PG.h2
-rw-r--r--src/osd/ReplicatedPG.cc85
-rw-r--r--src/osd/ReplicatedPG.h8
3 files changed, 78 insertions, 17 deletions
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 74809eea268..78377d03ad6 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -439,6 +439,7 @@ protected:
*/
struct BackfillInterval {
// info about a backfill interval on a peer
+ eversion_t version; /// version at which the scan occurred
map<hobject_t,eversion_t> objects;
hobject_t begin;
hobject_t end;
@@ -447,6 +448,7 @@ protected:
void clear() {
objects.clear();
begin = end = hobject_t();
+ version = eversion_t();
}
void reset(hobject_t start) {
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index a661aa7f786..cc156a16e97 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -1531,8 +1531,9 @@ void ReplicatedPG::do_scan(
BackfillInterval bi;
osr->flush();
+ bi.begin = m->begin;
scan_range(
- m->begin, cct->_conf->osd_backfill_scan_min,
+ cct->_conf->osd_backfill_scan_min,
cct->_conf->osd_backfill_scan_max, &bi, handle);
MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST,
get_osdmap()->get_epoch(), m->query_epoch,
@@ -7953,17 +7954,12 @@ int ReplicatedPG::recover_backfill(
<< " interval " << pbi.begin << "-" << pbi.end
<< " " << pbi.objects.size() << " objects" << dendl;
- int local_min = osd->store->get_ideal_list_min();
- int local_max = osd->store->get_ideal_list_max();
+ int local_min = cct->_conf->osd_backfill_scan_min;
+ int local_max = cct->_conf->osd_backfill_scan_max;
- // re-scan our local interval to cope with recent changes
- // FIXME: we could track the eversion_t when we last scanned, and invalidate
- // that way. or explicitly modify/invalidate when we actually change specific
- // objects.
- dout(10) << " rescanning local backfill_info from " << backfill_pos << dendl;
- backfill_info.clear();
- osr->flush();
- scan_range(backfill_pos, local_min, local_max, &backfill_info, handle);
+ // update our local interval to cope with recent changes
+ backfill_info.begin = backfill_pos;
+ update_range(&backfill_info, handle);
int ops = 0;
map<hobject_t, pair<eversion_t, eversion_t> > to_push;
@@ -7977,7 +7973,8 @@ int ReplicatedPG::recover_backfill(
if (backfill_info.begin <= pbi.begin &&
!backfill_info.extends_to_end() && backfill_info.empty()) {
osr->flush();
- scan_range(backfill_info.end, local_min, local_max, &backfill_info,
+ backfill_info.begin = backfill_info.end;
+ scan_range(local_min, local_max, &backfill_info,
handle);
backfill_info.trim();
}
@@ -8138,25 +8135,81 @@ void ReplicatedPG::prep_backfill_object_push(
start_recovery_op(oid);
recovering.insert(oid);
ObjectContextRef obc = get_object_context(oid, false);
+
+ // We need to take the read_lock here in order to flush in-progress writes
+ obc->ondisk_read_lock();
pgbackend->recover_object(
oid,
ObjectContextRef(),
obc,
h);
+ obc->ondisk_read_unlock();
+}
+
+void ReplicatedPG::update_range(
+ BackfillInterval *bi,
+ ThreadPool::TPHandle &handle)
+{
+ int local_min = cct->_conf->osd_backfill_scan_min;
+ int local_max = cct->_conf->osd_backfill_scan_max;
+ if (bi->version >= info.last_update) {
+ dout(10) << __func__<< ": bi is current " << dendl;
+ assert(bi->version == info.last_update);
+ } else if (bi->version >= info.log_tail) {
+ assert(!pg_log.get_log().empty());
+ dout(10) << __func__<< ": bi is old, (" << bi->version
+ << ") can be updated with log" << dendl;
+ list<pg_log_entry_t>::const_iterator i =
+ pg_log.get_log().log.end();
+ --i;
+ while (i != pg_log.get_log().log.begin() &&
+ i->version > bi->version) {
+ --i;
+ }
+ if (i->version == bi->version)
+ ++i;
+
+ assert(i != pg_log.get_log().log.end());
+ dout(10) << __func__ << ": updating from version " << i->version
+ << dendl;
+ for (; i != pg_log.get_log().log.end(); ++i) {
+ const hobject_t &soid = i->soid;
+ if (soid >= bi->begin && soid < bi->end) {
+ if (i->is_update()) {
+ dout(10) << __func__ << ": " << i->soid << " updated to version "
+ << i->version << dendl;
+ bi->objects.erase(i->soid);
+ bi->objects.insert(
+ make_pair(
+ i->soid,
+ i->version));
+ } else if (i->is_delete()) {
+ dout(10) << __func__ << ": " << i->soid << " removed" << dendl;
+ bi->objects.erase(i->soid);
+ }
+ }
+ }
+ bi->version = info.last_update;
+ } else {
+ dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
+ << dendl;
+ osr->flush();
+ scan_range(local_min, local_max, &backfill_info, handle);
+ }
}
void ReplicatedPG::scan_range(
- hobject_t begin, int min, int max, BackfillInterval *bi,
+ int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle)
{
assert(is_locked());
- dout(10) << "scan_range from " << begin << dendl;
- bi->begin = begin;
+ dout(10) << "scan_range from " << bi->begin << dendl;
+ bi->version = info.last_update;
bi->objects.clear(); // for good measure
vector<hobject_t> ls;
ls.reserve(max);
- int r = osd->store->collection_list_partial(coll, begin, min, max,
+ int r = osd->store->collection_list_partial(coll, bi->begin, min, max,
0, &ls, &bi->end);
assert(r >= 0);
dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index abee57ffe7d..bea793878d6 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -618,10 +618,16 @@ protected:
* @bi [out] resulting map of objects to eversion_t's
*/
void scan_range(
- hobject_t begin, int min, int max, BackfillInterval *bi,
+ int min, int max, BackfillInterval *bi,
ThreadPool::TPHandle &handle
);
+ /// Update a hash range to reflect changes since the last scan
+ void update_range(
+ BackfillInterval *bi, ///< [in,out] interval to update
+ ThreadPool::TPHandle &handle ///< [in] tp handle
+ );
+
void prep_backfill_object_push(
hobject_t oid, eversion_t v, eversion_t have, int peer,
PGBackend::RecoveryHandle *h);