diff options
author | Samuel Just <sam.just@inktank.com> | 2013-09-17 08:26:51 -0700 |
---|---|---|
committer | Samuel Just <sam.just@inktank.com> | 2013-10-04 13:49:55 -0700 |
commit | 664b589b05243b30a92ac3642958d56fb9144e3d (patch) | |
tree | a89e97a78da51ec467f4f415f1eff954dba10a80 | |
parent | e73ec48371fffbb16b03e57b157e35c087e0b342 (diff) | |
download | ceph-664b589b05243b30a92ac3642958d56fb9144e3d.tar.gz |
ReplicatedPG: don't rescan the local collection if we can avoid it
Signed-off-by: Samuel Just <sam.just@inktank.com>
-rw-r--r-- | src/osd/PG.h | 2 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.cc | 85 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.h | 8 |
3 files changed, 78 insertions, 17 deletions
diff --git a/src/osd/PG.h b/src/osd/PG.h index 74809eea268..78377d03ad6 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -439,6 +439,7 @@ protected: */ struct BackfillInterval { // info about a backfill interval on a peer + eversion_t version; /// version at which the scan occurred map<hobject_t,eversion_t> objects; hobject_t begin; hobject_t end; @@ -447,6 +448,7 @@ protected: void clear() { objects.clear(); begin = end = hobject_t(); + version = eversion_t(); } void reset(hobject_t start) { diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index a661aa7f786..cc156a16e97 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -1531,8 +1531,9 @@ void ReplicatedPG::do_scan( BackfillInterval bi; osr->flush(); + bi.begin = m->begin; scan_range( - m->begin, cct->_conf->osd_backfill_scan_min, + cct->_conf->osd_backfill_scan_min, cct->_conf->osd_backfill_scan_max, &bi, handle); MOSDPGScan *reply = new MOSDPGScan(MOSDPGScan::OP_SCAN_DIGEST, get_osdmap()->get_epoch(), m->query_epoch, @@ -7953,17 +7954,12 @@ int ReplicatedPG::recover_backfill( << " interval " << pbi.begin << "-" << pbi.end << " " << pbi.objects.size() << " objects" << dendl; - int local_min = osd->store->get_ideal_list_min(); - int local_max = osd->store->get_ideal_list_max(); + int local_min = cct->_conf->osd_backfill_scan_min; + int local_max = cct->_conf->osd_backfill_scan_max; - // re-scan our local interval to cope with recent changes - // FIXME: we could track the eversion_t when we last scanned, and invalidate - // that way. or explicitly modify/invalidate when we actually change specific - // objects. - dout(10) << " rescanning local backfill_info from " << backfill_pos << dendl; - backfill_info.clear(); - osr->flush(); - scan_range(backfill_pos, local_min, local_max, &backfill_info, handle); + // update our local interval to cope with recent changes + backfill_info.begin = backfill_pos; + update_range(&backfill_info, handle); int ops = 0; map<hobject_t, pair<eversion_t, eversion_t> > to_push; @@ -7977,7 +7973,8 @@ int ReplicatedPG::recover_backfill( if (backfill_info.begin <= pbi.begin && !backfill_info.extends_to_end() && backfill_info.empty()) { osr->flush(); - scan_range(backfill_info.end, local_min, local_max, &backfill_info, + backfill_info.begin = backfill_info.end; + scan_range(local_min, local_max, &backfill_info, handle); backfill_info.trim(); } @@ -8138,25 +8135,81 @@ void ReplicatedPG::prep_backfill_object_push( start_recovery_op(oid); recovering.insert(oid); ObjectContextRef obc = get_object_context(oid, false); + + // We need to take the read_lock here in order to flush in-progress writes + obc->ondisk_read_lock(); pgbackend->recover_object( oid, ObjectContextRef(), obc, h); + obc->ondisk_read_unlock(); +} + +void ReplicatedPG::update_range( + BackfillInterval *bi, + ThreadPool::TPHandle &handle) +{ + int local_min = cct->_conf->osd_backfill_scan_min; + int local_max = cct->_conf->osd_backfill_scan_max; + if (bi->version >= info.last_update) { + dout(10) << __func__<< ": bi is current " << dendl; + assert(bi->version == info.last_update); + } else if (bi->version >= info.log_tail) { + assert(!pg_log.get_log().empty()); + dout(10) << __func__<< ": bi is old, (" << bi->version + << ") can be updated with log" << dendl; + list<pg_log_entry_t>::const_iterator i = + pg_log.get_log().log.end(); + --i; + while (i != pg_log.get_log().log.begin() && + i->version > bi->version) { + --i; + } + if (i->version == bi->version) + ++i; + + assert(i != pg_log.get_log().log.end()); + dout(10) << __func__ << ": updating from version " << i->version + << dendl; + for (; i != pg_log.get_log().log.end(); ++i) { + const hobject_t &soid = i->soid; + if (soid >= bi->begin && soid < bi->end) { + if (i->is_update()) { + dout(10) << __func__ << ": " << i->soid << " updated to version " + << i->version << dendl; + bi->objects.erase(i->soid); + bi->objects.insert( + make_pair( + i->soid, + i->version)); + } else if (i->is_delete()) { + dout(10) << __func__ << ": " << i->soid << " removed" << dendl; + bi->objects.erase(i->soid); + } + } + } + bi->version = info.last_update; + } else { + dout(10) << __func__<< ": bi is old, rescanning local backfill_info" + << dendl; + osr->flush(); + scan_range(local_min, local_max, &backfill_info, handle); + } } void ReplicatedPG::scan_range( - hobject_t begin, int min, int max, BackfillInterval *bi, + int min, int max, BackfillInterval *bi, ThreadPool::TPHandle &handle) { assert(is_locked()); - dout(10) << "scan_range from " << begin << dendl; - bi->begin = begin; + dout(10) << "scan_range from " << bi->begin << dendl; + bi->version = info.last_update; bi->objects.clear(); // for good measure vector<hobject_t> ls; ls.reserve(max); - int r = osd->store->collection_list_partial(coll, begin, min, max, + int r = osd->store->collection_list_partial(coll, bi->begin, min, max, 0, &ls, &bi->end); assert(r >= 0); dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index abee57ffe7d..bea793878d6 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -618,10 +618,16 @@ protected: * @bi [out] resulting map of objects to eversion_t's */ void scan_range( - hobject_t begin, int min, int max, BackfillInterval *bi, + int min, int max, BackfillInterval *bi, ThreadPool::TPHandle &handle ); + /// Update a hash range to reflect changes since the last scan + void update_range( + BackfillInterval *bi, ///< [in,out] interval to update + ThreadPool::TPHandle &handle ///< [in] tp handle + ); + void prep_backfill_object_push( hobject_t oid, eversion_t v, eversion_t have, int peer, PGBackend::RecoveryHandle *h); |