summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-01-18 13:02:54 -0800
committerSage Weil <sage@inktank.com>2013-01-18 13:02:54 -0800
commit609442dad4acff77a3ce03064cc4c90c817b5213 (patch)
tree409a9551b97ce076ae12bf894e45f4ef62309cfd
parent7e8e6491a788caf82abc6d6702ce8646655a8730 (diff)
parent63e33c8ad9df5fc43b5695ebc839a26a3c88487c (diff)
downloadceph-609442dad4acff77a3ce03064cc4c90c817b5213.tar.gz
Merge remote-tracking branch 'gh/wip-scrub-argonaut' into argonaut
-rw-r--r--src/include/byteorder.h6
-rw-r--r--src/osd/OSD.cc36
-rw-r--r--src/osd/PG.cc83
-rw-r--r--src/osd/PG.h17
-rw-r--r--src/osd/ReplicatedPG.cc85
-rw-r--r--src/osd/ReplicatedPG.h11
-rw-r--r--src/osd/osd_types.cc24
-rw-r--r--src/osd/osd_types.h10
8 files changed, 227 insertions, 45 deletions
diff --git a/src/include/byteorder.h b/src/include/byteorder.h
index 797e561a885..f8c74991e7a 100644
--- a/src/include/byteorder.h
+++ b/src/include/byteorder.h
@@ -81,9 +81,9 @@ MAKE_LE_CLASS(32)
MAKE_LE_CLASS(16)
#undef MAKE_LE_CLASS
-#define init_le64(x) { mswab64(x) }
-#define init_le32(x) { mswab32(x) }
-#define init_le16(x) { mswab16(x) }
+#define init_le64(x) { (__u64)mswab64(x) }
+#define init_le32(x) { (__u32)mswab32(x) }
+#define init_le16(x) { (__u16)mswab16(x) }
/*
#define cpu_to_le64(x) (x)
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index f57f2264f74..4caaf46638b 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -1323,7 +1323,7 @@ void OSD::load_pgs()
// read pg state, log
pg->read_state(store);
- reg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp);
+ pg->reg_scrub();
// generate state for current mapping
osdmap->pg_to_up_acting_osds(pgid, pg->up, pg->acting);
@@ -1949,9 +1949,7 @@ void OSD::tick()
// periodically kick recovery work queue
recovery_tp.wake();
- if (scrub_should_schedule()) {
- sched_scrub();
- }
+ sched_scrub();
map_lock.get_read();
@@ -3090,11 +3088,11 @@ void OSD::handle_scrub(MOSDScrub *m)
PG *pg = p->second;
pg->lock();
if (pg->is_primary()) {
- if (m->repair)
- pg->state_set(PG_STATE_REPAIR);
- if (pg->queue_scrub()) {
- dout(10) << "queueing " << *pg << " for scrub" << dendl;
- }
+ pg->unreg_scrub();
+ pg->must_scrub = true;
+ pg->must_repair = m->repair;
+ pg->reg_scrub();
+ dout(10) << "marking " << *pg << " for scrub" << dendl;
}
pg->unlock();
}
@@ -3106,11 +3104,11 @@ void OSD::handle_scrub(MOSDScrub *m)
PG *pg = pg_map[*p];
pg->lock();
if (pg->is_primary()) {
- if (m->repair)
- pg->state_set(PG_STATE_REPAIR);
- if (pg->queue_scrub()) {
- dout(10) << "queueing " << *pg << " for scrub" << dendl;
- }
+ pg->unreg_scrub();
+ pg->must_scrub = true;
+ pg->must_repair = m->repair;
+ pg->reg_scrub();
+ dout(10) << "marking " << *pg << " for scrub" << dendl;
}
pg->unlock();
}
@@ -3157,7 +3155,9 @@ void OSD::sched_scrub()
{
assert(osd_lock.is_locked());
- dout(20) << "sched_scrub" << dendl;
+ bool should = scrub_should_schedule();
+
+ dout(20) << "sched_scrub should=" << (int)should << dendl;
pair<utime_t,pg_t> pos;
utime_t max = ceph_clock_now(g_ceph_context);
@@ -3184,7 +3184,9 @@ void OSD::sched_scrub()
sched_scrub_lock.Unlock();
PG *pg = _lookup_lock_pg(pgid);
if (pg) {
- if (pg->is_active() && !pg->sched_scrub()) {
+ if (pg->is_active() &&
+ (should || pg->must_scrub) &&
+ !pg->sched_scrub()) {
pg->unlock();
sched_scrub_lock.Lock();
break;
@@ -5126,7 +5128,7 @@ void OSD::_remove_pg(PG *pg)
// remove from map
pg_map.erase(pgid);
pg->put(); // since we've taken it out of map
- unreg_last_pg_scrub(pg->info.pgid, pg->info.history.last_scrub_stamp);
+ pg->unreg_scrub();
_put_pool(pg->pool);
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 2f38dac426e..c9d2a65fa45 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -263,10 +263,10 @@ bool PG::proc_replica_info(int from, pg_info_t &oinfo)
peer_info[from] = oinfo;
might_have_unfound.insert(from);
- osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ unreg_scrub();
if (info.history.merge(oinfo.history))
dirty_info = true;
- osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ reg_scrub();
// stray?
if (!is_acting(from)) {
@@ -1584,6 +1584,7 @@ bool PG::queue_scrub()
if (is_scrubbing()) {
return false;
}
+ must_scrub = false;
state_set(PG_STATE_SCRUBBING);
osd->scrub_wq.queue(this);
return true;
@@ -1918,7 +1919,7 @@ void PG::init(int role, vector<int>& newup, vector<int>& newacting, pg_history_t
info.stats.acting = acting;
info.stats.mapping_epoch = info.history.same_interval_since;
- osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ reg_scrub();
write_info(*t);
write_log(*t);
@@ -2618,6 +2619,20 @@ bool PG::sched_scrub()
return ret;
}
+void PG::reg_scrub()
+{
+ if (must_scrub) {
+ scrub_reg_stamp = utime_t();
+ } else {
+ scrub_reg_stamp = info.history.last_scrub_stamp;
+ }
+ osd->reg_last_pg_scrub(info.pgid, scrub_reg_stamp);
+}
+
+void PG::unreg_scrub()
+{
+ osd->unreg_last_pg_scrub(info.pgid, scrub_reg_stamp);
+}
void PG::sub_op_scrub_map(OpRequestRef op)
{
@@ -2682,8 +2697,13 @@ void PG::_scan_list(ScrubMap &map, vector<hobject_t> &ls)
if (r == 0) {
ScrubMap::object &o = map.objects[poid];
o.size = st.st_size;
+ o.nlinks = st.st_nlink;
assert(!o.negative);
osd->store->getattrs(coll, poid, o.attrs);
+ if (poid.snap != CEPH_SNAPDIR && poid.snap != CEPH_NOSNAP) {
+ // Check snap collections
+ check_snap_collections(st.st_ino, poid, o.attrs, &o.snapcolls);
+ }
dout(25) << "_scan_list " << poid << dendl;
} else {
dout(25) << "_scan_list " << poid << " got " << r << ", skipping" << dendl;
@@ -3022,7 +3042,6 @@ void PG::scrub()
if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
dout(10) << "scrub -- not primary or active or not clean" << dendl;
- state_clear(PG_STATE_REPAIR);
state_clear(PG_STATE_SCRUBBING);
clear_scrub_reserved();
unlock();
@@ -3130,7 +3149,6 @@ void PG::scrub_clear_state()
{
assert(_lock.is_locked());
state_clear(PG_STATE_SCRUBBING);
- state_clear(PG_STATE_REPAIR);
update_stats();
// active -> nothing.
@@ -3138,6 +3156,9 @@ void PG::scrub_clear_state()
osd->requeue_ops(this, waiting_for_active);
+ must_scrub = false;
+ must_repair = false;
+
finalizing_scrub = false;
scrub_block_writes = false;
scrub_active = false;
@@ -3215,6 +3236,7 @@ void PG::_compare_scrubmaps(const map<int,ScrubMap*> &maps,
map<hobject_t, set<int> > &missing,
map<hobject_t, set<int> > &inconsistent,
map<hobject_t, int> &authoritative,
+ map<hobject_t, set<int> > &invalid_snapcolls,
ostream &errorstream)
{
map<hobject_t,ScrubMap::object>::const_iterator i;
@@ -3241,6 +3263,18 @@ void PG::_compare_scrubmaps(const map<int,ScrubMap*> &maps,
// Take first osd to have it as authoritative
auth = j;
} else {
+ // Check snapcolls
+ if (k->snap < CEPH_MAXSNAP) {
+ if (_report_snap_collection_errors(
+ *k,
+ j->first,
+ j->second->objects[*k].attrs,
+ j->second->objects[*k].snapcolls,
+ j->second->objects[*k].nlinks,
+ errorstream)) {
+ invalid_snapcolls[*k].insert(j->first);
+ }
+ }
// Compare
stringstream ss;
if (!_compare_scrub_objects(auth->second->objects[*k],
@@ -3290,7 +3324,7 @@ void PG::scrub_finalize() {
dout(10) << "scrub_finalize has maps, analyzing" << dendl;
int errors = 0, fixed = 0;
- bool repair = state_test(PG_STATE_REPAIR);
+ bool repair = must_repair;
const char *mode = repair ? "repair":"scrub";
if (acting.size() > 1) {
dout(10) << "scrub comparing replica scrub maps" << dendl;
@@ -3300,6 +3334,7 @@ void PG::scrub_finalize() {
// Maps from objects with erros to missing/inconsistent peers
map<hobject_t, set<int> > missing;
map<hobject_t, set<int> > inconsistent;
+ map<hobject_t, set<int> > inconsistent_snapcolls;
// Map from object with errors to good peer
map<hobject_t, int> authoritative;
@@ -3314,9 +3349,22 @@ void PG::scrub_finalize() {
maps[i] = &scrub_received_maps[acting[i]];
}
- _compare_scrubmaps(maps, missing, inconsistent, authoritative, ss);
+ _compare_scrubmaps(
+ maps, missing, inconsistent, authoritative,
+ inconsistent_snapcolls,
+ ss);
+
+ for (map<hobject_t, set<int> >::iterator obj = inconsistent_snapcolls.begin();
+ obj != inconsistent_snapcolls.end();
+ ++obj) {
+ for (set<int>::iterator j = obj->second.begin(); j != obj->second.end(); ++j) {
+ ++errors;
+ ss << info.pgid << " " << mode << " " << " object " << obj->first
+ << " has inconsistent snapcolls on " << *j << std::endl;
+ }
+ }
- if (authoritative.size()) {
+ if (authoritative.size() || inconsistent_snapcolls.size()) {
ss << info.pgid << " " << mode << " " << missing.size() << " missing, "
<< inconsistent.size() << " inconsistent objects\n";
dout(2) << ss.str() << dendl;
@@ -3378,10 +3426,10 @@ void PG::scrub_finalize() {
state_clear(PG_STATE_INCONSISTENT);
// finish up
- osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ unreg_scrub();
info.history.last_scrub = info.last_update;
info.history.last_scrub_stamp = ceph_clock_now(g_ceph_context);
- osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ reg_scrub();
{
ObjectStore::Transaction *t = new ObjectStore::Transaction;
@@ -3674,6 +3722,9 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
state_clear(PG_STATE_DOWN);
state_clear(PG_STATE_RECOVERING);
+ must_scrub = false;
+ must_repair = false;
+
peer_missing.clear();
peer_purged.clear();
@@ -3689,7 +3740,7 @@ void PG::start_peering_interval(const OSDMapRef lastmap,
dout(10) << *this << " canceling deletion!" << dendl;
deleting = false;
osd->remove_wq.dequeue(this);
- osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ reg_scrub();
}
if (role != oldrole) {
@@ -3764,10 +3815,10 @@ void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
dirty_info = true;
}
- osd->unreg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ unreg_scrub();
if (info.history.merge(oinfo.history))
dirty_info = true;
- osd->reg_last_pg_scrub(info.pgid, info.history.last_scrub_stamp);
+ reg_scrub();
// Handle changes to purged_snaps ONLY IF we have caught up
if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
@@ -4471,11 +4522,9 @@ boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
if (msg->info.last_backfill == hobject_t()) {
// restart backfill
- pg->osd->unreg_last_pg_scrub(pg->info.pgid,
- pg->info.history.last_scrub_stamp);
+ pg->unreg_scrub();
pg->info = msg->info;
- pg->osd->reg_last_pg_scrub(pg->info.pgid,
- pg->info.history.last_scrub_stamp);
+ pg->reg_scrub();
pg->log.claim_log(msg->log);
pg->missing.clear();
} else {
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 4a292f6b3d0..1c680bfea2d 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -747,6 +747,8 @@ public:
epoch_t scrub_epoch_start;
ScrubMap primary_scrubmap;
MOSDRepScrub *active_rep_scrub;
+ bool must_scrub, must_repair;
+ utime_t scrub_reg_stamp;
void repair_object(const hobject_t& soid, ScrubMap::object *po, int bad_peer, int ok_peer);
bool _compare_scrub_objects(ScrubMap::object &auth,
@@ -756,6 +758,7 @@ public:
map<hobject_t, set<int> > &missing,
map<hobject_t, set<int> > &inconsistent,
map<hobject_t, int> &authoritative,
+ map<hobject_t, set<int> > &inconsistent_snapcolls,
ostream &errorstream);
void scrub();
void scrub_finalize();
@@ -766,11 +769,24 @@ public:
void build_scrub_map(ScrubMap &map);
void build_inc_scrub_map(ScrubMap &map, eversion_t v);
virtual int _scrub(ScrubMap &map, int& errors, int& fixed) { return 0; }
+ virtual bool _report_snap_collection_errors(
+ const hobject_t &hoid,
+ int osd,
+ const map<string, bufferptr> &attrs,
+ const set<snapid_t> &snapcolls,
+ uint32_t nlinks,
+ ostream &out) { return false; };
+ virtual void check_snap_collections(
+ ino_t hino, const hobject_t &hoid,
+ const map<string, bufferptr> &attrs,
+ set<snapid_t> *snapcolls) {};
void clear_scrub_reserved();
void scrub_reserve_replicas();
void scrub_unreserve_replicas();
bool scrub_all_replicas_reserved() const;
bool sched_scrub();
+ void reg_scrub();
+ void unreg_scrub();
void replica_scrub(class MOSDRepScrub *op);
void sub_op_scrub_map(OpRequestRef op);
@@ -1254,6 +1270,7 @@ public:
scrub_reserved(false), scrub_reserve_failed(false),
scrub_waiting_on(0),
active_rep_scrub(0),
+ must_scrub(false), must_repair(false),
recovery_state(this)
{
pool->get();
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 6446717e2a5..92df99778c6 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -5738,7 +5738,8 @@ void ReplicatedPG::on_change()
scrub_clear_state();
} else if (is_scrubbing()) {
state_clear(PG_STATE_SCRUBBING);
- state_clear(PG_STATE_REPAIR);
+ must_scrub = false;
+ must_repair = false;
}
context_registry_on_change();
@@ -6461,7 +6462,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed)
dout(10) << "_scrub" << dendl;
coll_t c(info.pgid);
- bool repair = state_test(PG_STATE_REPAIR);
+ bool repair = must_repair;
const char *mode = repair ? "repair":"scrub";
// traverse in reverse order.
@@ -6563,10 +6564,15 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed)
}
} else if (soid.snap) {
// it's a clone
- assert(head != hobject_t());
-
stat.num_object_clones++;
+ if (head == hobject_t()) {
+ osd->clog.error() << mode << " " << info.pgid << " " << soid
+ << " found clone without head";
+ ++errors;
+ continue;
+ }
+
if (soid.snap != *curclone) {
osd->clog.error() << mode << " " << info.pgid << " " << soid
<< " expected clone " << *curclone;
@@ -6628,6 +6634,77 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap, int& errors, int& fixed)
return errors;
}
+static set<snapid_t> get_expected_snap_colls(
+ const map<string, bufferptr> &attrs,
+ object_info_t *oi = 0)
+{
+ object_info_t _oi;
+ if (!oi)
+ oi = &_oi;
+
+ set<snapid_t> to_check;
+ map<string, bufferptr>::const_iterator oiiter = attrs.find(OI_ATTR);
+ if (oiiter == attrs.end())
+ return to_check;
+
+ bufferlist oiattr;
+ oiattr.push_back(oiiter->second);
+ *oi = object_info_t(oiattr);
+ if (oi->snaps.size() > 0)
+ to_check.insert(*(oi->snaps.begin()));
+ if (oi->snaps.size() > 1)
+ to_check.insert(*(oi->snaps.rbegin()));
+ return to_check;
+}
+
+bool ReplicatedPG::_report_snap_collection_errors(
+ const hobject_t &hoid,
+ int osd,
+ const map<string, bufferptr> &attrs,
+ const set<snapid_t> &snapcolls,
+ uint32_t nlinks,
+ ostream &out)
+{
+ bool errors = false;
+ set<snapid_t> to_check = get_expected_snap_colls(attrs);
+ if (to_check != snapcolls) {
+ out << info.pgid << " osd." << osd << " inconsistent snapcolls on "
+ << hoid << " found " << snapcolls << " expected " << to_check
+ << std::endl;
+ errors = true;
+ }
+ if (nlinks != snapcolls.size() + 1) {
+ out << info.pgid << " osd." << osd << " unaccounted for links on object "
+ << hoid << " snapcolls " << snapcolls << " nlinks " << nlinks
+ << std::endl;
+ errors = true;
+ }
+ return errors;
+}
+
+void ReplicatedPG::check_snap_collections(
+ ino_t hino,
+ const hobject_t &hoid,
+ const map<string, bufferptr> &attrs,
+ set<snapid_t> *snapcolls)
+{
+ object_info_t oi;
+ set<snapid_t> to_check = get_expected_snap_colls(attrs, &oi);
+
+ for (set<snapid_t>::iterator i = to_check.begin(); i != to_check.end(); ++i) {
+ struct stat st;
+ int r = osd->store->stat(coll_t(info.pgid, *i), hoid, &st);
+ if (r == -ENOENT) {
+ } else if (r == 0) {
+ if (hino == st.st_ino) {
+ snapcolls->insert(*i);
+ }
+ } else {
+ assert(0);
+ }
+ }
+}
+
/*---SnapTrimmer Logging---*/
#undef dout_prefix
#define dout_prefix *_dout << pg->gen_prefix()
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 6387cf889cb..51eaebe0e5f 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -788,6 +788,17 @@ protected:
// -- scrub --
virtual int _scrub(ScrubMap& map, int& errors, int& fixed);
+ virtual bool _report_snap_collection_errors(
+ const hobject_t &hoid,
+ int osd,
+ const map<string, bufferptr> &attrs,
+ const set<snapid_t> &snapcolls,
+ uint32_t nlinks,
+ ostream &out);
+ virtual void check_snap_collections(
+ ino_t hino, const hobject_t &hoid,
+ const map<string, bufferptr> &attrs,
+ set<snapid_t> *snapcolls);
void apply_and_flush_repops(bool requeue);
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index c939270f718..5b15afed065 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2508,19 +2508,39 @@ void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
void ScrubMap::object::encode(bufferlist& bl) const
{
- ENCODE_START(2, 2, bl);
+ ENCODE_START(4, 2, bl);
::encode(size, bl);
::encode(negative, bl);
::encode(attrs, bl);
+ ::encode(digest, bl);
+ ::encode(digest_present, bl);
+ ::encode(nlinks, bl);
+ ::encode(snapcolls, bl);
ENCODE_FINISH(bl);
}
void ScrubMap::object::decode(bufferlist::iterator& bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
::decode(size, bl);
::decode(negative, bl);
::decode(attrs, bl);
+ if (struct_v >= 3) {
+ ::decode(digest, bl);
+ ::decode(digest_present, bl);
+ }
+ else {
+ digest = 0;
+ digest_present = false;
+ }
+ if (struct_v >= 4) {
+ ::decode(nlinks, bl);
+ ::decode(snapcolls, bl);
+ } else {
+ /* Indicates that encoder was not aware of this field since stat must
+ * return nlink >= 1 */
+ nlinks = 0;
+ }
DECODE_FINISH(bl);
}
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index e2ca0e04515..dde2d7697d2 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -1749,8 +1749,14 @@ struct ScrubMap {
uint64_t size;
bool negative;
map<string,bufferptr> attrs;
-
- object(): size(0), negative(false) {}
+ __u32 digest;
+ bool digest_present;
+ uint32_t nlinks;
+ set<snapid_t> snapcolls;
+
+ object() :
+ size(0), negative(false), digest(0), digest_present(false),
+ nlinks(0) {}
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);