diff options
author | Sage Weil <sage@inktank.com> | 2013-10-01 14:18:52 -0700 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-10-01 14:18:52 -0700 |
commit | 0459dc4f463cd6cf9ff6432e41e17b646c683738 (patch) | |
tree | 0683452734a4ea1a0b8fca28ceda8fc5e22d19ea | |
parent | 1bdc3f70344acc46bf9a69d44f7936d346311423 (diff) | |
parent | d421b66293fb3d815ad2cd4c787dd2c39f48d6e8 (diff) | |
download | ceph-0459dc4f463cd6cf9ff6432e41e17b646c683738.tar.gz |
Merge pull request #670 from ceph/wip-osd-whiteout
osd: add basic whiteout infrastructure
Reviewed-by: Sage Weil <sage@inktank.com>
Reviewed-by: Samuel Just <sam.just@inktank.com>
-rw-r--r-- | src/osd/ReplicatedPG.cc | 35 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 38 | ||||
-rw-r--r-- | src/osd/osd_types.h | 39 |
3 files changed, 76 insertions, 36 deletions
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index a661aa7f786..960305bd21e 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -981,7 +981,7 @@ void ReplicatedPG::do_op(OpRequestRef op) return; } - if ((op->may_read()) && (obc->obs.oi.lost)) { + if ((op->may_read()) && (obc->obs.oi.is_lost())) { // This object is lost. Reading from it returns an error. dout(20) << __func__ << ": object " << obc->obs.oi.soid << " is lost" << dendl; @@ -991,7 +991,8 @@ void ReplicatedPG::do_op(OpRequestRef op) dout(25) << __func__ << ": object " << obc->obs.oi.soid << " has oi of " << obc->obs.oi << dendl; - if (!op->may_write() && !obc->obs.exists) { + if (!op->may_write() && (!obc->obs.exists || + obc->obs.oi.is_whiteout())) { osd->reply_op_error(op, -ENOENT); return; } @@ -1048,6 +1049,8 @@ void ReplicatedPG::do_op(OpRequestRef op) wait_for_missing_object(wait_oid, op); } else if (r) { osd->reply_op_error(op, r); + } else if (sobc->obs.oi.is_whiteout()) { + osd->reply_op_error(op, -ENOENT); } else { if (sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.get_key() && sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.oid.name && @@ -1102,6 +1105,8 @@ void ReplicatedPG::do_op(OpRequestRef op) wait_for_missing_object(wait_oid, op); } else if (r) { osd->reply_op_error(op, r); + } else if (sobc->obs.oi.is_whiteout()) { + osd->reply_op_error(op, -ENOENT); } else { dout(10) << " clone_oid " << clone_oid << " obc " << sobc << dendl; src_obc[clone_oid] = sobc; @@ -3285,7 +3290,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) if (cct->_conf->osd_tmapput_sets_uses_tmap) { assert(cct->_conf->osd_auto_upgrade_tmap); - oi.uses_tmap = true; + oi.set_flag(object_info_t::FLAG_USES_TMAP); } // write it @@ -3333,7 +3338,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) } set<string> out_set; - if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) { + if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) { dout(20) << "CEPH_OSD_OP_OMAPGETKEYS: " << " Reading " << oi.soid << " omap from tmap" << dendl; map<string, bufferlist> vals; @@ -3391,7 +3396,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) } map<string, bufferlist> out_set; - if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) { + if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) { dout(20) << "CEPH_OSD_OP_OMAPGETVALS: " << " Reading " << oi.soid << " omap from tmap" << dendl; map<string, bufferlist> vals; @@ -3442,7 +3447,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) case CEPH_OSD_OP_OMAPGETHEADER: ++ctx->num_read; { - if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) { + if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) { dout(20) << "CEPH_OSD_OP_OMAPGETHEADER: " << " Reading " << oi.soid << " omap from tmap" << dendl; map<string, bufferlist> vals; @@ -3473,7 +3478,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) goto fail; } map<string, bufferlist> out; - if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) { + if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) { dout(20) << "CEPH_OSD_OP_OMAPGET: " << " Reading " << oi.soid << " omap from tmap" << dendl; map<string, bufferlist> vals; @@ -3572,7 +3577,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) case CEPH_OSD_OP_OMAPSETVALS: ++ctx->num_write; { - if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) { + if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) { _copy_up_tmap(ctx); } if (!obs.exists) { @@ -3602,7 +3607,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) case CEPH_OSD_OP_OMAPSETHEADER: ++ctx->num_write; { - if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) { + if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) { _copy_up_tmap(ctx); } if (!obs.exists) { @@ -3622,7 +3627,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) result = -ENOENT; break; } - if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) { + if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) { _copy_up_tmap(ctx); } t.touch(coll, soid); @@ -3638,7 +3643,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) result = -ENOENT; break; } - if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) { + if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) { _copy_up_tmap(ctx); } t.touch(coll, soid); @@ -3817,7 +3822,7 @@ int ReplicatedPG::_get_tmap(OpContext *ctx, int ReplicatedPG::_copy_up_tmap(OpContext *ctx) { dout(20) << "copying up tmap for " << ctx->new_obs.oi.soid << dendl; - ctx->new_obs.oi.uses_tmap = false; + ctx->new_obs.oi.clear_flag(object_info_t::FLAG_USES_TMAP); map<string, bufferlist> vals; bufferlist header; int r = _get_tmap(ctx, &vals, &header); @@ -3877,11 +3882,11 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op) hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()), &rollback_to, false, &cloneid); if (ret) { - if (-ENOENT == ret) { + if (-ENOENT == ret || rollback_to->obs.oi.is_whiteout()) { // there's no snapshot here, or there's no object. // if there's no snapshot, we delete the object; otherwise, do nothing. dout(20) << "_rollback_to deleting head on " << soid.oid - << " because got ENOENT on find_object_context" << dendl; + << " because got ENOENT|whiteout on find_object_context" << dendl; if (ctx->obc->obs.oi.watchers.size()) { // Cannot delete an object with watchers ret = -EBUSY; @@ -7106,7 +7111,7 @@ ObjectContextRef ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t, obc->ondisk_write_lock(); - obc->obs.oi.lost = true; + obc->obs.oi.set_flag(object_info_t::FLAG_LOST); obc->obs.oi.version = info.last_update; obc->obs.oi.prior_version = version; diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index aa20dc592fa..952f0cf75d7 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -2796,9 +2796,8 @@ void object_info_t::copy_user_bits(const object_info_t& other) last_reqid = other.last_reqid; truncate_seq = other.truncate_seq; truncate_size = other.truncate_size; - lost = other.lost; + flags = other.flags; category = other.category; - uses_tmap = other.uses_tmap; } ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid, @@ -2839,13 +2838,15 @@ void object_info_t::encode(bufferlist& bl) const ::encode(snaps, bl); ::encode(truncate_seq, bl); ::encode(truncate_size, bl); - ::encode(lost, bl); + __u8 flags_lo = flags & 0xff; + __u8 flags_hi = (flags & 0xff00) >> 8; + ::encode(flags_lo, bl); ::encode(old_watchers, bl); /* shenanigans to avoid breaking backwards compatibility in the disk format. * When we can, switch this out for simply putting the version_t on disk. */ eversion_t user_eversion(0, user_version); ::encode(user_eversion, bl); - ::encode(uses_tmap, bl); + ::encode(flags_hi, bl); ::encode(watchers, bl); ENCODE_FINISH(bl); } @@ -2883,20 +2884,26 @@ void object_info_t::decode(bufferlist::iterator& bl) ::decode(snaps, bl); ::decode(truncate_seq, bl); ::decode(truncate_size, bl); - if (struct_v >= 3) - ::decode(lost, bl); - else - lost = false; + if (struct_v >= 3) { + __u8 lo; + ::decode(lo, bl); + flags = (flag_t)lo; + } else { + flags = (flag_t)0; + } if (struct_v >= 4) { ::decode(old_watchers, bl); eversion_t user_eversion; ::decode(user_eversion, bl); user_version = user_eversion.version; } - if (struct_v >= 9) - ::decode(uses_tmap, bl); - else - uses_tmap = true; + if (struct_v >= 9) { + __u8 hi; + ::decode(hi, bl); + flags = (flag_t)(flags | ((unsigned)hi << 8)); + } else { + set_flag(FLAG_USES_TMAP); + } if (struct_v < 10) soid.pool = myoloc.pool; if (struct_v >= 11) { @@ -2924,7 +2931,8 @@ void object_info_t::dump(Formatter *f) const f->dump_stream("last_reqid") << last_reqid; f->dump_unsigned("size", size); f->dump_stream("mtime") << mtime; - f->dump_unsigned("lost", lost); + f->dump_unsigned("lost", (int)is_lost()); + f->dump_unsigned("flags", (int)flags); f->dump_stream("wrlock_by") << wrlock_by; f->open_array_section("snaps"); for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) @@ -2960,8 +2968,10 @@ ostream& operator<<(ostream& out, const object_info_t& oi) out << " wrlock_by=" << oi.wrlock_by; else out << " " << oi.snaps; - if (oi.lost) + if (oi.is_lost()) out << " LOST"; + if (oi.is_whiteout()) + out << " WHITEOUT"; out << ")"; return out; } diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 884b8ada8cc..9d440e25bde 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -2093,7 +2093,16 @@ struct object_info_t { uint64_t size; utime_t mtime; - bool lost; + + // note: these are currently encoded into a total 16 bits; see + // encode()/decode() for the weirdness. + typedef enum { + FLAG_LOST = 1<<0, + FLAG_WHITEOUT = 1<<1, // object logically does not exist + // ... + FLAG_USES_TMAP = 1<<8, + } flag_t; + flag_t flags; osd_reqid_t wrlock_by; // [head] vector<snapid_t> snaps; // [clone] @@ -2102,13 +2111,28 @@ struct object_info_t { map<pair<uint64_t, entity_name_t>, watch_info_t> watchers; - bool uses_tmap; void copy_user_bits(const object_info_t& other); static ps_t legacy_object_locator_to_ps(const object_t &oid, const object_locator_t &loc); + bool test_flag(flag_t f) const { + return (flags & f) == f; + } + void set_flag(flag_t f) { + flags = (flag_t)(flags | f); + } + void clear_flag(flag_t f) { + flags = (flag_t)(flags & ~f); + } + bool is_lost() const { + return test_flag(FLAG_LOST); + } + bool is_whiteout() const { + return test_flag(FLAG_WHITEOUT); + } + void encode(bufferlist& bl) const; void decode(bufferlist::iterator& bl); void decode(bufferlist& bl) { @@ -2119,13 +2143,14 @@ struct object_info_t { static void generate_test_instances(list<object_info_t*>& o); explicit object_info_t() - : user_version(0), size(0), lost(false), - truncate_seq(0), truncate_size(0), uses_tmap(false) + : user_version(0), size(0), flags((flag_t)0), + truncate_seq(0), truncate_size(0) {} object_info_t(const hobject_t& s) - : soid(s), user_version(0), size(0), - lost(false), truncate_seq(0), truncate_size(0), uses_tmap(false) {} + : soid(s), + user_version(0), size(0), flags((flag_t)0), + truncate_seq(0), truncate_size(0) {} object_info_t(bufferlist& bl) { decode(bl); @@ -2135,7 +2160,7 @@ WRITE_CLASS_ENCODER(object_info_t) struct ObjectState { object_info_t oi; - bool exists; + bool exists; ///< the stored object exists (i.e., we will remember the object_info_t) ObjectState() : exists(false) {} |