summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-10-01 14:18:52 -0700
committerSage Weil <sage@inktank.com>2013-10-01 14:18:52 -0700
commit0459dc4f463cd6cf9ff6432e41e17b646c683738 (patch)
tree0683452734a4ea1a0b8fca28ceda8fc5e22d19ea
parent1bdc3f70344acc46bf9a69d44f7936d346311423 (diff)
parentd421b66293fb3d815ad2cd4c787dd2c39f48d6e8 (diff)
downloadceph-0459dc4f463cd6cf9ff6432e41e17b646c683738.tar.gz
Merge pull request #670 from ceph/wip-osd-whiteout
osd: add basic whiteout infrastructure Reviewed-by: Sage Weil <sage@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com>
-rw-r--r--src/osd/ReplicatedPG.cc35
-rw-r--r--src/osd/osd_types.cc38
-rw-r--r--src/osd/osd_types.h39
3 files changed, 76 insertions, 36 deletions
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index a661aa7f786..960305bd21e 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -981,7 +981,7 @@ void ReplicatedPG::do_op(OpRequestRef op)
return;
}
- if ((op->may_read()) && (obc->obs.oi.lost)) {
+ if ((op->may_read()) && (obc->obs.oi.is_lost())) {
// This object is lost. Reading from it returns an error.
dout(20) << __func__ << ": object " << obc->obs.oi.soid
<< " is lost" << dendl;
@@ -991,7 +991,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
dout(25) << __func__ << ": object " << obc->obs.oi.soid
<< " has oi of " << obc->obs.oi << dendl;
- if (!op->may_write() && !obc->obs.exists) {
+ if (!op->may_write() && (!obc->obs.exists ||
+ obc->obs.oi.is_whiteout())) {
osd->reply_op_error(op, -ENOENT);
return;
}
@@ -1048,6 +1049,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
wait_for_missing_object(wait_oid, op);
} else if (r) {
osd->reply_op_error(op, r);
+ } else if (sobc->obs.oi.is_whiteout()) {
+ osd->reply_op_error(op, -ENOENT);
} else {
if (sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.get_key() &&
sobc->obs.oi.soid.get_key() != obc->obs.oi.soid.oid.name &&
@@ -1102,6 +1105,8 @@ void ReplicatedPG::do_op(OpRequestRef op)
wait_for_missing_object(wait_oid, op);
} else if (r) {
osd->reply_op_error(op, r);
+ } else if (sobc->obs.oi.is_whiteout()) {
+ osd->reply_op_error(op, -ENOENT);
} else {
dout(10) << " clone_oid " << clone_oid << " obc " << sobc << dendl;
src_obc[clone_oid] = sobc;
@@ -3285,7 +3290,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
if (cct->_conf->osd_tmapput_sets_uses_tmap) {
assert(cct->_conf->osd_auto_upgrade_tmap);
- oi.uses_tmap = true;
+ oi.set_flag(object_info_t::FLAG_USES_TMAP);
}
// write it
@@ -3333,7 +3338,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
set<string> out_set;
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
+ if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) {
dout(20) << "CEPH_OSD_OP_OMAPGETKEYS: "
<< " Reading " << oi.soid << " omap from tmap" << dendl;
map<string, bufferlist> vals;
@@ -3391,7 +3396,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
}
map<string, bufferlist> out_set;
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
+ if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) {
dout(20) << "CEPH_OSD_OP_OMAPGETVALS: "
<< " Reading " << oi.soid << " omap from tmap" << dendl;
map<string, bufferlist> vals;
@@ -3442,7 +3447,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_OMAPGETHEADER:
++ctx->num_read;
{
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
+ if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) {
dout(20) << "CEPH_OSD_OP_OMAPGETHEADER: "
<< " Reading " << oi.soid << " omap from tmap" << dendl;
map<string, bufferlist> vals;
@@ -3473,7 +3478,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
goto fail;
}
map<string, bufferlist> out;
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
+ if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) {
dout(20) << "CEPH_OSD_OP_OMAPGET: "
<< " Reading " << oi.soid << " omap from tmap" << dendl;
map<string, bufferlist> vals;
@@ -3572,7 +3577,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_OMAPSETVALS:
++ctx->num_write;
{
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
+ if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) {
_copy_up_tmap(ctx);
}
if (!obs.exists) {
@@ -3602,7 +3607,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
case CEPH_OSD_OP_OMAPSETHEADER:
++ctx->num_write;
{
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
+ if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) {
_copy_up_tmap(ctx);
}
if (!obs.exists) {
@@ -3622,7 +3627,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -ENOENT;
break;
}
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
+ if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) {
_copy_up_tmap(ctx);
}
t.touch(coll, soid);
@@ -3638,7 +3643,7 @@ int ReplicatedPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
result = -ENOENT;
break;
}
- if (oi.uses_tmap && cct->_conf->osd_auto_upgrade_tmap) {
+ if (oi.test_flag(object_info_t::FLAG_USES_TMAP) && cct->_conf->osd_auto_upgrade_tmap) {
_copy_up_tmap(ctx);
}
t.touch(coll, soid);
@@ -3817,7 +3822,7 @@ int ReplicatedPG::_get_tmap(OpContext *ctx,
int ReplicatedPG::_copy_up_tmap(OpContext *ctx)
{
dout(20) << "copying up tmap for " << ctx->new_obs.oi.soid << dendl;
- ctx->new_obs.oi.uses_tmap = false;
+ ctx->new_obs.oi.clear_flag(object_info_t::FLAG_USES_TMAP);
map<string, bufferlist> vals;
bufferlist header;
int r = _get_tmap(ctx, &vals, &header);
@@ -3877,11 +3882,11 @@ int ReplicatedPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
hobject_t(soid.oid, soid.get_key(), snapid, soid.hash, info.pgid.pool(), soid.get_namespace()),
&rollback_to, false, &cloneid);
if (ret) {
- if (-ENOENT == ret) {
+ if (-ENOENT == ret || rollback_to->obs.oi.is_whiteout()) {
// there's no snapshot here, or there's no object.
// if there's no snapshot, we delete the object; otherwise, do nothing.
dout(20) << "_rollback_to deleting head on " << soid.oid
- << " because got ENOENT on find_object_context" << dendl;
+ << " because got ENOENT|whiteout on find_object_context" << dendl;
if (ctx->obc->obs.oi.watchers.size()) {
// Cannot delete an object with watchers
ret = -EBUSY;
@@ -7106,7 +7111,7 @@ ObjectContextRef ReplicatedPG::mark_object_lost(ObjectStore::Transaction *t,
obc->ondisk_write_lock();
- obc->obs.oi.lost = true;
+ obc->obs.oi.set_flag(object_info_t::FLAG_LOST);
obc->obs.oi.version = info.last_update;
obc->obs.oi.prior_version = version;
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index aa20dc592fa..952f0cf75d7 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -2796,9 +2796,8 @@ void object_info_t::copy_user_bits(const object_info_t& other)
last_reqid = other.last_reqid;
truncate_seq = other.truncate_seq;
truncate_size = other.truncate_size;
- lost = other.lost;
+ flags = other.flags;
category = other.category;
- uses_tmap = other.uses_tmap;
}
ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
@@ -2839,13 +2838,15 @@ void object_info_t::encode(bufferlist& bl) const
::encode(snaps, bl);
::encode(truncate_seq, bl);
::encode(truncate_size, bl);
- ::encode(lost, bl);
+ __u8 flags_lo = flags & 0xff;
+ __u8 flags_hi = (flags & 0xff00) >> 8;
+ ::encode(flags_lo, bl);
::encode(old_watchers, bl);
/* shenanigans to avoid breaking backwards compatibility in the disk format.
* When we can, switch this out for simply putting the version_t on disk. */
eversion_t user_eversion(0, user_version);
::encode(user_eversion, bl);
- ::encode(uses_tmap, bl);
+ ::encode(flags_hi, bl);
::encode(watchers, bl);
ENCODE_FINISH(bl);
}
@@ -2883,20 +2884,26 @@ void object_info_t::decode(bufferlist::iterator& bl)
::decode(snaps, bl);
::decode(truncate_seq, bl);
::decode(truncate_size, bl);
- if (struct_v >= 3)
- ::decode(lost, bl);
- else
- lost = false;
+ if (struct_v >= 3) {
+ __u8 lo;
+ ::decode(lo, bl);
+ flags = (flag_t)lo;
+ } else {
+ flags = (flag_t)0;
+ }
if (struct_v >= 4) {
::decode(old_watchers, bl);
eversion_t user_eversion;
::decode(user_eversion, bl);
user_version = user_eversion.version;
}
- if (struct_v >= 9)
- ::decode(uses_tmap, bl);
- else
- uses_tmap = true;
+ if (struct_v >= 9) {
+ __u8 hi;
+ ::decode(hi, bl);
+ flags = (flag_t)(flags | ((unsigned)hi << 8));
+ } else {
+ set_flag(FLAG_USES_TMAP);
+ }
if (struct_v < 10)
soid.pool = myoloc.pool;
if (struct_v >= 11) {
@@ -2924,7 +2931,8 @@ void object_info_t::dump(Formatter *f) const
f->dump_stream("last_reqid") << last_reqid;
f->dump_unsigned("size", size);
f->dump_stream("mtime") << mtime;
- f->dump_unsigned("lost", lost);
+ f->dump_unsigned("lost", (int)is_lost());
+ f->dump_unsigned("flags", (int)flags);
f->dump_stream("wrlock_by") << wrlock_by;
f->open_array_section("snaps");
for (vector<snapid_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p)
@@ -2960,8 +2968,10 @@ ostream& operator<<(ostream& out, const object_info_t& oi)
out << " wrlock_by=" << oi.wrlock_by;
else
out << " " << oi.snaps;
- if (oi.lost)
+ if (oi.is_lost())
out << " LOST";
+ if (oi.is_whiteout())
+ out << " WHITEOUT";
out << ")";
return out;
}
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 884b8ada8cc..9d440e25bde 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -2093,7 +2093,16 @@ struct object_info_t {
uint64_t size;
utime_t mtime;
- bool lost;
+
+ // note: these are currently encoded into a total 16 bits; see
+ // encode()/decode() for the weirdness.
+ typedef enum {
+ FLAG_LOST = 1<<0,
+ FLAG_WHITEOUT = 1<<1, // object logically does not exist
+ // ...
+ FLAG_USES_TMAP = 1<<8,
+ } flag_t;
+ flag_t flags;
osd_reqid_t wrlock_by; // [head]
vector<snapid_t> snaps; // [clone]
@@ -2102,13 +2111,28 @@ struct object_info_t {
map<pair<uint64_t, entity_name_t>, watch_info_t> watchers;
- bool uses_tmap;
void copy_user_bits(const object_info_t& other);
static ps_t legacy_object_locator_to_ps(const object_t &oid,
const object_locator_t &loc);
+ bool test_flag(flag_t f) const {
+ return (flags & f) == f;
+ }
+ void set_flag(flag_t f) {
+ flags = (flag_t)(flags | f);
+ }
+ void clear_flag(flag_t f) {
+ flags = (flag_t)(flags & ~f);
+ }
+ bool is_lost() const {
+ return test_flag(FLAG_LOST);
+ }
+ bool is_whiteout() const {
+ return test_flag(FLAG_WHITEOUT);
+ }
+
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& bl);
void decode(bufferlist& bl) {
@@ -2119,13 +2143,14 @@ struct object_info_t {
static void generate_test_instances(list<object_info_t*>& o);
explicit object_info_t()
- : user_version(0), size(0), lost(false),
- truncate_seq(0), truncate_size(0), uses_tmap(false)
+ : user_version(0), size(0), flags((flag_t)0),
+ truncate_seq(0), truncate_size(0)
{}
object_info_t(const hobject_t& s)
- : soid(s), user_version(0), size(0),
- lost(false), truncate_seq(0), truncate_size(0), uses_tmap(false) {}
+ : soid(s),
+ user_version(0), size(0), flags((flag_t)0),
+ truncate_seq(0), truncate_size(0) {}
object_info_t(bufferlist& bl) {
decode(bl);
@@ -2135,7 +2160,7 @@ WRITE_CLASS_ENCODER(object_info_t)
struct ObjectState {
object_info_t oi;
- bool exists;
+ bool exists; ///< the stored object exists (i.e., we will remember the object_info_t)
ObjectState() : exists(false) {}