summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSamuel Just <sam.just@inktank.com>2013-10-01 17:18:17 -0700
committerSamuel Just <sam.just@inktank.com>2013-10-03 22:40:14 -0700
commit91640b2407d9d57b4badd50658fbcb028a13d4b0 (patch)
treed3181d123729928f1b3fa187668a991446da48f8
parenteba52dd51d52b6d8ab32726fe31fb13db5770a6a (diff)
downloadceph-wip-5992-3.tar.gz
ReplicatedPG: block reads on an object until the write is committedwip-5992-3
Signed-off-by: Samuel Just <sam.just@inktank.com>
-rw-r--r--src/osd/ReplicatedPG.cc57
-rw-r--r--src/osd/ReplicatedPG.h163
2 files changed, 197 insertions, 23 deletions
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index 1684fa0dc53..f097b1c34c7 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -988,21 +988,9 @@ void ReplicatedPG::do_op(OpRequestRef op)
return;
}
- if ((op->may_read()) && (obc->obs.oi.lost)) {
- // This object is lost. Reading from it returns an error.
- dout(20) << __func__ << ": object " << obc->obs.oi.soid
- << " is lost" << dendl;
- osd->reply_op_error(op, -ENFILE);
- return;
- }
dout(25) << __func__ << ": object " << obc->obs.oi.soid
<< " has oi of " << obc->obs.oi << dendl;
- if (!op->may_write() && !obc->obs.exists) {
- osd->reply_op_error(op, -ENOENT);
- return;
- }
-
// are writes blocked by another object?
if (obc->blocked_by) {
dout(10) << "do_op writes for " << obc->obs.oi.soid << " blocked by "
@@ -1121,11 +1109,30 @@ void ReplicatedPG::do_op(OpRequestRef op)
}
}
- op->mark_started();
-
OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops,
&obc->obs, obc->ssc,
this);
+ if (!get_rw_locks(ctx)) {
+ op->mark_delayed("waiting for rw locks");
+ close_op_ctx(ctx);
+ return;
+ }
+
+ if ((op->may_read()) && (obc->obs.oi.lost)) {
+ // This object is lost. Reading from it returns an error.
+ dout(20) << __func__ << ": object " << obc->obs.oi.soid
+ << " is lost" << dendl;
+ close_op_ctx(ctx);
+ osd->reply_op_error(op, -ENFILE);
+ return;
+ }
+ if (!op->may_write() && !obc->obs.exists) {
+ close_op_ctx(ctx);
+ osd->reply_op_error(op, -ENOENT);
+ return;
+ }
+
+ op->mark_started();
ctx->obc = obc;
ctx->src_obc = src_obc;
@@ -1202,7 +1209,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
if (already_complete(oldv)) {
reply_ctx(ctx, 0, oldv, entry->user_version);
} else {
- delete ctx;
+ close_op_ctx(ctx);
if (m->wants_ack()) {
if (already_ack(oldv)) {
@@ -1295,7 +1302,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
if (result == -EAGAIN) {
// clean up after the ctx
- delete ctx;
+ close_op_ctx(ctx);
return;
}
@@ -1347,7 +1354,7 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
osd->send_message_osd_client(reply, m->get_connection());
- delete ctx;
+ close_op_ctx(ctx);
return;
}
@@ -1395,13 +1402,13 @@ void ReplicatedPG::execute_ctx(OpContext *ctx)
void ReplicatedPG::reply_ctx(OpContext *ctx, int r)
{
osd->reply_op_error(ctx->op, r);
- delete ctx;
+ close_op_ctx(ctx);
}
void ReplicatedPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
{
osd->reply_op_error(ctx->op, r, v, uv);
- delete ctx;
+ close_op_ctx(ctx);
}
void ReplicatedPG::log_op_stats(OpContext *ctx)
@@ -4589,7 +4596,7 @@ void ReplicatedPG::cancel_copy(CopyOpRef cop)
kick_object_context_blocked(ctx->obc);
- delete ctx;
+ close_op_ctx(ctx);
}
void ReplicatedPG::cancel_copy_ops()
@@ -4754,6 +4761,8 @@ void ReplicatedPG::op_commit(RepGather *repop)
eval_repop(repop);
}
+ release_op_ctx_locks(repop->ctx);
+
repop->put();
unlock();
}
@@ -4996,6 +5005,7 @@ ReplicatedPG::RepGather *ReplicatedPG::new_repop(OpContext *ctx, ObjectContextRe
void ReplicatedPG::remove_repop(RepGather *repop)
{
+ release_op_ctx_locks(repop->ctx);
repop_map.erase(repop->rep_tid);
repop->put();
@@ -8238,7 +8248,12 @@ void ReplicatedPG::scan_range(
{
assert(is_locked());
dout(10) << "scan_range from " << bi->begin << dendl;
- bi->version = last_update_applied;
+ if (last_update_applied >= info.log_tail) {
+ bi->version = last_update_applied;
+ } else {
+ osr.flush();
+ bi->version = info.last_update;
+ }
bi->objects.clear(); // for good measure
vector<hobject_t> ls;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index b398dd2fea4..c229ca68f8c 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -303,6 +303,8 @@ public:
hobject_t new_temp_oid, discard_temp_oid; ///< temp objects we should start/stop tracking
+ enum { W_LOCK, R_LOCK, NONE } lock_to_release;
+
OpContext(const OpContext& other);
const OpContext& operator=(const OpContext& other);
@@ -316,7 +318,8 @@ public:
current_osd_subop_num(0),
data_off(0), reply(NULL), pg(_pg),
num_read(0),
- num_write(0) {
+ num_write(0),
+ lock_to_release(NONE) {
if (_ssc) {
new_snapset = _ssc->snapset;
snapset = &_ssc->snapset;
@@ -324,6 +327,7 @@ public:
}
~OpContext() {
assert(!clone_obc);
+ assert(lock_to_release == NONE);
if (reply)
reply->put();
}
@@ -382,7 +386,7 @@ public:
if (--nref == 0) {
assert(!obc);
assert(src_obc.empty());
- delete ctx;
+ delete ctx; // must already be unlocked
delete this;
//generic_dout(0) << "deleting " << this << dendl;
}
@@ -393,6 +397,161 @@ public:
protected:
+ /// Tracks pending readers or writers on an object
+ class RWTracker {
+ struct ObjState {
+ enum State {
+ NONE,
+ READ,
+ WRITE
+ };
+ State state; /// rw state
+ uint64_t count; /// number of readers or writers
+ list<OpRequestRef> waiters; /// ops waiting on state change
+
+ ObjState() : state(NONE), count(0) {}
+ bool get_read(OpRequestRef op) {
+ switch (state) {
+ case NONE:
+ assert(count == 0);
+ state = READ;
+ // fall through
+ case READ:
+ count++;
+ return true;
+ case WRITE:
+ waiters.push_back(op);
+ return false;
+ default:
+ assert(0 == "unhandled case");
+ return false;
+ }
+ }
+ bool get_write(OpRequestRef op) {
+ switch (state) {
+ case NONE:
+ assert(count == 0);
+ state = WRITE;
+ // fall through
+ case WRITE:
+ count++;
+ return true;
+ case READ:
+ waiters.push_back(op);
+ return false;
+ default:
+ assert(0 == "unhandled case");
+ return false;
+ }
+ }
+ void dec(list<OpRequestRef> *requeue) {
+ assert(count > 0);
+ assert(requeue);
+ assert(requeue->empty());
+ count--;
+ if (count == 0) {
+ state = NONE;
+ requeue->swap(waiters);
+ }
+ }
+ void put_read(list<OpRequestRef> *requeue) {
+ assert(state == READ);
+ dec(requeue);
+ }
+ void put_write(list<OpRequestRef> *requeue) {
+ assert(state == WRITE);
+ dec(requeue);
+ }
+ void clear(list<OpRequestRef> *requeue) {
+ state = NONE;
+ count = 0;
+ assert(requeue);
+ assert(requeue->empty());
+ requeue->swap(waiters);
+ }
+ bool empty() const { return state == NONE; }
+ };
+ map<hobject_t, ObjState > obj_state;
+ public:
+ bool get_read(const hobject_t &hoid, OpRequestRef op) {
+ return obj_state[hoid].get_read(op);
+ }
+ bool get_write(const hobject_t &hoid, OpRequestRef op) {
+ return obj_state[hoid].get_write(op);
+ }
+ void put_read(const hobject_t &hoid, list<OpRequestRef> *to_wake) {
+ obj_state[hoid].put_read(to_wake);
+ if (obj_state[hoid].empty()) {
+ obj_state.erase(hoid);
+ }
+ }
+ void put_write(const hobject_t &hoid, list<OpRequestRef> *to_wake) {
+ obj_state[hoid].put_write(to_wake);
+ if (obj_state[hoid].empty()) {
+ obj_state.erase(hoid);
+ }
+ }
+ } rw_manager;
+
+ /**
+ * Grabs locks for OpContext, should be cleaned up in close_op_ctx
+ *
+ * @param ctx [in,out] ctx to get locks for
+ * @return true on success, false if we are queued
+ */
+ bool get_rw_locks(OpContext *ctx) {
+ if (ctx->op->may_write()) {
+ if (rw_manager.get_write(ctx->obs->oi.soid, ctx->op)) {
+ ctx->lock_to_release = OpContext::W_LOCK;
+ return true;
+ } else {
+ assert(0 == "Currently there cannot be a read in flight here");
+ return false;
+ }
+ } else {
+ assert(ctx->op->may_read());
+ if (rw_manager.get_read(ctx->obs->oi.soid, ctx->op)) {
+ ctx->lock_to_release = OpContext::R_LOCK;
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ /**
+ * Cleans up OpContext
+ *
+ * @param ctx [in] ctx to clean up
+ */
+ void close_op_ctx(OpContext *ctx) {
+ release_op_ctx_locks(ctx);
+ delete ctx;
+ }
+
+ /**
+ * Releases ctx locks
+ *
+ * @param ctx [in] ctx to clean up
+ */
+ void release_op_ctx_locks(OpContext *ctx) {
+ list<OpRequestRef> to_req;
+ switch (ctx->lock_to_release) {
+ case OpContext::W_LOCK:
+ rw_manager.put_write(ctx->obs->oi.soid, &to_req);
+ break;
+ case OpContext::R_LOCK:
+ rw_manager.put_read(ctx->obs->oi.soid, &to_req);
+ break;
+ case OpContext::NONE:
+ break;
+ default:
+ assert(0);
+ };
+ ctx->lock_to_release = OpContext::NONE;
+ requeue_ops(to_req);
+ }
+
// replica ops
// [primary|tail]
xlist<RepGather*> repop_queue;