diff options
author | Gregory Farnum <greg@inktank.com> | 2013-09-10 15:49:50 -0700 |
---|---|---|
committer | Gregory Farnum <greg@inktank.com> | 2013-09-10 15:49:50 -0700 |
commit | 383d8a199ea70578bb418cf36812963b04c42873 (patch) | |
tree | cd08ce05bd45098587bdea94a36ad64505d35370 | |
parent | e3ca59c3971d63388664581a65e6a41ec2b6274d (diff) | |
parent | ab2506dbfa95948474b27b8f00d393b13c35571a (diff) | |
download | ceph-383d8a199ea70578bb418cf36812963b04c42873.tar.gz |
Merge pull request #580 from ceph/wip-6033-redirects
Reviewed-by: Sage Weil <sage@inktank.com>
-rwxr-xr-x | qa/workunits/rados/caching_redirects.sh | 24 | ||||
-rw-r--r-- | src/messages/MOSDOpReply.h | 16 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.cc | 86 | ||||
-rw-r--r-- | src/osd/ReplicatedPG.h | 5 | ||||
-rw-r--r-- | src/osd/osd_types.cc | 35 | ||||
-rw-r--r-- | src/osd/osd_types.h | 45 | ||||
-rw-r--r-- | src/osdc/Objecter.cc | 61 | ||||
-rw-r--r-- | src/osdc/Objecter.h | 6 | ||||
-rw-r--r-- | src/test/encoding/types.h | 1 |
9 files changed, 228 insertions, 51 deletions
diff --git a/qa/workunits/rados/caching_redirects.sh b/qa/workunits/rados/caching_redirects.sh index a8eda487246..19b940b5b4c 100755 --- a/qa/workunits/rados/caching_redirects.sh +++ b/qa/workunits/rados/caching_redirects.sh @@ -11,9 +11,9 @@ expect_false() #create pools, set up tier relationship ceph osd pool create base_pool 2 -ceph osd pool create empty_cache 2 +ceph osd pool create partial_cache 2 ceph osd pool create data_cache 2 -ceph osd tier add base_pool empty_cache +ceph osd tier add base_pool partial_cache ceph osd tier add base_pool data_cache # populate base_pool and data_cache with some data @@ -25,6 +25,8 @@ rados -p base_pool put barobj bar.txt # data_cache is backwards so we can tell we read from it rados -p data_cache put fooobj bar.txt rados -p data_cache put barobj foo.txt +# partial_cache gets barobj backwards +rados -p partial_cache put barobj foo.txt # get the objects back before setting a caching pool rados -p base_pool get fooobj tmp.txt @@ -32,20 +34,18 @@ diff -q tmp.txt foo.txt rados -p base_pool get barobj tmp.txt diff -q tmp.txt bar.txt -# set up redirect and make sure we get nothing -ceph osd tier set-overlay base_pool empty_cache -expect_false rados -p base_pool get fooobj tmp.txt -expect_false rados -p base_pool get barobj tmp.txt -#let's write as well -rados -p base_pool put fooobj baz.txt -rados -p base_pool put barobj baz.txt -#and make sure we can look at the cache pool directly -rados -p empty_cache get fooobj tmp.txt -diff -q tmp.txt baz.txt +# set up redirect and make sure we get redirect-based results +ceph osd tier set-overlay base_pool partial_cache +ceph osd tier cache-mode partial_cache writeback +rados -p base_pool get fooobj tmp.txt +diff -q tmp.txt foo.txt +rados -p base_pool get barobj tmp.txt +diff -q tmp.txt foo.txt # switch cache pools and make sure contents differ ceph osd tier remove-overlay base_pool ceph osd tier set-overlay base_pool data_cache +ceph osd tier cache-mode data_cache writeback rados -p base_pool get fooobj tmp.txt diff -q tmp.txt bar.txt rados -p base_pool get barobj tmp.txt diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h index 5bf5ab4ef62..c0e989f7c3a 100644 --- a/src/messages/MOSDOpReply.h +++ b/src/messages/MOSDOpReply.h @@ -31,7 +31,7 @@ class MOSDOpReply : public Message { - static const int HEAD_VERSION = 5; + static const int HEAD_VERSION = 6; static const int COMPAT_VERSION = 2; object_t oid; @@ -44,6 +44,7 @@ class MOSDOpReply : public Message { version_t user_version; epoch_t osdmap_epoch; int32_t retry_attempt; + request_redirect_t redirect; public: object_t get_oid() const { return oid; } @@ -87,6 +88,10 @@ public: bad_replay_version = v; } + void set_redirect(const request_redirect_t& redir) { redirect = redir; } + const request_redirect_t& get_redirect() const { return redirect; } + bool is_redirect_reply() const { return !redirect.empty(); } + void add_flags(int f) { flags |= f; } void claim_op_out_data(vector<OSDOp>& o) { @@ -109,7 +114,7 @@ public: } // osdmap - epoch_t get_map_epoch() { return osdmap_epoch; } + epoch_t get_map_epoch() const { return osdmap_epoch; } /*osd_reqid_t get_reqid() { return osd_reqid_t(get_dest(), head.client_inc, @@ -180,6 +185,7 @@ public: ::encode(replay_version, payload); ::encode(user_version, payload); + ::encode(redirect, payload); } } virtual void decode_payload() { @@ -232,6 +238,9 @@ public: replay_version = bad_replay_version; user_version = replay_version.version; } + + if (header.version >= 6) + ::decode(redirect, p); } } @@ -253,6 +262,9 @@ public: char buf[80]; out << " (" << strerror_r(-get_result(), buf, sizeof(buf)) << ")"; } + if (is_redirect_reply()) { + out << " redirect: { " << redirect << " }"; + } out << ")"; } diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index b391e173d14..2456d176834 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -712,23 +712,28 @@ void ReplicatedPG::do_op(OpRequestRef op) m->get_object_locator().get_pool(), m->get_object_locator().nspace), &obc, can_create, &snapid); - if (r) { - if (r == -EAGAIN) { - // If we're not the primary of this OSD, and we have - // CEPH_OSD_FLAG_LOCALIZE_READS set, we just return -EAGAIN. Otherwise, - // we have to wait for the object. - if (is_primary() || - (!(m->get_flags() & CEPH_OSD_FLAG_BALANCE_READS) && - !(m->get_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) { - // missing the specific snap we need; requeue and wait. - assert(!can_create); // only happens on a read - hobject_t soid(m->get_oid(), m->get_object_locator().key, - snapid, m->get_pg().ps(), - info.pgid.pool(), m->get_object_locator().nspace); - wait_for_missing_object(soid, op); - return; - } + + if (r == -EAGAIN) { + // If we're not the primary of this OSD, and we have + // CEPH_OSD_FLAG_LOCALIZE_READS set, we just return -EAGAIN. Otherwise, + // we have to wait for the object. + if (is_primary() || + (!(m->get_flags() & CEPH_OSD_FLAG_BALANCE_READS) && + !(m->get_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) { + // missing the specific snap we need; requeue and wait. + assert(!can_create); // only happens on a read + hobject_t soid(m->get_oid(), m->get_object_locator().key, + snapid, m->get_pg().ps(), + info.pgid.pool(), m->get_object_locator().nspace); + wait_for_missing_object(soid, op); + return; } + } + + if (maybe_handle_cache(op, obc, r)) + return; + + if (r) { osd->reply_op_error(op, r); return; } @@ -887,6 +892,53 @@ void ReplicatedPG::do_op(OpRequestRef op) execute_ctx(ctx); } +bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, ObjectContextRef obc, + int r) +{ + switch(pool.info.cache_mode) { + case pg_pool_t::CACHEMODE_NONE: + return false; + break; + case pg_pool_t::CACHEMODE_WRITEBACK: + if (obc.get()) { + return false; + } else { + do_cache_redirect(op, obc); + return true; + } + break; + case pg_pool_t::CACHEMODE_INVALIDATE_FORWARD: + do_cache_redirect(op, obc); + return true; + break; + case pg_pool_t::CACHEMODE_READONLY: + if (obc.get() && !r) { + return false; + } else { + do_cache_redirect(op, obc); + return true; + } + break; + default: + assert(0); + } + return false; +} + +void ReplicatedPG::do_cache_redirect(OpRequestRef op, ObjectContextRef obc) +{ + MOSDOp *m = static_cast<MOSDOp*>(op->request); + int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK); + MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, + get_osdmap()->get_epoch(), flags); + request_redirect_t redir(m->get_object_locator(), pool.info.tier_of); + reply->set_redirect(redir); + dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op " + << op << dendl; + m->get_connection()->get_messenger()->send_message(reply, m->get_connection()); + return; +} + void ReplicatedPG::execute_ctx(OpContext *ctx) { dout(10) << __func__ << " " << ctx << dendl; @@ -3774,7 +3826,7 @@ void ReplicatedPG::make_writeable(OpContext *ctx) ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version, ctx->obs->oi.version, ctx->obs->oi.user_version, - ctx->reqid, ctx->new_obs.oi.mtime)); + osd_reqid_t(), ctx->new_obs.oi.mtime)); ::encode(snaps, ctx->log.back().snaps); ctx->at_version.version++; diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index 254b5842ffc..fef3814d93a 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -619,7 +619,10 @@ protected: void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi, SnapSet& ss, interval_set<uint64_t>& modified, uint64_t offset, uint64_t length, bool count_bytes); - void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st); + void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st); + + inline bool maybe_handle_cache(OpRequestRef op, ObjectContextRef obc, int r); + void do_cache_redirect(OpRequestRef op, ObjectContextRef obc); int prepare_transaction(OpContext *ctx); diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc index 34ae75b12b9..e94fd02a5ad 100644 --- a/src/osd/osd_types.cc +++ b/src/osd/osd_types.cc @@ -101,6 +101,41 @@ void object_locator_t::generate_test_instances(list<object_locator_t*>& o) o.push_back(new object_locator_t(12, "n1", "key2")); } +// -- request_redirect_t -- +void request_redirect_t::encode(bufferlist& bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(redirect_locator, bl); + ::encode(redirect_object, bl); + ::encode(osd_instructions, bl); + ENCODE_FINISH(bl); +} + +void request_redirect_t::decode(bufferlist::iterator& bl) +{ + DECODE_START(1, bl); + ::decode(redirect_locator, bl); + ::decode(redirect_object, bl); + ::decode(osd_instructions, bl); + DECODE_FINISH(bl); +} + +void request_redirect_t::dump(Formatter *f) const +{ + f->dump_string("object", redirect_object); + f->open_object_section("locator"); + redirect_locator.dump(f); + f->close_section(); // locator +} + +void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o) +{ + object_locator_t loc(1, "redir_obj"); + o.push_back(new request_redirect_t()); + o.push_back(new request_redirect_t(loc, 0)); + o.push_back(new request_redirect_t(loc, "redir_obj")); + o.push_back(new request_redirect_t(loc)); +} // -- pow2_hist_t -- void pow2_hist_t::dump(Formatter *f) const diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 91ab89a63af..901d9dbb488 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -129,6 +129,10 @@ struct object_locator_t { nspace = ""; } + bool empty() const { + return pool == -1; + } + void encode(bufferlist& bl) const; void decode(bufferlist::iterator& p); void dump(Formatter *f) const; @@ -153,6 +157,47 @@ inline ostream& operator<<(ostream& out, const object_locator_t& loc) return out; } +struct request_redirect_t { +private: + object_locator_t redirect_locator; ///< this is authoritative + string redirect_object; ///< If non-empty, the request goes to this object name + bufferlist osd_instructions; ///< a bufferlist for the OSDs, passed but not interpreted by clients + + friend ostream& operator<<(ostream& out, const request_redirect_t& redir); +public: + + request_redirect_t() {} + explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) : + redirect_locator(orig) { redirect_locator.pool = rpool; } + explicit request_redirect_t(const object_locator_t& rloc) : + redirect_locator(rloc) {} + explicit request_redirect_t(const object_locator_t& orig, + const string& robj) : + redirect_locator(orig), redirect_object(robj) {} + + void set_instructions(const bufferlist& bl) { osd_instructions = bl; } + const bufferlist& get_instructions() { return osd_instructions; } + + bool empty() const { return redirect_locator.empty() && + redirect_object.empty(); } + + void combine_with_locator(object_locator_t& orig, string& obj) const { + orig = redirect_locator; + if (!redirect_object.empty()) + obj = redirect_object; + } + + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + void dump(Formatter *f) const; + static void generate_test_instances(list<request_redirect_t*>& o); +}; +WRITE_CLASS_ENCODER(request_redirect_t) + +inline ostream& operator<<(ostream& out, const request_redirect_t& redir) { + out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}"; + return out; +} // Internal OSD op flags - set by the OSD based on the op types enum { diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc index 188c28d5e56..bf380e60438 100644 --- a/src/osdc/Objecter.cc +++ b/src/osdc/Objecter.cc @@ -1243,7 +1243,7 @@ tid_t Objecter::_op_submit(Op *op) } // send? - ldout(cct, 10) << "op_submit oid " << op->oid + ldout(cct, 10) << "op_submit oid " << op->base_oid << " " << op->base_oloc << " " << op->target_oloc << " " << op->ops << " tid " << op->tid << " osd." << (op->session ? op->session->osd : -1) @@ -1328,22 +1328,33 @@ int Objecter::recalc_op_target(Op *op) bool is_read = op->flags & CEPH_OSD_FLAG_READ; bool is_write = op->flags & CEPH_OSD_FLAG_WRITE; - op->target_oloc = op->base_oloc; - const pg_pool_t *pi = osdmap->get_pg_pool(op->base_oloc.pool); - if (pi) { - if (is_read && pi->has_read_tier()) - op->target_oloc.pool = pi->read_tier; - if (is_write && pi->has_write_tier()) - op->target_oloc.pool = pi->write_tier; + bool need_check_tiering = false; + if (op->target_oid.name.empty()) { + op->target_oid = op->base_oid; + need_check_tiering = true; + } + if (op->target_oloc.empty()) { + op->target_oloc = op->base_oloc; + need_check_tiering = true; + } + + if (need_check_tiering) { + const pg_pool_t *pi = osdmap->get_pg_pool(op->base_oloc.pool); + if (pi) { + if (is_read && pi->has_read_tier()) + op->target_oloc.pool = pi->read_tier; + if (is_write && pi->has_write_tier()) + op->target_oloc.pool = pi->write_tier; + } } if (op->precalc_pgid) { - assert(op->oid.name.empty()); // make sure this is a listing op + assert(op->base_oid.name.empty()); // make sure this is a listing op ldout(cct, 10) << "recalc_op_target have " << pgid << " pool " << osdmap->have_pg_pool(pgid.pool()) << dendl; if (!osdmap->have_pg_pool(pgid.pool())) return RECALC_OP_TARGET_POOL_DNE; } else { - int ret = osdmap->object_locator_to_pg(op->oid, op->target_oloc, pgid); + int ret = osdmap->object_locator_to_pg(op->target_oid, op->target_oloc, pgid); if (ret == -ENOENT) return RECALC_OP_TARGET_POOL_DNE; } @@ -1485,7 +1496,8 @@ void Objecter::send_op(Op *op) op->stamp = ceph_clock_now(cct); MOSDOp *m = new MOSDOp(client_inc, op->tid, - op->oid, op->target_oloc, op->pgid, osdmap->get_epoch(), + op->target_oid, op->target_oloc, op->pgid, + osdmap->get_epoch(), flags); m->set_snapid(op->snapid); @@ -1546,6 +1558,15 @@ void Objecter::throttle_op(Op *op, int op_budget) } } +void Objecter::unregister_op(Op *op) +{ + if (op->onack) + num_unacked--; + if (op->oncommit) + num_uncommitted--; + ops.erase(op->tid); +} + /* This function DOES put the passed message before returning */ void Objecter::handle_osd_op_reply(MOSDOpReply *m) { @@ -1592,12 +1613,18 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m) int rc = m->get_result(); + if (m->is_redirect_reply()) { + ldout(cct, 5) << " got redirect reply; redirecting" << dendl; + unregister_op(op); + m->get_redirect().combine_with_locator(op->target_oloc, op->target_oid.name); + op_submit(op); + m->put(); + return; + } + if (rc == -EAGAIN) { ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl; - if (op->onack) - num_unacked--; - if (op->oncommit) - num_uncommitted--; + unregister_op(op); op_submit(op); m->put(); return; @@ -2219,7 +2246,7 @@ void Objecter::dump_active() for (map<tid_t,Op*>::iterator p = ops.begin(); p != ops.end(); ++p) { Op *op = p->second; ldout(cct, 20) << op->tid << "\t" << op->pgid << "\tosd." << (op->session ? op->session->osd : -1) - << "\t" << op->oid << "\t" << op->ops << dendl; + << "\t" << op->base_oid << "\t" << op->ops << dendl; } } @@ -2250,7 +2277,7 @@ void Objecter::dump_ops(Formatter *fmt) const fmt->dump_int("osd", op->session ? op->session->osd : -1); fmt->dump_stream("last_sent") << op->stamp; fmt->dump_int("attempts", op->attempts); - fmt->dump_stream("object_id") << op->oid; + fmt->dump_stream("object_id") << op->base_oid; fmt->dump_stream("object_locator") << op->base_oloc; fmt->dump_stream("target_object_locator") << op->target_oloc; fmt->dump_stream("snapid") << op->snapid; diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 154ee410fde..58d52405a90 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -828,8 +828,9 @@ public: xlist<Op*>::item session_item; int incarnation; - object_t oid; + object_t base_oid; object_locator_t base_oloc; + object_t target_oid; object_locator_t target_oloc; pg_t pgid; @@ -874,7 +875,7 @@ public: Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op, int f, Context *ac, Context *co, version_t *ov) : session(NULL), session_item(this), incarnation(0), - oid(o), base_oloc(ol), + base_oid(o), base_oloc(ol), used_replica(false), con(NULL), snapid(CEPH_NOSNAP), outbl(NULL), @@ -1308,6 +1309,7 @@ private: // low-level tid_t op_submit(Op *op); tid_t _op_submit(Op *op); + inline void unregister_op(Op *op); // public interface public: diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h index 514c76fc1a8..fe17f077d8e 100644 --- a/src/test/encoding/types.h +++ b/src/test/encoding/types.h @@ -32,6 +32,7 @@ TYPE(CrushWrapper) #include "osd/osd_types.h" TYPE(osd_reqid_t) TYPE(object_locator_t) +TYPE(request_redirect_t) TYPE(pg_t) TYPE(coll_t) TYPE(pow2_hist_t) |