summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGregory Farnum <greg@inktank.com>2013-09-10 15:49:50 -0700
committerGregory Farnum <greg@inktank.com>2013-09-10 15:49:50 -0700
commit383d8a199ea70578bb418cf36812963b04c42873 (patch)
treecd08ce05bd45098587bdea94a36ad64505d35370
parente3ca59c3971d63388664581a65e6a41ec2b6274d (diff)
parentab2506dbfa95948474b27b8f00d393b13c35571a (diff)
downloadceph-383d8a199ea70578bb418cf36812963b04c42873.tar.gz
Merge pull request #580 from ceph/wip-6033-redirects
Reviewed-by: Sage Weil <sage@inktank.com>
-rwxr-xr-xqa/workunits/rados/caching_redirects.sh24
-rw-r--r--src/messages/MOSDOpReply.h16
-rw-r--r--src/osd/ReplicatedPG.cc86
-rw-r--r--src/osd/ReplicatedPG.h5
-rw-r--r--src/osd/osd_types.cc35
-rw-r--r--src/osd/osd_types.h45
-rw-r--r--src/osdc/Objecter.cc61
-rw-r--r--src/osdc/Objecter.h6
-rw-r--r--src/test/encoding/types.h1
9 files changed, 228 insertions, 51 deletions
diff --git a/qa/workunits/rados/caching_redirects.sh b/qa/workunits/rados/caching_redirects.sh
index a8eda487246..19b940b5b4c 100755
--- a/qa/workunits/rados/caching_redirects.sh
+++ b/qa/workunits/rados/caching_redirects.sh
@@ -11,9 +11,9 @@ expect_false()
#create pools, set up tier relationship
ceph osd pool create base_pool 2
-ceph osd pool create empty_cache 2
+ceph osd pool create partial_cache 2
ceph osd pool create data_cache 2
-ceph osd tier add base_pool empty_cache
+ceph osd tier add base_pool partial_cache
ceph osd tier add base_pool data_cache
# populate base_pool and data_cache with some data
@@ -25,6 +25,8 @@ rados -p base_pool put barobj bar.txt
# data_cache is backwards so we can tell we read from it
rados -p data_cache put fooobj bar.txt
rados -p data_cache put barobj foo.txt
+# partial_cache gets barobj backwards
+rados -p partial_cache put barobj foo.txt
# get the objects back before setting a caching pool
rados -p base_pool get fooobj tmp.txt
@@ -32,20 +34,18 @@ diff -q tmp.txt foo.txt
rados -p base_pool get barobj tmp.txt
diff -q tmp.txt bar.txt
-# set up redirect and make sure we get nothing
-ceph osd tier set-overlay base_pool empty_cache
-expect_false rados -p base_pool get fooobj tmp.txt
-expect_false rados -p base_pool get barobj tmp.txt
-#let's write as well
-rados -p base_pool put fooobj baz.txt
-rados -p base_pool put barobj baz.txt
-#and make sure we can look at the cache pool directly
-rados -p empty_cache get fooobj tmp.txt
-diff -q tmp.txt baz.txt
+# set up redirect and make sure we get redirect-based results
+ceph osd tier set-overlay base_pool partial_cache
+ceph osd tier cache-mode partial_cache writeback
+rados -p base_pool get fooobj tmp.txt
+diff -q tmp.txt foo.txt
+rados -p base_pool get barobj tmp.txt
+diff -q tmp.txt foo.txt
# switch cache pools and make sure contents differ
ceph osd tier remove-overlay base_pool
ceph osd tier set-overlay base_pool data_cache
+ceph osd tier cache-mode data_cache writeback
rados -p base_pool get fooobj tmp.txt
diff -q tmp.txt bar.txt
rados -p base_pool get barobj tmp.txt
diff --git a/src/messages/MOSDOpReply.h b/src/messages/MOSDOpReply.h
index 5bf5ab4ef62..c0e989f7c3a 100644
--- a/src/messages/MOSDOpReply.h
+++ b/src/messages/MOSDOpReply.h
@@ -31,7 +31,7 @@
class MOSDOpReply : public Message {
- static const int HEAD_VERSION = 5;
+ static const int HEAD_VERSION = 6;
static const int COMPAT_VERSION = 2;
object_t oid;
@@ -44,6 +44,7 @@ class MOSDOpReply : public Message {
version_t user_version;
epoch_t osdmap_epoch;
int32_t retry_attempt;
+ request_redirect_t redirect;
public:
object_t get_oid() const { return oid; }
@@ -87,6 +88,10 @@ public:
bad_replay_version = v;
}
+ void set_redirect(const request_redirect_t& redir) { redirect = redir; }
+ const request_redirect_t& get_redirect() const { return redirect; }
+ bool is_redirect_reply() const { return !redirect.empty(); }
+
void add_flags(int f) { flags |= f; }
void claim_op_out_data(vector<OSDOp>& o) {
@@ -109,7 +114,7 @@ public:
}
// osdmap
- epoch_t get_map_epoch() { return osdmap_epoch; }
+ epoch_t get_map_epoch() const { return osdmap_epoch; }
/*osd_reqid_t get_reqid() { return osd_reqid_t(get_dest(),
head.client_inc,
@@ -180,6 +185,7 @@ public:
::encode(replay_version, payload);
::encode(user_version, payload);
+ ::encode(redirect, payload);
}
}
virtual void decode_payload() {
@@ -232,6 +238,9 @@ public:
replay_version = bad_replay_version;
user_version = replay_version.version;
}
+
+ if (header.version >= 6)
+ ::decode(redirect, p);
}
}
@@ -253,6 +262,9 @@ public:
char buf[80];
out << " (" << strerror_r(-get_result(), buf, sizeof(buf)) << ")";
}
+ if (is_redirect_reply()) {
+ out << " redirect: { " << redirect << " }";
+ }
out << ")";
}
diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc
index b391e173d14..2456d176834 100644
--- a/src/osd/ReplicatedPG.cc
+++ b/src/osd/ReplicatedPG.cc
@@ -712,23 +712,28 @@ void ReplicatedPG::do_op(OpRequestRef op)
m->get_object_locator().get_pool(),
m->get_object_locator().nspace),
&obc, can_create, &snapid);
- if (r) {
- if (r == -EAGAIN) {
- // If we're not the primary of this OSD, and we have
- // CEPH_OSD_FLAG_LOCALIZE_READS set, we just return -EAGAIN. Otherwise,
- // we have to wait for the object.
- if (is_primary() ||
- (!(m->get_flags() & CEPH_OSD_FLAG_BALANCE_READS) &&
- !(m->get_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) {
- // missing the specific snap we need; requeue and wait.
- assert(!can_create); // only happens on a read
- hobject_t soid(m->get_oid(), m->get_object_locator().key,
- snapid, m->get_pg().ps(),
- info.pgid.pool(), m->get_object_locator().nspace);
- wait_for_missing_object(soid, op);
- return;
- }
+
+ if (r == -EAGAIN) {
+ // If we're not the primary of this OSD, and we have
+ // CEPH_OSD_FLAG_LOCALIZE_READS set, we just return -EAGAIN. Otherwise,
+ // we have to wait for the object.
+ if (is_primary() ||
+ (!(m->get_flags() & CEPH_OSD_FLAG_BALANCE_READS) &&
+ !(m->get_flags() & CEPH_OSD_FLAG_LOCALIZE_READS))) {
+ // missing the specific snap we need; requeue and wait.
+ assert(!can_create); // only happens on a read
+ hobject_t soid(m->get_oid(), m->get_object_locator().key,
+ snapid, m->get_pg().ps(),
+ info.pgid.pool(), m->get_object_locator().nspace);
+ wait_for_missing_object(soid, op);
+ return;
}
+ }
+
+ if (maybe_handle_cache(op, obc, r))
+ return;
+
+ if (r) {
osd->reply_op_error(op, r);
return;
}
@@ -887,6 +892,53 @@ void ReplicatedPG::do_op(OpRequestRef op)
execute_ctx(ctx);
}
+bool ReplicatedPG::maybe_handle_cache(OpRequestRef op, ObjectContextRef obc,
+ int r)
+{
+ switch(pool.info.cache_mode) {
+ case pg_pool_t::CACHEMODE_NONE:
+ return false;
+ break;
+ case pg_pool_t::CACHEMODE_WRITEBACK:
+ if (obc.get()) {
+ return false;
+ } else {
+ do_cache_redirect(op, obc);
+ return true;
+ }
+ break;
+ case pg_pool_t::CACHEMODE_INVALIDATE_FORWARD:
+ do_cache_redirect(op, obc);
+ return true;
+ break;
+ case pg_pool_t::CACHEMODE_READONLY:
+ if (obc.get() && !r) {
+ return false;
+ } else {
+ do_cache_redirect(op, obc);
+ return true;
+ }
+ break;
+ default:
+ assert(0);
+ }
+ return false;
+}
+
+void ReplicatedPG::do_cache_redirect(OpRequestRef op, ObjectContextRef obc)
+{
+ MOSDOp *m = static_cast<MOSDOp*>(op->request);
+ int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
+ MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
+ get_osdmap()->get_epoch(), flags);
+ request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
+ reply->set_redirect(redir);
+ dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
+ << op << dendl;
+ m->get_connection()->get_messenger()->send_message(reply, m->get_connection());
+ return;
+}
+
void ReplicatedPG::execute_ctx(OpContext *ctx)
{
dout(10) << __func__ << " " << ctx << dendl;
@@ -3774,7 +3826,7 @@ void ReplicatedPG::make_writeable(OpContext *ctx)
ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
ctx->obs->oi.version,
ctx->obs->oi.user_version,
- ctx->reqid, ctx->new_obs.oi.mtime));
+ osd_reqid_t(), ctx->new_obs.oi.mtime));
::encode(snaps, ctx->log.back().snaps);
ctx->at_version.version++;
diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h
index 254b5842ffc..fef3814d93a 100644
--- a/src/osd/ReplicatedPG.h
+++ b/src/osd/ReplicatedPG.h
@@ -619,7 +619,10 @@ protected:
void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi,
SnapSet& ss, interval_set<uint64_t>& modified,
uint64_t offset, uint64_t length, bool count_bytes);
- void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st);
+ void add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& st);
+
+ inline bool maybe_handle_cache(OpRequestRef op, ObjectContextRef obc, int r);
+ void do_cache_redirect(OpRequestRef op, ObjectContextRef obc);
int prepare_transaction(OpContext *ctx);
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 34ae75b12b9..e94fd02a5ad 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -101,6 +101,41 @@ void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
o.push_back(new object_locator_t(12, "n1", "key2"));
}
+// -- request_redirect_t --
+void request_redirect_t::encode(bufferlist& bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ::encode(redirect_locator, bl);
+ ::encode(redirect_object, bl);
+ ::encode(osd_instructions, bl);
+ ENCODE_FINISH(bl);
+}
+
+void request_redirect_t::decode(bufferlist::iterator& bl)
+{
+ DECODE_START(1, bl);
+ ::decode(redirect_locator, bl);
+ ::decode(redirect_object, bl);
+ ::decode(osd_instructions, bl);
+ DECODE_FINISH(bl);
+}
+
+void request_redirect_t::dump(Formatter *f) const
+{
+ f->dump_string("object", redirect_object);
+ f->open_object_section("locator");
+ redirect_locator.dump(f);
+ f->close_section(); // locator
+}
+
+void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
+{
+ object_locator_t loc(1, "redir_obj");
+ o.push_back(new request_redirect_t());
+ o.push_back(new request_redirect_t(loc, 0));
+ o.push_back(new request_redirect_t(loc, "redir_obj"));
+ o.push_back(new request_redirect_t(loc));
+}
// -- pow2_hist_t --
void pow2_hist_t::dump(Formatter *f) const
diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h
index 91ab89a63af..901d9dbb488 100644
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@@ -129,6 +129,10 @@ struct object_locator_t {
nspace = "";
}
+ bool empty() const {
+ return pool == -1;
+ }
+
void encode(bufferlist& bl) const;
void decode(bufferlist::iterator& p);
void dump(Formatter *f) const;
@@ -153,6 +157,47 @@ inline ostream& operator<<(ostream& out, const object_locator_t& loc)
return out;
}
+struct request_redirect_t {
+private:
+ object_locator_t redirect_locator; ///< this is authoritative
+ string redirect_object; ///< If non-empty, the request goes to this object name
+ bufferlist osd_instructions; ///< a bufferlist for the OSDs, passed but not interpreted by clients
+
+ friend ostream& operator<<(ostream& out, const request_redirect_t& redir);
+public:
+
+ request_redirect_t() {}
+ explicit request_redirect_t(const object_locator_t& orig, int64_t rpool) :
+ redirect_locator(orig) { redirect_locator.pool = rpool; }
+ explicit request_redirect_t(const object_locator_t& rloc) :
+ redirect_locator(rloc) {}
+ explicit request_redirect_t(const object_locator_t& orig,
+ const string& robj) :
+ redirect_locator(orig), redirect_object(robj) {}
+
+ void set_instructions(const bufferlist& bl) { osd_instructions = bl; }
+ const bufferlist& get_instructions() { return osd_instructions; }
+
+ bool empty() const { return redirect_locator.empty() &&
+ redirect_object.empty(); }
+
+ void combine_with_locator(object_locator_t& orig, string& obj) const {
+ orig = redirect_locator;
+ if (!redirect_object.empty())
+ obj = redirect_object;
+ }
+
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+ void dump(Formatter *f) const;
+ static void generate_test_instances(list<request_redirect_t*>& o);
+};
+WRITE_CLASS_ENCODER(request_redirect_t)
+
+inline ostream& operator<<(ostream& out, const request_redirect_t& redir) {
+ out << "object " << redir.redirect_object << ", locator{" << redir.redirect_locator << "}";
+ return out;
+}
// Internal OSD op flags - set by the OSD based on the op types
enum {
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index 188c28d5e56..bf380e60438 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -1243,7 +1243,7 @@ tid_t Objecter::_op_submit(Op *op)
}
// send?
- ldout(cct, 10) << "op_submit oid " << op->oid
+ ldout(cct, 10) << "op_submit oid " << op->base_oid
<< " " << op->base_oloc << " " << op->target_oloc
<< " " << op->ops << " tid " << op->tid
<< " osd." << (op->session ? op->session->osd : -1)
@@ -1328,22 +1328,33 @@ int Objecter::recalc_op_target(Op *op)
bool is_read = op->flags & CEPH_OSD_FLAG_READ;
bool is_write = op->flags & CEPH_OSD_FLAG_WRITE;
- op->target_oloc = op->base_oloc;
- const pg_pool_t *pi = osdmap->get_pg_pool(op->base_oloc.pool);
- if (pi) {
- if (is_read && pi->has_read_tier())
- op->target_oloc.pool = pi->read_tier;
- if (is_write && pi->has_write_tier())
- op->target_oloc.pool = pi->write_tier;
+ bool need_check_tiering = false;
+ if (op->target_oid.name.empty()) {
+ op->target_oid = op->base_oid;
+ need_check_tiering = true;
+ }
+ if (op->target_oloc.empty()) {
+ op->target_oloc = op->base_oloc;
+ need_check_tiering = true;
+ }
+
+ if (need_check_tiering) {
+ const pg_pool_t *pi = osdmap->get_pg_pool(op->base_oloc.pool);
+ if (pi) {
+ if (is_read && pi->has_read_tier())
+ op->target_oloc.pool = pi->read_tier;
+ if (is_write && pi->has_write_tier())
+ op->target_oloc.pool = pi->write_tier;
+ }
}
if (op->precalc_pgid) {
- assert(op->oid.name.empty()); // make sure this is a listing op
+ assert(op->base_oid.name.empty()); // make sure this is a listing op
ldout(cct, 10) << "recalc_op_target have " << pgid << " pool " << osdmap->have_pg_pool(pgid.pool()) << dendl;
if (!osdmap->have_pg_pool(pgid.pool()))
return RECALC_OP_TARGET_POOL_DNE;
} else {
- int ret = osdmap->object_locator_to_pg(op->oid, op->target_oloc, pgid);
+ int ret = osdmap->object_locator_to_pg(op->target_oid, op->target_oloc, pgid);
if (ret == -ENOENT)
return RECALC_OP_TARGET_POOL_DNE;
}
@@ -1485,7 +1496,8 @@ void Objecter::send_op(Op *op)
op->stamp = ceph_clock_now(cct);
MOSDOp *m = new MOSDOp(client_inc, op->tid,
- op->oid, op->target_oloc, op->pgid, osdmap->get_epoch(),
+ op->target_oid, op->target_oloc, op->pgid,
+ osdmap->get_epoch(),
flags);
m->set_snapid(op->snapid);
@@ -1546,6 +1558,15 @@ void Objecter::throttle_op(Op *op, int op_budget)
}
}
+void Objecter::unregister_op(Op *op)
+{
+ if (op->onack)
+ num_unacked--;
+ if (op->oncommit)
+ num_uncommitted--;
+ ops.erase(op->tid);
+}
+
/* This function DOES put the passed message before returning */
void Objecter::handle_osd_op_reply(MOSDOpReply *m)
{
@@ -1592,12 +1613,18 @@ void Objecter::handle_osd_op_reply(MOSDOpReply *m)
int rc = m->get_result();
+ if (m->is_redirect_reply()) {
+ ldout(cct, 5) << " got redirect reply; redirecting" << dendl;
+ unregister_op(op);
+ m->get_redirect().combine_with_locator(op->target_oloc, op->target_oid.name);
+ op_submit(op);
+ m->put();
+ return;
+ }
+
if (rc == -EAGAIN) {
ldout(cct, 7) << " got -EAGAIN, resubmitting" << dendl;
- if (op->onack)
- num_unacked--;
- if (op->oncommit)
- num_uncommitted--;
+ unregister_op(op);
op_submit(op);
m->put();
return;
@@ -2219,7 +2246,7 @@ void Objecter::dump_active()
for (map<tid_t,Op*>::iterator p = ops.begin(); p != ops.end(); ++p) {
Op *op = p->second;
ldout(cct, 20) << op->tid << "\t" << op->pgid << "\tosd." << (op->session ? op->session->osd : -1)
- << "\t" << op->oid << "\t" << op->ops << dendl;
+ << "\t" << op->base_oid << "\t" << op->ops << dendl;
}
}
@@ -2250,7 +2277,7 @@ void Objecter::dump_ops(Formatter *fmt) const
fmt->dump_int("osd", op->session ? op->session->osd : -1);
fmt->dump_stream("last_sent") << op->stamp;
fmt->dump_int("attempts", op->attempts);
- fmt->dump_stream("object_id") << op->oid;
+ fmt->dump_stream("object_id") << op->base_oid;
fmt->dump_stream("object_locator") << op->base_oloc;
fmt->dump_stream("target_object_locator") << op->target_oloc;
fmt->dump_stream("snapid") << op->snapid;
diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h
index 154ee410fde..58d52405a90 100644
--- a/src/osdc/Objecter.h
+++ b/src/osdc/Objecter.h
@@ -828,8 +828,9 @@ public:
xlist<Op*>::item session_item;
int incarnation;
- object_t oid;
+ object_t base_oid;
object_locator_t base_oloc;
+ object_t target_oid;
object_locator_t target_oloc;
pg_t pgid;
@@ -874,7 +875,7 @@ public:
Op(const object_t& o, const object_locator_t& ol, vector<OSDOp>& op,
int f, Context *ac, Context *co, version_t *ov) :
session(NULL), session_item(this), incarnation(0),
- oid(o), base_oloc(ol),
+ base_oid(o), base_oloc(ol),
used_replica(false), con(NULL),
snapid(CEPH_NOSNAP),
outbl(NULL),
@@ -1308,6 +1309,7 @@ private:
// low-level
tid_t op_submit(Op *op);
tid_t _op_submit(Op *op);
+ inline void unregister_op(Op *op);
// public interface
public:
diff --git a/src/test/encoding/types.h b/src/test/encoding/types.h
index 514c76fc1a8..fe17f077d8e 100644
--- a/src/test/encoding/types.h
+++ b/src/test/encoding/types.h
@@ -32,6 +32,7 @@ TYPE(CrushWrapper)
#include "osd/osd_types.h"
TYPE(osd_reqid_t)
TYPE(object_locator_t)
+TYPE(request_redirect_t)
TYPE(pg_t)
TYPE(coll_t)
TYPE(pow2_hist_t)