summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSage Weil <sage@inktank.com>2013-07-30 16:55:21 -0700
committerSage Weil <sage@inktank.com>2013-07-30 16:55:21 -0700
commit3fa1cd20b5d091ec997cf15251586d77f3cf079b (patch)
tree3237af3bc61edf232c2639c47ad55dc4f2e3111c
parente747fa8e32021b6fbc85384128133cf5289bcfa8 (diff)
parent736d6a1bde71919f7469218684c12793a9532aeb (diff)
downloadceph-3fa1cd20b5d091ec997cf15251586d77f3cf079b.tar.gz
Merge remote-tracking branch 'gh/next'
-rw-r--r--src/common/WorkQueue.cc8
-rw-r--r--src/common/WorkQueue.h1
-rw-r--r--src/common/config_opts.h2
-rw-r--r--src/mon/Monitor.cc23
-rw-r--r--src/mon/Monitor.h11
-rw-r--r--src/mon/PGMonitor.cc8
-rw-r--r--src/mon/Paxos.cc8
-rw-r--r--src/osd/OSD.cc17
-rw-r--r--src/osd/OSD.h2
-rw-r--r--src/osd/PG.cc7
-rw-r--r--src/osd/PG.h2
-rw-r--r--src/osd/osd_types.cc5
-rw-r--r--src/osdc/Objecter.cc1
-rw-r--r--src/rgw/rgw_auth_s3.cc1
-rw-r--r--src/rgw/rgw_bucket.cc6
-rw-r--r--src/rgw/rgw_common.h4
-rw-r--r--src/rgw/rgw_op.cc18
-rw-r--r--src/rgw/rgw_rados.cc46
-rw-r--r--src/rgw/rgw_rados.h2
-rw-r--r--src/test/test_osd_types.cc416
20 files changed, 545 insertions, 43 deletions
diff --git a/src/common/WorkQueue.cc b/src/common/WorkQueue.cc
index a40200a68bd..6b648a78021 100644
--- a/src/common/WorkQueue.cc
+++ b/src/common/WorkQueue.cc
@@ -49,7 +49,13 @@ ThreadPool::ThreadPool(CephContext *cct_, string nm, int n, const char *option)
}
}
-void ThreadPool::TPHandle::reset_tp_timeout() {
+void ThreadPool::TPHandle::suspend_tp_timeout()
+{
+ cct->get_heartbeat_map()->clear_timeout(hb);
+}
+
+void ThreadPool::TPHandle::reset_tp_timeout()
+{
cct->get_heartbeat_map()->reset_timeout(
hb, grace, suicide_grace);
}
diff --git a/src/common/WorkQueue.h b/src/common/WorkQueue.h
index d936d77abef..b2742accdce 100644
--- a/src/common/WorkQueue.h
+++ b/src/common/WorkQueue.h
@@ -49,6 +49,7 @@ public:
: cct(cct), hb(hb), grace(grace), suicide_grace(suicide_grace) {}
public:
void reset_tp_timeout();
+ void suspend_tp_timeout();
};
private:
diff --git a/src/common/config_opts.h b/src/common/config_opts.h
index 3b9d025393f..1c7a917602a 100644
--- a/src/common/config_opts.h
+++ b/src/common/config_opts.h
@@ -80,7 +80,7 @@ SUBSYS(journal, 1, 3)
SUBSYS(ms, 0, 5)
SUBSYS(mon, 1, 5)
SUBSYS(monc, 0, 10)
-SUBSYS(paxos, 0, 5)
+SUBSYS(paxos, 1, 5)
SUBSYS(tp, 0, 5)
SUBSYS(auth, 1, 5)
SUBSYS(crypto, 1, 5)
diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc
index 119ef740aa8..2c21e6eac69 100644
--- a/src/mon/Monitor.cc
+++ b/src/mon/Monitor.cc
@@ -619,7 +619,7 @@ void Monitor::bootstrap()
{
dout(10) << "bootstrap" << dendl;
- sync_reset();
+ sync_reset_requester();
unregister_cluster_logger();
cancel_probe_timeout();
@@ -806,23 +806,27 @@ void Monitor::sync_obtain_latest_monmap(bufferlist &bl)
latest_monmap.encode(bl, CEPH_FEATURES_ALL);
}
-void Monitor::sync_reset()
+void Monitor::sync_reset_requester()
{
+ dout(10) << __func__ << dendl;
+
if (sync_timeout_event) {
timer.cancel_event(sync_timeout_event);
sync_timeout_event = NULL;
}
- // leader state
- sync_providers.clear();
-
- // requester state
sync_provider = entity_inst_t();
sync_cookie = 0;
sync_full = false;
sync_start_version = 0;
}
+void Monitor::sync_reset_provider()
+{
+ dout(10) << __func__ << dendl;
+ sync_providers.clear();
+}
+
void Monitor::sync_start(entity_inst_t &other, bool full)
{
dout(10) << __func__ << " " << other << (full ? " full" : " recent") << dendl;
@@ -832,7 +836,7 @@ void Monitor::sync_start(entity_inst_t &other, bool full)
state = STATE_SYNCHRONIZING;
// make sure are not a provider for anyone!
- sync_reset();
+ sync_reset_provider();
sync_full = full;
@@ -923,8 +927,6 @@ void Monitor::sync_finish(version_t last_committed)
t.erase("mon_sync", "last_committed_floor");
store->apply_transaction(t);
- sync_reset();
-
assert(g_conf->mon_sync_requester_kill_at != 9);
init_paxos();
@@ -1173,7 +1175,6 @@ void Monitor::handle_sync_chunk(MMonSync *m)
void Monitor::handle_sync_no_cookie(MMonSync *m)
{
dout(10) << __func__ << dendl;
- sync_reset();
bootstrap();
}
@@ -1763,7 +1764,7 @@ void Monitor::get_health(string& status, bufferlist *detailbl, Formatter *f)
}
if (f) {
- f->open_object_section(name.c_str());
+ f->open_object_section("mon");
f->dump_string("name", name.c_str());
f->dump_float("skew", skew);
f->dump_float("latency", latency);
diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h
index bed48ecee34..69dfefe144a 100644
--- a/src/mon/Monitor.h
+++ b/src/mon/Monitor.h
@@ -168,6 +168,7 @@ public:
case STATE_ELECTING: return "electing";
case STATE_LEADER: return "leader";
case STATE_PEON: return "peon";
+ case STATE_SHUTDOWN: return "shutdown";
default: return "???";
}
}
@@ -301,10 +302,14 @@ private:
set<string> get_sync_targets_names();
/**
- * Reset the monitor's sync-related data structures and state, both
- * for the requester- and provider-side.
+ * Reset the monitor's sync-related data structures for syncing *from* a peer
*/
- void sync_reset();
+ void sync_reset_requester();
+
+ /**
+ * Reset sync state related to allowing others to sync from us
+ */
+ void sync_reset_provider();
/**
* Caled when a sync attempt times out (requester-side)
diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc
index d86cbe70c19..93b0b0b3828 100644
--- a/src/mon/PGMonitor.cc
+++ b/src/mon/PGMonitor.cc
@@ -1332,12 +1332,16 @@ bool PGMonitor::preprocess_command(MMonCommand *m)
// perhaps these would be better in the parsing, but it's weird
if (prefix == "pg dump_json") {
+ vector<string> v;
+ v.push_back(string("all"));
cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
- cmd_putval(g_ceph_context, cmdmap, "dumpcontents", string("all"));
+ cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
prefix = "pg dump";
} else if (prefix == "pg dump_pools_json") {
+ vector<string> v;
+ v.push_back(string("pools"));
cmd_putval(g_ceph_context, cmdmap, "format", string("json"));
- cmd_putval(g_ceph_context, cmdmap, "dumpcontents", string("pool"));
+ cmd_putval(g_ceph_context, cmdmap, "dumpcontents", v);
prefix = "pg dump";
}
diff --git a/src/mon/Paxos.cc b/src/mon/Paxos.cc
index a543abed7ed..445413da13b 100644
--- a/src/mon/Paxos.cc
+++ b/src/mon/Paxos.cc
@@ -492,7 +492,7 @@ void Paxos::handle_last(MMonPaxos *last)
void Paxos::collect_timeout()
{
- dout(5) << "collect timeout, calling fresh election" << dendl;
+ dout(1) << "collect timeout, calling fresh election" << dendl;
collect_timeout_event = 0;
assert(mon->is_leader());
mon->bootstrap();
@@ -711,7 +711,7 @@ void Paxos::handle_accept(MMonPaxos *accept)
void Paxos::accept_timeout()
{
- dout(5) << "accept timeout, calling fresh election" << dendl;
+ dout(1) << "accept timeout, calling fresh election" << dendl;
accept_timeout_event = 0;
assert(mon->is_leader());
assert(is_updating() || is_updating_previous());
@@ -1004,7 +1004,7 @@ void Paxos::handle_lease_ack(MMonPaxos *ack)
void Paxos::lease_ack_timeout()
{
- dout(5) << "lease_ack_timeout -- calling new election" << dendl;
+ dout(1) << "lease_ack_timeout -- calling new election" << dendl;
assert(mon->is_leader());
assert(is_active());
@@ -1023,7 +1023,7 @@ void Paxos::reset_lease_timeout()
void Paxos::lease_timeout()
{
- dout(5) << "lease_timeout -- calling new election" << dendl;
+ dout(1) << "lease_timeout -- calling new election" << dendl;
assert(mon->is_peon());
lease_timeout_event = 0;
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index e3a7c227e15..89aa1db34eb 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -4957,13 +4957,22 @@ void OSD::handle_osd_map(MOSDMap *m)
if (first > osdmap->get_epoch() + 1) {
dout(10) << "handle_osd_map message skips epochs " << osdmap->get_epoch() + 1
<< ".." << (first-1) << dendl;
- if ((m->oldest_map < first && osdmap->get_epoch() == 0) ||
- m->oldest_map <= osdmap->get_epoch()) {
+ if (m->oldest_map <= osdmap->get_epoch() + 1) {
monc->sub_want("osdmap", osdmap->get_epoch()+1, CEPH_SUBSCRIBE_ONETIME);
monc->renew_subs();
m->put();
return;
}
+ // always try to get the full range of maps--as many as we can. this
+ // 1- is good to have
+ // 2- is at present the only way to ensure that we get a *full* map as
+ // the first map!
+ if (m->oldest_map < first) {
+ monc->sub_want("osdmap", m->oldest_map - 1, CEPH_SUBSCRIBE_ONETIME);
+ monc->renew_subs();
+ m->put();
+ return;
+ }
skip_maps = true;
}
@@ -7041,9 +7050,9 @@ PGRef OSD::OpWQ::_dequeue()
return pg;
}
-void OSD::OpWQ::_process(PGRef pg)
+void OSD::OpWQ::_process(PGRef pg, ThreadPool::TPHandle &handle)
{
- pg->lock();
+ pg->lock_suspend_timeout(handle);
OpRequestRef op;
{
Mutex::Locker l(qlock);
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index 5bcff7442d7..478f766d145 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -911,7 +911,7 @@ private:
bool _empty() {
return pqueue.empty();
}
- void _process(PGRef pg);
+ void _process(PGRef pg, ThreadPool::TPHandle &handle);
} op_wq;
void enqueue_op(PG *pg, OpRequestRef op);
diff --git a/src/osd/PG.cc b/src/osd/PG.cc
index 9f957b8e054..f731441e8a4 100644
--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@@ -193,6 +193,13 @@ PG::~PG()
#endif
}
+void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
+{
+ handle.suspend_tp_timeout();
+ lock();
+ handle.reset_tp_timeout();
+}
+
void PG::lock(bool no_lockdep)
{
_lock.Lock(no_lockdep);
diff --git a/src/osd/PG.h b/src/osd/PG.h
index 10e9a2544a9..8f572c75e19 100644
--- a/src/osd/PG.h
+++ b/src/osd/PG.h
@@ -245,6 +245,8 @@ protected:
public:
bool deleting; // true while in removing or OSD is shutting down
+
+ void lock_suspend_timeout(ThreadPool::TPHandle &handle);
void lock(bool no_lockdep = false);
void unlock() {
//generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
diff --git a/src/osd/osd_types.cc b/src/osd/osd_types.cc
index 02c1ef7b69d..fbd5cbbe9a0 100644
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@@ -1715,7 +1715,7 @@ bool pg_interval_t::check_new_interval(
if (!i.acting.empty() &&
i.acting.size() >=
- osdmap->get_pools().find(pool_id)->second.min_size) {
+ lastmap->get_pools().find(pool_id)->second.min_size) {
if (out)
*out << "generate_past_intervals " << i
<< ": not rw,"
@@ -1730,6 +1730,7 @@ bool pg_interval_t::check_new_interval(
*out << "generate_past_intervals " << i
<< " : primary up " << lastmap->get_up_from(i.acting[0])
<< "-" << lastmap->get_up_thru(i.acting[0])
+ << " includes interval"
<< std::endl;
} else if (last_epoch_clean >= i.first &&
last_epoch_clean <= i.last) {
@@ -1758,7 +1759,7 @@ bool pg_interval_t::check_new_interval(
} else {
i.maybe_went_rw = false;
if (out)
- *out << "generate_past_intervals " << i << " : empty" << std::endl;
+ *out << "generate_past_intervals " << i << " : acting set is too small" << std::endl;
}
return true;
} else {
diff --git a/src/osdc/Objecter.cc b/src/osdc/Objecter.cc
index aefe74088e9..70d296a3ab3 100644
--- a/src/osdc/Objecter.cc
+++ b/src/osdc/Objecter.cc
@@ -2436,6 +2436,7 @@ int Objecter::recalc_command_target(CommandOp *c)
c->session = s;
s->command_ops.push_back(&c->session_item);
} else {
+ c->session = NULL;
num_homeless_ops++;
}
return RECALC_OP_TARGET_NEED_RESEND;
diff --git a/src/rgw/rgw_auth_s3.cc b/src/rgw/rgw_auth_s3.cc
index c93de7cd58a..f3f0c8322f0 100644
--- a/src/rgw/rgw_auth_s3.cc
+++ b/src/rgw/rgw_auth_s3.cc
@@ -6,6 +6,7 @@
static const char *signed_subresources[] = {
"acl",
+ "cors",
"delete",
"lifecycle",
"location",
diff --git a/src/rgw/rgw_bucket.cc b/src/rgw/rgw_bucket.cc
index bf8da99d616..d32af5df601 100644
--- a/src/rgw/rgw_bucket.cc
+++ b/src/rgw/rgw_bucket.cc
@@ -202,7 +202,11 @@ int rgw_bucket_set_attrs(RGWRados *store, rgw_bucket& bucket,
string oid;
store->get_bucket_meta_oid(bucket, oid);
rgw_obj obj(store->zone.domain_root, oid);
- return store->meta_mgr->set_attrs(bucket_meta_handler, oid,
+
+ string key;
+ store->get_bucket_instance_entry(bucket, key); /* we want the bucket instance name without
+ the oid prefix cruft */
+ return store->meta_mgr->set_attrs(bucket_instance_meta_handler, key,
obj, attrs, rmattrs, objv_tracker);
}
diff --git a/src/rgw/rgw_common.h b/src/rgw/rgw_common.h
index 543bdf21377..8e4126de271 100644
--- a/src/rgw/rgw_common.h
+++ b/src/rgw/rgw_common.h
@@ -542,6 +542,10 @@ struct rgw_bucket {
std::string marker;
std::string bucket_id;
+ std::string oid; /*
+ * runtime in-memory only info. If not empty, points to the bucket instance object
+ */
+
rgw_bucket() { }
rgw_bucket(const char *n) : name(n) {
assert(*n == '.'); // only rgw private buckets should be initialized without pool
diff --git a/src/rgw/rgw_op.cc b/src/rgw/rgw_op.cc
index 7760a2f5c52..e672de154ab 100644
--- a/src/rgw/rgw_op.cc
+++ b/src/rgw/rgw_op.cc
@@ -241,8 +241,10 @@ static int get_policy_from_attr(CephContext *cct, RGWRados *store, void *ctx,
}
if (obj.object.empty()) {
+ rgw_obj instance_obj;
+ store->get_bucket_instance_obj(bucket_info.bucket, instance_obj);
return get_bucket_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs,
- policy, obj);
+ policy, instance_obj);
}
return get_obj_policy_from_attr(cct, store, ctx, bucket_info, bucket_attrs,
policy, obj);
@@ -1898,9 +1900,8 @@ void RGWPutCORS::execute()
RGWObjVersionTracker *ptracker = (s->object ? NULL : &s->bucket_info.objv_tracker);
- string no_obj;
cors_config->encode(bl);
- obj.init(s->bucket, no_obj);
+ store->get_bucket_instance_obj(s->bucket, obj);
store->set_atomic(s->obj_ctx, obj);
ret = store->set_attr(s->obj_ctx, obj, RGW_ATTR_CORS, bl, ptracker);
}
@@ -1917,13 +1918,12 @@ void RGWDeleteCORS::execute()
{
bufferlist bl;
rgw_obj obj;
- string no_obj;
if (!s->bucket_cors) {
dout(2) << "No CORS configuration set yet for this bucket" << dendl;
ret = -ENOENT;
return;
}
- obj.init(s->bucket, no_obj);
+ store->get_bucket_instance_obj(s->bucket, obj);
store->set_atomic(s->obj_ctx, obj);
map<string, bufferlist> orig_attrs, attrs, rmattrs;
map<string, bufferlist>::iterator iter;
@@ -2516,10 +2516,10 @@ int RGWHandler::read_cors_config(void)
bufferlist bl;
dout(10) << "Going to read cors from attrs" << dendl;
- string no_object;
- rgw_obj no_obj(s->bucket, no_object);
- if (no_obj.bucket.name.size()) {
- ret = store->get_attr(s->obj_ctx, no_obj, RGW_ATTR_CORS, bl);
+ rgw_obj obj;
+ store->get_bucket_instance_obj(s->bucket, obj);
+ if (obj.bucket.name.size()) {
+ ret = store->get_attr(s->obj_ctx, obj, RGW_ATTR_CORS, bl);
if (ret >= 0) {
bufferlist::iterator iter = bl.begin();
s->bucket_cors = new RGWCORSConfiguration();
diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc
index 8af03b03a8f..aba5cdf0ee2 100644
--- a/src/rgw/rgw_rados.cc
+++ b/src/rgw/rgw_rados.cc
@@ -831,6 +831,11 @@ void RGWRados::finalize()
RGWRESTConn *conn = iter->second;
delete conn;
}
+
+ for (iter = region_conn_map.begin(); iter != region_conn_map.end(); ++iter) {
+ RGWRESTConn *conn = iter->second;
+ delete conn;
+ }
}
/**
@@ -896,6 +901,12 @@ int RGWRados::init_complete()
}
RGWRegion& region = iter->second;
rest_master_conn = new RGWRESTConn(cct, this, region.endpoints);
+
+ for (iter = region_map.regions.begin(); iter != region_map.regions.end(); ++iter) {
+ RGWRegion& region = iter->second;
+
+ region_conn_map[region.name] = new RGWRESTConn(cct, this, region.endpoints);
+ }
}
map<string, RGWZone>::iterator ziter;
@@ -2535,7 +2546,17 @@ int RGWRados::copy_obj(void *ctx,
RGWRESTConn *conn;
if (source_zone.empty()) {
- conn = rest_master_conn;
+ if (dest_bucket_info.region.empty()) {
+ /* source is in the master region */
+ conn = rest_master_conn;
+ } else {
+ map<string, RGWRESTConn *>::iterator iter = region_conn_map.find(src_bucket_info.region);
+ if (iter == zone_conn_map.end()) {
+ ldout(cct, 0) << "could not find region connection to region: " << source_zone << dendl;
+ return -ENOENT;
+ }
+ conn = iter->second;
+ }
} else {
map<string, RGWRESTConn *>::iterator iter = zone_conn_map.find(source_zone);
if (iter == zone_conn_map.end()) {
@@ -2886,7 +2907,7 @@ int RGWRados::set_bucket_owner(rgw_bucket& bucket, ACLOwner& owner)
{
RGWBucketInfo info;
map<string, bufferlist> attrs;
- int r = get_bucket_instance_info(NULL, bucket, info, NULL, &attrs);
+ int r = get_bucket_info(NULL, bucket.name, info, NULL, &attrs);
if (r < 0) {
ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << dendl;
return r;
@@ -2919,7 +2940,7 @@ int RGWRados::set_buckets_enabled(vector<rgw_bucket>& buckets, bool enabled)
RGWBucketInfo info;
map<string, bufferlist> attrs;
- int r = get_bucket_instance_info(NULL, bucket, info, NULL, &attrs);
+ int r = get_bucket_info(NULL, bucket.name, info, NULL, &attrs);
if (r < 0) {
ldout(cct, 0) << "NOTICE: get_bucket_info on bucket=" << bucket.name << " returned err=" << r << ", skipping bucket" << dendl;
ret = r;
@@ -4546,6 +4567,17 @@ void RGWRados::get_bucket_meta_oid(rgw_bucket& bucket, string& oid)
oid = RGW_BUCKET_INSTANCE_MD_PREFIX + entry;
}
+void RGWRados::get_bucket_instance_obj(rgw_bucket& bucket, rgw_obj& obj)
+{
+ if (!bucket.oid.empty()) {
+ obj.init(zone.domain_root, bucket.oid);
+ } else {
+ string oid;
+ get_bucket_meta_oid(bucket, oid);
+ obj.init(zone.domain_root, oid);
+ }
+}
+
int RGWRados::get_bucket_instance_info(void *ctx, const string& meta_key, RGWBucketInfo& info,
time_t *pmtime, map<string, bufferlist> *pattrs)
{
@@ -4562,7 +4594,11 @@ int RGWRados::get_bucket_instance_info(void *ctx, rgw_bucket& bucket, RGWBucketI
time_t *pmtime, map<string, bufferlist> *pattrs)
{
string oid;
- get_bucket_meta_oid(bucket, oid);
+ if (!bucket.oid.empty()) {
+ get_bucket_meta_oid(bucket, oid);
+ } else {
+ oid = bucket.oid;
+ }
return get_bucket_instance_from_oid(ctx, oid, info, pmtime, pattrs);
}
@@ -4586,6 +4622,7 @@ int RGWRados::get_bucket_instance_from_oid(void *ctx, string& oid, RGWBucketInfo
ldout(cct, 0) << "ERROR: could not decode buffer info, caught buffer::error" << dendl;
return -EIO;
}
+ info.bucket.oid = oid;
return 0;
}
@@ -4628,6 +4665,7 @@ int RGWRados::get_bucket_info(void *ctx, string& bucket_name, RGWBucketInfo& inf
if (entry_point.has_bucket_info) {
info = entry_point.old_bucket_info;
+ info.bucket.oid = bucket_name;
info.ep_objv = ot.read_version;
ldout(cct, 20) << "rgw_get_bucket_info: old bucket info, bucket=" << info.bucket << " owner " << info.owner << dendl;
return 0;
diff --git a/src/rgw/rgw_rados.h b/src/rgw/rgw_rados.h
index bcc40900299..d01f76ec224 100644
--- a/src/rgw/rgw_rados.h
+++ b/src/rgw/rgw_rados.h
@@ -888,6 +888,7 @@ public:
RGWRegionMap region_map;
RGWRESTConn *rest_master_conn;
map<string, RGWRESTConn *> zone_conn_map;
+ map<string, RGWRESTConn *> region_conn_map;
RGWMetadataManager *meta_mgr;
@@ -1285,6 +1286,7 @@ public:
int decode_policy(bufferlist& bl, ACLOwner *owner);
int get_bucket_stats(rgw_bucket& bucket, uint64_t *bucket_ver, uint64_t *master_ver, map<RGWObjCategory, RGWBucketStats>& stats,
string *max_marker);
+ void get_bucket_instance_obj(rgw_bucket& bucket, rgw_obj& obj);
void get_bucket_instance_entry(rgw_bucket& bucket, string& entry);
void get_bucket_meta_oid(rgw_bucket& bucket, string& oid);
diff --git a/src/test/test_osd_types.cc b/src/test/test_osd_types.cc
index 7a43b7b892a..fa4ae6163ac 100644
--- a/src/test/test_osd_types.cc
+++ b/src/test/test_osd_types.cc
@@ -17,6 +17,7 @@
#include "include/types.h"
#include "osd/osd_types.h"
+#include "osd/OSDMap.h"
#include "gtest/gtest.h"
#include "common/Thread.h"
@@ -117,6 +118,421 @@ TEST(hobject, prefixes5)
ASSERT_EQ(prefixes_out, prefixes_correct);
}
+TEST(pg_interval_t, check_new_interval)
+{
+ //
+ // Create a situation where osdmaps are the same so that
+ // each test case can diverge from it using minimal code.
+ //
+ int osd_id = 1;
+ epoch_t epoch = 40;
+ std::tr1::shared_ptr<OSDMap> osdmap(new OSDMap());
+ osdmap->set_max_osd(10);
+ osdmap->set_state(osd_id, CEPH_OSD_EXISTS);
+ osdmap->set_epoch(epoch);
+ std::tr1::shared_ptr<OSDMap> lastmap(new OSDMap());
+ lastmap->set_max_osd(10);
+ lastmap->set_state(osd_id, CEPH_OSD_EXISTS);
+ lastmap->set_epoch(epoch);
+ epoch_t same_interval_since = epoch;
+ epoch_t last_epoch_clean = same_interval_since;
+ int64_t pool_id = 200;
+ int pg_num = 4;
+ __u8 min_size = 2;
+ {
+ OSDMap::Incremental inc(epoch + 1);
+ inc.new_pools[pool_id].min_size = min_size;
+ inc.new_pools[pool_id].set_pg_num(pg_num);
+ inc.new_up_thru[osd_id] = epoch + 1;
+ osdmap->apply_incremental(inc);
+ lastmap->apply_incremental(inc);
+ }
+ vector<int> new_acting;
+ new_acting.push_back(osd_id);
+ new_acting.push_back(osd_id + 1);
+ vector<int> old_acting = new_acting;
+ vector<int> new_up;
+ new_up.push_back(osd_id);
+ vector<int> old_up = new_up;
+ pg_t pgid;
+ pgid.set_pool(pool_id);
+
+ //
+ // Do nothing if there are no modifications in
+ // acting, up or pool size and that the pool is not
+ // being split
+ //
+ {
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_FALSE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals));
+ ASSERT_TRUE(past_intervals.empty());
+ }
+
+ //
+ // pool did not exist in the old osdmap
+ //
+ {
+ std::tr1::shared_ptr<OSDMap> lastmap(new OSDMap());
+ lastmap->set_max_osd(10);
+ lastmap->set_state(osd_id, CEPH_OSD_EXISTS);
+ lastmap->set_epoch(epoch);
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
+ ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
+ }
+
+ //
+ // The acting set has changed
+ //
+ {
+ vector<int> new_acting;
+ int new_primary = osd_id + 1;
+ new_acting.push_back(new_primary);
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
+ ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
+ }
+
+ //
+ // The up set has changed
+ //
+ {
+ vector<int> new_up;
+ int new_primary = osd_id + 1;
+ new_up.push_back(new_primary);
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
+ ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
+ }
+
+ //
+ // PG is splitting
+ //
+ {
+ std::tr1::shared_ptr<OSDMap> osdmap(new OSDMap());
+ osdmap->set_max_osd(10);
+ osdmap->set_state(osd_id, CEPH_OSD_EXISTS);
+ osdmap->set_epoch(epoch);
+ int new_pg_num = pg_num ^ 2;
+ OSDMap::Incremental inc(epoch + 1);
+ inc.new_pools[pool_id].min_size = min_size;
+ inc.new_pools[pool_id].set_pg_num(new_pg_num);
+ osdmap->apply_incremental(inc);
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
+ ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
+ }
+
+ //
+ // PG size has changed
+ //
+ {
+ std::tr1::shared_ptr<OSDMap> osdmap(new OSDMap());
+ osdmap->set_max_osd(10);
+ osdmap->set_state(osd_id, CEPH_OSD_EXISTS);
+ osdmap->set_epoch(epoch);
+ OSDMap::Incremental inc(epoch + 1);
+ __u8 new_min_size = min_size + 1;
+ inc.new_pools[pool_id].min_size = new_min_size;
+ inc.new_pools[pool_id].set_pg_num(pg_num);
+ osdmap->apply_incremental(inc);
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_EQ(same_interval_since, past_intervals[same_interval_since].first);
+ ASSERT_EQ(osdmap->get_epoch() - 1, past_intervals[same_interval_since].last);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].acting[0]);
+ ASSERT_EQ(osd_id, past_intervals[same_interval_since].up[0]);
+ }
+
+ //
+ // The old acting set was empty : the previous interval could not
+ // have been rw
+ //
+ {
+ vector<int> old_acting;
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ostringstream out;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals,
+ &out));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw);
+ ASSERT_NE(string::npos, out.str().find("acting set is too small"));
+ }
+
+ //
+ // The old acting set did not have enough osd : it could
+ // not have been rw
+ //
+ {
+ vector<int> old_acting;
+ old_acting.push_back(osd_id);
+
+ //
+ // see http://tracker.ceph.com/issues/5780
+ // the size of the old acting set should be compared
+ // with the min_size of the old osdmap
+ //
+ // The new osdmap is created so that it triggers the
+ // bug.
+ //
+ std::tr1::shared_ptr<OSDMap> osdmap(new OSDMap());
+ osdmap->set_max_osd(10);
+ osdmap->set_state(osd_id, CEPH_OSD_EXISTS);
+ osdmap->set_epoch(epoch);
+ OSDMap::Incremental inc(epoch + 1);
+ __u8 new_min_size = old_acting.size();
+ inc.new_pools[pool_id].min_size = new_min_size;
+ inc.new_pools[pool_id].set_pg_num(pg_num);
+ osdmap->apply_incremental(inc);
+
+ ostringstream out;
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals,
+ &out));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw);
+ ASSERT_NE(string::npos, out.str().find("acting set is too small"));
+ }
+
+ //
+ // The acting set changes. The old acting set primary was up during the
+ // previous interval and may have been rw.
+ //
+ {
+ vector<int> new_acting;
+ new_acting.push_back(osd_id + 4);
+ new_acting.push_back(osd_id + 5);
+
+ ostringstream out;
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals,
+ &out));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_TRUE(past_intervals[same_interval_since].maybe_went_rw);
+ ASSERT_NE(string::npos, out.str().find("includes interval"));
+ }
+ //
+ // The acting set changes. The old acting set primary was not up
+ // during the old interval but last_epoch_clean is in the
+ // old interval and it may have been rw.
+ //
+ {
+ vector<int> new_acting;
+ new_acting.push_back(osd_id + 4);
+ new_acting.push_back(osd_id + 5);
+
+ std::tr1::shared_ptr<OSDMap> lastmap(new OSDMap());
+ lastmap->set_max_osd(10);
+ lastmap->set_state(osd_id, CEPH_OSD_EXISTS);
+ lastmap->set_epoch(epoch);
+ OSDMap::Incremental inc(epoch + 1);
+ inc.new_pools[pool_id].min_size = min_size;
+ inc.new_pools[pool_id].set_pg_num(pg_num);
+ inc.new_up_thru[osd_id] = epoch - 10;
+ lastmap->apply_incremental(inc);
+
+ ostringstream out;
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals,
+ &out));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_TRUE(past_intervals[same_interval_since].maybe_went_rw);
+ ASSERT_NE(string::npos, out.str().find("presumed to have been rw"));
+ }
+
+ //
+ // The acting set changes. The old acting set primary was not up
+ // during the old interval and last_epoch_clean is before the
+ // old interval : the previous interval could not possibly have
+ // been rw.
+ //
+ {
+ vector<int> new_acting;
+ new_acting.push_back(osd_id + 4);
+ new_acting.push_back(osd_id + 5);
+
+ epoch_t last_epoch_clean = epoch - 10;
+
+ std::tr1::shared_ptr<OSDMap> lastmap(new OSDMap());
+ lastmap->set_max_osd(10);
+ lastmap->set_state(osd_id, CEPH_OSD_EXISTS);
+ lastmap->set_epoch(epoch);
+ OSDMap::Incremental inc(epoch + 1);
+ inc.new_pools[pool_id].min_size = min_size;
+ inc.new_pools[pool_id].set_pg_num(pg_num);
+ inc.new_up_thru[osd_id] = last_epoch_clean;
+ lastmap->apply_incremental(inc);
+
+ ostringstream out;
+
+ map<epoch_t, pg_interval_t> past_intervals;
+
+ ASSERT_TRUE(past_intervals.empty());
+ ASSERT_TRUE(pg_interval_t::check_new_interval(old_acting,
+ new_acting,
+ old_up,
+ new_up,
+ same_interval_since,
+ last_epoch_clean,
+ osdmap,
+ lastmap,
+ pool_id,
+ pgid,
+ &past_intervals,
+ &out));
+ ASSERT_EQ((unsigned int)1, past_intervals.size());
+ ASSERT_FALSE(past_intervals[same_interval_since].maybe_went_rw);
+ ASSERT_NE(string::npos, out.str().find("does not include interval"));
+ }
+}
+
TEST(pg_t, split)
{
pg_t pgid(0, 0, -1);