// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef CEPH_MDSMAP_H #define CEPH_MDSMAP_H #include #include "include/types.h" #include "common/Clock.h" #include "msg/Message.h" #include #include #include using namespace std; #include "common/config.h" #include "include/CompatSet.h" #include "common/Formatter.h" /* boot --> standby, creating, or starting. dne ----> creating -----> active* ^ ^___________/ / ^ ^ | / / | destroying / / | ^ / / | | / / | stopped <---- stopping* <-/ / | \ / | ----- starting* ----/ | | failed | \ | \--> replay* --> reconnect* --> rejoin* * = can fail */ class md_config_t; class CephContext; extern CompatSet get_mdsmap_compat_set(); extern CompatSet get_mdsmap_compat_set_base(); // pre v0.20 #define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20") #define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges") #define MDS_FEATURE_INCOMPAT_FILELAYOUT CompatSet::Feature(3, "default file layouts on dirs") #define MDS_FEATURE_INCOMPAT_DIRINODE CompatSet::Feature(4, "dir inode in separate object") class MDSMap { public: // mds states /* static const int STATE_DNE = CEPH_MDS_STATE_DNE; // down, never existed. static const int STATE_DESTROYING = CEPH_MDS_STATE_DESTROYING; // down, existing, semi-destroyed. static const int STATE_FAILED = CEPH_MDS_STATE_FAILED; // down, active subtrees; needs to be recovered. */ static const int STATE_STOPPED = CEPH_MDS_STATE_STOPPED; // down, once existed, but no subtrees. empty log. static const int STATE_BOOT = CEPH_MDS_STATE_BOOT; // up, boot announcement. destiny unknown. static const int STATE_STANDBY = CEPH_MDS_STATE_STANDBY; // up, idle. waiting for assignment by monitor. static const int STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY; // up, replaying active node; ready to take over. static const int STATE_ONESHOT_REPLAY = CEPH_MDS_STATE_REPLAYONCE; //up, replaying active node journal to verify it, then shutting down static const int STATE_CREATING = CEPH_MDS_STATE_CREATING; // up, creating MDS instance (new journal, idalloc..). static const int STATE_STARTING = CEPH_MDS_STATE_STARTING; // up, starting prior stopped MDS instance. static const int STATE_REPLAY = CEPH_MDS_STATE_REPLAY; // up, starting prior failed instance. scanning journal. static const int STATE_RESOLVE = CEPH_MDS_STATE_RESOLVE; // up, disambiguating distributed operations (import, rename, etc.) static const int STATE_RECONNECT = CEPH_MDS_STATE_RECONNECT; // up, reconnect to clients static const int STATE_REJOIN = CEPH_MDS_STATE_REJOIN; // up, replayed journal, rejoining distributed cache static const int STATE_CLIENTREPLAY = CEPH_MDS_STATE_CLIENTREPLAY; // up, active static const int STATE_ACTIVE = CEPH_MDS_STATE_ACTIVE; // up, active static const int STATE_STOPPING = CEPH_MDS_STATE_STOPPING; // up, exporting metadata (-> standby or out) // indicate startup standby preferences for MDS // of course, if they have a specific rank to follow, they just set that! static const int MDS_NO_STANDBY_PREF = -1; // doesn't have instructions to do anything static const int MDS_STANDBY_ANY = -2; // is instructed to be standby-replay, may // or may not have specific name to follow static const int MDS_STANDBY_NAME = -3; // standby for a named MDS static const int MDS_MATCHED_ACTIVE = -4; // has a matched standby, which if up // it should follow, but otherwise should // be assigned a rank struct mds_info_t { uint64_t global_id; string name; int32_t rank; int32_t inc; int32_t state; version_t state_seq; entity_addr_t addr; utime_t laggy_since; int32_t standby_for_rank; string standby_for_name; set export_targets; mds_info_t() : global_id(0), rank(-1), inc(0), state(STATE_STANDBY), state_seq(0), standby_for_rank(MDS_NO_STANDBY_PREF) { } bool laggy() const { return !(laggy_since == utime_t()); } void clear_laggy() { laggy_since = utime_t(); } entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); } void encode(bufferlist& bl) const { __u8 v = 3; ::encode(v, bl); ::encode(global_id, bl); ::encode(name, bl); ::encode(rank, bl); ::encode(inc, bl); ::encode(state, bl); ::encode(state_seq, bl); ::encode(addr, bl); ::encode(laggy_since, bl); ::encode(standby_for_rank, bl); ::encode(standby_for_name, bl); ::encode(export_targets, bl); } void decode(bufferlist::iterator& bl) { __u8 v; ::decode(v, bl); ::decode(global_id, bl); ::decode(name, bl); ::decode(rank, bl); ::decode(inc, bl); ::decode(state, bl); ::decode(state_seq, bl); ::decode(addr, bl); ::decode(laggy_since, bl); ::decode(standby_for_rank, bl); ::decode(standby_for_name, bl); if (v >= 2) ::decode(export_targets, bl); } void dump(Formatter *f) const; }; protected: // base map epoch_t epoch; uint32_t flags; // flags epoch_t last_failure; // mds epoch of last failure epoch_t last_failure_osd_epoch; // osd epoch of last failure; any mds entering replay needs // at least this osdmap to ensure the blacklist propagates. utime_t created, modified; int32_t tableserver; // which MDS has anchortable, snaptable int32_t root; // which MDS has root directory __u32 session_timeout; __u32 session_autoclose; uint64_t max_file_size; vector data_pg_pools; // file data pg_pools available to clients (via an ioctl). first is the default. int64_t cas_pg_pool; // where CAS objects go int64_t metadata_pg_pool; // where fs metadata objects go /* * in: the set of logical mds #'s that define the cluster. this is the set * of mds's the metadata may be distributed over. * up: map from logical mds #'s to the addrs filling those roles. * failed: subset of @in that are failed. * stopped: set of nodes that have been initialized, but are not active. * * @up + @failed = @in. @in * @stopped = {}. */ uint32_t max_mds; /* The maximum number of active MDSes. Also, the maximum rank. */ set in; // currently defined cluster map inc; // most recent incarnation. set failed, stopped; // which roles are failed or stopped map up; // who is in those roles map mds_info; public: CompatSet compat; friend class MDSMonitor; public: MDSMap() : epoch(0), flags(0), last_failure(0), last_failure_osd_epoch(0), tableserver(0), root(0), cas_pg_pool(-1), metadata_pg_pool(0) { } utime_t get_session_timeout() { return utime_t(session_timeout,0); } uint64_t get_max_filesize() { return max_file_size; } int get_flags() const { return flags; } int test_flag(int f) const { return flags & f; } void set_flag(int f) { flags |= f; } void clear_flag(int f) { flags &= ~f; } epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } const utime_t& get_created() const { return created; } void set_created(utime_t ct) { modified = created = ct; } const utime_t& get_modified() const { return modified; } void set_modified(utime_t mt) { modified = mt; } epoch_t get_last_failure() const { return last_failure; } epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; } unsigned get_max_mds() const { return max_mds; } void set_max_mds(int m) { max_mds = m; } int get_tableserver() const { return tableserver; } int get_root() const { return root; } const vector &get_data_pg_pools() const { return data_pg_pools; } int64_t get_data_pg_pool() const { return data_pg_pools[0]; } int64_t get_cas_pg_pool() const { return cas_pg_pool; } int64_t get_metadata_pg_pool() const { return metadata_pg_pool; } const map& get_mds_info() { return mds_info; } const mds_info_t& get_mds_info_gid(uint64_t gid) { assert(mds_info.count(gid)); return mds_info[gid]; } const mds_info_t& get_mds_info(int m) { assert(up.count(m) && mds_info.count(up[m])); return mds_info[up[m]]; } // counts unsigned get_num_in_mds() { return in.size(); } unsigned get_num_up_mds() { return up.size(); } int get_num_failed_mds() { return failed.size(); } unsigned get_num_mds(int state) { unsigned n = 0; for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state == state) ++n; return n; } // data pools void add_data_pg_pool(int64_t poolid) { data_pg_pools.push_back(poolid); } int remove_data_pg_pool(int64_t poolid) { for (vector::iterator p = data_pg_pools.begin(); p != data_pg_pools.end(); ++p) { if (*p == poolid) { data_pg_pools.erase(p); return 0; } } return -ENOENT; } // sets void get_mds_set(set& s) { s = in; } void get_up_mds_set(set& s) { for (map::const_iterator p = up.begin(); p != up.end(); ++p) s.insert(p->first); } void get_active_mds_set(set& s) { get_mds_set(s, MDSMap::STATE_ACTIVE); } void get_failed_mds_set(set& s) { s = failed; } int get_failed() { if (!failed.empty()) return *failed.begin(); return -1; } void get_stopped_mds_set(set& s) { s = stopped; } void get_recovery_mds_set(set& s) { s = failed; for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING) s.insert(p->second.rank); } void get_mds_set(set& s, int state) { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state == state) s.insert(p->second.rank); } int get_random_up_mds() { if (up.empty()) return -1; map::iterator p = up.begin(); for (int n = rand() % up.size(); n; n--) p++; return p->first; } const mds_info_t* find_by_name(const string& name) const { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) { if (p->second.name == name) return &p->second; } return NULL; } uint64_t find_standby_for(int mds, string& name) { map::const_iterator generic_standby = mds_info.end(); for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) { if ((p->second.state != MDSMap::STATE_STANDBY && p->second.state != MDSMap::STATE_STANDBY_REPLAY) || p->second.laggy() || p->second.rank >= 0) continue; if (p->second.standby_for_rank == mds || (name.length() && p->second.standby_for_name == name)) return p->first; if (p->second.standby_for_rank < 0 && p->second.standby_for_name.length() == 0) generic_standby = p; } if (generic_standby != mds_info.end()) return generic_standby->first; return 0; } uint64_t find_unused_for(int mds, string& name) { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) { if (p->second.state != MDSMap::STATE_STANDBY || p->second.laggy() || p->second.rank >= 0) continue; if ((p->second.standby_for_rank == MDS_NO_STANDBY_PREF || p->second.standby_for_rank == MDS_MATCHED_ACTIVE || (p->second.standby_for_rank == MDS_STANDBY_ANY && g_conf->mon_force_standby_active))) { return p->first; } } return 0; } uint64_t find_replacement_for(int mds, string& name) { uint64_t standby = find_standby_for(mds, name); if (standby) return standby; else return find_unused_for(mds, name); } void get_health(list >& summary, list > *detail) const; // mds states bool is_down(int m) { return up.count(m) == 0; } bool is_up(int m) { return up.count(m); } bool is_in(int m) { return up.count(m) || failed.count(m); } bool is_out(int m) { return !is_in(m); } bool is_failed(int m) { return failed.count(m); } bool is_stopped(int m) { return stopped.count(m); } bool is_dne(int m) { return in.count(m) == 0; } bool is_dne_gid(uint64_t gid) { return mds_info.count(gid) == 0; } int get_state(int m) { return up.count(m) ? mds_info[up[m]].state : 0; } int get_state_gid(uint64_t gid) { return mds_info.count(gid) ? mds_info[gid].state : 0; } mds_info_t& get_info(int m) { assert(up.count(m)); return mds_info[up[m]]; } mds_info_t& get_info_gid(uint64_t gid) { assert(mds_info.count(gid)); return mds_info[gid]; } bool is_boot(int m) { return get_state(m) == STATE_BOOT; } bool is_creating(int m) { return get_state(m) == STATE_CREATING; } bool is_starting(int m) { return get_state(m) == STATE_STARTING; } bool is_replay(int m) { return get_state(m) == STATE_REPLAY; } bool is_resolve(int m) { return get_state(m) == STATE_RESOLVE; } bool is_reconnect(int m) { return get_state(m) == STATE_RECONNECT; } bool is_rejoin(int m) { return get_state(m) == STATE_REJOIN; } bool is_clientreplay(int m) { return get_state(m) == STATE_CLIENTREPLAY; } bool is_active(int m) { return get_state(m) == STATE_ACTIVE; } bool is_stopping(int m) { return get_state(m) == STATE_STOPPING; } bool is_clientreplay_or_active_or_stopping(int m) { return is_clientreplay(m) || is_active(m) || is_stopping(m); } bool is_followable(int m) { return (is_resolve(m) || is_replay(m) || is_rejoin(m) || is_clientreplay(m) || is_active(m) || is_stopping(m)); } bool is_laggy_gid(uint64_t gid) { return mds_info.count(gid) && mds_info[gid].laggy(); } // cluster states bool is_full() { return in.size() >= max_mds; } bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. return get_num_mds(STATE_REPLAY) + get_num_mds(STATE_RESOLVE) + get_num_mds(STATE_RECONNECT) + get_num_mds(STATE_REJOIN) + failed.size(); } bool is_any_failed() { return failed.size(); } bool is_rejoining() { // nodes are rejoining cache state return get_num_mds(STATE_REJOIN) > 0 && get_num_mds(STATE_REPLAY) == 0 && get_num_mds(STATE_RECONNECT) == 0 && get_num_mds(STATE_RESOLVE) == 0 && failed.empty(); } bool is_stopped() { return up.size() == 0; } // inst bool have_inst(int m) { return up.count(m); } const entity_inst_t get_inst(int m) { assert(up.count(m)); return mds_info[up[m]].get_inst(); } const entity_addr_t get_addr(int m) { assert(up.count(m)); return mds_info[up[m]].addr; } bool get_inst(int m, entity_inst_t& inst) { if (up.count(m)) { inst = get_inst(m); return true; } return false; } int get_rank_gid(uint64_t gid) { if (mds_info.count(gid)) return mds_info[gid].rank; return -1; } int get_inc(int m) { if (up.count(m)) return mds_info[up[m]].inc; return 0; } int get_inc_gid(uint64_t gid) { if (mds_info.count(gid)) return mds_info[gid].inc; return -1; } void encode_client_old(bufferlist& bl) const { __u16 v = 2; ::encode(v, bl); ::encode(epoch, bl); ::encode(flags, bl); ::encode(last_failure, bl); ::encode(root, bl); ::encode(session_timeout, bl); ::encode(session_autoclose, bl); ::encode(max_file_size, bl); ::encode(max_mds, bl); ::encode(mds_info, bl); __u32 n = data_pg_pools.size(); ::encode(n, bl); for (vector::const_iterator p = data_pg_pools.begin(); p != data_pg_pools.end(); ++p) { n = *p; ::encode(n, bl); } int32_t m = cas_pg_pool; ::encode(m, bl); } void encode(bufferlist& bl) const { __u16 v = 3; ::encode(v, bl); ::encode(epoch, bl); ::encode(flags, bl); ::encode(last_failure, bl); ::encode(root, bl); ::encode(session_timeout, bl); ::encode(session_autoclose, bl); ::encode(max_file_size, bl); ::encode(max_mds, bl); ::encode(mds_info, bl); ::encode(data_pg_pools, bl); ::encode(cas_pg_pool, bl); // kclient ignores everything from here __u16 ev = 5; ::encode(ev, bl); ::encode(compat, bl); ::encode(metadata_pg_pool, bl); ::encode(created, bl); ::encode(modified, bl); ::encode(tableserver, bl); ::encode(in, bl); ::encode(inc, bl); ::encode(up, bl); ::encode(failed, bl); ::encode(stopped, bl); ::encode(last_failure_osd_epoch, bl); } void decode(bufferlist::iterator& p) { __u16 v; ::decode(v, p); ::decode(epoch, p); ::decode(flags, p); ::decode(last_failure, p); ::decode(root, p); ::decode(session_timeout, p); ::decode(session_autoclose, p); ::decode(max_file_size, p); ::decode(max_mds, p); ::decode(mds_info, p); if (v < 3) { __u32 n; ::decode(n, p); while (n--) { __u32 m; ::decode(m, p); data_pg_pools.push_back(m); } __s32 s; ::decode(s, p); cas_pg_pool = s; } else { ::decode(data_pg_pools, p); ::decode(cas_pg_pool, p); } // kclient ignores everything from here __u16 ev = 1; if (v >= 2) ::decode(ev, p); if (ev >= 3) ::decode(compat, p); else compat = get_mdsmap_compat_set_base(); if (ev < 5) { __u32 n; ::decode(n, p); metadata_pg_pool = n; } else { ::decode(metadata_pg_pool, p); } ::decode(created, p); ::decode(modified, p); ::decode(tableserver, p); ::decode(in, p); ::decode(inc, p); ::decode(up, p); ::decode(failed, p); ::decode(stopped, p); if (ev >= 4) ::decode(last_failure_osd_epoch, p); } void decode(bufferlist& bl) { bufferlist::iterator p = bl.begin(); decode(p); } void print(ostream& out); void print_summary(ostream& out); void dump(Formatter *f) const; }; WRITE_CLASS_ENCODER(MDSMap::mds_info_t) WRITE_CLASS_ENCODER(MDSMap) inline ostream& operator<<(ostream& out, MDSMap& m) { m.print_summary(out); return out; } #endif