// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef CEPH_MDSMAP_H #define CEPH_MDSMAP_H #include "include/types.h" #include "common/Clock.h" #include "msg/Message.h" #include #include #include using namespace std; #include "config.h" #include "include/CompatSet.h" /* boot --> standby, creating, or starting. dne ----> creating -----> active* ^ ^___________/ / ^ ^ | / / | destroying / / | ^ / / | | / / | stopped <---- stopping* <-/ / | \ / | ----- starting* ----/ | | failed | \ | \--> replay* --> reconnect* --> rejoin* * = can fail */ extern CompatSet mdsmap_compat; extern CompatSet mdsmap_compat_base; // pre v0.20 #define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20") class MDSMap { public: // mds states /* static const int STATE_DNE = CEPH_MDS_STATE_DNE; // down, never existed. static const int STATE_DESTROYING = CEPH_MDS_STATE_DESTROYING; // down, existing, semi-destroyed. static const int STATE_FAILED = CEPH_MDS_STATE_FAILED; // down, active subtrees; needs to be recovered. */ static const int STATE_STOPPED = CEPH_MDS_STATE_STOPPED; // down, once existed, but no subtrees. empty log. static const int STATE_BOOT = CEPH_MDS_STATE_BOOT; // up, boot announcement. destiny unknown. static const int STATE_STANDBY = CEPH_MDS_STATE_STANDBY; // up, idle. waiting for assignment by monitor. static const int STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY; // up, replaying active node; ready to take over. static const int STATE_CREATING = CEPH_MDS_STATE_CREATING; // up, creating MDS instance (new journal, idalloc..). static const int STATE_STARTING = CEPH_MDS_STATE_STARTING; // up, starting prior stopped MDS instance. static const int STATE_REPLAY = CEPH_MDS_STATE_REPLAY; // up, starting prior failed instance. scanning journal. static const int STATE_RESOLVE = CEPH_MDS_STATE_RESOLVE; // up, disambiguating distributed operations (import, rename, etc.) static const int STATE_RECONNECT = CEPH_MDS_STATE_RECONNECT; // up, reconnect to clients static const int STATE_REJOIN = CEPH_MDS_STATE_REJOIN; // up, replayed journal, rejoining distributed cache static const int STATE_CLIENTREPLAY = CEPH_MDS_STATE_CLIENTREPLAY; // up, active static const int STATE_ACTIVE = CEPH_MDS_STATE_ACTIVE; // up, active static const int STATE_STOPPING = CEPH_MDS_STATE_STOPPING; // up, exporting metadata (-> standby or out) struct mds_info_t { uint64_t global_id; string name; int32_t rank; int32_t inc; int32_t state; version_t state_seq; entity_addr_t addr; utime_t laggy_since; int32_t standby_for_rank; string standby_for_name; set export_targets; mds_info_t() : global_id(0), rank(-1), inc(0), state(STATE_STANDBY), state_seq(0) { } bool laggy() const { return !(laggy_since == utime_t()); } void clear_laggy() { laggy_since = utime_t(); } entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); } void encode(bufferlist& bl) const { __u8 v = 3; ::encode(v, bl); ::encode(global_id, bl); ::encode(name, bl); ::encode(rank, bl); ::encode(inc, bl); ::encode(state, bl); ::encode(state_seq, bl); ::encode(addr, bl); ::encode(laggy_since, bl); ::encode(standby_for_rank, bl); ::encode(standby_for_name, bl); ::encode(export_targets, bl); } void decode(bufferlist::iterator& bl) { __u8 v; ::decode(v, bl); ::decode(global_id, bl); ::decode(name, bl); ::decode(rank, bl); ::decode(inc, bl); ::decode(state, bl); ::decode(state_seq, bl); ::decode(addr, bl); ::decode(laggy_since, bl); ::decode(standby_for_rank, bl); ::decode(standby_for_name, bl); if (v >= 2) ::decode(export_targets, bl); } }; protected: // base map epoch_t epoch; epoch_t client_epoch; // incremented only when change is significant to client. epoch_t last_failure; // epoch of last failure utime_t created, modified; int32_t tableserver; // which MDS has anchortable, snaptable int32_t root; // which MDS has root directory __u32 session_timeout; __u32 session_autoclose; uint64_t max_file_size; vector<__u32> data_pg_pools; // file data pg_pools available to clients (via an ioctl). first is the default. __u32 cas_pg_pool; // where CAS objects go __u32 metadata_pg_pool; // where fs metadata objects go /* * in: the set of logical mds #'s that define the cluster. this is the set * of mds's the metadata may be distributed over. * up: map from logical mds #'s to the addrs filling those roles. * failed: subset of @in that are failed. * stopped: set of nodes that have been initialized, but are not active. * * @up + @failed = @in. @in * @stopped = {}. */ uint32_t max_mds; set in; // currently defined cluster map inc; // most recent incarnation. set failed, stopped; // which roles are failed or stopped map up; // who is in those roles map mds_info; public: CompatSet compat; friend class MDSMonitor; public: MDSMap() : epoch(0), client_epoch(0), last_failure(0), tableserver(0), root(0), cas_pg_pool(0), metadata_pg_pool(0) { // hack.. this doesn't really belong here session_timeout = (int)g_conf.mds_session_timeout; session_autoclose = (int)g_conf.mds_session_autoclose; max_file_size = g_conf.mds_max_file_size; } utime_t get_session_timeout() { return utime_t(session_timeout,0); } uint64_t get_max_filesize() { return max_file_size; } epoch_t get_epoch() const { return epoch; } void inc_epoch() { epoch++; } const utime_t& get_created() const { return created; } void set_created(utime_t ct) { modified = created = ct; } const utime_t& get_modified() const { return modified; } void set_modified(utime_t mt) { modified = mt; } epoch_t get_last_failure() const { return last_failure; } unsigned get_max_mds() const { return max_mds; } void set_max_mds(int m) { max_mds = m; } int get_tableserver() const { return tableserver; } int get_root() const { return root; } const vector<__u32> &get_data_pg_pools() const { return data_pg_pools; } __u32 get_data_pg_pool() const { return data_pg_pools[0]; } __u32 get_cas_pg_pool() const { return cas_pg_pool; } __u32 get_metadata_pg_pool() const { return metadata_pg_pool; } const map& get_mds_info() { return mds_info; } const mds_info_t& get_mds_info_gid(uint64_t gid) { assert(mds_info.count(gid)); return mds_info[gid]; } const mds_info_t& get_mds_info(int m) { assert(up.count(m) && mds_info.count(up[m])); return mds_info[up[m]]; } // counts unsigned get_num_mds() { return in.size(); } unsigned get_num_mds(int state) { unsigned n = 0; for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state == state) ++n; return n; } int get_num_failed() { return failed.size(); } // sets void get_mds_set(set& s) { s = in; } void get_up_mds_set(set& s) { for (map::const_iterator p = up.begin(); p != up.end(); ++p) s.insert(p->first); } void get_active_mds_set(set& s) { get_mds_set(s, MDSMap::STATE_ACTIVE); } void get_failed_mds_set(set& s) { s = failed; } int get_failed() { if (!failed.empty()) return *failed.begin(); return -1; } void get_stopped_mds_set(set& s) { s = stopped; } void get_recovery_mds_set(set& s) { s = failed; for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_STOPPING) s.insert(p->second.rank); } void get_mds_set(set& s, int state) { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) if (p->second.state == state) s.insert(p->second.rank); } int get_random_up_mds() { if (up.empty()) return -1; map::iterator p = up.begin(); for (int n = rand() % up.size(); n; n--) p++; return p->first; } uint64_t find_standby_for(int mds, string& name) { for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) { if (p->second.rank == -1 && (p->second.standby_for_rank == mds || p->second.standby_for_name == name) && p->second.state == MDSMap::STATE_STANDBY && !p->second.laggy()) { return p->first; } } for (map::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) { if (p->second.rank == -1 && p->second.standby_for_rank < 0 && p->second.standby_for_name.length() == 0 && p->second.state == MDSMap::STATE_STANDBY && !p->second.laggy()) { return p->first; } } return 0; } // mds states bool is_down(int m) { return up.count(m) == 0; } bool is_up(int m) { return up.count(m); } bool is_in(int m) { return up.count(m) || failed.count(m); } bool is_out(int m) { return !is_in(m); } bool is_failed(int m) { return failed.count(m); } bool is_stopped(int m) { return stopped.count(m); } bool is_dne(int m) { return in.count(m) == 0; } bool is_dne_gid(uint64_t gid) { return mds_info.count(gid) == 0; } int get_state(int m) { return up.count(m) ? mds_info[up[m]].state : 0; } int get_state_gid(uint64_t gid) { return mds_info.count(gid) ? mds_info[gid].state : 0; } mds_info_t& get_info(int m) { assert(up.count(m)); return mds_info[up[m]]; } mds_info_t& get_info_gid(uint64_t gid) { assert(mds_info.count(gid)); return mds_info[gid]; } bool is_boot(int m) { return get_state(m) == STATE_BOOT; } bool is_creating(int m) { return get_state(m) == STATE_CREATING; } bool is_starting(int m) { return get_state(m) == STATE_STARTING; } bool is_replay(int m) { return get_state(m) == STATE_REPLAY; } bool is_resolve(int m) { return get_state(m) == STATE_RESOLVE; } bool is_reconnect(int m) { return get_state(m) == STATE_RECONNECT; } bool is_rejoin(int m) { return get_state(m) == STATE_REJOIN; } bool is_clientreplay(int m) { return get_state(m) == STATE_CLIENTREPLAY; } bool is_active(int m) { return get_state(m) == STATE_ACTIVE; } bool is_stopping(int m) { return get_state(m) == STATE_STOPPING; } bool is_clientreplay_or_active_or_stopping(int m) { return is_clientreplay(m) || is_active(m) || is_stopping(m); } bool is_laggy_gid(uint64_t gid) { return mds_info.count(gid) && mds_info[gid].laggy(); } // cluster states bool is_full() { return in.size() >= max_mds; } bool is_degraded() { // degraded = some recovery in process. fixes active membership and recovery_set. return get_num_mds(STATE_REPLAY) + get_num_mds(STATE_RESOLVE) + get_num_mds(STATE_RECONNECT) + get_num_mds(STATE_REJOIN) + failed.size(); } bool is_rejoining() { // nodes are rejoining cache state return get_num_mds(STATE_REJOIN) > 0 && get_num_mds(STATE_REPLAY) == 0 && get_num_mds(STATE_RECONNECT) == 0 && get_num_mds(STATE_RESOLVE) == 0 && failed.empty(); } bool is_stopped() { return up.size() == 0; } // inst bool have_inst(int m) { return up.count(m); } const entity_inst_t get_inst(int m) { assert(up.count(m)); return mds_info[up[m]].get_inst(); } const entity_addr_t get_addr(int m) { assert(up.count(m)); return mds_info[up[m]].addr; } bool get_inst(int m, entity_inst_t& inst) { if (up.count(m)) { inst = get_inst(m); return true; } return false; } int get_rank_gid(uint64_t gid) { if (mds_info.count(gid)) return mds_info[gid].rank; return -1; } int get_inc(int m) { if (up.count(m)) return mds_info[up[m]].inc; return 0; } void encode(bufferlist& bl) const { __u16 v = 2; ::encode(v, bl); ::encode(epoch, bl); ::encode(client_epoch, bl); ::encode(last_failure, bl); ::encode(root, bl); ::encode(session_timeout, bl); ::encode(session_autoclose, bl); ::encode(max_file_size, bl); ::encode(max_mds, bl); ::encode(mds_info, bl); ::encode(data_pg_pools, bl); ::encode(cas_pg_pool, bl); // kclient ignores everything from here __u16 ev = 3; ::encode(ev, bl); ::encode(compat, bl); ::encode(metadata_pg_pool, bl); ::encode(created, bl); ::encode(modified, bl); ::encode(tableserver, bl); ::encode(in, bl); ::encode(inc, bl); ::encode(up, bl); ::encode(failed, bl); ::encode(stopped, bl); } void decode(bufferlist::iterator& p) { __u16 v; ::decode(v, p); ::decode(epoch, p); ::decode(client_epoch, p); ::decode(last_failure, p); ::decode(root, p); ::decode(session_timeout, p); ::decode(session_autoclose, p); ::decode(max_file_size, p); ::decode(max_mds, p); ::decode(mds_info, p); ::decode(data_pg_pools, p); ::decode(cas_pg_pool, p); // kclient ignores everything from here __u16 ev = 1; if (v >= 2) ::decode(ev, p); if (ev >= 3) ::decode(compat, p); else compat = mdsmap_compat_base; ::decode(metadata_pg_pool, p); ::decode(created, p); ::decode(modified, p); ::decode(tableserver, p); ::decode(in, p); ::decode(inc, p); ::decode(up, p); ::decode(failed, p); ::decode(stopped, p); } void decode(bufferlist& bl) { bufferlist::iterator p = bl.begin(); decode(p); } void print(ostream& out); void print_summary(ostream& out); }; WRITE_CLASS_ENCODER(MDSMap::mds_info_t) WRITE_CLASS_ENCODER(MDSMap) inline ostream& operator<<(ostream& out, MDSMap& m) { m.print_summary(out); return out; } #endif