diff options
author | Yan, Zheng <zheng.z.yan@intel.com> | 2012-11-19 10:43:46 +0800 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2012-12-01 12:52:23 -0800 |
commit | 0fa487585ec99e3d9c37eeff3a0ce0543bc93d5d (patch) | |
tree | 00fd3282e6fbf3bb8a7cbd25cc7d88b7d96624aa | |
parent | 2a5068944ef73c55a60e3333b5641177c5381db9 (diff) | |
download | ceph-0fa487585ec99e3d9c37eeff3a0ce0543bc93d5d.tar.gz |
mds: fix freeze inode deadlock
CInode::freeze_inode() is used in the case of cross authority rename.
Server::handle_slave_rename_prep() calls it to wait for all other
operations on source inode to complete. This happens after all locks
for the rename operation are acquired. But to acquire locks, we need
auth pin locks' parent objects first. So there is an ABBA deadlock
if someone auth pins the source inode after locks for rename are
acquired and before Server::handle_slave_rename_prep() is called.
The fix is freeze and auth pin the source inode at the same time.
This patch introduces CInode::freeze_auth_pin(), it waits for all
other MDRequests to release auth pins, then change the inode to
FROZENAUTHPIN state, this state prevents other MDRequests from
getting new auth pins.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
-rw-r--r-- | src/mds/CInode.cc | 36 | ||||
-rw-r--r-- | src/mds/CInode.h | 5 | ||||
-rw-r--r-- | src/mds/Locker.cc | 7 | ||||
-rw-r--r-- | src/mds/Locker.h | 3 | ||||
-rw-r--r-- | src/mds/Mutation.cc | 31 | ||||
-rw-r--r-- | src/mds/Mutation.h | 6 | ||||
-rw-r--r-- | src/mds/Server.cc | 47 | ||||
-rw-r--r-- | src/mds/mdstypes.h | 7 | ||||
-rw-r--r-- | src/messages/MMDSSlaveRequest.h | 1 |
9 files changed, 124 insertions, 19 deletions
diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index c12930837df..af70b681ffc 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -130,6 +130,7 @@ ostream& operator<<(ostream& out, CInode& in) if (in.state_test(CInode::STATE_DIRTYPARENT)) out << " dirtyparent"; if (in.is_freezing_inode()) out << " FREEZING=" << in.auth_pin_freeze_allowance; if (in.is_frozen_inode()) out << " FROZEN"; + if (in.is_frozen_auth_pin()) out << " FROZEN_AUTHPIN"; inode_t *pi = in.get_projected_inode(); if (pi->is_truncating()) @@ -1862,7 +1863,8 @@ void CInode::add_waiter(uint64_t tag, Context *c) // wait on the directory? // make sure its not the inode that is explicitly ambiguous|freezing|frozen if (((tag & WAIT_SINGLEAUTH) && !state_test(STATE_AMBIGUOUSAUTH)) || - ((tag & WAIT_UNFREEZE) && !is_frozen_inode() && !is_freezing_inode())) { + ((tag & WAIT_UNFREEZE) && + !is_frozen_inode() && !is_freezing_inode() && !is_frozen_auth_pin())) { dout(15) << "passing waiter up tree" << dendl; parent->dir->add_waiter(tag, c); return; @@ -1885,8 +1887,10 @@ bool CInode::freeze_inode(int auth_pin_allowance) dout(10) << "freeze_inode - frozen" << dendl; assert(auth_pins == auth_pin_allowance); - get(PIN_FROZEN); - state_set(STATE_FROZEN); + if (!state_test(STATE_FROZEN)) { + get(PIN_FROZEN); + state_set(STATE_FROZEN); + } return true; } @@ -1904,10 +1908,34 @@ void CInode::unfreeze_inode(list<Context*>& finished) take_waiting(WAIT_UNFREEZE, finished); } +void CInode::unfreeze_inode() +{ + list<Context*> finished; + unfreeze_inode(finished); + mdcache->mds->queue_waiters(finished); +} + +void CInode::freeze_auth_pin() +{ + assert(state_test(CInode::STATE_FROZEN)); + state_set(CInode::STATE_FROZENAUTHPIN); +} + +void CInode::unfreeze_auth_pin() +{ + assert(state_test(CInode::STATE_FROZENAUTHPIN)); + state_clear(CInode::STATE_FROZENAUTHPIN); + if (!state_test(STATE_FREEZING|STATE_FROZEN)) { + list<Context*> finished; + take_waiting(WAIT_UNFREEZE, finished); + mdcache->mds->queue_waiters(finished); + } +} // auth_pins bool CInode::can_auth_pin() { - if (is_freezing_inode() || is_frozen_inode()) return false; + if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) + return false; if (parent) return parent->can_auth_pin(); return true; diff --git a/src/mds/CInode.h b/src/mds/CInode.h index b76b52414c9..e43ecf50fa3 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -181,6 +181,7 @@ public: static const int STATE_DIRTYPARENT = (1<<14); static const int STATE_DIRTYRSTAT = (1<<15); static const int STATE_STRAYPINNED = (1<<16); + static const int STATE_FROZENAUTHPIN = (1<<17); // -- waiters -- static const uint64_t WAIT_DIR = (1<<0); @@ -856,6 +857,7 @@ public: // -- freeze -- bool is_freezing_inode() { return state_test(STATE_FREEZING); } bool is_frozen_inode() { return state_test(STATE_FROZEN); } + bool is_frozen_auth_pin() { return state_test(STATE_FROZENAUTHPIN); } bool is_frozen(); bool is_frozen_dir(); bool is_freezing(); @@ -864,7 +866,10 @@ public: * auth_pins it is itself holding/responsible for. */ bool freeze_inode(int auth_pin_allowance=0); void unfreeze_inode(list<Context*>& finished); + void unfreeze_inode(); + void freeze_auth_pin(); + void unfreeze_auth_pin(); // -- reference counting -- void bad_put(int by) { diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 63f83116fe1..ee4799e18f8 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -174,7 +174,8 @@ bool Locker::acquire_locks(MDRequest *mdr, set<SimpleLock*> &rdlocks, set<SimpleLock*> &wrlocks, set<SimpleLock*> &xlocks, - map<SimpleLock*,int> *remote_wrlocks) + map<SimpleLock*,int> *remote_wrlocks, + CInode *auth_pin_freeze) { if (mdr->done_locking && !mdr->is_slave()) { // not on slaves! master requests locks piecemeal. @@ -336,7 +337,9 @@ bool Locker::acquire_locks(MDRequest *mdr, dout(10) << " req remote auth_pin of " << **q << dendl; MDSCacheObjectInfo info; (*q)->set_object_info(info); - req->get_authpins().push_back(info); + req->get_authpins().push_back(info); + if (*q == auth_pin_freeze) + (*q)->set_object_info(req->get_authpin_freeze()); mdr->pin(*q); } mds->send_message_mds(req, p->first); diff --git a/src/mds/Locker.h b/src/mds/Locker.h index a1cf59e3185..b3b9919e7fd 100644 --- a/src/mds/Locker.h +++ b/src/mds/Locker.h @@ -88,7 +88,8 @@ public: set<SimpleLock*> &rdlocks, set<SimpleLock*> &wrlocks, set<SimpleLock*> &xlocks, - map<SimpleLock*,int> *remote_wrlocks=NULL); + map<SimpleLock*,int> *remote_wrlocks=NULL, + CInode *auth_pin_freeze=NULL); void cancel_locking(Mutation *mut, set<CInode*> *pneed_issue); void drop_locks(Mutation *mut, set<CInode*> *pneed_issue=0); diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc index 6321ffc160a..a9c35134bc8 100644 --- a/src/mds/Mutation.cc +++ b/src/mds/Mutation.cc @@ -82,8 +82,39 @@ void Mutation::auth_unpin(MDSCacheObject *object) auth_pins.erase(object); } +bool Mutation::freeze_auth_pin(CInode *inode) +{ + assert(!auth_pin_freeze || auth_pin_freeze == inode); + auth_pin_freeze = inode; + auth_pin(inode); + if (!inode->freeze_inode(1)) + return false; + + inode->freeze_auth_pin(); + inode->unfreeze_inode(); + return true; +} + +void Mutation::unfreeze_auth_pin(CInode *inode) +{ + assert(auth_pin_freeze == inode); + assert(is_auth_pinned(inode)); + if (inode->is_frozen_auth_pin()) + inode->unfreeze_auth_pin(); + else + inode->unfreeze_inode(); + auth_pin_freeze = NULL; +} + +bool Mutation::can_auth_pin(MDSCacheObject *object) +{ + return object->can_auth_pin() || (is_auth_pinned(object) && object == auth_pin_freeze); +} + void Mutation::drop_local_auth_pins() { + if (auth_pin_freeze) + unfreeze_auth_pin(auth_pin_freeze); for (set<MDSCacheObject*>::iterator it = auth_pins.begin(); it != auth_pins.end(); it++) { diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index cba6223864e..37cc764254d 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -50,6 +50,7 @@ struct Mutation { // auth pins set< MDSCacheObject* > remote_auth_pins; set< MDSCacheObject* > auth_pins; + CInode *auth_pin_freeze; // held locks set< SimpleLock* > rdlocks; // always local. @@ -81,12 +82,14 @@ struct Mutation { : attempt(0), ls(0), slave_to_mds(-1), + auth_pin_freeze(NULL), locking(NULL), done_locking(false), committing(false), aborted(false), killed(false) { } Mutation(metareqid_t ri, __u32 att=0, int slave_to=-1) : reqid(ri), attempt(att), ls(0), slave_to_mds(slave_to), + auth_pin_freeze(NULL), locking(NULL), done_locking(false), committing(false), aborted(false), killed(false) { } virtual ~Mutation() { @@ -120,6 +123,9 @@ struct Mutation { bool is_auth_pinned(MDSCacheObject *object); void auth_pin(MDSCacheObject *object); void auth_unpin(MDSCacheObject *object); + bool freeze_auth_pin(CInode *inode); + void unfreeze_auth_pin(CInode *inode); + bool can_auth_pin(MDSCacheObject *object); void drop_local_auth_pins(); void add_projected_inode(CInode *in); void pop_and_dirty_projected_inodes(); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 1545fef73c5..72fd7da2305 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1487,6 +1487,7 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) // build list of objects list<MDSCacheObject*> objects; + CInode *auth_pin_freeze = NULL; bool fail = false; for (vector<MDSCacheObjectInfo>::iterator p = mdr->slave_request->get_authpins().begin(); @@ -1500,6 +1501,8 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) } objects.push_back(object); + if (*p == mdr->slave_request->get_authpin_freeze()) + auth_pin_freeze = dynamic_cast<CInode*>(object); } // can we auth pin them? @@ -1512,8 +1515,7 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) fail = true; break; } - if (!mdr->is_auth_pinned(*p) && - !(*p)->can_auth_pin()) { + if (!mdr->can_auth_pin(*p)) { // wait dout(10) << " waiting for authpinnable on " << **p << dendl; (*p)->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); @@ -1527,6 +1529,22 @@ void Server::handle_slave_auth_pin(MDRequest *mdr) if (fail) { mdr->drop_local_auth_pins(); // just in case } else { + /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations + * on the source inode to complete. This happens after all locks for the rename + * operation are acquired. But to acquire locks, we need auth pin locks' parent + * objects first. So there is an ABBA deadlock if someone auth pins the source inode + * after locks are acquired and before Server::handle_slave_rename_prep() is called. + * The solution is freeze the inode and prevent other MDRequests from getting new + * auth pins. + */ + if (auth_pin_freeze) { + dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl; + if (!mdr->freeze_auth_pin(auth_pin_freeze)) { + auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); + mds->mdlog->flush(); + return; + } + } for (list<MDSCacheObject*>::iterator p = objects.begin(); p != objects.end(); ++p) { @@ -1923,7 +1941,8 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, int n, // do NOT proceed if freezing, as cap release may defer in that case, and // we could deadlock when we try to lock @ref. // if we're already auth_pinned, continue; the release has already been processed. - if (ref->is_frozen() || (ref->is_freezing() && !mdr->is_auth_pinned(ref))) { + if (ref->is_frozen() || ref->is_frozen_auth_pin() || + (ref->is_freezing() && !mdr->is_auth_pinned(ref))) { dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl; ref->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr)); /* If we have any auth pins, this will deadlock. @@ -5246,7 +5265,9 @@ void Server::handle_client_rename(MDRequest *mdr) // take any locks needed for anchor creation/verification mds->mdcache->anchor_create_prep_locks(mdr, srci, rdlocks, xlocks); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, &remote_wrlocks)) + CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : NULL; + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, + &remote_wrlocks, auth_pin_freeze)) return; if (oldin && @@ -5996,9 +6017,7 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) // am i srcdn auth? if (srcdn->is_auth()) { - if (srcdnl->is_primary() && - !srcdnl->get_inode()->is_freezing_inode() && - !srcdnl->get_inode()->is_frozen_inode()) { + if (srcdnl->is_primary()) { // set ambiguous auth for srci /* * NOTE: we don't worry about ambiguous cache expire as we do @@ -6015,7 +6034,13 @@ void Server::handle_slave_rename_prep(MDRequest *mdr) int allowance = 2; // 1 for the mdr auth_pin, 1 for the link lock allowance += srcdnl->get_inode()->is_dir(); // for the snap lock dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl; - if (!srcdnl->get_inode()->freeze_inode(allowance)) { + bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance); + + // unfreeze auth pin after freezing the inode to avoid queueing waiters + if (srcdnl->get_inode()->is_frozen_auth_pin()) + mdr->unfreeze_auth_pin(srcdnl->get_inode()); + + if (!frozen_inode) { srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr)); return; } @@ -6183,8 +6208,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, destdnl->get_inode()->take_waiting(CInode::WAIT_SINGLEAUTH, finished); // unfreeze - assert(destdnl->get_inode()->is_frozen_inode() || - destdnl->get_inode()->is_freezing_inode()); + assert(destdnl->get_inode()->is_frozen_inode()); destdnl->get_inode()->unfreeze_inode(finished); mds->queue_waiters(finished); @@ -6207,8 +6231,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, destdnl->get_inode()->take_waiting(CInode::WAIT_SINGLEAUTH, finished); // unfreeze - assert(destdnl->get_inode()->is_frozen_inode() || - destdnl->get_inode()->is_freezing_inode()); + assert(destdnl->get_inode()->is_frozen_inode()); destdnl->get_inode()->unfreeze_inode(finished); mds->queue_waiters(finished); diff --git a/src/mds/mdstypes.h b/src/mds/mdstypes.h index db4dbf1ac61..22e754eb2a1 100644 --- a/src/mds/mdstypes.h +++ b/src/mds/mdstypes.h @@ -1250,6 +1250,13 @@ public: } }; +inline bool operator==(const MDSCacheObjectInfo& l, const MDSCacheObjectInfo& r) { + if (l.ino || r.ino) + return l.ino == r.ino && l.snapid == r.snapid; + else + return l.dirfrag == r.dirfrag && l.dname == r.dname; +} + WRITE_CLASS_ENCODER(MDSCacheObjectInfo) diff --git a/src/messages/MMDSSlaveRequest.h b/src/messages/MMDSSlaveRequest.h index 4f2bb5948bd..03ec582c49e 100644 --- a/src/messages/MMDSSlaveRequest.h +++ b/src/messages/MMDSSlaveRequest.h @@ -112,6 +112,7 @@ public: int get_lock_type() { return lock_type; } MDSCacheObjectInfo &get_object_info() { return object_info; } + MDSCacheObjectInfo &get_authpin_freeze() { return object_info; } vector<MDSCacheObjectInfo>& get_authpins() { return authpins; } |