diff options
author | Yan, Zheng <zheng.z.yan@intel.com> | 2013-09-19 13:05:59 +0800 |
---|---|---|
committer | Yan, Zheng <zheng.z.yan@intel.com> | 2013-09-24 08:45:55 +0800 |
commit | ba9cec2f96153324f02600c354a3cea943a784f2 (patch) | |
tree | 8e6f5b7d2cbf8d9c2a5db9ab7d1c82df35c3b2b2 | |
parent | d39e16ad3c360551ae591d91ec5aeb03cc2cfd25 (diff) | |
download | ceph-ba9cec2f96153324f02600c354a3cea943a784f2.tar.gz |
mds: delete orphan dirfrags during MDS recovers
This patch make the MDS use following steps to fragmentate directory.
---
1. freeze the old dirfrags
2. journal EFragment::OP_PREPARE
3. store the new dirfrags
4. journal EFragment::OP_COMMIT
5. delete the old dirfrags
6. journal EFragment::OP_FINISH
The newly introduced event EFragment::OP_FINISH indicates that all orphan
frags have been deleted. The new process guarantees that orphan frags can
be properly deleted if the MDS crashes while fragmentating directory.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
-rw-r--r-- | src/mds/MDCache.cc | 151 | ||||
-rw-r--r-- | src/mds/MDCache.h | 11 | ||||
-rw-r--r-- | src/mds/events/EFragment.h | 7 | ||||
-rw-r--r-- | src/mds/journal.cc | 22 |
4 files changed, 150 insertions, 41 deletions
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index bb59c6a21fa..6a45cf14b03 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -11041,13 +11041,23 @@ void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs) } } -class C_MDC_FragmentLoggedAndStored : public Context { +class C_MDC_FragmentPrep : public Context { MDCache *mdcache; MDRequest *mdr; public: - C_MDC_FragmentLoggedAndStored(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} + C_MDC_FragmentPrep(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} virtual void finish(int r) { - mdcache->fragment_logged_and_stored(mdr); + mdcache->_fragment_logged(mdr); + } +}; + +class C_MDC_FragmentStore : public Context { + MDCache *mdcache; + MDRequest *mdr; +public: + C_MDC_FragmentStore(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {} + virtual void finish(int r) { + mdcache->_fragment_stored(mdr); } }; @@ -11065,6 +11075,20 @@ public: } }; +class C_MDC_FragmentFinish : public Context { + MDCache *mdcache; + dirfrag_t basedirfrag; + list<CDir*> resultfrags; +public: + C_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) : + mdcache(m), basedirfrag(f) { + resultfrags.swap(l); + } + virtual void finish(int r) { + mdcache->_fragment_finish(basedirfrag, resultfrags); + } +}; + void MDCache::fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits) { dout(10) << "fragment_frozen " << dirs << " " << basefrag << " by " << bits @@ -11132,39 +11156,49 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) diri->verify_dirfrags(); mds->queue_waiters(waiters); + for (list<frag_t>::iterator p = old_frags.begin(); p != old_frags.end(); ++p) + assert(!diri->dirfragtree.is_leaf(*p)); + mdr->ls = mds->mdlog->get_current_segment(); EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, diri->ino(), info.basefrag, info.bits); mds->mdlog->start_entry(le); le->metablob.add_dir_context(*info.resultfrags.begin()); + for (list<CDir*>::iterator p = info.resultfrags.begin(); + p != info.resultfrags.end(); + ++p) { + le->metablob.add_dir(*p, false); + } + le->orig_frags = old_frags; // dft lock mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); mdr->add_updated_lock(&diri->dirfragtreelock); - /* - // filelock - mds->locker->mark_updated_scatterlock(&diri->filelock); - mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); - mut->add_updated_lock(&diri->filelock); - - // dirlock - mds->locker->mark_updated_scatterlock(&diri->nestlock); - mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); - mut->add_updated_lock(&diri->nestlock); - */ + add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, old_frags); + mds->mdlog->submit_entry(le, new C_MDC_FragmentPrep(this, mdr)); + mds->mdlog->flush(); +} - // freeze, journal, and store resulting frags - C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentLoggedAndStored(this, mdr)); +void MDCache::_fragment_logged(MDRequest *mdr) +{ + assert(fragment_requests.count(mdr->reqid)); + fragment_info_t &info = fragment_requests[mdr->reqid]; + CInode *diri = info.resultfrags.front()->get_inode(); + + dout(10) << "fragment_logged " << info.resultfrags << " " << info.basefrag + << " bits " << info.bits << " on " << *diri << dendl; + + // store resulting frags + C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr)); for (list<CDir*>::iterator p = info.resultfrags.begin(); p != info.resultfrags.end(); ++p) { CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - le->metablob.add_dir(dir, false); + dout(10) << " storing result frag " << *dir << dendl; // freeze and store them too dir->auth_pin(this); @@ -11172,19 +11206,16 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) dir->commit(0, gather.new_sub(), true); // ignore authpinnability } - add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, old_frags); - mds->mdlog->submit_entry(le, gather.new_sub()); - mds->mdlog->flush(); gather.activate(); } -void MDCache::fragment_logged_and_stored(MDRequest *mdr) +void MDCache::_fragment_stored(MDRequest *mdr) { assert(fragment_requests.count(mdr->reqid)); fragment_info_t &info = fragment_requests[mdr->reqid]; CInode *diri = info.resultfrags.front()->get_inode(); - dout(10) << "fragment_logged_and_stored " << info.resultfrags << " " << info.basefrag + dout(10) << "fragment_stored " << info.resultfrags << " " << info.basefrag << " bits " << info.bits << " on " << *diri << dendl; // tell peers @@ -11274,6 +11305,7 @@ void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags) { dout(10) << "fragment_finish " << basedirfrag << dendl; assert(uncommitted_fragments.count(basedirfrag)); + ufragment &uf = uncommitted_fragments[basedirfrag]; // unmark & auth_unpin for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) { @@ -11281,7 +11313,11 @@ void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags) (*p)->auth_unpin(this); } - finish_uncommitted_fragment(basedirfrag); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, + basedirfrag.ino, basedirfrag.frag, uf.bits); + mds->mdlog->start_submit_entry(le); + + finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH); } /* This function DOES put the passed message before returning */ @@ -11337,11 +11373,30 @@ void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<fra uf.bits = bits; } -void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag) +void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op) +{ + dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag + << " op " << EFragment::op_name(op) << dendl; + if (uncommitted_fragments.count(basedirfrag)) { + ufragment& uf = uncommitted_fragments[basedirfrag]; + if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) + uf.committed = true; + else + uncommitted_fragments.erase(basedirfrag); + } +} + +void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags) { - dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag << dendl; + dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag + << " old_frags (" << old_frags << ")" << dendl; if (uncommitted_fragments.count(basedirfrag)) { - uncommitted_fragments.erase(basedirfrag); + ufragment& uf = uncommitted_fragments[basedirfrag]; + if (!uf.old_frags.empty()) { + uf.old_frags.swap(old_frags); + uf.committed = true; + } else + uncommitted_fragments.erase(basedirfrag); } } @@ -11352,19 +11407,57 @@ void MDCache::rollback_uncommitted_fragments() p != uncommitted_fragments.end(); ++p) { ufragment &uf = p->second; + CInode *diri = get_inode(p->first.ino); assert(diri); + + if (uf.committed) { + list<CDir*> frags; + diri->get_dirfrags_under(p->first.frag, frags); + for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) { + CDir *dir = *q; + dir->auth_pin(this); + dir->state_set(CDir::STATE_FRAGMENTING); + } + _fragment_committed(p->first, frags); + continue; + } + dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl; + + list<frag_t> old_frags; + diri->dirfragtree.get_leaves_under(p->first.frag, old_frags); + list<CDir*> resultfrags; list<Context*> waiters; adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true); if (g_conf->mds_debug_frag) diri->verify_dirfrags(); + for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q) + assert(!diri->dirfragtree.is_leaf(*q)); + + LogSegment *ls = mds->mdlog->get_current_segment(); EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits); - mds->mdlog->start_submit_entry(le); + mds->mdlog->start_entry(le); + + le->orig_frags.swap(uf.old_frags); + le->metablob.add_dir_context(*resultfrags.begin()); + + for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) { + CDir *dir = *q; + dir->auth_pin(this); + dir->state_set(CDir::STATE_FRAGMENTING); + // don't know if the dirfrag was dirty + dir->_mark_dirty(ls); + le->metablob.add_dir(dir, true); + } + + mds->mdlog->submit_entry(le); + + uf.old_frags.swap(old_frags); + _fragment_committed(p->first, resultfrags); } - uncommitted_fragments.clear(); } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 8560ce481fb..50ae258cc2d 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -945,8 +945,9 @@ protected: private: struct ufragment { int bits; + bool committed; list<frag_t> old_frags; - ufragment() : bits(0) {} + ufragment() : bits(0), committed(false) {} }; map<dirfrag_t, ufragment> uncommitted_fragments; @@ -984,7 +985,8 @@ private: void fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits); void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs); void dispatch_fragment_dir(MDRequest *mdr); - void fragment_logged_and_stored(MDRequest *mdr); + void _fragment_logged(MDRequest *mdr); + void _fragment_stored(MDRequest *mdr); void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags); void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags); @@ -994,13 +996,16 @@ private: friend class C_MDC_FragmentFrozen; friend class C_MDC_FragmentMarking; - friend class C_MDC_FragmentLoggedAndStored; + friend class C_MDC_FragmentPrep; + friend class C_MDC_FragmentStore; friend class C_MDC_FragmentCommit; + friend class C_MDC_FragmentFinish; void handle_fragment_notify(MMDSFragmentNotify *m); void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag); void finish_uncommitted_fragment(dirfrag_t basedirfrag); + void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags); // -- updates -- //int send_inode_updates(CInode *in); diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h index bdbbd335e29..01c35078e1a 100644 --- a/src/mds/events/EFragment.h +++ b/src/mds/events/EFragment.h @@ -25,6 +25,7 @@ public: inodeno_t ino; frag_t basefrag; __s32 bits; // positive for split (from basefrag), negative for merge (to basefrag) + list<frag_t> orig_frags; EFragment() : LogEvent(EVENT_FRAGMENT) { } EFragment(MDLog *mdlog, int o, inodeno_t i, frag_t bf, int b) : @@ -39,13 +40,15 @@ public: OP_PREPARE = 1, OP_COMMIT = 2, OP_ROLLBACK = 3, - OP_ONESHOT = 4, // (legacy) PREPARE+COMMIT + OP_FINISH = 4, // finish deleting orphan dirfrags + OP_ONESHOT = 5, // (legacy) PREPARE+COMMIT }; - const char *op_name(int o) const { + static const char *op_name(int o) { switch (o) { case OP_PREPARE: return "prepare"; case OP_COMMIT: return "commit"; case OP_ROLLBACK: return "rollback"; + case OP_FINISH: return "finish"; default: return "???"; } } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 49fdb9ce5da..f36087f49d8 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -2382,7 +2382,6 @@ void EFragment::replay(MDS *mds) list<CDir*> resultfrags; list<Context*> waiters; list<frag_t> old_frags; - pair<dirfrag_t,int> desc(dirfrag_t(ino,basefrag), bits); // in may be NULL if it wasn't in our cache yet. if it's a prepare // it will be once we replay the metablob , but first we need to @@ -2391,7 +2390,7 @@ void EFragment::replay(MDS *mds) switch (op) { case OP_PREPARE: - mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, old_frags); + mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags); // fall-thru case OP_ONESHOT: if (in) @@ -2399,34 +2398,41 @@ void EFragment::replay(MDS *mds) break; case OP_ROLLBACK: - if (in) + if (in) { + in->dirfragtree.get_leaves_under(basefrag, old_frags); mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true); - // fall-thru + } + mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), old_frags); + break; + case OP_COMMIT: - mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag)); + case OP_FINISH: + mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op); break; default: assert(0); } + metablob.replay(mds, _segment); if (in && g_conf->mds_debug_frag) in->verify_dirfrags(); } void EFragment::encode(bufferlist &bl) const { - ENCODE_START(4, 4, bl); + ENCODE_START(5, 4, bl); ::encode(stamp, bl); ::encode(op, bl); ::encode(ino, bl); ::encode(basefrag, bl); ::encode(bits, bl); ::encode(metablob, bl); + ::encode(orig_frags, bl); ENCODE_FINISH(bl); } void EFragment::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); if (struct_v >= 2) ::decode(stamp, bl); if (struct_v >= 3) @@ -2437,6 +2443,8 @@ void EFragment::decode(bufferlist::iterator &bl) { ::decode(basefrag, bl); ::decode(bits, bl); ::decode(metablob, bl); + if (struct_v >= 5) + ::decode(orig_frags, bl); DECODE_FINISH(bl); } |