summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYan, Zheng <zheng.z.yan@intel.com>2013-10-04 15:10:37 +0800
committerYan, Zheng <zheng.z.yan@intel.com>2013-10-05 11:31:11 +0800
commit4014ba25b31a34b46e3284831d7875afc8c6a76c (patch)
tree3bfd88f63fe24bca13a99f951e86a9fb54ee5006
parenta881c1d36ceb705a89e7d157e56bb94d4624af97 (diff)
downloadceph-4014ba25b31a34b46e3284831d7875afc8c6a76c.tar.gz
mds: delete orphan dirfrags during MDS recovers
This patch make the MDS use following steps to fragmentate directory. 1. freeze the old dirfrags 2. journal EFragment::OP_PREPARE 3. store the new dirfrags 4. journal EFragment::OP_COMMIT 5. delete the old dirfrags 6. journal EFragment::OP_FINISH The newly introduced event EFragment::OP_FINISH indicates that all orphan frags have been deleted. The new process guarantees that orphan frags can be properly deleted if the MDS crashes while fragmentating directory. Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
-rw-r--r--src/mds/MDCache.cc120
-rw-r--r--src/mds/MDCache.h31
-rw-r--r--src/mds/journal.cc8
3 files changed, 123 insertions, 36 deletions
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index c15aeaa7fcd..9f97f31fcbf 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -11041,13 +11041,23 @@ void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
}
}
-class C_MDC_FragmentLoggedAndStored : public Context {
+class C_MDC_FragmentPrep : public Context {
MDCache *mdcache;
MDRequest *mdr;
public:
- C_MDC_FragmentLoggedAndStored(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
+ C_MDC_FragmentPrep(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
virtual void finish(int r) {
- mdcache->fragment_logged_and_stored(mdr);
+ mdcache->_fragment_logged(mdr);
+ }
+};
+
+class C_MDC_FragmentStore : public Context {
+ MDCache *mdcache;
+ MDRequest *mdr;
+public:
+ C_MDC_FragmentStore(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
+ virtual void finish(int r) {
+ mdcache->_fragment_stored(mdr);
}
};
@@ -11065,6 +11075,20 @@ public:
}
};
+class C_MDC_FragmentFinish : public Context {
+ MDCache *mdcache;
+ dirfrag_t basedirfrag;
+ list<CDir*> resultfrags;
+public:
+ C_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
+ mdcache(m), basedirfrag(f) {
+ resultfrags.swap(l);
+ }
+ virtual void finish(int r) {
+ mdcache->_fragment_finish(basedirfrag, resultfrags);
+ }
+};
+
void MDCache::fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits)
{
dout(10) << "fragment_frozen " << dirs << " " << basefrag << " by " << bits
@@ -11144,6 +11168,11 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
assert(!diri->dirfragtree.is_leaf(*p));
le->metablob.add_dir_context(*info.resultfrags.begin());
+ for (list<CDir*>::iterator p = info.resultfrags.begin();
+ p != info.resultfrags.end();
+ ++p) {
+ le->metablob.add_dir(*p, false);
+ }
// dft lock
mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
@@ -11162,15 +11191,28 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
mut->add_updated_lock(&diri->nestlock);
*/
- // freeze, journal, and store resulting frags
- C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentLoggedAndStored(this, mdr));
+ add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, le->orig_frags);
+ mds->mdlog->submit_entry(le, new C_MDC_FragmentPrep(this, mdr));
+ mds->mdlog->flush();
+}
+
+void MDCache::_fragment_logged(MDRequest *mdr)
+{
+ assert(fragment_requests.count(mdr->reqid));
+ fragment_info_t &info = fragment_requests[mdr->reqid];
+ CInode *diri = info.resultfrags.front()->get_inode();
+
+ dout(10) << "fragment_logged " << info.resultfrags << " " << info.basefrag
+ << " bits " << info.bits << " on " << *diri << dendl;
+
+ // store resulting frags
+ C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
for (list<CDir*>::iterator p = info.resultfrags.begin();
p != info.resultfrags.end();
++p) {
CDir *dir = *p;
- dout(10) << " result frag " << *dir << dendl;
- le->metablob.add_dir(dir, false);
+ dout(10) << " storing result frag " << *dir << dendl;
// freeze and store them too
dir->auth_pin(this);
@@ -11178,19 +11220,16 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
dir->commit(0, gather.new_sub(), true); // ignore authpinnability
}
- add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, le->orig_frags);
- mds->mdlog->submit_entry(le, gather.new_sub());
- mds->mdlog->flush();
gather.activate();
}
-void MDCache::fragment_logged_and_stored(MDRequest *mdr)
+void MDCache::_fragment_stored(MDRequest *mdr)
{
assert(fragment_requests.count(mdr->reqid));
fragment_info_t &info = fragment_requests[mdr->reqid];
CInode *diri = info.resultfrags.front()->get_inode();
- dout(10) << "fragment_logged_and_stored " << info.resultfrags << " " << info.basefrag
+ dout(10) << "fragment_stored " << info.resultfrags << " " << info.basefrag
<< " bits " << info.bits << " on " << *diri << dendl;
// tell peers
@@ -11280,6 +11319,7 @@ void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
{
dout(10) << "fragment_finish " << basedirfrag << dendl;
assert(uncommitted_fragments.count(basedirfrag));
+ ufragment &uf = uncommitted_fragments[basedirfrag];
// unmark & auth_unpin
for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) {
@@ -11287,7 +11327,11 @@ void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
(*p)->auth_unpin(this);
}
- finish_uncommitted_fragment(basedirfrag);
+ EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH,
+ basedirfrag.ino, basedirfrag.frag, uf.bits);
+ mds->mdlog->start_submit_entry(le);
+
+ finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
}
/* This function DOES put the passed message before returning */
@@ -11346,11 +11390,32 @@ void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<fra
uf.rollback.swap(*rollback);
}
-void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag)
+void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
{
- dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag << dendl;
+ dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
+ << " op " << EFragment::op_name(op) << dendl;
if (uncommitted_fragments.count(basedirfrag)) {
- uncommitted_fragments.erase(basedirfrag);
+ ufragment& uf = uncommitted_fragments[basedirfrag];
+ if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
+ uf.committed = true;
+ } else {
+ uncommitted_fragments.erase(basedirfrag);
+ }
+ }
+}
+
+void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
+{
+ dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
+ << " old_frags (" << old_frags << ")" << dendl;
+ if (uncommitted_fragments.count(basedirfrag)) {
+ ufragment& uf = uncommitted_fragments[basedirfrag];
+ if (!uf.old_frags.empty()) {
+ uf.old_frags.swap(old_frags);
+ uf.committed = true;
+ } else {
+ uncommitted_fragments.erase(basedirfrag);
+ }
}
}
@@ -11363,12 +11428,28 @@ void MDCache::rollback_uncommitted_fragments()
ufragment &uf = p->second;
CInode *diri = get_inode(p->first.ino);
assert(diri);
+
+ if (uf.committed) {
+ list<CDir*> frags;
+ diri->get_dirfrags_under(p->first.frag, frags);
+ for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
+ CDir *dir = *q;
+ dir->auth_pin(this);
+ dir->state_set(CDir::STATE_FRAGMENTING);
+ }
+ _fragment_committed(p->first, frags);
+ continue;
+ }
+
dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
LogSegment *ls = mds->mdlog->get_current_segment();
EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits);
mds->mdlog->start_entry(le);
+ list<frag_t> old_frags;
+ diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
+
list<CDir*> resultfrags;
if (uf.old_frags.empty()) {
// created by old format EFragment
@@ -11410,6 +11491,9 @@ void MDCache::rollback_uncommitted_fragments()
if (g_conf->mds_debug_frag)
diri->verify_dirfrags();
+ for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
+ assert(!diri->dirfragtree.is_leaf(*q));
+
for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
CDir *dir = *q;
dir->auth_pin(this);
@@ -11417,8 +11501,10 @@ void MDCache::rollback_uncommitted_fragments()
}
mds->mdlog->submit_entry(le);
+
+ uf.old_frags.swap(old_frags);
+ _fragment_committed(p->first, resultfrags);
}
- uncommitted_fragments.clear();
}
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 170adf78a3e..b9e7cfa823f 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -945,9 +945,10 @@ protected:
private:
struct ufragment {
int bits;
+ bool committed;
list<frag_t> old_frags;
bufferlist rollback;
- ufragment() : bits(0) {}
+ ufragment() : bits(0), committed(false) {}
};
map<dirfrag_t, ufragment> uncommitted_fragments;
@@ -970,39 +971,35 @@ private:
CDir *force_dir_fragment(CInode *diri, frag_t fg);
void get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds);
-
- friend class EFragment;
-
bool can_fragment(CInode *diri, list<CDir*>& dirs);
-
-public:
- void split_dir(CDir *dir, int byn);
- void merge_dir(CInode *diri, frag_t fg);
-
-private:
void fragment_freeze_dirs(list<CDir*>& dirs, C_GatherBuilder &gather);
void fragment_mark_and_complete(list<CDir*>& dirs);
void fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits);
void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
void dispatch_fragment_dir(MDRequest *mdr);
- void fragment_logged_and_stored(MDRequest *mdr);
+ void _fragment_logged(MDRequest *mdr);
+ void _fragment_stored(MDRequest *mdr);
void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
-public:
- void rollback_uncommitted_fragments();
-private:
-
+ friend class EFragment;
friend class C_MDC_FragmentFrozen;
friend class C_MDC_FragmentMarking;
- friend class C_MDC_FragmentLoggedAndStored;
+ friend class C_MDC_FragmentPrep;
+ friend class C_MDC_FragmentStore;
friend class C_MDC_FragmentCommit;
+ friend class C_MDC_FragmentFinish;
void handle_fragment_notify(MMDSFragmentNotify *m);
void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
bufferlist *rollback=NULL);
- void finish_uncommitted_fragment(dirfrag_t basedirfrag);
+ void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
+ void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
+public:
+ void split_dir(CDir *dir, int byn);
+ void merge_dir(CInode *diri, frag_t fg);
+ void rollback_uncommitted_fragments();
// -- updates --
//int send_inode_updates(CInode *in);
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 237e1349396..ece1156fc58 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -2399,6 +2399,7 @@ void EFragment::replay(MDS *mds)
case OP_ROLLBACK:
if (in) {
+ in->dirfragtree.get_leaves_under(basefrag, old_frags);
if (orig_frags.empty()) {
// old format EFragment
mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
@@ -2407,9 +2408,12 @@ void EFragment::replay(MDS *mds)
mds->mdcache->force_dir_fragment(in, *p);
}
}
- // fall-thru
+ mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), old_frags);
+ break;
+
case OP_COMMIT:
- mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag));
+ case OP_FINISH:
+ mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
break;
default: