summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYan, Zheng <zheng.z.yan@intel.com>2013-09-19 13:05:59 +0800
committerYan, Zheng <zheng.z.yan@intel.com>2013-09-24 08:45:55 +0800
commitba9cec2f96153324f02600c354a3cea943a784f2 (patch)
tree8e6f5b7d2cbf8d9c2a5db9ab7d1c82df35c3b2b2
parentd39e16ad3c360551ae591d91ec5aeb03cc2cfd25 (diff)
downloadceph-ba9cec2f96153324f02600c354a3cea943a784f2.tar.gz
mds: delete orphan dirfrags during MDS recovers
This patch make the MDS use following steps to fragmentate directory. --- 1. freeze the old dirfrags 2. journal EFragment::OP_PREPARE 3. store the new dirfrags 4. journal EFragment::OP_COMMIT 5. delete the old dirfrags 6. journal EFragment::OP_FINISH The newly introduced event EFragment::OP_FINISH indicates that all orphan frags have been deleted. The new process guarantees that orphan frags can be properly deleted if the MDS crashes while fragmentating directory. Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
-rw-r--r--src/mds/MDCache.cc151
-rw-r--r--src/mds/MDCache.h11
-rw-r--r--src/mds/events/EFragment.h7
-rw-r--r--src/mds/journal.cc22
4 files changed, 150 insertions, 41 deletions
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index bb59c6a21fa..6a45cf14b03 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -11041,13 +11041,23 @@ void MDCache::fragment_unmark_unfreeze_dirs(list<CDir*>& dirs)
}
}
-class C_MDC_FragmentLoggedAndStored : public Context {
+class C_MDC_FragmentPrep : public Context {
MDCache *mdcache;
MDRequest *mdr;
public:
- C_MDC_FragmentLoggedAndStored(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
+ C_MDC_FragmentPrep(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
virtual void finish(int r) {
- mdcache->fragment_logged_and_stored(mdr);
+ mdcache->_fragment_logged(mdr);
+ }
+};
+
+class C_MDC_FragmentStore : public Context {
+ MDCache *mdcache;
+ MDRequest *mdr;
+public:
+ C_MDC_FragmentStore(MDCache *m, MDRequest *r) : mdcache(m), mdr(r) {}
+ virtual void finish(int r) {
+ mdcache->_fragment_stored(mdr);
}
};
@@ -11065,6 +11075,20 @@ public:
}
};
+class C_MDC_FragmentFinish : public Context {
+ MDCache *mdcache;
+ dirfrag_t basedirfrag;
+ list<CDir*> resultfrags;
+public:
+ C_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list<CDir*>& l) :
+ mdcache(m), basedirfrag(f) {
+ resultfrags.swap(l);
+ }
+ virtual void finish(int r) {
+ mdcache->_fragment_finish(basedirfrag, resultfrags);
+ }
+};
+
void MDCache::fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits)
{
dout(10) << "fragment_frozen " << dirs << " " << basefrag << " by " << bits
@@ -11132,39 +11156,49 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
diri->verify_dirfrags();
mds->queue_waiters(waiters);
+ for (list<frag_t>::iterator p = old_frags.begin(); p != old_frags.end(); ++p)
+ assert(!diri->dirfragtree.is_leaf(*p));
+
mdr->ls = mds->mdlog->get_current_segment();
EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, diri->ino(),
info.basefrag, info.bits);
mds->mdlog->start_entry(le);
le->metablob.add_dir_context(*info.resultfrags.begin());
+ for (list<CDir*>::iterator p = info.resultfrags.begin();
+ p != info.resultfrags.end();
+ ++p) {
+ le->metablob.add_dir(*p, false);
+ }
+ le->orig_frags = old_frags;
// dft lock
mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
mdr->add_updated_lock(&diri->dirfragtreelock);
- /*
- // filelock
- mds->locker->mark_updated_scatterlock(&diri->filelock);
- mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
- mut->add_updated_lock(&diri->filelock);
-
- // dirlock
- mds->locker->mark_updated_scatterlock(&diri->nestlock);
- mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
- mut->add_updated_lock(&diri->nestlock);
- */
+ add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, old_frags);
+ mds->mdlog->submit_entry(le, new C_MDC_FragmentPrep(this, mdr));
+ mds->mdlog->flush();
+}
- // freeze, journal, and store resulting frags
- C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentLoggedAndStored(this, mdr));
+void MDCache::_fragment_logged(MDRequest *mdr)
+{
+ assert(fragment_requests.count(mdr->reqid));
+ fragment_info_t &info = fragment_requests[mdr->reqid];
+ CInode *diri = info.resultfrags.front()->get_inode();
+
+ dout(10) << "fragment_logged " << info.resultfrags << " " << info.basefrag
+ << " bits " << info.bits << " on " << *diri << dendl;
+
+ // store resulting frags
+ C_GatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
for (list<CDir*>::iterator p = info.resultfrags.begin();
p != info.resultfrags.end();
++p) {
CDir *dir = *p;
- dout(10) << " result frag " << *dir << dendl;
- le->metablob.add_dir(dir, false);
+ dout(10) << " storing result frag " << *dir << dendl;
// freeze and store them too
dir->auth_pin(this);
@@ -11172,19 +11206,16 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
dir->commit(0, gather.new_sub(), true); // ignore authpinnability
}
- add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, old_frags);
- mds->mdlog->submit_entry(le, gather.new_sub());
- mds->mdlog->flush();
gather.activate();
}
-void MDCache::fragment_logged_and_stored(MDRequest *mdr)
+void MDCache::_fragment_stored(MDRequest *mdr)
{
assert(fragment_requests.count(mdr->reqid));
fragment_info_t &info = fragment_requests[mdr->reqid];
CInode *diri = info.resultfrags.front()->get_inode();
- dout(10) << "fragment_logged_and_stored " << info.resultfrags << " " << info.basefrag
+ dout(10) << "fragment_stored " << info.resultfrags << " " << info.basefrag
<< " bits " << info.bits << " on " << *diri << dendl;
// tell peers
@@ -11274,6 +11305,7 @@ void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
{
dout(10) << "fragment_finish " << basedirfrag << dendl;
assert(uncommitted_fragments.count(basedirfrag));
+ ufragment &uf = uncommitted_fragments[basedirfrag];
// unmark & auth_unpin
for (list<CDir*>::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) {
@@ -11281,7 +11313,11 @@ void MDCache::_fragment_finish(dirfrag_t basedirfrag, list<CDir*>& resultfrags)
(*p)->auth_unpin(this);
}
- finish_uncommitted_fragment(basedirfrag);
+ EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH,
+ basedirfrag.ino, basedirfrag.frag, uf.bits);
+ mds->mdlog->start_submit_entry(le);
+
+ finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
}
/* This function DOES put the passed message before returning */
@@ -11337,11 +11373,30 @@ void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<fra
uf.bits = bits;
}
-void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag)
+void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
+{
+ dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
+ << " op " << EFragment::op_name(op) << dendl;
+ if (uncommitted_fragments.count(basedirfrag)) {
+ ufragment& uf = uncommitted_fragments[basedirfrag];
+ if (op != EFragment::OP_FINISH && !uf.old_frags.empty())
+ uf.committed = true;
+ else
+ uncommitted_fragments.erase(basedirfrag);
+ }
+}
+
+void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags)
{
- dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag << dendl;
+ dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
+ << " old_frags (" << old_frags << ")" << dendl;
if (uncommitted_fragments.count(basedirfrag)) {
- uncommitted_fragments.erase(basedirfrag);
+ ufragment& uf = uncommitted_fragments[basedirfrag];
+ if (!uf.old_frags.empty()) {
+ uf.old_frags.swap(old_frags);
+ uf.committed = true;
+ } else
+ uncommitted_fragments.erase(basedirfrag);
}
}
@@ -11352,19 +11407,57 @@ void MDCache::rollback_uncommitted_fragments()
p != uncommitted_fragments.end();
++p) {
ufragment &uf = p->second;
+
CInode *diri = get_inode(p->first.ino);
assert(diri);
+
+ if (uf.committed) {
+ list<CDir*> frags;
+ diri->get_dirfrags_under(p->first.frag, frags);
+ for (list<CDir*>::iterator q = frags.begin(); q != frags.end(); ++q) {
+ CDir *dir = *q;
+ dir->auth_pin(this);
+ dir->state_set(CDir::STATE_FRAGMENTING);
+ }
+ _fragment_committed(p->first, frags);
+ continue;
+ }
+
dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
+
+ list<frag_t> old_frags;
+ diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
+
list<CDir*> resultfrags;
list<Context*> waiters;
adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
if (g_conf->mds_debug_frag)
diri->verify_dirfrags();
+ for (list<frag_t>::iterator q = old_frags.begin(); q != old_frags.end(); ++q)
+ assert(!diri->dirfragtree.is_leaf(*q));
+
+ LogSegment *ls = mds->mdlog->get_current_segment();
EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits);
- mds->mdlog->start_submit_entry(le);
+ mds->mdlog->start_entry(le);
+
+ le->orig_frags.swap(uf.old_frags);
+ le->metablob.add_dir_context(*resultfrags.begin());
+
+ for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
+ CDir *dir = *q;
+ dir->auth_pin(this);
+ dir->state_set(CDir::STATE_FRAGMENTING);
+ // don't know if the dirfrag was dirty
+ dir->_mark_dirty(ls);
+ le->metablob.add_dir(dir, true);
+ }
+
+ mds->mdlog->submit_entry(le);
+
+ uf.old_frags.swap(old_frags);
+ _fragment_committed(p->first, resultfrags);
}
- uncommitted_fragments.clear();
}
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 8560ce481fb..50ae258cc2d 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -945,8 +945,9 @@ protected:
private:
struct ufragment {
int bits;
+ bool committed;
list<frag_t> old_frags;
- ufragment() : bits(0) {}
+ ufragment() : bits(0), committed(false) {}
};
map<dirfrag_t, ufragment> uncommitted_fragments;
@@ -984,7 +985,8 @@ private:
void fragment_frozen(list<CDir*>& dirs, frag_t basefrag, int bits);
void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
void dispatch_fragment_dir(MDRequest *mdr);
- void fragment_logged_and_stored(MDRequest *mdr);
+ void _fragment_logged(MDRequest *mdr);
+ void _fragment_stored(MDRequest *mdr);
void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
@@ -994,13 +996,16 @@ private:
friend class C_MDC_FragmentFrozen;
friend class C_MDC_FragmentMarking;
- friend class C_MDC_FragmentLoggedAndStored;
+ friend class C_MDC_FragmentPrep;
+ friend class C_MDC_FragmentStore;
friend class C_MDC_FragmentCommit;
+ friend class C_MDC_FragmentFinish;
void handle_fragment_notify(MMDSFragmentNotify *m);
void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag);
void finish_uncommitted_fragment(dirfrag_t basedirfrag);
+ void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
// -- updates --
//int send_inode_updates(CInode *in);
diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h
index bdbbd335e29..01c35078e1a 100644
--- a/src/mds/events/EFragment.h
+++ b/src/mds/events/EFragment.h
@@ -25,6 +25,7 @@ public:
inodeno_t ino;
frag_t basefrag;
__s32 bits; // positive for split (from basefrag), negative for merge (to basefrag)
+ list<frag_t> orig_frags;
EFragment() : LogEvent(EVENT_FRAGMENT) { }
EFragment(MDLog *mdlog, int o, inodeno_t i, frag_t bf, int b) :
@@ -39,13 +40,15 @@ public:
OP_PREPARE = 1,
OP_COMMIT = 2,
OP_ROLLBACK = 3,
- OP_ONESHOT = 4, // (legacy) PREPARE+COMMIT
+ OP_FINISH = 4, // finish deleting orphan dirfrags
+ OP_ONESHOT = 5, // (legacy) PREPARE+COMMIT
};
- const char *op_name(int o) const {
+ static const char *op_name(int o) {
switch (o) {
case OP_PREPARE: return "prepare";
case OP_COMMIT: return "commit";
case OP_ROLLBACK: return "rollback";
+ case OP_FINISH: return "finish";
default: return "???";
}
}
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 49fdb9ce5da..f36087f49d8 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -2382,7 +2382,6 @@ void EFragment::replay(MDS *mds)
list<CDir*> resultfrags;
list<Context*> waiters;
list<frag_t> old_frags;
- pair<dirfrag_t,int> desc(dirfrag_t(ino,basefrag), bits);
// in may be NULL if it wasn't in our cache yet. if it's a prepare
// it will be once we replay the metablob , but first we need to
@@ -2391,7 +2390,7 @@ void EFragment::replay(MDS *mds)
switch (op) {
case OP_PREPARE:
- mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, old_frags);
+ mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags);
// fall-thru
case OP_ONESHOT:
if (in)
@@ -2399,34 +2398,41 @@ void EFragment::replay(MDS *mds)
break;
case OP_ROLLBACK:
- if (in)
+ if (in) {
+ in->dirfragtree.get_leaves_under(basefrag, old_frags);
mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
- // fall-thru
+ }
+ mds->mdcache->rollback_uncommitted_fragment(dirfrag_t(ino, basefrag), old_frags);
+ break;
+
case OP_COMMIT:
- mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag));
+ case OP_FINISH:
+ mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag), op);
break;
default:
assert(0);
}
+
metablob.replay(mds, _segment);
if (in && g_conf->mds_debug_frag)
in->verify_dirfrags();
}
void EFragment::encode(bufferlist &bl) const {
- ENCODE_START(4, 4, bl);
+ ENCODE_START(5, 4, bl);
::encode(stamp, bl);
::encode(op, bl);
::encode(ino, bl);
::encode(basefrag, bl);
::encode(bits, bl);
::encode(metablob, bl);
+ ::encode(orig_frags, bl);
ENCODE_FINISH(bl);
}
void EFragment::decode(bufferlist::iterator &bl) {
- DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
if (struct_v >= 2)
::decode(stamp, bl);
if (struct_v >= 3)
@@ -2437,6 +2443,8 @@ void EFragment::decode(bufferlist::iterator &bl) {
::decode(basefrag, bl);
::decode(bits, bl);
::decode(metablob, bl);
+ if (struct_v >= 5)
+ ::decode(orig_frags, bl);
DECODE_FINISH(bl);
}