diff options
author | Yan, Zheng <zheng.z.yan@intel.com> | 2013-09-19 13:05:59 +0800 |
---|---|---|
committer | Yan, Zheng <zheng.z.yan@intel.com> | 2013-10-05 11:31:09 +0800 |
commit | a881c1d36ceb705a89e7d157e56bb94d4624af97 (patch) | |
tree | 966c54031dd131487a5cac8d8fe39a2eeab359be | |
parent | 4634d3ce2d14a31d7e0a27985e5b0eac51c84d22 (diff) | |
download | ceph-a881c1d36ceb705a89e7d157e56bb94d4624af97.tar.gz |
mds: journal original dirfrags for rollback
Fragments with different 'bits' can be merged into one fragment. So we can't
use 'basefrag' and 'bits' to calculate the original fragments when rolling
back a merge operation.
We also can't rely on MDCache::adjust_dir_fragments() to restore the original
fragments' statistics during rolling back. This is because fragments are not
always complete in the rollback case.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
-rw-r--r-- | src/mds/MDCache.cc | 72 | ||||
-rw-r--r-- | src/mds/MDCache.h | 4 | ||||
-rw-r--r-- | src/mds/events/EFragment.h | 22 | ||||
-rw-r--r-- | src/mds/journal.cc | 37 |
4 files changed, 117 insertions, 18 deletions
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 8e9dd89db71..c15aeaa7fcd 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -11125,9 +11125,12 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) info.basefrag, info.bits); mds->mdlog->start_entry(le); - list<frag_t> old_frags; - for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) - old_frags.push_back((*p)->get_frag()); + for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) { + CDir *dir = *p; + dirfrag_rollback rollback; + rollback.fnode = dir->fnode; + le->add_orig_frag(dir->get_frag(), &rollback); + } // refragment list<Context*> waiters; @@ -11137,6 +11140,9 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) diri->verify_dirfrags(); mds->queue_waiters(waiters); + for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p) + assert(!diri->dirfragtree.is_leaf(*p)); + le->metablob.add_dir_context(*info.resultfrags.begin()); // dft lock @@ -11172,7 +11178,7 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr) dir->commit(0, gather.new_sub(), true); // ignore authpinnability } - add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, old_frags); + add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, le->orig_frags); mds->mdlog->submit_entry(le, gather.new_sub()); mds->mdlog->flush(); gather.activate(); @@ -11328,13 +11334,16 @@ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) notify->put(); } -void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags) +void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags, + bufferlist *rollback) { dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl; assert(!uncommitted_fragments.count(basedirfrag)); ufragment& uf = uncommitted_fragments[basedirfrag]; uf.old_frags = old_frags; uf.bits = bits; + if (rollback) + uf.rollback.swap(*rollback); } void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag) @@ -11355,14 +11364,59 @@ void MDCache::rollback_uncommitted_fragments() CInode *diri = get_inode(p->first.ino); assert(diri); dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl; + + LogSegment *ls = mds->mdlog->get_current_segment(); + EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits); + mds->mdlog->start_entry(le); + list<CDir*> resultfrags; - list<Context*> waiters; - adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true); + if (uf.old_frags.empty()) { + // created by old format EFragment + list<Context*> waiters; + adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true); + } else { + bufferlist::iterator bp = uf.rollback.begin(); + for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) { + CDir *dir = force_dir_fragment(diri, *q); + resultfrags.push_back(dir); + + dirfrag_rollback rollback; + ::decode(rollback, bp); + + dir->set_version(rollback.fnode.version); + dir->fnode = rollback.fnode; + + dir->_mark_dirty(ls); + + if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) { + dout(10) << " dirty nestinfo on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&dir->inode->nestlock); + ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest); + dir->get_inode()->nestlock.mark_dirty(); + } + if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) { + dout(10) << " dirty fragstat on " << *dir << dendl; + mds->locker->mark_updated_scatterlock(&dir->inode->filelock); + ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir); + dir->get_inode()->filelock.mark_dirty(); + } + + le->add_orig_frag(dir->get_frag()); + le->metablob.add_dir_context(dir); + le->metablob.add_dir(dir, true); + } + } + if (g_conf->mds_debug_frag) diri->verify_dirfrags(); - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits); - mds->mdlog->start_submit_entry(le); + for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) { + CDir *dir = *q; + dir->auth_pin(this); + dir->state_set(CDir::STATE_FRAGMENTING); + } + + mds->mdlog->submit_entry(le); } uncommitted_fragments.clear(); } diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h index 8560ce481fb..170adf78a3e 100644 --- a/src/mds/MDCache.h +++ b/src/mds/MDCache.h @@ -946,6 +946,7 @@ private: struct ufragment { int bits; list<frag_t> old_frags; + bufferlist rollback; ufragment() : bits(0) {} }; map<dirfrag_t, ufragment> uncommitted_fragments; @@ -999,7 +1000,8 @@ private: void handle_fragment_notify(MMDSFragmentNotify *m); - void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag); + void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag, + bufferlist *rollback=NULL); void finish_uncommitted_fragment(dirfrag_t basedirfrag); // -- updates -- diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h index bdbbd335e29..a9ddd548502 100644 --- a/src/mds/events/EFragment.h +++ b/src/mds/events/EFragment.h @@ -18,6 +18,14 @@ #include "../LogEvent.h" #include "EMetaBlob.h" +struct dirfrag_rollback { + fnode_t fnode; + dirfrag_rollback() { } + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); +}; +WRITE_CLASS_ENCODER(dirfrag_rollback) + class EFragment : public LogEvent { public: EMetaBlob metablob; @@ -25,6 +33,8 @@ public: inodeno_t ino; frag_t basefrag; __s32 bits; // positive for split (from basefrag), negative for merge (to basefrag) + list<frag_t> orig_frags; + bufferlist rollback; EFragment() : LogEvent(EVENT_FRAGMENT) { } EFragment(MDLog *mdlog, int o, inodeno_t i, frag_t bf, int b) : @@ -39,17 +49,25 @@ public: OP_PREPARE = 1, OP_COMMIT = 2, OP_ROLLBACK = 3, - OP_ONESHOT = 4, // (legacy) PREPARE+COMMIT + OP_FINISH = 4, // finish deleting orphan dirfrags + OP_ONESHOT = 5, // (legacy) PREPARE+COMMIT }; - const char *op_name(int o) const { + static const char *op_name(int o) { switch (o) { case OP_PREPARE: return "prepare"; case OP_COMMIT: return "commit"; case OP_ROLLBACK: return "rollback"; + case OP_FINISH: return "finish"; default: return "???"; } } + void add_orig_frag(frag_t df, dirfrag_rollback *drb=NULL) { + orig_frags.push_back(df); + if (drb) + ::encode(*drb, rollback); + } + void encode(bufferlist &bl) const; void decode(bufferlist::iterator &bl); void dump(Formatter *f) const; diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 49fdb9ce5da..237e1349396 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -2382,7 +2382,6 @@ void EFragment::replay(MDS *mds) list<CDir*> resultfrags; list<Context*> waiters; list<frag_t> old_frags; - pair<dirfrag_t,int> desc(dirfrag_t(ino,basefrag), bits); // in may be NULL if it wasn't in our cache yet. if it's a prepare // it will be once we replay the metablob , but first we need to @@ -2391,7 +2390,7 @@ void EFragment::replay(MDS *mds) switch (op) { case OP_PREPARE: - mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, old_frags); + mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, &rollback); // fall-thru case OP_ONESHOT: if (in) @@ -2399,8 +2398,15 @@ void EFragment::replay(MDS *mds) break; case OP_ROLLBACK: - if (in) - mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true); + if (in) { + if (orig_frags.empty()) { + // old format EFragment + mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true); + } else { + for (list<frag_t>::iterator p = orig_frags.begin(); p != orig_frags.end(); ++p) + mds->mdcache->force_dir_fragment(in, *p); + } + } // fall-thru case OP_COMMIT: mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag)); @@ -2409,24 +2415,27 @@ void EFragment::replay(MDS *mds) default: assert(0); } + metablob.replay(mds, _segment); if (in && g_conf->mds_debug_frag) in->verify_dirfrags(); } void EFragment::encode(bufferlist &bl) const { - ENCODE_START(4, 4, bl); + ENCODE_START(5, 4, bl); ::encode(stamp, bl); ::encode(op, bl); ::encode(ino, bl); ::encode(basefrag, bl); ::encode(bits, bl); ::encode(metablob, bl); + ::encode(orig_frags, bl); + ::encode(rollback, bl); ENCODE_FINISH(bl); } void EFragment::decode(bufferlist::iterator &bl) { - DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl); + DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl); if (struct_v >= 2) ::decode(stamp, bl); if (struct_v >= 3) @@ -2437,6 +2446,10 @@ void EFragment::decode(bufferlist::iterator &bl) { ::decode(basefrag, bl); ::decode(bits, bl); ::decode(metablob, bl); + if (struct_v >= 5) { + ::decode(orig_frags, bl); + ::decode(rollback, bl); + } DECODE_FINISH(bl); } @@ -2460,7 +2473,19 @@ void EFragment::generate_test_instances(list<EFragment*>& ls) ls.back()->bits = 5; } +void dirfrag_rollback::encode(bufferlist &bl) const +{ + ENCODE_START(1, 1, bl); + ::encode(fnode, bl); + ENCODE_FINISH(bl); +} +void dirfrag_rollback::decode(bufferlist::iterator &bl) +{ + DECODE_START(1, bl); + ::decode(fnode, bl); + DECODE_FINISH(bl); +} |