summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYan, Zheng <zheng.z.yan@intel.com>2013-09-19 13:05:59 +0800
committerYan, Zheng <zheng.z.yan@intel.com>2013-10-05 11:31:09 +0800
commita881c1d36ceb705a89e7d157e56bb94d4624af97 (patch)
tree966c54031dd131487a5cac8d8fe39a2eeab359be
parent4634d3ce2d14a31d7e0a27985e5b0eac51c84d22 (diff)
downloadceph-a881c1d36ceb705a89e7d157e56bb94d4624af97.tar.gz
mds: journal original dirfrags for rollback
Fragments with different 'bits' can be merged into one fragment. So we can't use 'basefrag' and 'bits' to calculate the original fragments when rolling back a merge operation. We also can't rely on MDCache::adjust_dir_fragments() to restore the original fragments' statistics during rolling back. This is because fragments are not always complete in the rollback case. Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
-rw-r--r--src/mds/MDCache.cc72
-rw-r--r--src/mds/MDCache.h4
-rw-r--r--src/mds/events/EFragment.h22
-rw-r--r--src/mds/journal.cc37
4 files changed, 117 insertions, 18 deletions
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 8e9dd89db71..c15aeaa7fcd 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -11125,9 +11125,12 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
info.basefrag, info.bits);
mds->mdlog->start_entry(le);
- list<frag_t> old_frags;
- for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p)
- old_frags.push_back((*p)->get_frag());
+ for (list<CDir*>::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) {
+ CDir *dir = *p;
+ dirfrag_rollback rollback;
+ rollback.fnode = dir->fnode;
+ le->add_orig_frag(dir->get_frag(), &rollback);
+ }
// refragment
list<Context*> waiters;
@@ -11137,6 +11140,9 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
diri->verify_dirfrags();
mds->queue_waiters(waiters);
+ for (list<frag_t>::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p)
+ assert(!diri->dirfragtree.is_leaf(*p));
+
le->metablob.add_dir_context(*info.resultfrags.begin());
// dft lock
@@ -11172,7 +11178,7 @@ void MDCache::dispatch_fragment_dir(MDRequest *mdr)
dir->commit(0, gather.new_sub(), true); // ignore authpinnability
}
- add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, old_frags);
+ add_uncommitted_fragment(dirfrag_t(diri->ino(), info.basefrag), info.bits, le->orig_frags);
mds->mdlog->submit_entry(le, gather.new_sub());
mds->mdlog->flush();
gather.activate();
@@ -11328,13 +11334,16 @@ void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify)
notify->put();
}
-void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags)
+void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frags,
+ bufferlist *rollback)
{
dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
assert(!uncommitted_fragments.count(basedirfrag));
ufragment& uf = uncommitted_fragments[basedirfrag];
uf.old_frags = old_frags;
uf.bits = bits;
+ if (rollback)
+ uf.rollback.swap(*rollback);
}
void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag)
@@ -11355,14 +11364,59 @@ void MDCache::rollback_uncommitted_fragments()
CInode *diri = get_inode(p->first.ino);
assert(diri);
dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
+
+ LogSegment *ls = mds->mdlog->get_current_segment();
+ EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits);
+ mds->mdlog->start_entry(le);
+
list<CDir*> resultfrags;
- list<Context*> waiters;
- adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
+ if (uf.old_frags.empty()) {
+ // created by old format EFragment
+ list<Context*> waiters;
+ adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true);
+ } else {
+ bufferlist::iterator bp = uf.rollback.begin();
+ for (list<frag_t>::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) {
+ CDir *dir = force_dir_fragment(diri, *q);
+ resultfrags.push_back(dir);
+
+ dirfrag_rollback rollback;
+ ::decode(rollback, bp);
+
+ dir->set_version(rollback.fnode.version);
+ dir->fnode = rollback.fnode;
+
+ dir->_mark_dirty(ls);
+
+ if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
+ dout(10) << " dirty nestinfo on " << *dir << dendl;
+ mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
+ ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
+ dir->get_inode()->nestlock.mark_dirty();
+ }
+ if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
+ dout(10) << " dirty fragstat on " << *dir << dendl;
+ mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
+ ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
+ dir->get_inode()->filelock.mark_dirty();
+ }
+
+ le->add_orig_frag(dir->get_frag());
+ le->metablob.add_dir_context(dir);
+ le->metablob.add_dir(dir, true);
+ }
+ }
+
if (g_conf->mds_debug_frag)
diri->verify_dirfrags();
- EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, diri->ino(), p->first.frag, uf.bits);
- mds->mdlog->start_submit_entry(le);
+ for (list<CDir*>::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) {
+ CDir *dir = *q;
+ dir->auth_pin(this);
+ dir->state_set(CDir::STATE_FRAGMENTING);
+ }
+
+ mds->mdlog->submit_entry(le);
}
uncommitted_fragments.clear();
}
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 8560ce481fb..170adf78a3e 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -946,6 +946,7 @@ private:
struct ufragment {
int bits;
list<frag_t> old_frags;
+ bufferlist rollback;
ufragment() : bits(0) {}
};
map<dirfrag_t, ufragment> uncommitted_fragments;
@@ -999,7 +1000,8 @@ private:
void handle_fragment_notify(MMDSFragmentNotify *m);
- void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag);
+ void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
+ bufferlist *rollback=NULL);
void finish_uncommitted_fragment(dirfrag_t basedirfrag);
// -- updates --
diff --git a/src/mds/events/EFragment.h b/src/mds/events/EFragment.h
index bdbbd335e29..a9ddd548502 100644
--- a/src/mds/events/EFragment.h
+++ b/src/mds/events/EFragment.h
@@ -18,6 +18,14 @@
#include "../LogEvent.h"
#include "EMetaBlob.h"
+struct dirfrag_rollback {
+ fnode_t fnode;
+ dirfrag_rollback() { }
+ void encode(bufferlist& bl) const;
+ void decode(bufferlist::iterator& bl);
+};
+WRITE_CLASS_ENCODER(dirfrag_rollback)
+
class EFragment : public LogEvent {
public:
EMetaBlob metablob;
@@ -25,6 +33,8 @@ public:
inodeno_t ino;
frag_t basefrag;
__s32 bits; // positive for split (from basefrag), negative for merge (to basefrag)
+ list<frag_t> orig_frags;
+ bufferlist rollback;
EFragment() : LogEvent(EVENT_FRAGMENT) { }
EFragment(MDLog *mdlog, int o, inodeno_t i, frag_t bf, int b) :
@@ -39,17 +49,25 @@ public:
OP_PREPARE = 1,
OP_COMMIT = 2,
OP_ROLLBACK = 3,
- OP_ONESHOT = 4, // (legacy) PREPARE+COMMIT
+ OP_FINISH = 4, // finish deleting orphan dirfrags
+ OP_ONESHOT = 5, // (legacy) PREPARE+COMMIT
};
- const char *op_name(int o) const {
+ static const char *op_name(int o) {
switch (o) {
case OP_PREPARE: return "prepare";
case OP_COMMIT: return "commit";
case OP_ROLLBACK: return "rollback";
+ case OP_FINISH: return "finish";
default: return "???";
}
}
+ void add_orig_frag(frag_t df, dirfrag_rollback *drb=NULL) {
+ orig_frags.push_back(df);
+ if (drb)
+ ::encode(*drb, rollback);
+ }
+
void encode(bufferlist &bl) const;
void decode(bufferlist::iterator &bl);
void dump(Formatter *f) const;
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 49fdb9ce5da..237e1349396 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -2382,7 +2382,6 @@ void EFragment::replay(MDS *mds)
list<CDir*> resultfrags;
list<Context*> waiters;
list<frag_t> old_frags;
- pair<dirfrag_t,int> desc(dirfrag_t(ino,basefrag), bits);
// in may be NULL if it wasn't in our cache yet. if it's a prepare
// it will be once we replay the metablob , but first we need to
@@ -2391,7 +2390,7 @@ void EFragment::replay(MDS *mds)
switch (op) {
case OP_PREPARE:
- mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, old_frags);
+ mds->mdcache->add_uncommitted_fragment(dirfrag_t(ino, basefrag), bits, orig_frags, &rollback);
// fall-thru
case OP_ONESHOT:
if (in)
@@ -2399,8 +2398,15 @@ void EFragment::replay(MDS *mds)
break;
case OP_ROLLBACK:
- if (in)
- mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
+ if (in) {
+ if (orig_frags.empty()) {
+ // old format EFragment
+ mds->mdcache->adjust_dir_fragments(in, basefrag, -bits, resultfrags, waiters, true);
+ } else {
+ for (list<frag_t>::iterator p = orig_frags.begin(); p != orig_frags.end(); ++p)
+ mds->mdcache->force_dir_fragment(in, *p);
+ }
+ }
// fall-thru
case OP_COMMIT:
mds->mdcache->finish_uncommitted_fragment(dirfrag_t(ino, basefrag));
@@ -2409,24 +2415,27 @@ void EFragment::replay(MDS *mds)
default:
assert(0);
}
+
metablob.replay(mds, _segment);
if (in && g_conf->mds_debug_frag)
in->verify_dirfrags();
}
void EFragment::encode(bufferlist &bl) const {
- ENCODE_START(4, 4, bl);
+ ENCODE_START(5, 4, bl);
::encode(stamp, bl);
::encode(op, bl);
::encode(ino, bl);
::encode(basefrag, bl);
::encode(bits, bl);
::encode(metablob, bl);
+ ::encode(orig_frags, bl);
+ ::encode(rollback, bl);
ENCODE_FINISH(bl);
}
void EFragment::decode(bufferlist::iterator &bl) {
- DECODE_START_LEGACY_COMPAT_LEN(4, 4, 4, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(5, 4, 4, bl);
if (struct_v >= 2)
::decode(stamp, bl);
if (struct_v >= 3)
@@ -2437,6 +2446,10 @@ void EFragment::decode(bufferlist::iterator &bl) {
::decode(basefrag, bl);
::decode(bits, bl);
::decode(metablob, bl);
+ if (struct_v >= 5) {
+ ::decode(orig_frags, bl);
+ ::decode(rollback, bl);
+ }
DECODE_FINISH(bl);
}
@@ -2460,7 +2473,19 @@ void EFragment::generate_test_instances(list<EFragment*>& ls)
ls.back()->bits = 5;
}
+void dirfrag_rollback::encode(bufferlist &bl) const
+{
+ ENCODE_START(1, 1, bl);
+ ::encode(fnode, bl);
+ ENCODE_FINISH(bl);
+}
+void dirfrag_rollback::decode(bufferlist::iterator &bl)
+{
+ DECODE_START(1, bl);
+ ::decode(fnode, bl);
+ DECODE_FINISH(bl);
+}