summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYan, Zheng <zheng.z.yan@intel.com>2013-01-19 12:57:31 +0800
committerYan, Zheng <zheng.z.yan@intel.com>2013-01-29 10:17:35 +0800
commit1a6626f03220b8a3f4052f95226d23094c1f7b36 (patch)
treed19f340fe21faf56bd0f8a0903fed3e75624b95d
parent9944d9fbc969134a2b4645c1e402dfb5aab0eaaa (diff)
downloadceph-1a6626f03220b8a3f4052f95226d23094c1f7b36.tar.gz
mds: preserve non-auth/unlinked objects until slave commit
The MDS should not trim objects in non-auth subtree immediately after replaying a slave rename. Because the slave rename may require rollback later and these objects are needed for rollback. Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
-rw-r--r--src/mds/MDCache.cc99
-rw-r--r--src/mds/MDCache.h17
-rw-r--r--src/mds/Mutation.h5
-rw-r--r--src/mds/Server.cc8
-rw-r--r--src/mds/events/EMetaBlob.h3
-rw-r--r--src/mds/journal.cc53
6 files changed, 124 insertions, 61 deletions
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index adcf8c1ef28..97adb273750 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2867,19 +2867,16 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
if (mds->is_resolve()) {
// replay
- assert(uncommitted_slave_updates[from].count(*p));
+ MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
+ assert(su);
+
// log commit
mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", *p, from,
- ESlaveUpdate::OP_COMMIT,
- uncommitted_slave_updates[from][*p]->origop));
-
- delete uncommitted_slave_updates[from][*p];
- uncommitted_slave_updates[from].erase(*p);
- if (uncommitted_slave_updates[from].empty())
- uncommitted_slave_updates.erase(from);
-
+ ESlaveUpdate::OP_COMMIT, su->origop));
mds->mdlog->wait_for_safe(new C_MDC_SlaveCommit(this, from, *p));
mds->mdlog->flush();
+
+ finish_uncommitted_slave_update(*p, from);
} else {
MDRequest *mdr = request_get(*p);
assert(mdr->slave_request == 0); // shouldn't be doing anything!
@@ -2893,28 +2890,24 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
dout(10) << " abort on slave " << *p << dendl;
if (mds->is_resolve()) {
- assert(uncommitted_slave_updates[from].count(*p));
+ MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from);
+ assert(su);
// perform rollback (and journal a rollback entry)
// note: this will hold up the resolve a bit, until the rollback entries journal.
- switch (uncommitted_slave_updates[from][*p]->origop) {
+ switch (su->origop) {
case ESlaveUpdate::LINK:
- mds->server->do_link_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0);
+ mds->server->do_link_rollback(su->rollback, from, 0);
break;
case ESlaveUpdate::RENAME:
- mds->server->do_rename_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0);
+ mds->server->do_rename_rollback(su->rollback, from, 0);
break;
case ESlaveUpdate::RMDIR:
- mds->server->do_rmdir_rollback(uncommitted_slave_updates[from][*p]->rollback, from, 0);
+ mds->server->do_rmdir_rollback(su->rollback, from, 0);
break;
default:
assert(0);
}
-
- delete uncommitted_slave_updates[from][*p];
- uncommitted_slave_updates[from].erase(*p);
- if (uncommitted_slave_updates[from].empty())
- uncommitted_slave_updates.erase(from);
} else {
MDRequest *mdr = request_get(*p);
if (mdr->more()->slave_commit) {
@@ -2939,7 +2932,67 @@ void MDCache::handle_resolve_ack(MMDSResolveAck *ack)
ack->put();
}
+void MDCache::add_uncommitted_slave_update(metareqid_t reqid, int master, MDSlaveUpdate *su)
+{
+ assert(uncommitted_slave_updates[master].count(reqid) == 0);
+ uncommitted_slave_updates[master][reqid] = su;
+ if (su->rename_olddir)
+ uncommitted_slave_rename_olddir[su->rename_olddir]++;
+ for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); p++)
+ uncommitted_slave_unlink[*p]++;
+}
+void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, int master)
+{
+ assert(uncommitted_slave_updates[master].count(reqid));
+ MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
+
+ uncommitted_slave_updates[master].erase(reqid);
+ if (uncommitted_slave_updates[master].empty())
+ uncommitted_slave_updates.erase(master);
+ // discard the non-auth subtree we renamed out of
+ if (su->rename_olddir) {
+ uncommitted_slave_rename_olddir[su->rename_olddir]--;
+ if (uncommitted_slave_rename_olddir[su->rename_olddir] == 0) {
+ uncommitted_slave_rename_olddir.erase(su->rename_olddir);
+ CDir *root = get_subtree_root(su->rename_olddir);
+ if (root->get_dir_auth() == CDIR_AUTH_UNDEF)
+ try_trim_non_auth_subtree(root);
+ }
+ }
+ // removed the inodes that were unlinked by slave update
+ for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); p++) {
+ CInode *in = *p;
+ uncommitted_slave_unlink[in]--;
+ if (uncommitted_slave_unlink[in] == 0) {
+ uncommitted_slave_unlink.erase(in);
+ if (!in->get_projected_parent_dn())
+ mds->mdcache->remove_inode_recursive(in);
+ }
+ }
+ delete su;
+}
+
+MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, int master)
+{
+
+ MDSlaveUpdate* su = NULL;
+ if (uncommitted_slave_updates.count(master) &&
+ uncommitted_slave_updates[master].count(reqid)) {
+ su = uncommitted_slave_updates[master][reqid];
+ assert(su);
+ }
+ return su;
+}
+
+void MDCache::finish_rollback(metareqid_t reqid) {
+ assert(need_resolve_rollback.count(reqid));
+ if (mds->is_resolve())
+ finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]);
+ need_resolve_rollback.erase(reqid);
+ if (need_resolve_rollback.empty())
+ maybe_resolve_finish();
+}
void MDCache::disambiguate_imports()
{
@@ -5788,6 +5841,10 @@ bool MDCache::trim_non_auth_subtree(CDir *dir)
{
dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
+ // preserve the dir for rollback
+ if (uncommitted_slave_rename_olddir.count(dir))
+ return true;
+
bool keep_dir = false;
CDir::map_t::iterator j = dir->begin();
CDir::map_t::iterator i = j;
@@ -5805,7 +5862,9 @@ bool MDCache::trim_non_auth_subtree(CDir *dir)
for (list<CDir*>::iterator subdir = subdirs.begin();
subdir != subdirs.end();
++subdir) {
- if ((*subdir)->is_subtree_root() || my_ambiguous_imports.count((*subdir)->dirfrag())) {
+ if (uncommitted_slave_rename_olddir.count(*subdir) || // preserve the dir for rollback
+ my_ambiguous_imports.count((*subdir)->dirfrag()) ||
+ (*subdir)->is_subtree_root()) {
keep_inode = true;
dout(10) << "trim_non_auth_subtree(" << dir << ") subdir " << *subdir << "is kept!" << dendl;
}
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 31c7467bf41..dffc6ba1831 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -312,6 +312,8 @@ protected:
map<int, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
map<int, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates; // slave: for replay.
+ map<CDir*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit.
+ map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit.
// track master requests whose slaves haven't acknowledged commit
struct umaster {
@@ -329,7 +331,7 @@ protected:
set<int> wants_resolve; // nodes i need to send my resolve to
set<int> got_resolve; // nodes i got resolves from
set<int> need_resolve_ack; // nodes i need a resolve_ack from
- set<metareqid_t> need_resolve_rollback; // rollbacks i'm writing to the journal
+ map<metareqid_t, int> need_resolve_rollback; // rollbacks i'm writing to the journal
void handle_resolve(MMDSResolve *m);
void handle_resolve_ack(MMDSResolveAck *m);
@@ -337,17 +339,16 @@ protected:
void disambiguate_imports();
void recalc_auth_bits();
void trim_unlinked_inodes();
+ void add_uncommitted_slave_update(metareqid_t reqid, int master, MDSlaveUpdate*);
+ void finish_uncommitted_slave_update(metareqid_t reqid, int master);
+ MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, int master);
public:
void remove_inode_recursive(CInode *in);
- void add_rollback(metareqid_t reqid) {
- need_resolve_rollback.insert(reqid);
- }
- void finish_rollback(metareqid_t reqid) {
- need_resolve_rollback.erase(reqid);
- if (need_resolve_rollback.empty())
- maybe_resolve_finish();
+ void add_rollback(metareqid_t reqid, int master) {
+ need_resolve_rollback[reqid] = master;
}
+ void finish_rollback(metareqid_t reqid);
// ambiguous imports
void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index d0d3ecabf8c..36d62a74bf0 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -298,10 +298,13 @@ struct MDSlaveUpdate {
bufferlist rollback;
elist<MDSlaveUpdate*>::item item;
Context *waiter;
+ CDir* rename_olddir;
+ set<CInode*> unlinked;
MDSlaveUpdate(int oo, bufferlist &rbl, elist<MDSlaveUpdate*> &list) :
origop(oo),
item(this),
- waiter(0) {
+ waiter(0),
+ rename_olddir(0) {
rollback.claim(rbl);
list.push_back(&item);
}
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index ee1547ff067..cce4f8c7293 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4378,12 +4378,11 @@ void Server::do_link_rollback(bufferlist &rbl, int master, MDRequest *mdr)
Mutation *mut = mdr;
if (!mut) {
assert(mds->is_resolve());
- mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes
+ mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
mut = new Mutation(rollback.reqid);
mut->ls = mds->mdlog->get_current_segment();
}
-
CInode *in = mds->mdcache->get_inode(rollback.ino);
assert(in);
dout(10) << " target is " << *in << dendl;
@@ -4976,7 +4975,7 @@ void Server::do_rmdir_rollback(bufferlist &rbl, int master, MDRequest *mdr)
dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
if (!mdr) {
assert(mds->is_resolve());
- mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes
+ mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
}
CDir *dir = mds->mdcache->get_dirfrag(rollback.src_dir);
@@ -6480,8 +6479,9 @@ void Server::do_rename_rollback(bufferlist &rbl, int master, MDRequest *mdr)
dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
if (!mdr) {
assert(mds->is_resolve());
- mds->mdcache->add_rollback(rollback.reqid); // need to finish this update before resolve finishes
+ mds->mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
}
+
Mutation *mut = new Mutation(rollback.reqid);
mut->ls = mds->mdlog->get_current_segment();
diff --git a/src/mds/events/EMetaBlob.h b/src/mds/events/EMetaBlob.h
index 9bbd615e31d..77ceb9458ba 100644
--- a/src/mds/events/EMetaBlob.h
+++ b/src/mds/events/EMetaBlob.h
@@ -27,6 +27,7 @@
class MDS;
class MDLog;
class LogSegment;
+class MDSlaveUpdate;
/*
* a bunch of metadata in the journal
@@ -674,7 +675,7 @@ private:
}
void update_segment(LogSegment *ls);
- void replay(MDS *mds, LogSegment *ls=0);
+ void replay(MDS *mds, LogSegment *ls, MDSlaveUpdate *su=NULL);
};
WRITE_CLASS_ENCODER(EMetaBlob)
WRITE_CLASS_ENCODER(EMetaBlob::fullbit)
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 72a5e5e4ad9..6e22c5f1914 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -415,7 +415,7 @@ void EMetaBlob::fullbit::update_inode(MDS *mds, CInode *in)
in->old_inodes = old_inodes;
}
-void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
+void EMetaBlob::replay(MDS *mds, LogSegment *logseg, MDSlaveUpdate *slaveup)
{
dout(10) << "EMetaBlob.replay " << lump_map.size() << " dirlumps by " << client_name << dendl;
@@ -676,8 +676,12 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
// see if we can discard the subtree we renamed out of
CDir *root = mds->mdcache->get_subtree_root(olddir);
- if (root->get_dir_auth() == CDIR_AUTH_UNDEF)
- mds->mdcache->try_trim_non_auth_subtree(root);
+ if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
+ if (slaveup) // preserve the old dir until slave commit
+ slaveup->rename_olddir = olddir;
+ else
+ mds->mdcache->try_trim_non_auth_subtree(root);
+ }
}
// if we are the srci importer, we'll also have some dirfrags we have to open up...
@@ -710,8 +714,12 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
for (set<CInode*>::iterator p = linked.begin(); p != linked.end(); p++)
unlinked.erase(*p);
dout(10) << " unlinked set contains " << unlinked << dendl;
- for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p)
- mds->mdcache->remove_inode_recursive(p->first);
+ for (map<CInode*, CDir*>::iterator p = unlinked.begin(); p != unlinked.end(); ++p) {
+ if (slaveup) // preserve unlinked inodes until slave commit
+ slaveup->unlinked.insert(p->first);
+ else
+ mds->mdcache->remove_inode_recursive(p->first);
+ }
}
// table client transactions
@@ -1107,23 +1115,21 @@ void ECommitted::replay(MDS *mds)
void ESlaveUpdate::replay(MDS *mds)
{
+ MDSlaveUpdate *su;
switch (op) {
case ESlaveUpdate::OP_PREPARE:
dout(10) << "ESlaveUpdate.replay prepare " << reqid << " for mds." << master
<< ": applying commit, saving rollback info" << dendl;
- assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid) == 0);
- commit.replay(mds, _segment);
- mds->mdcache->uncommitted_slave_updates[master][reqid] =
- new MDSlaveUpdate(origop, rollback, _segment->slave_updates);
+ su = new MDSlaveUpdate(origop, rollback, _segment->slave_updates);
+ commit.replay(mds, _segment, su);
+ mds->mdcache->add_uncommitted_slave_update(reqid, master, su);
break;
case ESlaveUpdate::OP_COMMIT:
- if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) {
+ su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
+ if (su) {
dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master << dendl;
- delete mds->mdcache->uncommitted_slave_updates[master][reqid];
- mds->mdcache->uncommitted_slave_updates[master].erase(reqid);
- if (mds->mdcache->uncommitted_slave_updates[master].empty())
- mds->mdcache->uncommitted_slave_updates.erase(master);
+ mds->mdcache->finish_uncommitted_slave_update(reqid, master);
} else {
dout(10) << "ESlaveUpdate.replay commit " << reqid << " for mds." << master
<< ": ignoring, no previously saved prepare" << dendl;
@@ -1131,19 +1137,12 @@ void ESlaveUpdate::replay(MDS *mds)
break;
case ESlaveUpdate::OP_ROLLBACK:
- if (mds->mdcache->uncommitted_slave_updates[master].count(reqid)) {
- dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
- << ": applying rollback commit blob" << dendl;
- assert(mds->mdcache->uncommitted_slave_updates[master].count(reqid));
- commit.replay(mds, _segment);
- delete mds->mdcache->uncommitted_slave_updates[master][reqid];
- mds->mdcache->uncommitted_slave_updates[master].erase(reqid);
- if (mds->mdcache->uncommitted_slave_updates[master].empty())
- mds->mdcache->uncommitted_slave_updates.erase(master);
- } else {
- dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
- << ": ignoring, no previously saved prepare" << dendl;
- }
+ dout(10) << "ESlaveUpdate.replay abort " << reqid << " for mds." << master
+ << ": applying rollback commit blob" << dendl;
+ su = mds->mdcache->get_uncommitted_slave_update(reqid, master);
+ if (su)
+ mds->mdcache->finish_uncommitted_slave_update(reqid, master);
+ commit.replay(mds, _segment);
break;
default: