summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSam Lang <sam.lang@inktank.com>2013-04-09 10:35:19 -0500
committerSam Lang <sam.lang@inktank.com>2013-04-11 10:25:46 -0500
commit4977f3eab0cb265efeceeb02862c09e460549828 (patch)
tree1f050a26f8bf1252daebdb49a594880c82d2e339
parent3a1cf53c30b67efbec46f893b3248939ca6d610c (diff)
downloadceph-4977f3eab0cb265efeceeb02862c09e460549828.tar.gz
mds: Delay export on missing inodes for reconnect
The reconnect caps sent by the client on reconnect may not have inodes found in the inode cache until after clientreplay (when the client creates a new file, for example). Currently, we send an export for that cap to the client if we don't see an inode in the cache and path_is_mine() returns false (for example, if the client didn't send a path because the file was already unlinked). Instead, we want to delay handling of the reconnect cap until clientreplay completes. This patch modifies handle_client_reconnect() so that we don't assume the cap isn't ours if we don't have an inode for it, but instead delay recovery for later. An export cap message is only sent if the inode exists and the cap isn't ours (non-auth) during reconnect. If any remaining recovered caps exist in the recovered list once the mds goes active, we send export messages at that point. Also, after removing the path_is_mine check, MDCache::parallel_fetch_traverse_dir() needs to skip non-auth dirfrags. Fixes #4451. Signed-off-by: Sam Lang <sam.lang@inktank.com> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com> Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com> Reviewed-by: Greg Farnum <greg@inktank.com>
-rw-r--r--src/mds/Capability.h1
-rw-r--r--src/mds/MDCache.cc45
-rw-r--r--src/mds/MDCache.h1
-rw-r--r--src/mds/MDS.cc1
-rw-r--r--src/mds/Server.cc15
5 files changed, 46 insertions, 17 deletions
diff --git a/src/mds/Capability.h b/src/mds/Capability.h
index 946afdc02b9..54d2312daeb 100644
--- a/src/mds/Capability.h
+++ b/src/mds/Capability.h
@@ -272,6 +272,7 @@ public:
Export make_export() {
return Export(_wanted, issued(), pending(), client_follows, mseq+1, last_issue_stamp);
}
+ void rejoin_import() { mseq++; }
void merge(Export& other) {
// issued + pending
int newpending = other.pending | pending();
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 3f090bb3238..3129ed7c267 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -4097,20 +4097,25 @@ bool MDCache::parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
frag_t fg = cur->pick_dirfrag(path[i]);
CDir *dir = cur->get_or_open_dirfrag(this, fg);
CDentry *dn = dir->lookup(path[i]);
- CDentry::linkage_t *dnl = dn->get_linkage();
- if (!dn || dnl->is_null()) {
- if (!dir->is_complete()) {
- // fetch dir
- fetch_queue.insert(dir);
- return false;
- } else {
+ CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
+
+ if (!dnl || dnl->is_null()) {
+ if (!dir->is_auth()) {
+ dout(10) << " not dirfrag auth " << *dir << dendl;
+ return true;
+ }
+ if (dnl || dir->is_complete()) {
// probably because the client created it and held a cap but it never committed
// to the journal, and the op hasn't replayed yet.
dout(5) << " dne (not created yet?) " << ino << " at " << path << dendl;
missing.insert(ino);
return true;
}
+ // fetch dir
+ fetch_queue.insert(dir);
+ return false;
}
+
cur = dnl->get_inode();
if (!cur) {
assert(dnl->is_remote());
@@ -5041,8 +5046,32 @@ void MDCache::rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconn
Capability *cap = in->reconnect_cap(client, icr, session);
- if (frommds >= 0)
+ if (frommds >= 0) {
+ cap->rejoin_import();
do_cap_import(session, in, cap);
+ }
+}
+
+void MDCache::export_remaining_imported_caps()
+{
+ dout(10) << "export_remaining_imported_caps" << dendl;
+
+ for (map<inodeno_t,map<client_t,map<int,ceph_mds_cap_reconnect> > >::iterator p = cap_imports.begin();
+ p != cap_imports.end();
+ ++p) {
+ for (map<client_t,map<int,ceph_mds_cap_reconnect> >::iterator q = p->second.begin();
+ q != p->second.end();
+ ++q) {
+ Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
+ if (session) {
+ // mark client caps stale.
+ MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0);
+ mds->send_message_client_counted(stale, q->first);
+ }
+ }
+ }
+
+ cap_imports.clear();
}
void MDCache::try_reconnect_cap(CInode *in, Session *session)
diff --git a/src/mds/MDCache.h b/src/mds/MDCache.h
index 73780e26892..d837586a3ac 100644
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@@ -486,6 +486,7 @@ public:
void rejoin_import_cap(CInode *in, client_t client, ceph_mds_cap_reconnect& icr, int frommds);
void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
void try_reconnect_cap(CInode *in, Session *session);
+ void export_remaining_imported_caps();
// cap imports. delayed snap parent opens.
// realm inode -> client -> cap inodes needing to split to this realm
diff --git a/src/mds/MDS.cc b/src/mds/MDS.cc
index 3b3b2d6dc2e..935fb0c417e 100644
--- a/src/mds/MDS.cc
+++ b/src/mds/MDS.cc
@@ -1504,6 +1504,7 @@ void MDS::active_start()
mdcache->clean_open_file_lists();
mdcache->scan_stray_dir();
+ mdcache->export_remaining_imported_caps();
finish_contexts(g_ceph_context, waiting_for_replay); // kick waiters
finish_contexts(g_ceph_context, waiting_for_active); // kick waiters
}
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index dc7ea23f763..11ab834d856 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -574,7 +574,7 @@ void Server::handle_client_reconnect(MClientReconnect *m)
// notify client of success with an OPEN
mds->messenger->send_message(new MClientSession(CEPH_SESSION_OPEN), m->get_connection());
-
+
if (session->is_closed()) {
dout(10) << " session is closed, will make best effort to reconnect "
<< m->get_source_inst() << dendl;
@@ -636,15 +636,12 @@ void Server::handle_client_reconnect(MClientReconnect *m)
}
filepath path(p->second.path, (uint64_t)p->second.capinfo.pathbase);
- if ((in && !in->is_auth()) ||
- !mds->mdcache->path_is_mine(path)) {
+ if (in && !in->is_auth()) {
// not mine.
dout(0) << "non-auth " << p->first << " " << path
<< ", will pass off to authority" << dendl;
// mark client caps stale.
- inode_t fake_inode;
- fake_inode.ino = p->first;
MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0);
//stale->head.migrate_seq = 0; // FIXME ******
mds->send_message_client_counted(stale, session);
@@ -652,11 +649,11 @@ void Server::handle_client_reconnect(MClientReconnect *m)
// add to cap export list.
mdcache->rejoin_export_caps(p->first, from, p->second);
} else {
- // mine. fetch later.
+ // don't know if the inode is mine
dout(0) << "missing " << p->first << " " << path
- << " (mine), will load later" << dendl;
- mdcache->rejoin_recovered_caps(p->first, from, p->second,
- -1); // "from" me.
+ << " will load or export later" << dendl;
+ mdcache->rejoin_recovered_caps(p->first, from, p->second, -1);
+ mdcache->rejoin_export_caps(p->first, from, p->second);
}
}