osd: helpers to blacklist messages to down osds

There is a race between handle_osd_map -> note_down_osd() and PG threads: - handle_osd_map -> note_down_osd marks down an osd for epoch N - a pg thread with epoch <N sends a message to the (old) peer, reopening the msgr connection - nobody cleans up Introduce a pre_publish_map() OSDService method and helpers for sending messages to peers. Pass in the epoch we are working from, and drop the message on the floor if the target OSD has been since marked down. See #3548. Signed-off-by: Sage Weil <sage@inktank.com>
author: Sage Weil <sage@inktank.com> 2012-11-28 16:02:59 -0800
committer: Sage Weil <sage@inktank.com> 2012-11-29 12:38:51 -0800
commit: ea65dffff2d78f10894deac7f92d330d3cc37ccd (patch)
tree: da955e5cdd4870ca2bc29792355021c4b59afdd2
parent: dd3a24a647d0b0f1153cf1b102ed1f51d51be2f2 (diff)
download: ceph-ea65dffff2d78f10894deac7f92d330d3cc37ccd.tar.gz
2 files changed, 61 insertions, 3 deletions
diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc
index 96052820a15..914d1dcc88a 100644
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@@ -2475,6 +2475,49 @@ void OSD::send_alive()
   }
 }
 
+void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
+{
+  Mutex::Locker l(publish_lock);
+
+  // service map is always newer/newest
+  assert(from_epoch <= next_osdmap->get_epoch());
+
+  if (next_osdmap->is_down(peer) ||
+      next_osdmap->get_info(peer).up_from > from_epoch) {
+    m->put();
+    return;
+  }
+  osd->cluster_messenger->send_message(m, next_osdmap->get_cluster_inst(peer));
+}
+
+Connection *OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
+{
+  Mutex::Locker l(publish_lock);
+
+  // service map is always newer/newest
+  assert(from_epoch <= next_osdmap->get_epoch());
+
+  if (next_osdmap->is_down(peer) ||
+      next_osdmap->get_info(peer).up_from > from_epoch) {
+    return NULL;
+  }
+  return osd->cluster_messenger->get_connection(next_osdmap->get_cluster_inst(peer));
+}
+
+Connection *OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
+{
+  Mutex::Locker l(publish_lock);
+
+  // service map is always newer/newest
+  assert(from_epoch <= next_osdmap->get_epoch());
+
+  if (next_osdmap->is_down(peer) ||
+      next_osdmap->get_info(peer).up_from > from_epoch) {
+    return NULL;
+  }
+  return osd->hbclient_messenger->get_connection(next_osdmap->get_hb_inst(peer));
+}
+
 void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
 {
   Mutex::Locker l(pg_temp_lock);
@@ -3632,14 +3675,19 @@ void OSD::handle_osd_map(MOSDMap *m)
     OSDMapRef newmap = get_map(cur);
     assert(newmap);  // we just cached it above!
 
+    // start blacklisting messages sent to peers that go down.
+    service.pre_publish_map(newmap);
+
     // kill connections to newly down osds
     set<int> old;
     osdmap->get_all_osds(old);
-    for (set<int>::iterator p = old.begin(); p != old.end(); p++)
+    for (set<int>::iterator p = old.begin(); p != old.end(); p++) {
       if (*p != whoami &&
 	  osdmap->have_inst(*p) &&                        // in old map
-	  (!newmap->exists(*p) || !newmap->is_up(*p)))    // but not the new one
+	  (!newmap->exists(*p) || !newmap->is_up(*p))) {  // but not the new one
 	note_down_osd(*p);
+      }
+    }
     
     osdmap = newmap;
 
diff --git a/src/osd/OSD.h b/src/osd/OSD.h
index ff277e1e5b8..4608891a20d 100644
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@@ -193,18 +193,28 @@ public:
     Mutex::Locker l(publish_lock);
     superblock = block;
   }
-  OSDMapRef osdmap;
+  OSDMapRef osdmap, next_osdmap;
   OSDMapRef get_osdmap() {
     Mutex::Locker l(publish_lock);
     return osdmap;
   }
+  void pre_publish_map(OSDMapRef map) {
+    Mutex::Locker l(publish_lock);
+    next_osdmap = map;
+  }
   void publish_map(OSDMapRef map) {
     Mutex::Locker l(publish_lock);
     osdmap = map;
+    next_osdmap = map;
   }
 
   int get_nodeid() const { return whoami; }
 
+  // -- message helpers --
+  Connection *get_con_osd_cluster(int peer, epoch_t from_epoch);
+  Connection *get_con_osd_hb(int peer, epoch_t from_epoch);
+  void send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch);
+
   // -- scrub scheduling --
   Mutex sched_scrub_lock;
   int scrubs_pending;
author	Sage Weil <sage@inktank.com>	2012-11-28 16:02:59 -0800
committer	Sage Weil <sage@inktank.com>	2012-11-29 12:38:51 -0800
commit	ea65dffff2d78f10894deac7f92d330d3cc37ccd (patch)
tree	da955e5cdd4870ca2bc29792355021c4b59afdd2
parent	dd3a24a647d0b0f1153cf1b102ed1f51d51be2f2 (diff)
download	ceph-ea65dffff2d78f10894deac7f92d330d3cc37ccd.tar.gz