summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSridhar Samudrala <sridhar.samudrala@intel.com>2020-08-18 23:53:07 -0700
committerdormando <dormando@rydia.net>2020-11-02 15:00:36 -0800
commit4de258ed891c0e5048192be1626fff6fabb10438 (patch)
tree72d641c7d3cf7fe74817f229fed164342cfa41ff
parent0b374c63ab7e63c0098983d0a68cefedfd94557a (diff)
downloadmemcached-4de258ed891c0e5048192be1626fff6fabb10438.tar.gz
Introduce NAPI ID based worker thread selection
By default memcached assigns connections to worker threads in a round-robin manner. This patch introduces an option to select a worker thread based on the incoming connection's NAPI ID if SO_INCOMING_NAPI_ID socket option is supported by the OS. This allows a memcached worker thread to be associated with a NIC HW receive queue and service all the connection requests received on a specific RX queue. This mapping between a memcached thread and a HW NIC queue streamlines the flow of data from the NIC to the application. In addition, an optimal path with reduced context switches is possible, if epoll based busy polling (sysctl -w net.core.busy_poll = <non-zero value>) is also enabled. This feature is enabled via a new command line parameter -N <num> or "--napi_ids=<num>", where <num> is the number of available/assigned NIC hardware RX queues through which the connections can be received. The number of napi_ids specified cannot be greater than the number of worker threads specified using -t/--threads option. If the option is not specified, or the conditions not met, the code defaults to round robin thread selection. Signed-off-by: Kiran Patil <kiran.patil@intel.com> Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
-rw-r--r--doc/napi_ids.txt41
-rw-r--r--doc/protocol.txt5
-rw-r--r--memcached.c34
-rw-r--r--memcached.h4
-rwxr-xr-xt/stats.t4
-rw-r--r--thread.c80
6 files changed, 162 insertions, 6 deletions
diff --git a/doc/napi_ids.txt b/doc/napi_ids.txt
new file mode 100644
index 0000000..5f497dc
--- /dev/null
+++ b/doc/napi_ids.txt
@@ -0,0 +1,41 @@
+NAPI ID based worker thread selection
+ -N <num_napi_ids> | --napi_ids=<num_napi_ids>
+
+By default memcached assigns connections to worker threads in a round-robin
+manner. NAPI ID based worker thread selection enables worker threads to be
+selected based on the NIC HW RX queue on which the incoming request is
+received.
+
+This is enabled using SO_INCOMING_NAPI_ID socket option that is supported
+in linux kernels 4.12 or higher. This socket option returns a system level
+unique ID called NAPI ID that is associated with a RX queue on which the
+last packet associated with that socket is received.
+
+This allows memcached to split the incoming flows among threads based on
+the RX queue on which they are received. Each worker thread is associated
+with a NIC HW receive queue and services all the connection requests
+received on a specific RX queue. This mapping between a memcached thread
+and a HW NIC queue streamlines the flow of data from the NIC to the
+application. In addition, an optimal path with reduced context switches is
+possible, if epoll based busy polling
+(sysctl -w net.core.busy_poll = <non-zero value>) is also enabled.
+
+This feature is enabled via a new command line parameter -N <num> or
+"--napi_ids=<num>", where <num> is the number of available/assigned NIC
+hardware RX queues through which requests associated with a connection are
+received. The number of napi_ids specified cannot be greater than the number
+of worker threads specified using -t/--threads option. If the option is
+not specified, or the conditions not met, the code defaults to round robin
+thread selection.
+
+During a normal run, each worker thread gets associated with a napi_id and
+this will establish a 1:1 mapping between the thread and queues. If a new
+napi_id is received after each thread is associated with its own napi_id
+(this can happen if num_napi_ids argument doesn't exactly match with the
+number of RX queues OR if the NIC driver goes through a reload), a stats
+error counter called 'unexpected_napi_ids' is incremented and all the
+napi_id's associated with the threads are reset.
+
+If -N option is used, but the connection requests are received from a
+virtual interface like loopback, napi_id returned can be 0. This condition
+is tracked via a stats counter called 'round_robin_fallback'.
diff --git a/doc/protocol.txt b/doc/protocol.txt
index 737937c..c4cca71 100644
--- a/doc/protocol.txt
+++ b/doc/protocol.txt
@@ -1310,6 +1310,11 @@ integers separated by a colon (treat this as a floating point number).
| log_worker_written | 64u | Logs written by a worker, to be picked up |
| log_watcher_skipped | 64u | Logs not sent to slow watchers. |
| log_watcher_sent | 64u | Logs written to watchers. |
+| unexected_napi_ids | 64u | Number of times an unexpected napi id is |
+| | | is received. See doc/napi_ids.txt |
+| round_robin_fallback | 64u | Number of times napi id of 0 is received |
+| | | resulting in fallback to round robin |
+| | | thread selection. See doc/napi_ids.txt |
|-----------------------+---------+-------------------------------------------|
Settings statistics
diff --git a/memcached.c b/memcached.c
index 3ce7d73..b5dc8c7 100644
--- a/memcached.c
+++ b/memcached.c
@@ -61,6 +61,10 @@
#include <sys/sysctl.h>
#endif
+#ifndef SO_INCOMING_NAPI_ID
+#define SO_INCOMING_NAPI_ID 56
+#endif
+
/*
* forward declarations
*/
@@ -288,6 +292,7 @@ static void settings_init(void) {
#ifdef MEMCACHED_DEBUG
settings.relaxed_privileges = false;
#endif
+ settings.num_napi_ids = 0;
}
extern pthread_mutex_t conn_lock;
@@ -1775,6 +1780,8 @@ void server_stats(ADD_STAT add_stats, conn *c) {
APPEND_STAT("time_since_server_cert_refresh", "%u", now - settings.ssl_last_cert_refresh_time);
}
#endif
+ APPEND_STAT("unexpected_napi_ids", "%llu", (unsigned long long)stats.unexpected_napi_ids);
+ APPEND_STAT("round_robin_fallback", "%llu", (unsigned long long)stats.round_robin_fallback);
}
void process_stat_settings(ADD_STAT add_stats, void *c) {
@@ -1859,6 +1866,7 @@ void process_stat_settings(ADD_STAT add_stats, void *c) {
APPEND_STAT("ssl_wbuf_size", "%u", settings.ssl_wbuf_size);
APPEND_STAT("ssl_session_cache", "%s", settings.ssl_session_cache ? "yes" : "no");
#endif
+ APPEND_STAT("num_napi_ids", "%s", settings.num_napi_ids);
}
static int nz_strcmp(int nzlength, const char *nz, const char *z) {
@@ -3383,6 +3391,16 @@ static int server_socket(const char *interface,
continue;
}
+ if (settings.num_napi_ids) {
+ socklen_t len = sizeof(socklen_t);
+ int napi_id;
+ error = getsockopt(sfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len);
+ if (error != 0) {
+ fprintf(stderr, "-N <num_napi_ids> option not supported\n");
+ exit(EXIT_FAILURE);
+ }
+ }
+
#ifdef IPV6_V6ONLY
if (next->ai_family == AF_INET6) {
error = setsockopt(sfd, IPPROTO_IPV6, IPV6_V6ONLY, (char *) &flags, sizeof(flags));
@@ -3906,6 +3924,7 @@ static void usage(void) {
verify_default("ssl_keyformat", settings.ssl_keyformat == SSL_FILETYPE_PEM);
verify_default("ssl_verify_mode", settings.ssl_verify_mode == SSL_VERIFY_NONE);
#endif
+ printf("-N, --napi_ids number of napi ids. see doc/napi_ids.txt for more details\n");
return;
}
@@ -4663,6 +4682,7 @@ int main (int argc, char **argv) {
"Y:" /* Enable token auth */
"e:" /* mmap path for external item memory */
"o:" /* Extended generic options */
+ "N:" /* NAPI ID based thread selection */
;
/* process arguments */
@@ -4703,6 +4723,7 @@ int main (int argc, char **argv) {
{"auth-file", required_argument, 0, 'Y'},
{"memory-file", required_argument, 0, 'e'},
{"extended", required_argument, 0, 'o'},
+ {"napi-ids", required_argument, 0, 'N'},
{0, 0, 0, 0}
};
int optindex;
@@ -4924,6 +4945,13 @@ int main (int argc, char **argv) {
// dupe the file path now just in case the options get mangled.
settings.auth_file = strdup(optarg);
break;
+ case 'N':
+ settings.num_napi_ids = atoi(optarg);
+ if (settings.num_napi_ids <= 0) {
+ fprintf(stderr, "Maximum number of NAPI IDs must be greater than 0\n");
+ return 1;
+ }
+ break;
case 'o': /* It's sub-opts time! */
subopts_orig = subopts = strdup(optarg); /* getsubopt() changes the original args */
@@ -5311,6 +5339,12 @@ int main (int argc, char **argv) {
}
}
+ if (settings.num_napi_ids > settings.num_threads) {
+ fprintf(stderr, "Number of napi_ids(%d) cannot be greater than number of threads(%d)\n",
+ settings.num_napi_ids, settings.num_threads);
+ exit(EX_USAGE);
+ }
+
if (settings.item_size_max < ITEM_SIZE_MAX_LOWER_LIMIT) {
fprintf(stderr, "Item max size cannot be less than 1024 bytes.\n");
exit(EX_USAGE);
diff --git a/memcached.h b/memcached.h
index 2eb9ebb..17d2a8f 100644
--- a/memcached.h
+++ b/memcached.h
@@ -367,6 +367,8 @@ struct stats {
uint64_t ssl_new_sessions; /* successfully negotiated new (non-reused) TLS sessions */
#endif
struct timeval maxconns_entered; /* last time maxconns entered */
+ uint64_t unexpected_napi_ids; /* see doc/napi_ids.txt */
+ uint64_t round_robin_fallback; /* see doc/napi_ids.txt */
};
/**
@@ -481,6 +483,7 @@ struct settings {
unsigned int ssl_wbuf_size; /* size of the write buffer used by ssl_sendmsg method */
bool ssl_session_cache; /* enable SSL server session caching */
#endif
+ int num_napi_ids; /* maximum number of NAPI IDs */
};
extern struct stats stats;
@@ -621,6 +624,7 @@ typedef struct {
#ifdef TLS
char *ssl_wbuf;
#endif
+ int napi_id; /* napi id associated with this thread */
} LIBEVENT_THREAD;
diff --git a/t/stats.t b/t/stats.t
index a84dc05..7c82bca 100755
--- a/t/stats.t
+++ b/t/stats.t
@@ -28,9 +28,9 @@ if (MemcachedTest::enabled_tls_testing()) {
# when TLS is enabled, stats contains additional keys:
# - ssl_handshake_errors
# - time_since_server_cert_refresh
- is(scalar(keys(%$stats)), 80, "expected count of stats values");
+ is(scalar(keys(%$stats)), 82, "expected count of stats values");
} else {
- is(scalar(keys(%$stats)), 78, "expected count of stats values");
+ is(scalar(keys(%$stats)), 80, "expected count of stats values");
}
# Test initial state
diff --git a/thread.c b/thread.c
index 7de7d4e..e4b1da7 100644
--- a/thread.c
+++ b/thread.c
@@ -587,6 +587,77 @@ static void thread_libevent_process(evutil_socket_t fd, short which, void *arg)
/* Which thread we assigned a connection to most recently. */
static int last_thread = -1;
+/* Last thread we assigned to a connection based on napi_id */
+static int last_thread_by_napi_id = -1;
+
+static LIBEVENT_THREAD *select_thread_round_robin(void)
+{
+ int tid = (last_thread + 1) % settings.num_threads;
+
+ last_thread = tid;
+
+ return threads + tid;
+}
+
+static void reset_threads_napi_id(void)
+{
+ LIBEVENT_THREAD *thread;
+ int i;
+
+ for (i = 0; i < settings.num_threads; i++) {
+ thread = threads + i;
+ thread->napi_id = 0;
+ }
+
+ last_thread_by_napi_id = -1;
+}
+
+/* Select a worker thread based on the NAPI ID of an incoming connection
+ * request. NAPI ID is a globally unique ID that identifies a NIC RX queue
+ * on which a flow is received.
+ */
+static LIBEVENT_THREAD *select_thread_by_napi_id(int sfd)
+{
+ LIBEVENT_THREAD *thread;
+ int napi_id, err, i;
+ socklen_t len;
+ int tid = -1;
+
+ len = sizeof(socklen_t);
+ err = getsockopt(sfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len);
+ if ((err == -1) || (napi_id == 0)) {
+ STATS_LOCK();
+ stats.round_robin_fallback++;
+ STATS_UNLOCK();
+ return select_thread_round_robin();
+ }
+
+select:
+ for (i = 0; i < settings.num_threads; i++) {
+ thread = threads + i;
+ if (last_thread_by_napi_id < i) {
+ thread->napi_id = napi_id;
+ last_thread_by_napi_id = i;
+ tid = i;
+ break;
+ }
+ if (thread->napi_id == napi_id) {
+ tid = i;
+ break;
+ }
+ }
+
+ if (tid == -1) {
+ STATS_LOCK();
+ stats.unexpected_napi_ids++;
+ STATS_UNLOCK();
+ reset_threads_napi_id();
+ goto select;
+ }
+
+ return threads + tid;
+}
+
/*
* Dispatches a new connection to another thread. This is only ever called
* from the main thread, either during initialization (for UDP) or because
@@ -603,11 +674,12 @@ void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags,
return;
}
- int tid = (last_thread + 1) % settings.num_threads;
-
- LIBEVENT_THREAD *thread = threads + tid;
+ LIBEVENT_THREAD *thread;
- last_thread = tid;
+ if (!settings.num_napi_ids)
+ thread = select_thread_round_robin();
+ else
+ thread = select_thread_by_napi_id(sfd);
item->sfd = sfd;
item->init_state = init_state;