diff options
author | Sridhar Samudrala <sridhar.samudrala@intel.com> | 2020-08-18 23:53:07 -0700 |
---|---|---|
committer | dormando <dormando@rydia.net> | 2020-11-02 15:00:36 -0800 |
commit | 4de258ed891c0e5048192be1626fff6fabb10438 (patch) | |
tree | 72d641c7d3cf7fe74817f229fed164342cfa41ff | |
parent | 0b374c63ab7e63c0098983d0a68cefedfd94557a (diff) | |
download | memcached-4de258ed891c0e5048192be1626fff6fabb10438.tar.gz |
Introduce NAPI ID based worker thread selection
By default memcached assigns connections to worker threads in
a round-robin manner. This patch introduces an option to select
a worker thread based on the incoming connection's NAPI ID if
SO_INCOMING_NAPI_ID socket option is supported by the OS.
This allows a memcached worker thread to be associated with a
NIC HW receive queue and service all the connection requests
received on a specific RX queue. This mapping between a memcached
thread and a HW NIC queue streamlines the flow of data from the
NIC to the application. In addition, an optimal path with reduced
context switches is possible, if epoll based busy polling
(sysctl -w net.core.busy_poll = <non-zero value>) is also enabled.
This feature is enabled via a new command line parameter -N <num>
or "--napi_ids=<num>", where <num> is the number of available/assigned
NIC hardware RX queues through which the connections can be received.
The number of napi_ids specified cannot be greater than the number
of worker threads specified using -t/--threads option.
If the option is not specified, or the conditions not met, the code
defaults to round robin thread selection.
Signed-off-by: Kiran Patil <kiran.patil@intel.com>
Signed-off-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
-rw-r--r-- | doc/napi_ids.txt | 41 | ||||
-rw-r--r-- | doc/protocol.txt | 5 | ||||
-rw-r--r-- | memcached.c | 34 | ||||
-rw-r--r-- | memcached.h | 4 | ||||
-rwxr-xr-x | t/stats.t | 4 | ||||
-rw-r--r-- | thread.c | 80 |
6 files changed, 162 insertions, 6 deletions
diff --git a/doc/napi_ids.txt b/doc/napi_ids.txt new file mode 100644 index 0000000..5f497dc --- /dev/null +++ b/doc/napi_ids.txt @@ -0,0 +1,41 @@ +NAPI ID based worker thread selection + -N <num_napi_ids> | --napi_ids=<num_napi_ids> + +By default memcached assigns connections to worker threads in a round-robin +manner. NAPI ID based worker thread selection enables worker threads to be +selected based on the NIC HW RX queue on which the incoming request is +received. + +This is enabled using SO_INCOMING_NAPI_ID socket option that is supported +in linux kernels 4.12 or higher. This socket option returns a system level +unique ID called NAPI ID that is associated with a RX queue on which the +last packet associated with that socket is received. + +This allows memcached to split the incoming flows among threads based on +the RX queue on which they are received. Each worker thread is associated +with a NIC HW receive queue and services all the connection requests +received on a specific RX queue. This mapping between a memcached thread +and a HW NIC queue streamlines the flow of data from the NIC to the +application. In addition, an optimal path with reduced context switches is +possible, if epoll based busy polling +(sysctl -w net.core.busy_poll = <non-zero value>) is also enabled. + +This feature is enabled via a new command line parameter -N <num> or +"--napi_ids=<num>", where <num> is the number of available/assigned NIC +hardware RX queues through which requests associated with a connection are +received. The number of napi_ids specified cannot be greater than the number +of worker threads specified using -t/--threads option. If the option is +not specified, or the conditions not met, the code defaults to round robin +thread selection. + +During a normal run, each worker thread gets associated with a napi_id and +this will establish a 1:1 mapping between the thread and queues. If a new +napi_id is received after each thread is associated with its own napi_id +(this can happen if num_napi_ids argument doesn't exactly match with the +number of RX queues OR if the NIC driver goes through a reload), a stats +error counter called 'unexpected_napi_ids' is incremented and all the +napi_id's associated with the threads are reset. + +If -N option is used, but the connection requests are received from a +virtual interface like loopback, napi_id returned can be 0. This condition +is tracked via a stats counter called 'round_robin_fallback'. diff --git a/doc/protocol.txt b/doc/protocol.txt index 737937c..c4cca71 100644 --- a/doc/protocol.txt +++ b/doc/protocol.txt @@ -1310,6 +1310,11 @@ integers separated by a colon (treat this as a floating point number). | log_worker_written | 64u | Logs written by a worker, to be picked up | | log_watcher_skipped | 64u | Logs not sent to slow watchers. | | log_watcher_sent | 64u | Logs written to watchers. | +| unexected_napi_ids | 64u | Number of times an unexpected napi id is | +| | | is received. See doc/napi_ids.txt | +| round_robin_fallback | 64u | Number of times napi id of 0 is received | +| | | resulting in fallback to round robin | +| | | thread selection. See doc/napi_ids.txt | |-----------------------+---------+-------------------------------------------| Settings statistics diff --git a/memcached.c b/memcached.c index 3ce7d73..b5dc8c7 100644 --- a/memcached.c +++ b/memcached.c @@ -61,6 +61,10 @@ #include <sys/sysctl.h> #endif +#ifndef SO_INCOMING_NAPI_ID +#define SO_INCOMING_NAPI_ID 56 +#endif + /* * forward declarations */ @@ -288,6 +292,7 @@ static void settings_init(void) { #ifdef MEMCACHED_DEBUG settings.relaxed_privileges = false; #endif + settings.num_napi_ids = 0; } extern pthread_mutex_t conn_lock; @@ -1775,6 +1780,8 @@ void server_stats(ADD_STAT add_stats, conn *c) { APPEND_STAT("time_since_server_cert_refresh", "%u", now - settings.ssl_last_cert_refresh_time); } #endif + APPEND_STAT("unexpected_napi_ids", "%llu", (unsigned long long)stats.unexpected_napi_ids); + APPEND_STAT("round_robin_fallback", "%llu", (unsigned long long)stats.round_robin_fallback); } void process_stat_settings(ADD_STAT add_stats, void *c) { @@ -1859,6 +1866,7 @@ void process_stat_settings(ADD_STAT add_stats, void *c) { APPEND_STAT("ssl_wbuf_size", "%u", settings.ssl_wbuf_size); APPEND_STAT("ssl_session_cache", "%s", settings.ssl_session_cache ? "yes" : "no"); #endif + APPEND_STAT("num_napi_ids", "%s", settings.num_napi_ids); } static int nz_strcmp(int nzlength, const char *nz, const char *z) { @@ -3383,6 +3391,16 @@ static int server_socket(const char *interface, continue; } + if (settings.num_napi_ids) { + socklen_t len = sizeof(socklen_t); + int napi_id; + error = getsockopt(sfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len); + if (error != 0) { + fprintf(stderr, "-N <num_napi_ids> option not supported\n"); + exit(EXIT_FAILURE); + } + } + #ifdef IPV6_V6ONLY if (next->ai_family == AF_INET6) { error = setsockopt(sfd, IPPROTO_IPV6, IPV6_V6ONLY, (char *) &flags, sizeof(flags)); @@ -3906,6 +3924,7 @@ static void usage(void) { verify_default("ssl_keyformat", settings.ssl_keyformat == SSL_FILETYPE_PEM); verify_default("ssl_verify_mode", settings.ssl_verify_mode == SSL_VERIFY_NONE); #endif + printf("-N, --napi_ids number of napi ids. see doc/napi_ids.txt for more details\n"); return; } @@ -4663,6 +4682,7 @@ int main (int argc, char **argv) { "Y:" /* Enable token auth */ "e:" /* mmap path for external item memory */ "o:" /* Extended generic options */ + "N:" /* NAPI ID based thread selection */ ; /* process arguments */ @@ -4703,6 +4723,7 @@ int main (int argc, char **argv) { {"auth-file", required_argument, 0, 'Y'}, {"memory-file", required_argument, 0, 'e'}, {"extended", required_argument, 0, 'o'}, + {"napi-ids", required_argument, 0, 'N'}, {0, 0, 0, 0} }; int optindex; @@ -4924,6 +4945,13 @@ int main (int argc, char **argv) { // dupe the file path now just in case the options get mangled. settings.auth_file = strdup(optarg); break; + case 'N': + settings.num_napi_ids = atoi(optarg); + if (settings.num_napi_ids <= 0) { + fprintf(stderr, "Maximum number of NAPI IDs must be greater than 0\n"); + return 1; + } + break; case 'o': /* It's sub-opts time! */ subopts_orig = subopts = strdup(optarg); /* getsubopt() changes the original args */ @@ -5311,6 +5339,12 @@ int main (int argc, char **argv) { } } + if (settings.num_napi_ids > settings.num_threads) { + fprintf(stderr, "Number of napi_ids(%d) cannot be greater than number of threads(%d)\n", + settings.num_napi_ids, settings.num_threads); + exit(EX_USAGE); + } + if (settings.item_size_max < ITEM_SIZE_MAX_LOWER_LIMIT) { fprintf(stderr, "Item max size cannot be less than 1024 bytes.\n"); exit(EX_USAGE); diff --git a/memcached.h b/memcached.h index 2eb9ebb..17d2a8f 100644 --- a/memcached.h +++ b/memcached.h @@ -367,6 +367,8 @@ struct stats { uint64_t ssl_new_sessions; /* successfully negotiated new (non-reused) TLS sessions */ #endif struct timeval maxconns_entered; /* last time maxconns entered */ + uint64_t unexpected_napi_ids; /* see doc/napi_ids.txt */ + uint64_t round_robin_fallback; /* see doc/napi_ids.txt */ }; /** @@ -481,6 +483,7 @@ struct settings { unsigned int ssl_wbuf_size; /* size of the write buffer used by ssl_sendmsg method */ bool ssl_session_cache; /* enable SSL server session caching */ #endif + int num_napi_ids; /* maximum number of NAPI IDs */ }; extern struct stats stats; @@ -621,6 +624,7 @@ typedef struct { #ifdef TLS char *ssl_wbuf; #endif + int napi_id; /* napi id associated with this thread */ } LIBEVENT_THREAD; @@ -28,9 +28,9 @@ if (MemcachedTest::enabled_tls_testing()) { # when TLS is enabled, stats contains additional keys: # - ssl_handshake_errors # - time_since_server_cert_refresh - is(scalar(keys(%$stats)), 80, "expected count of stats values"); + is(scalar(keys(%$stats)), 82, "expected count of stats values"); } else { - is(scalar(keys(%$stats)), 78, "expected count of stats values"); + is(scalar(keys(%$stats)), 80, "expected count of stats values"); } # Test initial state @@ -587,6 +587,77 @@ static void thread_libevent_process(evutil_socket_t fd, short which, void *arg) /* Which thread we assigned a connection to most recently. */ static int last_thread = -1; +/* Last thread we assigned to a connection based on napi_id */ +static int last_thread_by_napi_id = -1; + +static LIBEVENT_THREAD *select_thread_round_robin(void) +{ + int tid = (last_thread + 1) % settings.num_threads; + + last_thread = tid; + + return threads + tid; +} + +static void reset_threads_napi_id(void) +{ + LIBEVENT_THREAD *thread; + int i; + + for (i = 0; i < settings.num_threads; i++) { + thread = threads + i; + thread->napi_id = 0; + } + + last_thread_by_napi_id = -1; +} + +/* Select a worker thread based on the NAPI ID of an incoming connection + * request. NAPI ID is a globally unique ID that identifies a NIC RX queue + * on which a flow is received. + */ +static LIBEVENT_THREAD *select_thread_by_napi_id(int sfd) +{ + LIBEVENT_THREAD *thread; + int napi_id, err, i; + socklen_t len; + int tid = -1; + + len = sizeof(socklen_t); + err = getsockopt(sfd, SOL_SOCKET, SO_INCOMING_NAPI_ID, &napi_id, &len); + if ((err == -1) || (napi_id == 0)) { + STATS_LOCK(); + stats.round_robin_fallback++; + STATS_UNLOCK(); + return select_thread_round_robin(); + } + +select: + for (i = 0; i < settings.num_threads; i++) { + thread = threads + i; + if (last_thread_by_napi_id < i) { + thread->napi_id = napi_id; + last_thread_by_napi_id = i; + tid = i; + break; + } + if (thread->napi_id == napi_id) { + tid = i; + break; + } + } + + if (tid == -1) { + STATS_LOCK(); + stats.unexpected_napi_ids++; + STATS_UNLOCK(); + reset_threads_napi_id(); + goto select; + } + + return threads + tid; +} + /* * Dispatches a new connection to another thread. This is only ever called * from the main thread, either during initialization (for UDP) or because @@ -603,11 +674,12 @@ void dispatch_conn_new(int sfd, enum conn_states init_state, int event_flags, return; } - int tid = (last_thread + 1) % settings.num_threads; - - LIBEVENT_THREAD *thread = threads + tid; + LIBEVENT_THREAD *thread; - last_thread = tid; + if (!settings.num_napi_ids) + thread = select_thread_round_robin(); + else + thread = select_thread_by_napi_id(sfd); item->sfd = sfd; item->init_state = init_state; |