/* * Copyright (c) 2014 Nicira, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "ovs-numa.h" #include #include #ifdef __linux__ #include #include #include #include #include #endif /* __linux__ */ #include "hash.h" #include "openvswitch/hmap.h" #include "openvswitch/list.h" #include "ovs-thread.h" #include "openvswitch/vlog.h" #include "util.h" VLOG_DEFINE_THIS_MODULE(ovs_numa); /* ovs-numa module * =============== * * This module stores the affinity information of numa nodes and cpu cores. * It also provides functions to bookkeep the pin of threads on cpu cores. * * It is assumed that the numa node ids and cpu core ids all start from 0. * There is no guarantee that node and cpu ids are numbered consecutively * So, for example, if two nodes exist with ids 0 and 8, * 'ovs_numa_get_n_nodes()' will return 2, no assumption of node numbering * should be made. * * NOTE, this module should only be used by the main thread. * * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be * invalidated when ever the system topology changes. Support for detecting * topology changes has not been included. For now, add a TODO entry for * addressing it in the future. * * TODO: Fix ovs-numa when cpu hotplug is used. */ /* numa node. */ struct numa_node { struct hmap_node hmap_node; /* In the 'all_numa_nodes'. */ struct ovs_list cores; /* List of cpu cores on the numa node. */ int numa_id; /* numa node id. */ }; /* Cpu core on a numa node. */ struct cpu_core { struct hmap_node hmap_node;/* In the 'all_cpu_cores'. */ struct ovs_list list_node; /* In 'numa_node->cores' list. */ struct numa_node *numa; /* numa node containing the core. */ unsigned core_id; /* Core id. */ }; /* Contains all 'struct numa_node's. */ static struct hmap all_numa_nodes = HMAP_INITIALIZER(&all_numa_nodes); /* Contains all 'struct cpu_core's. */ static struct hmap all_cpu_cores = HMAP_INITIALIZER(&all_cpu_cores); /* True if numa node and core info are correctly extracted. */ static bool found_numa_and_core; /* True if the module was initialized with dummy options. In this case, the * module must not interact with the actual cpus/nodes in the system. */ static bool dummy_numa = false; /* If 'dummy_numa' is true, contains a copy of the dummy numa configuration * parameter */ static char *dummy_config; static struct numa_node *get_numa_by_numa_id(int numa_id); #ifdef __linux__ /* Returns true if 'str' contains all digits. Returns false otherwise. */ static bool contain_all_digits(const char *str) { return str[strspn(str, "0123456789")] == '\0'; } #endif /* __linux__ */ static struct numa_node * insert_new_numa_node(int numa_id) { struct numa_node *n = xzalloc(sizeof *n); hmap_insert(&all_numa_nodes, &n->hmap_node, hash_int(numa_id, 0)); ovs_list_init(&n->cores); n->numa_id = numa_id; return n; } static struct cpu_core * insert_new_cpu_core(struct numa_node *n, unsigned core_id) { struct cpu_core *c = xzalloc(sizeof *c); hmap_insert(&all_cpu_cores, &c->hmap_node, hash_int(core_id, 0)); ovs_list_insert(&n->cores, &c->list_node); c->core_id = core_id; c->numa = n; return c; } /* Has the same effect as discover_numa_and_core(), but instead of * reading sysfs entries, extracts the info from the global variable * 'dummy_config', which is set with ovs_numa_set_dummy(). * * 'dummy_config' lists the numa_ids of each CPU separated by a comma, e.g. * - "0,0,0,0": four cores on numa socket 0. * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets. * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets. * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous. */ static void discover_numa_and_core_dummy(void) { char *conf = xstrdup(dummy_config); char *id, *saveptr = NULL; unsigned i = 0; for (id = strtok_r(conf, ",", &saveptr); id; id = strtok_r(NULL, ",", &saveptr)) { struct hmap_node *hnode; struct numa_node *n; long numa_id; numa_id = strtol(id, NULL, 10); if (numa_id < 0 || numa_id >= MAX_NUMA_NODES) { VLOG_WARN("Invalid numa node %ld", numa_id); continue; } hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0)); if (hnode) { n = CONTAINER_OF(hnode, struct numa_node, hmap_node); } else { n = insert_new_numa_node(numa_id); } insert_new_cpu_core(n, i); i++; } free(conf); } #ifdef __linux__ /* Check if a CPU is detected and online. */ static int cpu_detected(unsigned int core_id) { char path[PATH_MAX]; int len = snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/topology/core_id", core_id); if (len <= 0 || (unsigned) len >= sizeof(path)) { return 0; } if (access(path, F_OK) != 0) { return 0; } return 1; } #endif /* __linux__ */ /* Discovers all numa nodes and the corresponding cpu cores. * Constructs the 'struct numa_node' and 'struct cpu_core'. */ static void discover_numa_and_core(void) { #ifdef __linux__ int i; DIR *dir; bool numa_supported = true; /* Check if NUMA supported on this system. */ dir = opendir("/sys/devices/system/node"); if (!dir && errno == ENOENT) { numa_supported = false; } if (dir) { closedir(dir); } for (i = 0; i < MAX_NUMA_NODES; i++) { char* path; if (numa_supported) { /* Constructs the path to node /sys/devices/system/nodeX. */ path = xasprintf("/sys/devices/system/node/node%d", i); } else { path = xasprintf("/sys/devices/system/cpu/"); } dir = opendir(path); /* Creates 'struct numa_node' if the 'dir' is non-null. */ if (dir) { struct numa_node *n; struct dirent *subdir; n = insert_new_numa_node(i); while ((subdir = readdir(dir)) != NULL) { if (!strncmp(subdir->d_name, "cpu", 3) && contain_all_digits(subdir->d_name + 3)) { unsigned core_id; core_id = strtoul(subdir->d_name + 3, NULL, 10); if (cpu_detected(core_id)) { insert_new_cpu_core(n, core_id); } } } closedir(dir); } else if (errno != ENOENT) { VLOG_WARN("opendir(%s) failed (%s)", path, ovs_strerror(errno)); } free(path); if (!numa_supported) { break; } } #endif /* __linux__ */ } /* Gets 'struct cpu_core' by 'core_id'. */ static struct cpu_core* get_core_by_core_id(unsigned core_id) { struct cpu_core *core; HMAP_FOR_EACH_WITH_HASH (core, hmap_node, hash_int(core_id, 0), &all_cpu_cores) { if (core->core_id == core_id) { return core; } } return NULL; } /* Gets 'struct numa_node' by 'numa_id'. */ static struct numa_node* get_numa_by_numa_id(int numa_id) { struct numa_node *numa; HMAP_FOR_EACH_WITH_HASH (numa, hmap_node, hash_int(numa_id, 0), &all_numa_nodes) { if (numa->numa_id == numa_id) { return numa; } } return NULL; } /* Initializes the numa module. */ void ovs_numa_init(void) { static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER; if (ovsthread_once_start(&once)) { const struct numa_node *n; if (dummy_numa) { discover_numa_and_core_dummy(); } else { discover_numa_and_core(); } HMAP_FOR_EACH(n, hmap_node, &all_numa_nodes) { VLOG_INFO("Discovered %"PRIuSIZE" CPU cores on NUMA node %d", ovs_list_size(&n->cores), n->numa_id); } VLOG_INFO("Discovered %"PRIuSIZE" NUMA nodes and %"PRIuSIZE" CPU cores", hmap_count(&all_numa_nodes), hmap_count(&all_cpu_cores)); if (hmap_count(&all_numa_nodes) && hmap_count(&all_cpu_cores)) { found_numa_and_core = true; } ovsthread_once_done(&once); } } /* Extracts the numa node and core info from the 'config'. This is useful for * testing purposes. The function must be called once, before ovs_numa_init(). * * The format of 'config' is explained in the comment above * discover_numa_and_core_dummy().*/ void ovs_numa_set_dummy(const char *config) { dummy_numa = true; ovs_assert(config); free(dummy_config); dummy_config = xstrdup(config); } bool ovs_numa_numa_id_is_valid(int numa_id) { return found_numa_and_core && numa_id < ovs_numa_get_n_numas(); } bool ovs_numa_core_id_is_valid(unsigned core_id) { return found_numa_and_core && core_id < ovs_numa_get_n_cores(); } /* Returns the number of numa nodes. */ int ovs_numa_get_n_numas(void) { return found_numa_and_core ? hmap_count(&all_numa_nodes) : OVS_NUMA_UNSPEC; } /* Returns the number of cpu cores. */ int ovs_numa_get_n_cores(void) { return found_numa_and_core ? hmap_count(&all_cpu_cores) : OVS_CORE_UNSPEC; } /* Given 'core_id', returns the corresponding numa node id. Returns * OVS_NUMA_UNSPEC if 'core_id' is invalid. */ int ovs_numa_get_numa_id(unsigned core_id) { struct cpu_core *core = get_core_by_core_id(core_id); if (core) { return core->numa->numa_id; } return OVS_NUMA_UNSPEC; } /* Returns the number of cpu cores on numa node. Returns OVS_CORE_UNSPEC * if 'numa_id' is invalid. */ int ovs_numa_get_n_cores_on_numa(int numa_id) { struct numa_node *numa = get_numa_by_numa_id(numa_id); if (numa) { return ovs_list_size(&numa->cores); } return OVS_CORE_UNSPEC; } /* Returns the largest core_id. * * Return OVS_CORE_UNSPEC, if core_id information is not found. * * Returning OVS_CORE_UNSPEC comes at a caveat. The caller function * must remember to check the return value of this callee function * against OVS_CORE_UNSPEC. OVS_CORE_UNSPEC is a positive integer * INT_MAX, which the caller may interpret it as the largest * core_id if it's not checking for it. */ unsigned ovs_numa_get_largest_core_id(void) { struct cpu_core *core; unsigned max_id = 0; if (!found_numa_and_core) { return OVS_CORE_UNSPEC; } HMAP_FOR_EACH (core, hmap_node, &all_cpu_cores) { if (core->core_id > max_id) { max_id = core->core_id; } } return max_id; } static struct ovs_numa_dump * ovs_numa_dump_create(void) { struct ovs_numa_dump *dump = xmalloc(sizeof *dump); hmap_init(&dump->cores); hmap_init(&dump->numas); return dump; } static void ovs_numa_dump_add(struct ovs_numa_dump *dump, int numa_id, int core_id) { struct ovs_numa_info_core *c = xzalloc(sizeof *c); struct ovs_numa_info_numa *n; c->numa_id = numa_id; c->core_id = core_id; hmap_insert(&dump->cores, &c->hmap_node, hash_2words(numa_id, core_id)); HMAP_FOR_EACH_WITH_HASH (n, hmap_node, hash_int(numa_id, 0), &dump->numas) { if (n->numa_id == numa_id) { n->n_cores++; return; } } n = xzalloc(sizeof *n); n->numa_id = numa_id; n->n_cores = 1; hmap_insert(&dump->numas, &n->hmap_node, hash_int(numa_id, 0)); } /* Given the 'numa_id', returns dump of all cores on the numa node. */ struct ovs_numa_dump * ovs_numa_dump_cores_on_numa(int numa_id) { struct ovs_numa_dump *dump = ovs_numa_dump_create(); struct numa_node *numa = get_numa_by_numa_id(numa_id); if (numa) { struct cpu_core *core; LIST_FOR_EACH (core, list_node, &numa->cores) { ovs_numa_dump_add(dump, numa->numa_id, core->core_id); } } return dump; } struct ovs_numa_dump * ovs_numa_dump_cores_with_cmask(const char *cmask) { struct ovs_numa_dump *dump = ovs_numa_dump_create(); int core_id = 0; int end_idx; /* Ignore leading 0x. */ end_idx = 0; if (!strncmp(cmask, "0x", 2) || !strncmp(cmask, "0X", 2)) { end_idx = 2; } for (int i = strlen(cmask) - 1; i >= end_idx; i--) { char hex = cmask[i]; int bin; bin = hexit_value(hex); if (bin == -1) { VLOG_WARN("Invalid cpu mask: %c", cmask[i]); bin = 0; } for (int j = 0; j < 4; j++) { if ((bin >> j) & 0x1) { struct cpu_core *core = get_core_by_core_id(core_id); if (core) { ovs_numa_dump_add(dump, core->numa->numa_id, core->core_id); } } core_id++; } } return dump; } struct ovs_numa_dump * ovs_numa_dump_n_cores_per_numa(int cores_per_numa) { struct ovs_numa_dump *dump = ovs_numa_dump_create(); const struct numa_node *n; HMAP_FOR_EACH (n, hmap_node, &all_numa_nodes) { const struct cpu_core *core; int i = 0; LIST_FOR_EACH (core, list_node, &n->cores) { if (i++ >= cores_per_numa) { break; } ovs_numa_dump_add(dump, core->numa->numa_id, core->core_id); } } return dump; } bool ovs_numa_dump_contains_core(const struct ovs_numa_dump *dump, int numa_id, unsigned core_id) { struct ovs_numa_info_core *core; HMAP_FOR_EACH_WITH_HASH (core, hmap_node, hash_2words(numa_id, core_id), &dump->cores) { if (core->core_id == core_id && core->numa_id == numa_id) { return true; } } return false; } size_t ovs_numa_dump_count(const struct ovs_numa_dump *dump) { return hmap_count(&dump->cores); } void ovs_numa_dump_destroy(struct ovs_numa_dump *dump) { struct ovs_numa_info_core *c; struct ovs_numa_info_numa *n; if (!dump) { return; } HMAP_FOR_EACH_POP (c, hmap_node, &dump->cores) { free(c); } HMAP_FOR_EACH_POP (n, hmap_node, &dump->numas) { free(n); } hmap_destroy(&dump->cores); hmap_destroy(&dump->numas); free(dump); } struct ovs_numa_dump * ovs_numa_thread_getaffinity_dump(void) { if (dummy_numa) { /* Nothing to do. */ return NULL; } #ifndef __linux__ return NULL; #else struct ovs_numa_dump *dump; const struct numa_node *n; cpu_set_t cpuset; int err; CPU_ZERO(&cpuset); err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset); if (err) { VLOG_ERR("Thread getaffinity error: %s", ovs_strerror(err)); return NULL; } dump = ovs_numa_dump_create(); HMAP_FOR_EACH (n, hmap_node, &all_numa_nodes) { const struct cpu_core *core; LIST_FOR_EACH (core, list_node, &n->cores) { if (CPU_ISSET(core->core_id, &cpuset)) { ovs_numa_dump_add(dump, core->numa->numa_id, core->core_id); } } } if (!ovs_numa_dump_count(dump)) { ovs_numa_dump_destroy(dump); return NULL; } return dump; #endif /* __linux__ */ } int ovs_numa_thread_setaffinity_dump(const struct ovs_numa_dump *dump) { if (!dump || dummy_numa) { /* Nothing to do. */ return 0; } #ifdef __linux__ const struct ovs_numa_info_core *core; cpu_set_t cpuset; int err; CPU_ZERO(&cpuset); FOR_EACH_CORE_ON_DUMP (core, dump) { CPU_SET(core->core_id, &cpuset); } err = pthread_setaffinity_np(pthread_self(), sizeof cpuset, &cpuset); if (err) { VLOG_ERR("Thread setaffinity error: %s", ovs_strerror(err)); return err; } return 0; #else /* !__linux__ */ return EOPNOTSUPP; #endif /* __linux__ */ } int ovs_numa_thread_setaffinity_core(unsigned core_id) { const struct cpu_core *core = get_core_by_core_id(core_id); struct ovs_numa_dump *affinity = ovs_numa_dump_create(); int ret = EINVAL; if (core) { ovs_numa_dump_add(affinity, core->numa->numa_id, core->core_id); ret = ovs_numa_thread_setaffinity_dump(affinity); } ovs_numa_dump_destroy(affinity); return ret; }