dpif-netdev: Calculate per numa variance.

Currently, pmd_rebalance_dry_run() calculate overall variance of all pmds regardless of their numa location. The overall result may hide un-balance in an individual numa. Considering the following case. Numa0 is free because VMs on numa0 are not sending pkts, while numa1 is busy. Within numa1, pmds workloads are not balanced. Obviously, moving 500 kpps workloads from pmd 126 to pmd 62 will make numa1 much more balance. For numa1 the variance improvement will be almost 100%, because after rebalance each pmd in numa1 holds same workload(variance ~= 0). But the overall variance improvement is only about 20%, which may not trigger auto_lb. ``` numa_id core_id kpps 0 30 0 0 31 0 0 94 0 0 95 0 1 126 1500 1 127 1000 1 63 1000 1 62 500 ``` As auto_lb doesn't balance workload across numa nodes. So it makes more sense to calculate variance improvement per numa node. Signed-off-by: Cheng Li <lic121@chinatelecom.cn> Signed-off-by: Kevin Traynor <ktraynor@redhat.com> Co-authored-by: Kevin Traynor <ktraynor@redhat.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
author: Cheng Li <lic121@chinatelecom.cn> 2022-12-17 13:15:36 +0000
committer: Ilya Maximets <i.maximets@ovn.org> 2022-12-21 22:15:47 +0100
commit: 46e04ec31bb2b889bd5715d436be2bdc0268f08b (patch)
tree: 39e1666a32dc45bfef90a65ec22864159f5004f6
parent: ad6e506fcb63e34f3398c5284cb2bd1858ac3a49 (diff)
download: openvswitch-46e04ec31bb2b889bd5715d436be2bdc0268f08b.tar.gz
2 files changed, 47 insertions, 48 deletions
diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst
index 88457f366..9006fd40f 100644
--- a/Documentation/topics/dpdk/pmd.rst
+++ b/Documentation/topics/dpdk/pmd.rst
@@ -291,10 +291,10 @@ If a PMD core is detected to be above the load threshold and the minimum
 pre-requisites are met, a dry-run using the current PMD assignment algorithm is
 performed.
 
-The current variance of load between the PMD cores and estimated variance from
-the dry-run are both calculated. If the estimated dry-run variance is improved
-from the current one by the variance threshold, a new Rx queue to PMD
-assignment will be performed.
+For each numa node, the current variance of load between the PMD cores and
+estimated variance from the dry-run are both calculated. If any numa's
+estimated dry-run variance is improved from the current one by the variance
+threshold, a new Rx queue to PMD assignment will be performed.
 
 For example, to set the variance improvement threshold to 40%::
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c015fb6dd..7127068fe 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -6131,39 +6131,33 @@ rxq_scheduling(struct dp_netdev *dp)
 static uint64_t variance(uint64_t a[], int n);
 
 static uint64_t
-sched_numa_list_variance(struct sched_numa_list *numa_list)
+sched_numa_variance(struct sched_numa *numa)
 {
-    struct sched_numa *numa;
     uint64_t *percent_busy = NULL;
-    unsigned total_pmds = 0;
     int n_proc = 0;
     uint64_t var;
 
-    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
-        total_pmds += numa->n_pmds;
-        percent_busy = xrealloc(percent_busy,
-                                total_pmds * sizeof *percent_busy);
+    percent_busy = xmalloc(numa->n_pmds * sizeof *percent_busy);
 
-        for (unsigned i = 0; i < numa->n_pmds; i++) {
-            struct sched_pmd *sched_pmd;
-            uint64_t total_cycles = 0;
+    for (unsigned i = 0; i < numa->n_pmds; i++) {
+        struct sched_pmd *sched_pmd;
+        uint64_t total_cycles = 0;
 
-            sched_pmd = &numa->pmds[i];
-            /* Exclude isolated PMDs from variance calculations. */
-            if (sched_pmd->isolated == true) {
-                continue;
-            }
-            /* Get the total pmd cycles for an interval. */
-            atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
-
-            if (total_cycles) {
-                /* Estimate the cycles to cover all intervals. */
-                total_cycles *= PMD_INTERVAL_MAX;
-                percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
-                                             / total_cycles;
-            } else {
-                percent_busy[n_proc++] = 0;
-            }
+        sched_pmd = &numa->pmds[i];
+        /* Exclude isolated PMDs from variance calculations. */
+        if (sched_pmd->isolated == true) {
+            continue;
+        }
+        /* Get the total pmd cycles for an interval. */
+        atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
+
+        if (total_cycles) {
+            /* Estimate the cycles to cover all intervals. */
+            total_cycles *= PMD_INTERVAL_MAX;
+            percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
+                                            / total_cycles;
+        } else {
+            percent_busy[n_proc++] = 0;
         }
     }
     var = variance(percent_busy, n_proc);
@@ -6237,6 +6231,7 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
     struct sched_numa_list numa_list_est;
     bool thresh_met = false;
     uint64_t current_var, estimate_var;
+    struct sched_numa *numa_cur, *numa_est;
     uint64_t improvement = 0;
 
     VLOG_DBG("PMD auto load balance performing dry run.");
@@ -6255,25 +6250,29 @@ pmd_rebalance_dry_run(struct dp_netdev *dp)
             sched_numa_list_count(&numa_list_est) == 1) {
 
         /* Calculate variances. */
-        current_var = sched_numa_list_variance(&numa_list_cur);
-        estimate_var = sched_numa_list_variance(&numa_list_est);
-
-        if (estimate_var < current_var) {
-             improvement = ((current_var - estimate_var) * 100) / current_var;
-        }
-        VLOG_DBG("Current variance %"PRIu64" Estimated variance %"PRIu64".",
-                 current_var, estimate_var);
-        VLOG_DBG("Variance improvement %"PRIu64"%%.", improvement);
-
-        if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
-            thresh_met = true;
-            VLOG_DBG("PMD load variance improvement threshold %u%% "
-                     "is met.", dp->pmd_alb.rebalance_improve_thresh);
-        } else {
-            VLOG_DBG("PMD load variance improvement threshold "
-                     "%u%% is not met.",
-                      dp->pmd_alb.rebalance_improve_thresh);
+        HMAP_FOR_EACH (numa_cur, node, &numa_list_cur.numas) {
+            numa_est = sched_numa_list_lookup(&numa_list_est,
+                                              numa_cur->numa_id);
+            if (!numa_est) {
+                continue;
+            }
+            current_var = sched_numa_variance(numa_cur);
+            estimate_var = sched_numa_variance(numa_est);
+            if (estimate_var < current_var) {
+                improvement = ((current_var - estimate_var) * 100)
+                              / current_var;
+            }
+            VLOG_DBG("Numa node %d. Current variance %"PRIu64" Estimated "
+                     "variance %"PRIu64". Variance improvement %"PRIu64"%%.",
+                     numa_cur->numa_id, current_var,
+                     estimate_var, improvement);
+            if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
+                thresh_met = true;
+            }
         }
+        VLOG_DBG("PMD load variance improvement threshold %u%% is %s.",
+                 dp->pmd_alb.rebalance_improve_thresh,
+                 thresh_met ? "met" : "not met");
     } else {
         VLOG_DBG("PMD auto load balance detected cross-numa polling with "
                  "multiple numa nodes. Unable to accurately estimate.");
author	Cheng Li <lic121@chinatelecom.cn>	2022-12-17 13:15:36 +0000
committer	Ilya Maximets <i.maximets@ovn.org>	2022-12-21 22:15:47 +0100
commit	46e04ec31bb2b889bd5715d436be2bdc0268f08b (patch)
tree	39e1666a32dc45bfef90a65ec22864159f5004f6
parent	ad6e506fcb63e34f3398c5284cb2bd1858ac3a49 (diff)
download	openvswitch-46e04ec31bb2b889bd5715d436be2bdc0268f08b.tar.gz