diff options
-rw-r--r-- | Documentation/topics/dpdk/pmd.rst | 63 | ||||
-rw-r--r-- | NEWS | 1 | ||||
-rw-r--r-- | lib/dpif-netdev.c | 383 | ||||
-rw-r--r-- | vswitchd/vswitch.xml | 41 |
4 files changed, 488 insertions, 0 deletions
diff --git a/Documentation/topics/dpdk/pmd.rst b/Documentation/topics/dpdk/pmd.rst index dd9172d65..b0e19d794 100644 --- a/Documentation/topics/dpdk/pmd.rst +++ b/Documentation/topics/dpdk/pmd.rst @@ -183,3 +183,66 @@ or can be triggered by using:: In addition, the output of ``pmd-rxq-show`` was modified to include Rx queue utilization of the PMD as a percentage. Prior to this, tracking of stats was not available. + +Automatic assignment of Port/Rx Queue to PMD Threads (experimental) +------------------------------------------------------------------- + +Cycle or utilization based allocation of Rx queues to PMDs gives efficient +load distribution but it is not adaptive to change in traffic pattern +occurring over the time. This causes uneven load among the PMDs which results +in overall lower throughput. + +To address this automatic load balancing of PMDs can be set by:: + + $ ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true" + +If pmd-auto-lb is set to true AND cycle based assignment is enabled then auto +load balancing of PMDs is enabled provided there are 2 or more non-isolated +PMDs and at least one of these PMDs is polling more than one RX queue. So, +following conditions need to be met to have Auto Load balancing enabled: + +1. cycle based assignment of RX queues to PMD is enabled. +2. pmd-auto-lb is set to true. +3. There are two or more non-isolated PMDs present. +4. And at least one of the non-isolated PMD has more than one queue polling. + +If any of above is not met PMD Auto Load Balancing is disabled. + +Once auto load balancing is set, each non-isolated PMD measures the processing +load for each of its associated queues every 10 seconds. If the aggregated PMD +load reaches 95% for 6 consecutive intervals then PMD considers itself to be +overloaded. + +If any PMD is overloaded, a dry-run of the PMD assignment algorithm is +performed by OVS main thread. The dry-run does NOT change the existing queue +to PMD assignments. + +If the resultant mapping of dry-run indicates an improved distribution of the +load then the actual reassignment will be performed. + +.. note:: + + PMD Auto Load Balancing doesn't currently work if queues are assigned + cross NUMA as actual processing load could get worse after assignment + as compared to what dry run predicts. + +The minimum time between 2 consecutive PMD auto load balancing iterations can +also be configured by:: + + $ ovs-vsctl set open_vswitch .\ + other_config:pmd-auto-lb-rebal-interval="<interval>" + +where ``<interval>`` is a value in minutes. The default interval is 1 minute +and setting it to 0 will also result in default value i.e. 1 min. + +A user can use this option to avoid frequent trigger of Auto Load Balancing of +PMDs. For e.g. set this (in min) such that it occurs once in few hours or a day +or a week. + +.. note:: + In some scenarios it may not be desired to have Auto Load Balancing + triggerred. For example, if traffic profile for specific RX queue is + changing dramatically very frequently which in turn thrashes CPU cache + due to changes required in dpctl flows and EMC for newly added flows. + In such scenarios user should configure rebalance interval accordingly + to avoid frequent rebalancing happening. @@ -27,6 +27,7 @@ Post-v2.10.0 * Add option for simple round-robin based Rxq to PMD assignment. It can be set with pmd-rxq-assign. * Add support for DPDK 18.11 + * Add support for Auto load balancing of PMDs (experimental) - Add 'symmetric_l3' hash function. - OVS now honors 'updelay' and 'downdelay' for bonds with LACP configured. - ovs-vswitchd: diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 1564db9c6..be529b6b0 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -80,6 +80,12 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev); +/* Auto Load Balancing Defaults */ +#define ALB_ACCEPTABLE_IMPROVEMENT 25 +#define ALB_PMD_LOAD_THRESHOLD 95 +#define ALB_PMD_REBALANCE_POLL_INTERVAL 1 /* 1 Min */ +#define MIN_TO_MSEC 60000 + #define FLOW_DUMP_MAX_BATCH 50 /* Use per thread recirc_depth to prevent recirculation loop. */ #define MAX_RECIRC_DEPTH 6 @@ -288,6 +294,13 @@ struct dp_meter { struct dp_meter_band bands[]; }; +struct pmd_auto_lb { + bool auto_lb_requested; /* Auto load balancing requested by user. */ + bool is_enabled; /* Current status of Auto load balancing. */ + uint64_t rebalance_intvl; + uint64_t rebalance_poll_timer; +}; + /* Datapath based on the network device interface from netdev.h. * * @@ -368,6 +381,7 @@ struct dp_netdev { uint64_t last_tnl_conf_seq; struct conntrack conntrack; + struct pmd_auto_lb pmd_alb; }; static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id) @@ -702,6 +716,11 @@ struct dp_netdev_pmd_thread { /* Keep track of detailed PMD performance statistics. */ struct pmd_perf_stats perf_stats; + /* Stats from previous iteration used by automatic pmd + * load balance logic. */ + uint64_t prev_stats[PMD_N_STATS]; + atomic_count pmd_overloaded; + /* Set to true if the pmd thread needs to be reloaded. */ bool need_reload; }; @@ -3734,6 +3753,53 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops, } } +/* Enable or Disable PMD auto load balancing. */ +static void +set_pmd_auto_lb(struct dp_netdev *dp) +{ + unsigned int cnt = 0; + struct dp_netdev_pmd_thread *pmd; + struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; + + bool enable_alb = false; + bool multi_rxq = false; + bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc; + + /* Ensure that there is at least 2 non-isolated PMDs and + * one of them is polling more than one rxq. */ + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) { + continue; + } + + if (hmap_count(&pmd->poll_list) > 1) { + multi_rxq = true; + } + if (cnt && multi_rxq) { + enable_alb = true; + break; + } + cnt++; + } + + /* Enable auto LB if it is requested and cycle based assignment is true. */ + enable_alb = enable_alb && pmd_rxq_assign_cyc && + pmd_alb->auto_lb_requested; + + if (pmd_alb->is_enabled != enable_alb) { + pmd_alb->is_enabled = enable_alb; + if (pmd_alb->is_enabled) { + VLOG_INFO("PMD auto load balance is enabled " + "(with rebalance interval:%"PRIu64" msec)", + pmd_alb->rebalance_intvl); + } else { + pmd_alb->rebalance_poll_timer = 0; + VLOG_INFO("PMD auto load balance is disabled"); + } + } + +} + /* Applies datapath configuration from the database. Some of the changes are * actually applied in dpif_netdev_run(). */ static int @@ -3748,6 +3814,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) DEFAULT_EM_FLOW_INSERT_INV_PROB); uint32_t insert_min, cur_min; uint32_t tx_flush_interval, cur_tx_flush_interval; + uint64_t rebalance_intvl; tx_flush_interval = smap_get_int(other_config, "tx-flush-interval", DEFAULT_TX_FLUSH_INTERVAL); @@ -3819,6 +3886,23 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) pmd_rxq_assign); dp_netdev_request_reconfigure(dp); } + + struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; + pmd_alb->auto_lb_requested = smap_get_bool(other_config, "pmd-auto-lb", + false); + + rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebal-interval", + ALB_PMD_REBALANCE_POLL_INTERVAL); + + /* Input is in min, convert it to msec. */ + rebalance_intvl = + rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC; + + if (pmd_alb->rebalance_intvl != rebalance_intvl) { + pmd_alb->rebalance_intvl = rebalance_intvl; + } + + set_pmd_auto_lb(dp); return 0; } @@ -4762,6 +4846,9 @@ reconfigure_datapath(struct dp_netdev *dp) /* Reload affected pmd threads. */ reload_affected_pmds(dp); + + /* Check if PMD Auto LB is to be enabled */ + set_pmd_auto_lb(dp); } /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */ @@ -4780,6 +4867,241 @@ ports_require_restart(const struct dp_netdev *dp) return false; } +/* Calculates variance in the values stored in array 'a'. 'n' is the number + * of elements in array to be considered for calculating vairance. + * Usage example: data array 'a' contains the processing load of each pmd and + * 'n' is the number of PMDs. It returns the variance in processing load of + * PMDs*/ +static uint64_t +variance(uint64_t a[], int n) +{ + /* Compute mean (average of elements). */ + uint64_t sum = 0; + uint64_t mean = 0; + uint64_t sqDiff = 0; + + if (!n) { + return 0; + } + + for (int i = 0; i < n; i++) { + sum += a[i]; + } + + if (sum) { + mean = sum / n; + + /* Compute sum squared differences with mean. */ + for (int i = 0; i < n; i++) { + sqDiff += (a[i] - mean)*(a[i] - mean); + } + } + return (sqDiff ? (sqDiff / n) : 0); +} + + +/* Returns the variance in the PMDs usage as part of dry run of rxqs + * assignment to PMDs. */ +static bool +get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list, + uint32_t num_pmds, uint64_t *predicted_variance) + OVS_REQUIRES(dp->port_mutex) +{ + struct dp_netdev_port *port; + struct dp_netdev_pmd_thread *pmd; + struct dp_netdev_rxq **rxqs = NULL; + struct rr_numa *numa = NULL; + struct rr_numa_list rr; + int n_rxqs = 0; + bool ret = false; + uint64_t *pmd_usage; + + if (!predicted_variance) { + return ret; + } + + pmd_usage = xcalloc(num_pmds, sizeof(uint64_t)); + + HMAP_FOR_EACH (port, node, &dp->ports) { + if (!netdev_is_pmd(port->netdev)) { + continue; + } + + for (int qid = 0; qid < port->n_rxq; qid++) { + struct dp_netdev_rxq *q = &port->rxqs[qid]; + uint64_t cycle_hist = 0; + + if (q->pmd->isolated) { + continue; + } + + if (n_rxqs == 0) { + rxqs = xmalloc(sizeof *rxqs); + } else { + rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1)); + } + + /* Sum the queue intervals and store the cycle history. */ + for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) { + cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i); + } + dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, + cycle_hist); + /* Store the queue. */ + rxqs[n_rxqs++] = q; + } + } + if (n_rxqs > 1) { + /* Sort the queues in order of the processing cycles + * they consumed during their last pmd interval. */ + qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles); + } + rr_numa_list_populate(dp, &rr); + + for (int i = 0; i < n_rxqs; i++) { + int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev); + numa = rr_numa_list_lookup(&rr, numa_id); + if (!numa) { + /* Abort if cross NUMA polling. */ + VLOG_DBG("PMD auto lb dry run." + " Aborting due to cross-numa polling."); + goto cleanup; + } + + pmd = rr_numa_get_pmd(numa, true); + VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d " + "to be assigned port \'%s\' rx queue %d " + "(measured processing cycles %"PRIu64").", + pmd->core_id, numa_id, + netdev_rxq_get_name(rxqs[i]->rx), + netdev_rxq_get_queue_id(rxqs[i]->rx), + dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST)); + + for (int id = 0; id < num_pmds; id++) { + if (pmd->core_id == core_list[id]) { + /* Add the processing cycles of rxq to pmd polling it. */ + pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i], + RXQ_CYCLES_PROC_HIST); + } + } + } + + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + uint64_t total_cycles = 0; + + if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) { + continue; + } + + /* Get the total pmd cycles for an interval. */ + atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles); + /* Estimate the cycles to cover all intervals. */ + total_cycles *= PMD_RXQ_INTERVAL_MAX; + for (int id = 0; id < num_pmds; id++) { + if (pmd->core_id == core_list[id]) { + if (pmd_usage[id]) { + pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles; + } + VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, " + "usage %"PRIu64"", pmd->core_id, pmd_usage[id]); + } + } + } + *predicted_variance = variance(pmd_usage, num_pmds); + ret = true; + +cleanup: + rr_numa_list_destroy(&rr); + free(rxqs); + free(pmd_usage); + return ret; +} + +/* Does the dry run of Rxq assignment to PMDs and returns true if it gives + * better distribution of load on PMDs. */ +static bool +pmd_rebalance_dry_run(struct dp_netdev *dp) + OVS_REQUIRES(dp->port_mutex) +{ + struct dp_netdev_pmd_thread *pmd; + uint64_t *curr_pmd_usage; + + uint64_t curr_variance; + uint64_t new_variance; + uint64_t improvement = 0; + uint32_t num_pmds; + uint32_t *pmd_corelist; + struct rxq_poll *poll, *poll_next; + bool ret; + + num_pmds = cmap_count(&dp->poll_threads); + + if (num_pmds > 1) { + curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t)); + pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t)); + } else { + return false; + } + + num_pmds = 0; + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + uint64_t total_cycles = 0; + uint64_t total_proc = 0; + + if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) { + continue; + } + + /* Get the total pmd cycles for an interval. */ + atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles); + /* Estimate the cycles to cover all intervals. */ + total_cycles *= PMD_RXQ_INTERVAL_MAX; + + HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) { + uint64_t proc_cycles = 0; + for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) { + proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i); + } + total_proc += proc_cycles; + } + if (total_proc) { + curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles; + } + + VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"", + pmd->core_id, curr_pmd_usage[num_pmds]); + + if (atomic_count_get(&pmd->pmd_overloaded)) { + atomic_count_set(&pmd->pmd_overloaded, 0); + } + + pmd_corelist[num_pmds] = pmd->core_id; + num_pmds++; + } + + curr_variance = variance(curr_pmd_usage, num_pmds); + ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance); + + if (ret) { + VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64"," + " Predicted PMD variance: %"PRIu64"", + curr_variance, new_variance); + + if (new_variance < curr_variance) { + improvement = + ((curr_variance - new_variance) * 100) / curr_variance; + } + if (improvement < ALB_ACCEPTABLE_IMPROVEMENT) { + ret = false; + } + } + + free(curr_pmd_usage); + free(pmd_corelist); + return ret; +} + + /* Return true if needs to revalidate datapath flows. */ static bool dpif_netdev_run(struct dpif *dpif) @@ -4789,6 +5111,9 @@ dpif_netdev_run(struct dpif *dpif) struct dp_netdev_pmd_thread *non_pmd; uint64_t new_tnl_seq; bool need_to_flush = true; + bool pmd_rebalance = false; + long long int now = time_msec(); + struct dp_netdev_pmd_thread *pmd; ovs_mutex_lock(&dp->port_mutex); non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); @@ -4821,6 +5146,32 @@ dpif_netdev_run(struct dpif *dpif) dp_netdev_pmd_unref(non_pmd); } + struct pmd_auto_lb *pmd_alb = &dp->pmd_alb; + if (pmd_alb->is_enabled) { + if (!pmd_alb->rebalance_poll_timer) { + pmd_alb->rebalance_poll_timer = now; + } else if ((pmd_alb->rebalance_poll_timer + + pmd_alb->rebalance_intvl) < now) { + pmd_alb->rebalance_poll_timer = now; + CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { + if (atomic_count_get(&pmd->pmd_overloaded) >= + PMD_RXQ_INTERVAL_MAX) { + pmd_rebalance = true; + break; + } + } + + if (pmd_rebalance && + !dp_netdev_is_reconf_required(dp) && + !ports_require_restart(dp) && + pmd_rebalance_dry_run(dp)) { + VLOG_INFO("PMD auto lb dry run." + " requesting datapath reconfigure."); + dp_netdev_request_reconfigure(dp); + } + } + } + if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) { reconfigure_datapath(dp); } @@ -4979,6 +5330,8 @@ pmd_thread_main(void *f_) reload: pmd_alloc_static_tx_qid(pmd); + atomic_count_init(&pmd->pmd_overloaded, 0); + /* List port/core affinity */ for (i = 0; i < poll_cnt; i++) { VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n", @@ -7188,9 +7541,39 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, struct polled_queue *poll_list, int poll_cnt) { struct dpcls *cls; + uint64_t tot_idle = 0, tot_proc = 0; + unsigned int pmd_load = 0; if (pmd->ctx.now > pmd->rxq_next_cycle_store) { uint64_t curr_tsc; + struct pmd_auto_lb *pmd_alb = &pmd->dp->pmd_alb; + if (pmd_alb->is_enabled && !pmd->isolated + && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] >= + pmd->prev_stats[PMD_CYCLES_ITER_IDLE]) + && (pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] >= + pmd->prev_stats[PMD_CYCLES_ITER_BUSY])) + { + tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] - + pmd->prev_stats[PMD_CYCLES_ITER_IDLE]; + tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] - + pmd->prev_stats[PMD_CYCLES_ITER_BUSY]; + + if (tot_proc) { + pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc)); + } + + if (pmd_load >= ALB_PMD_LOAD_THRESHOLD) { + atomic_count_inc(&pmd->pmd_overloaded); + } else { + atomic_count_set(&pmd->pmd_overloaded, 0); + } + } + + pmd->prev_stats[PMD_CYCLES_ITER_IDLE] = + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE]; + pmd->prev_stats[PMD_CYCLES_ITER_BUSY] = + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY]; + /* Get the cycles that were used to process each queue and store. */ for (unsigned i = 0; i < poll_cnt; i++) { uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq, diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml index 6f8095882..d58f63228 100644 --- a/vswitchd/vswitch.xml +++ b/vswitchd/vswitch.xml @@ -574,6 +574,47 @@ be set to 'skip_sw'. </p> </column> + <column name="other_config" key="pmd-auto-lb" + type='{"type": "boolean"}'> + <p> + Configures PMD Auto Load Balancing that allows automatic assignment of + RX queues to PMDs if any of PMDs is overloaded (i.e. processing cycles + > 95%). + </p> + <p> + It uses current scheme of cycle based assignment of RX queues that + are not statically pinned to PMDs. + </p> + <p> + The default value is <code>false</code>. + </p> + <p> + Set this value to <code>true</code> to enable this option. It is + currently disabled by default and an experimental feature. + </p> + <p> + This only comes in effect if cycle based assignment is enabled and + there are more than one non-isolated PMDs present and at least one of + it polls more than one queue. + </p> + </column> + <column name="other_config" key="pmd-auto-lb-rebal-interval" + type='{"type": "integer", + "minInteger": 0, "maxInteger": 20000}'> + <p> + The minimum time (in minutes) 2 consecutive PMD Auto Load Balancing + iterations. + </p> + <p> + The defaul value is 1 min. If configured to 0 then it would be + converted to default value i.e. 1 min + </p> + <p> + This option can be configured to avoid frequent trigger of auto load + balancing of PMDs. For e.g. set the value (in min) such that it occurs + once in few hours or a day or a week. + </p> + </column> </group> <group title="Status"> <column name="next_cfg"> |