/* * Copyright (c) 2021 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifdef __x86_64__ /* Sparse cannot handle the AVX512 instructions. */ #if !defined(__CHECKER__) #include #include "dpif-netdev.h" #include "dpif-netdev-perf.h" #include "dpif-netdev-private.h" #include #include #include "dp-packet.h" #include "netdev.h" #include "netdev-offload.h" /* Each AVX512 register (zmm register in assembly notation) can contain up to * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum * number of miniflow blocks that can be processed in a single pass of the * AVX512 code at a time. */ #define NUM_U64_IN_ZMM_REG (8) /* Structure to contain per-packet metadata that must be attributed to the * dp netdev flow. This is unfortunate to have to track per packet, however * it's a bit awkward to maintain them in a performant way. This structure * helps to keep two variables on a single cache line per packet. */ struct pkt_flow_meta { uint16_t bytes; uint16_t tcp_flags; }; /* Structure of heap allocated memory for DPIF internals. */ struct dpif_userdata { OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct netdev_flow_key keys[NETDEV_MAX_BURST]; OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST]; OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST]; }; int32_t dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd, struct dp_packet_batch *packets, odp_port_t in_port) { /* Allocate DPIF userdata. */ if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) { pmd->netdev_input_func_userdata = xmalloc_pagealign(sizeof(struct dpif_userdata)); } struct dpif_userdata *ud = pmd->netdev_input_func_userdata; struct netdev_flow_key *keys = ud->keys; struct netdev_flow_key **key_ptrs = ud->key_ptrs; struct pkt_flow_meta *pkt_meta = ud->pkt_meta; /* The AVX512 DPIF implementation handles rules in a way that is optimized * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is * achieved by separating the rule arrays. Bitmasks are kept for each * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS * array. Later the two arrays are merged by AVX-512 expand instructions. */ /* Stores the computed output: a rule pointer for each packet. */ /* Used initially for HWOL/EMC/SMC and Simple Match. */ struct dpcls_rule *rules[NETDEV_MAX_BURST]; /* Used for DPCLS. */ struct dpcls_rule *dpcls_rules[NETDEV_MAX_BURST]; uint32_t dpcls_key_idx = 0; for (uint32_t i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) { _mm512_storeu_si512(&rules[i], _mm512_setzero_si512()); _mm512_storeu_si512(&dpcls_rules[i], _mm512_setzero_si512()); } const size_t batch_size = dp_packet_batch_size(packets); /* Prefetch 2 packets ahead when processing. This was found to perform best * through testing. */ const uint32_t prefetch_ahead = 2; const uint32_t initial_prefetch = MIN(prefetch_ahead, batch_size); for (int i = 0; i < initial_prefetch; i++) { struct dp_packet *packet = packets->packets[i]; OVS_PREFETCH(dp_packet_data(packet)); pkt_metadata_prefetch_init(&packet->md); } const bool simple_match_enabled = dp_netdev_simple_match_enabled(pmd, in_port); /* Check if EMC or SMC are enabled. */ struct dfc_cache *cache = &pmd->flow_cache; const uint32_t hwol_enabled = netdev_is_flow_api_enabled(); const uint32_t emc_enabled = pmd->ctx.emc_insert_min != 0; const uint32_t smc_enabled = pmd->ctx.smc_enable_db; uint32_t n_simple_hit = 0; uint32_t emc_hits = 0; uint32_t smc_hits = 0; uint32_t phwol_hits = 0; /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. */ uint32_t hwol_emc_smc_hitmask = 0; uint32_t smc_hitmask = 0; /* The below while loop is based on the 'iter' variable which has a number * of bits set representing packets that we want to process * (HWOL->MFEX->EMC->SMC). As each packet is processed, we clear (set to 0) * the bit representing that packet using '_blsr_u64()'. The * 'raw_ctz()' will give us the correct index into the 'packets', * 'pkt_meta', 'keys' and 'rules' arrays. * * For one iteration of the while loop, here's some pseudocode as an * example where 'iter' is represented in binary: * * while (iter) { // iter = 1100 * uint32_t i = raw_ctz(iter); // i = 2 * iter = _blsr_u64(iter); // iter = 1000 * // do all processing (HWOL->MFEX->EMC->SMC) * } */ uint32_t lookup_pkts_bitmask = (UINT64_C(1) << batch_size) - 1; if (simple_match_enabled) { struct dp_packet *packet; DP_PACKET_BATCH_FOR_EACH (i, packet, packets) { struct dp_netdev_flow *f = NULL; ovs_be16 vlan_tci = 0; ovs_be16 dl_type = 0; uint8_t nw_frag = 0; if (i + prefetch_ahead < batch_size) { struct dp_packet **dp_packets = packets->packets; /* Prefetch next packet data and metadata. */ OVS_PREFETCH(dp_packet_data(dp_packets[i + prefetch_ahead])); pkt_metadata_prefetch_init( &dp_packets[i + prefetch_ahead]->md); } pkt_metadata_init(&packet->md, in_port); pkt_meta[i].tcp_flags = parse_tcp_flags(packet, &dl_type, &nw_frag, &vlan_tci); f = dp_netdev_simple_match_lookup(pmd, in_port, dl_type, nw_frag, vlan_tci); if (!f) { /* Any miss in Simple Match means an upcall is needed. Fall * back to the scalar DPIF to do this. */ return -1; } pkt_meta[i].bytes = dp_packet_size(packet); rules[i] = &f->cr; n_simple_hit++; hwol_emc_smc_hitmask |= (UINT32_C(1) << i); } goto action_stage; } /* Do a batch minfilow extract into keys. */ uint32_t mf_mask = 0; miniflow_extract_func mfex_func; atomic_read_relaxed(&pmd->miniflow_extract_opt, &mfex_func); if (mfex_func) { mf_mask = mfex_func(packets, keys, batch_size, in_port, pmd); } uint32_t iter = lookup_pkts_bitmask; while (iter) { uint32_t i = raw_ctz(iter); iter = _blsr_u64(iter); if (i + prefetch_ahead < batch_size) { struct dp_packet **dp_packets = packets->packets; /* Prefetch next packet data and metadata. */ OVS_PREFETCH(dp_packet_data(dp_packets[i + prefetch_ahead])); pkt_metadata_prefetch_init(&dp_packets[i + prefetch_ahead]->md); } /* Get packet pointer from bitmask and packet md. */ struct dp_packet *packet = packets->packets[i]; pkt_metadata_init(&packet->md, in_port); struct dp_netdev_flow *f = NULL; struct netdev_flow_key *key = &keys[i]; /* Check the minfiflow mask to see if the packet was correctly * classifed by vector mfex else do a scalar miniflow extract * for that packet. */ bool mfex_hit = !!(mf_mask & (UINT32_C(1) << i)); /* Check for a partial hardware offload match. */ if (hwol_enabled) { if (OVS_UNLIKELY(dp_netdev_hw_flow(pmd, packet, &f))) { /* Packet restoration failed and it was dropped, do not * continue processing. */ continue; } if (f) { rules[i] = &f->cr; /* If AVX512 MFEX already classified the packet, use it. */ if (mfex_hit) { pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf); } else { pkt_meta[i].tcp_flags = parse_tcp_flags(packet, NULL, NULL, NULL); } pkt_meta[i].bytes = dp_packet_size(packet); phwol_hits++; hwol_emc_smc_hitmask |= (UINT32_C(1) << i); continue; } } if (!mfex_hit) { /* Do a scalar miniflow extract into keys. */ miniflow_extract(packet, &key->mf); } /* Cache TCP and byte values for all packets. */ pkt_meta[i].bytes = dp_packet_size(packet); pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf); key->len = netdev_flow_key_size(miniflow_n_values(&key->mf)); key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf); if (emc_enabled) { f = emc_lookup(&cache->emc_cache, key); if (f) { rules[i] = &f->cr; emc_hits++; hwol_emc_smc_hitmask |= (UINT32_C(1) << i); continue; } } if (smc_enabled) { f = smc_lookup_single(pmd, packet, key); if (f) { rules[i] = &f->cr; smc_hits++; smc_hitmask |= (UINT32_C(1) << i); continue; } } /* The flow pointer was not found in HWOL/EMC/SMC, so add it to the * dpcls input keys array for batch lookup later. */ key_ptrs[dpcls_key_idx] = &keys[i]; dpcls_key_idx++; } hwol_emc_smc_hitmask |= smc_hitmask; uint32_t hwol_emc_smc_missmask = ~hwol_emc_smc_hitmask; /* DPCLS handles any packets missed by HWOL/EMC/SMC. It operates on the * key_ptrs[] for input miniflows to match, storing results in the * dpcls_rules[] array. */ if (dpcls_key_idx > 0) { struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port); if (OVS_UNLIKELY(!cls)) { return -1; } bool any_miss = !dpcls_lookup(cls, (const struct netdev_flow_key **) key_ptrs, dpcls_rules, dpcls_key_idx, NULL); if (OVS_UNLIKELY(any_miss)) { return -1; } /* Merge DPCLS rules and HWOL/EMC/SMC rules. */ uint32_t dpcls_idx = 0; for (int i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) { /* Indexing here is somewhat complicated due to DPCLS output rule * load index depending on the hitmask of HWOL/EMC/SMC. More * packets from HWOL/EMC/SMC bitmask means less DPCLS rules are * used. */ __m512i v_cache_rules = _mm512_loadu_si512(&rules[i]); __m512i v_merged_rules = _mm512_mask_expandloadu_epi64(v_cache_rules, ~hwol_emc_smc_hitmask, &dpcls_rules[dpcls_idx]); _mm512_storeu_si512(&rules[i], v_merged_rules); /* Update DPCLS load index and bitmask for HWOL/EMC/SMC hits. * There are NUM_U64_IN_ZMM_REG output pointers per register, * subtract the HWOL/EMC/SMC lanes equals the number of DPCLS rules * consumed. */ uint32_t hitmask_FF = (hwol_emc_smc_hitmask & 0xFF); dpcls_idx += NUM_U64_IN_ZMM_REG - __builtin_popcountll(hitmask_FF); hwol_emc_smc_hitmask = (hwol_emc_smc_hitmask >> NUM_U64_IN_ZMM_REG); } } /* At this point we have a 1:1 pkt to rules mapping, so update EMC/SMC * if required. */ /* Insert SMC and DPCLS hits into EMC. */ if (emc_enabled) { uint32_t emc_insert_mask = smc_hitmask | hwol_emc_smc_missmask; emc_insert_mask &= lookup_pkts_bitmask; emc_probabilistic_insert_batch(pmd, keys, &rules[0], emc_insert_mask); } /* Insert DPCLS hits into SMC. */ if (smc_enabled) { uint32_t smc_insert_mask = hwol_emc_smc_missmask; smc_insert_mask &= lookup_pkts_bitmask; smc_insert_batch(pmd, keys, &rules[0], smc_insert_mask); } /* At this point we don't return error anymore, so commit stats here. */ uint32_t mfex_hit_cnt = __builtin_popcountll(mf_mask); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_PHWOL_HIT, phwol_hits); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MFEX_OPT_HIT, mfex_hit_cnt); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, emc_hits); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, smc_hits); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT, dpcls_key_idx); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP, dpcls_key_idx); action_stage: pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size); pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SIMPLE_HIT, n_simple_hit); /* Initialize the "Action Batch" for each flow handled below. */ struct dp_packet_batch action_batch; action_batch.trunc = 0; while (lookup_pkts_bitmask) { uint32_t rule_pkt_idx = raw_ctz(lookup_pkts_bitmask); uint64_t needle = (uintptr_t) rules[rule_pkt_idx]; /* Parallel compare NUM_U64_IN_ZMM_REG flow* 's to the needle, create a * bitmask. */ uint32_t batch_bitmask = 0; for (uint32_t j = 0; j < NETDEV_MAX_BURST; j += NUM_U64_IN_ZMM_REG) { /* Pre-calculate store addr. */ uint32_t num_pkts_in_batch = __builtin_popcountll(batch_bitmask); void *store_addr = &action_batch.packets[num_pkts_in_batch]; /* Search for identical flow* in burst, update bitmask. */ __m512i v_needle = _mm512_set1_epi64(needle); __m512i v_hay = _mm512_loadu_si512(&rules[j]); __mmask8 k_cmp_bits = _mm512_cmpeq_epi64_mask(v_needle, v_hay); uint32_t cmp_bits = k_cmp_bits; batch_bitmask |= cmp_bits << j; /* Compress and store the batched packets. */ struct dp_packet **packets_ptrs = &packets->packets[j]; __m512i v_pkt_ptrs = _mm512_loadu_si512(packets_ptrs); _mm512_mask_compressstoreu_epi64(store_addr, cmp_bits, v_pkt_ptrs); } /* Strip all packets in this batch from the lookup_pkts_bitmask. */ lookup_pkts_bitmask &= (~batch_bitmask); action_batch.count = __builtin_popcountll(batch_bitmask); /* Loop over all packets in this batch, to gather the byte and tcp_flag * values, and pass them to the execute function. It would be nice to * optimize this away, however it is not easy to refactor in dpif. */ uint32_t bytes = 0; uint16_t tcp_flags = 0; uint32_t bitmask_iter = batch_bitmask; for (int i = 0; i < action_batch.count; i++) { uint32_t idx = raw_ctz(bitmask_iter); bitmask_iter = _blsr_u64(bitmask_iter); bytes += pkt_meta[idx].bytes; tcp_flags |= pkt_meta[idx].tcp_flags; } dp_netdev_batch_execute(pmd, &action_batch, rules[rule_pkt_idx], bytes, tcp_flags); } return 0; } #endif #endif