summaryrefslogtreecommitdiff
path: root/lib/dpif-netdev-lookup-avx512-gather.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dpif-netdev-lookup-avx512-gather.c')
-rw-r--r--lib/dpif-netdev-lookup-avx512-gather.c264
1 files changed, 264 insertions, 0 deletions
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
new file mode 100644
index 000000000..12a01a34a
--- /dev/null
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2020, Intel Corperation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+#if !defined(__CHECKER__)
+
+#include <config.h>
+
+#include "dpif-netdev.h"
+#include "dpif-netdev-lookup.h"
+#include "dpif-netdev-private.h"
+#include "cmap.h"
+#include "flow.h"
+#include "pvector.h"
+#include "openvswitch/vlog.h"
+
+#include "immintrin.h"
+
+/* Each AVX512 register (zmm register in assembly notation) can contain up to
+ * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
+ * number of miniflow blocks that can be processed in a single pass of the
+ * AVX512 code at a time.
+ */
+#define NUM_U64_IN_ZMM_REG (8)
+#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
+
+
+VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
+
+static inline __m512i
+_mm512_popcnt_epi64_manual(__m512i v_in)
+{
+ static const uint8_t pop_lut[64] = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ };
+ __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
+
+ __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
+ __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
+ __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
+ __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
+
+ __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
+ __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
+ __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
+
+ return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
+}
+
+static inline uint64_t
+netdev_rule_matches_key(const struct dpcls_rule *rule,
+ const uint32_t mf_bits_total,
+ const uint64_t * block_cache)
+{
+ const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
+ const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
+ const uint32_t lane_mask = (1 << mf_bits_total) - 1;
+
+ /* Always load a full cache line from blocks_cache. Other loads must be
+ * trimmed to the amount of data required for mf_bits_total blocks.
+ */
+ __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
+ __m512i v_mask = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
+ __m512i v_key = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
+
+ __m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+ uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+
+ /* returns 1 assuming result of SIMD compare is all blocks. */
+ return res_mask == lane_mask;
+}
+
+static inline uint32_t ALWAYS_INLINE
+avx512_lookup_impl(struct dpcls_subtable *subtable,
+ uint32_t keys_map,
+ const struct netdev_flow_key *keys[],
+ struct dpcls_rule **rules,
+ const uint32_t bit_count_u0,
+ const uint32_t bit_count_u1)
+{
+ OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
+
+ const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
+ int i;
+ uint32_t hashes[NETDEV_MAX_BURST];
+ const uint32_t n_pkts = __builtin_popcountll(keys_map);
+ ovs_assert(NETDEV_MAX_BURST >= n_pkts);
+
+ const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
+ const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
+
+ /* Load subtable blocks for masking later. */
+ const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
+ const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
+
+ /* Load pre-created subtable masks for each block in subtable. */
+ const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
+ const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
+ subtable->mf_masks);
+
+ ULLONG_FOR_EACH_1 (i, keys_map) {
+ const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
+ const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
+
+ /* Pre-create register with *PER PACKET* u0 offset. */
+ const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
+ const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
+ pkt_mf_u0_pop);
+
+ /* Broadcast u0, u1 bitmasks to 8x u64 lanes. */
+ __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
+ __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
+ keys[i]->mf.map.bits[1]);
+
+ /* Bitmask by pre-created masks. */
+ __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
+
+ /* Manual AVX512 popcount for u64 lanes. */
+ __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+
+ /* Offset popcounts for u1 with pre-created offset register. */
+ __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
+
+ /* Gather u64 blocks from packet miniflow. */
+ const __m512i v_zeros = _mm512_setzero_si512();
+ const void *pkt_data = miniflow_get_values(&keys[i]->mf);
+ __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
+ bit_count_total_mask, v_indexes,
+ pkt_data, 8);
+
+ /* Zero out bits that pkt doesn't have:
+ * - 2x pext() to extract bits from packet miniflow as needed by TBL
+ * - Shift u1 over by bit_count of u0, OR to create zero bitmask
+ */
+ uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
+ uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
+ uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
+
+ /* Mask blocks using AND with subtable blocks, use k-mask to zero
+ * where lanes as required for this packet.
+ */
+ __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
+ v_all_blocks, v_tbl_blocks);
+
+ /* Store to blocks cache, full cache line aligned. */
+ _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
+ }
+
+ /* Hash the now linearized blocks of packet metadata. */
+ ULLONG_FOR_EACH_1 (i, keys_map) {
+ uint64_t *block_ptr = &block_cache[i * 8];
+ uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
+ hashes[i] = hash_finish(hash, bit_count_total * 8);
+ }
+
+ /* Lookup: this returns a bitmask of packets where the hash table had
+ * an entry for the given hash key. Presence of a hash key does not
+ * guarantee matching the key, as there can be hash collisions.
+ */
+ uint32_t found_map;
+ const struct cmap_node *nodes[NETDEV_MAX_BURST];
+ found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
+
+ /* Verify that packet actually matched rule. If not found, a hash
+ * collision has taken place, so continue searching with the next node.
+ */
+ ULLONG_FOR_EACH_1 (i, found_map) {
+ struct dpcls_rule *rule;
+
+ CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
+ const uint32_t cidx = i * 8;
+ uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
+ &block_cache[cidx]);
+ if (OVS_LIKELY(match)) {
+ rules[i] = rule;
+ subtable->hit_cnt++;
+ goto next;
+ }
+ }
+
+ /* None of the found rules was a match. Clear the i-th bit to
+ * search for this key in the next subtable. */
+ ULLONG_SET0(found_map, i);
+ next:
+ ; /* Keep Sparse happy. */
+ }
+
+ return found_map;
+}
+
+/* Expand out specialized functions with U0 and U1 bit attributes. */
+#define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1) \
+ static uint32_t \
+ dpcls_avx512_gather_mf_##U0##_##U1(struct dpcls_subtable *subtable, \
+ uint32_t keys_map, \
+ const struct netdev_flow_key *keys[], \
+ struct dpcls_rule **rules) \
+ { \
+ return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1); \
+ } \
+
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
+
+/* Check if a specialized function is valid for the required subtable. */
+#define CHECK_LOOKUP_FUNCTION(U0, U1) \
+ ovs_assert((U0 + U1) <= NUM_U64_IN_ZMM_REG); \
+ if (!f && u0_bits == U0 && u1_bits == U1) { \
+ f = dpcls_avx512_gather_mf_##U0##_##U1; \
+ }
+
+static uint32_t
+dpcls_avx512_gather_mf_any(struct dpcls_subtable *subtable, uint32_t keys_map,
+ const struct netdev_flow_key *keys[],
+ struct dpcls_rule **rules)
+{
+ return avx512_lookup_impl(subtable, keys_map, keys, rules,
+ subtable->mf_bits_set_unit0,
+ subtable->mf_bits_set_unit1);
+}
+
+dpcls_subtable_lookup_func
+dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits)
+{
+ dpcls_subtable_lookup_func f = NULL;
+
+ int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+ int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+ if (!avx512f_available || !bmi2_available) {
+ return NULL;
+ }
+
+ CHECK_LOOKUP_FUNCTION(5, 1);
+ CHECK_LOOKUP_FUNCTION(4, 1);
+ CHECK_LOOKUP_FUNCTION(4, 0);
+
+ if (!f && (u0_bits + u1_bits) < NUM_U64_IN_ZMM_REG) {
+ f = dpcls_avx512_gather_mf_any;
+ VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
+ u0_bits, u1_bits);
+ }
+
+ return f;
+}
+
+#endif /* CHECKER */
+#endif /* __x86_64__ */