summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--configure.ac3
-rw-r--r--lib/automake.mk21
-rw-r--r--lib/dpif-netdev-lookup-avx512-gather.c264
-rw-r--r--lib/dpif-netdev-lookup.c20
-rw-r--r--lib/dpif-netdev-lookup.h4
-rw-r--r--m4/openvswitch.m430
6 files changed, 342 insertions, 0 deletions
diff --git a/configure.ac b/configure.ac
index 4a6995ea8..da76cd8a5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -178,10 +178,13 @@ OVS_ENABLE_OPTION([-Wno-null-pointer-arithmetic])
OVS_ENABLE_OPTION([-Warray-bounds-pointer-arithmetic])
OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED])
OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], [HAVE_WNO_UNUSED_PARAMETER])
+OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F])
+OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS -DHAVE_AVX512F"])
OVS_ENABLE_WERROR
OVS_ENABLE_SPARSE
OVS_CTAGS_IDENTIFIERS
OVS_CHECK_DPCLS_AUTOVALIDATOR
+OVS_CHECK_BINUTILS_AVX512
AC_ARG_VAR(KARCH, [Kernel Architecture String])
AC_SUBST(KARCH)
diff --git a/lib/automake.mk b/lib/automake.mk
index 1fc1a209e..eca448a5a 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -11,6 +11,7 @@ lib_libopenvswitch_la_LIBADD = $(SSL_LIBS)
lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD)
lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD)
+
if WIN32
lib_libopenvswitch_la_LIBADD += ${PTHREAD_LIBS}
endif
@@ -20,6 +21,26 @@ lib_libopenvswitch_la_LDFLAGS = \
-Wl,--version-script=$(top_builddir)/lib/libopenvswitch.sym \
$(AM_LDFLAGS)
+if HAVE_AVX512F
+# Build library of avx512 code with CPU ISA CFLAGS enabled. This allows the
+# compiler to use the ISA features required for the ISA optimized code-paths.
+# Use LDFLAGS to compile only static library of this code, as it should be
+# statically linked into vswitchd even if vswitchd is a shared build.
+lib_LTLIBRARIES += lib/libopenvswitchavx512.la
+lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la
+lib_libopenvswitchavx512_la_CFLAGS = \
+ -mavx512f \
+ -mavx512bw \
+ -mavx512dq \
+ -mbmi2 \
+ $(AM_CFLAGS)
+lib_libopenvswitchavx512_la_SOURCES = \
+ lib/dpif-netdev-lookup-avx512-gather.c
+lib_libopenvswitchavx512_la_LDFLAGS = \
+ -static
+endif
+
+# Build core vswitch libraries as before
lib_libopenvswitch_la_SOURCES = \
lib/aes128.c \
lib/aes128.h \
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
new file mode 100644
index 000000000..12a01a34a
--- /dev/null
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2020, Intel Corperation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+#if !defined(__CHECKER__)
+
+#include <config.h>
+
+#include "dpif-netdev.h"
+#include "dpif-netdev-lookup.h"
+#include "dpif-netdev-private.h"
+#include "cmap.h"
+#include "flow.h"
+#include "pvector.h"
+#include "openvswitch/vlog.h"
+
+#include "immintrin.h"
+
+/* Each AVX512 register (zmm register in assembly notation) can contain up to
+ * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
+ * number of miniflow blocks that can be processed in a single pass of the
+ * AVX512 code at a time.
+ */
+#define NUM_U64_IN_ZMM_REG (8)
+#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
+
+
+VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
+
+static inline __m512i
+_mm512_popcnt_epi64_manual(__m512i v_in)
+{
+ static const uint8_t pop_lut[64] = {
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+ };
+ __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
+
+ __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
+ __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
+ __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
+ __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
+
+ __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
+ __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
+ __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
+
+ return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
+}
+
+static inline uint64_t
+netdev_rule_matches_key(const struct dpcls_rule *rule,
+ const uint32_t mf_bits_total,
+ const uint64_t * block_cache)
+{
+ const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
+ const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
+ const uint32_t lane_mask = (1 << mf_bits_total) - 1;
+
+ /* Always load a full cache line from blocks_cache. Other loads must be
+ * trimmed to the amount of data required for mf_bits_total blocks.
+ */
+ __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
+ __m512i v_mask = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
+ __m512i v_key = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
+
+ __m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+ uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+
+ /* returns 1 assuming result of SIMD compare is all blocks. */
+ return res_mask == lane_mask;
+}
+
+static inline uint32_t ALWAYS_INLINE
+avx512_lookup_impl(struct dpcls_subtable *subtable,
+ uint32_t keys_map,
+ const struct netdev_flow_key *keys[],
+ struct dpcls_rule **rules,
+ const uint32_t bit_count_u0,
+ const uint32_t bit_count_u1)
+{
+ OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
+
+ const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
+ int i;
+ uint32_t hashes[NETDEV_MAX_BURST];
+ const uint32_t n_pkts = __builtin_popcountll(keys_map);
+ ovs_assert(NETDEV_MAX_BURST >= n_pkts);
+
+ const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
+ const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
+
+ /* Load subtable blocks for masking later. */
+ const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
+ const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
+
+ /* Load pre-created subtable masks for each block in subtable. */
+ const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
+ const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
+ subtable->mf_masks);
+
+ ULLONG_FOR_EACH_1 (i, keys_map) {
+ const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
+ const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
+
+ /* Pre-create register with *PER PACKET* u0 offset. */
+ const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
+ const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
+ pkt_mf_u0_pop);
+
+ /* Broadcast u0, u1 bitmasks to 8x u64 lanes. */
+ __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
+ __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
+ keys[i]->mf.map.bits[1]);
+
+ /* Bitmask by pre-created masks. */
+ __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
+
+ /* Manual AVX512 popcount for u64 lanes. */
+ __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+
+ /* Offset popcounts for u1 with pre-created offset register. */
+ __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
+
+ /* Gather u64 blocks from packet miniflow. */
+ const __m512i v_zeros = _mm512_setzero_si512();
+ const void *pkt_data = miniflow_get_values(&keys[i]->mf);
+ __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
+ bit_count_total_mask, v_indexes,
+ pkt_data, 8);
+
+ /* Zero out bits that pkt doesn't have:
+ * - 2x pext() to extract bits from packet miniflow as needed by TBL
+ * - Shift u1 over by bit_count of u0, OR to create zero bitmask
+ */
+ uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
+ uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
+ uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
+
+ /* Mask blocks using AND with subtable blocks, use k-mask to zero
+ * where lanes as required for this packet.
+ */
+ __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
+ v_all_blocks, v_tbl_blocks);
+
+ /* Store to blocks cache, full cache line aligned. */
+ _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
+ }
+
+ /* Hash the now linearized blocks of packet metadata. */
+ ULLONG_FOR_EACH_1 (i, keys_map) {
+ uint64_t *block_ptr = &block_cache[i * 8];
+ uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
+ hashes[i] = hash_finish(hash, bit_count_total * 8);
+ }
+
+ /* Lookup: this returns a bitmask of packets where the hash table had
+ * an entry for the given hash key. Presence of a hash key does not
+ * guarantee matching the key, as there can be hash collisions.
+ */
+ uint32_t found_map;
+ const struct cmap_node *nodes[NETDEV_MAX_BURST];
+ found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
+
+ /* Verify that packet actually matched rule. If not found, a hash
+ * collision has taken place, so continue searching with the next node.
+ */
+ ULLONG_FOR_EACH_1 (i, found_map) {
+ struct dpcls_rule *rule;
+
+ CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
+ const uint32_t cidx = i * 8;
+ uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
+ &block_cache[cidx]);
+ if (OVS_LIKELY(match)) {
+ rules[i] = rule;
+ subtable->hit_cnt++;
+ goto next;
+ }
+ }
+
+ /* None of the found rules was a match. Clear the i-th bit to
+ * search for this key in the next subtable. */
+ ULLONG_SET0(found_map, i);
+ next:
+ ; /* Keep Sparse happy. */
+ }
+
+ return found_map;
+}
+
+/* Expand out specialized functions with U0 and U1 bit attributes. */
+#define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1) \
+ static uint32_t \
+ dpcls_avx512_gather_mf_##U0##_##U1(struct dpcls_subtable *subtable, \
+ uint32_t keys_map, \
+ const struct netdev_flow_key *keys[], \
+ struct dpcls_rule **rules) \
+ { \
+ return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1); \
+ } \
+
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
+
+/* Check if a specialized function is valid for the required subtable. */
+#define CHECK_LOOKUP_FUNCTION(U0, U1) \
+ ovs_assert((U0 + U1) <= NUM_U64_IN_ZMM_REG); \
+ if (!f && u0_bits == U0 && u1_bits == U1) { \
+ f = dpcls_avx512_gather_mf_##U0##_##U1; \
+ }
+
+static uint32_t
+dpcls_avx512_gather_mf_any(struct dpcls_subtable *subtable, uint32_t keys_map,
+ const struct netdev_flow_key *keys[],
+ struct dpcls_rule **rules)
+{
+ return avx512_lookup_impl(subtable, keys_map, keys, rules,
+ subtable->mf_bits_set_unit0,
+ subtable->mf_bits_set_unit1);
+}
+
+dpcls_subtable_lookup_func
+dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits)
+{
+ dpcls_subtable_lookup_func f = NULL;
+
+ int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+ int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+ if (!avx512f_available || !bmi2_available) {
+ return NULL;
+ }
+
+ CHECK_LOOKUP_FUNCTION(5, 1);
+ CHECK_LOOKUP_FUNCTION(4, 1);
+ CHECK_LOOKUP_FUNCTION(4, 0);
+
+ if (!f && (u0_bits + u1_bits) < NUM_U64_IN_ZMM_REG) {
+ f = dpcls_avx512_gather_mf_any;
+ VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
+ u0_bits, u1_bits);
+ }
+
+ return f;
+}
+
+#endif /* CHECKER */
+#endif /* __x86_64__ */
diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
index 530187e9c..bd0a99abe 100644
--- a/lib/dpif-netdev-lookup.c
+++ b/lib/dpif-netdev-lookup.c
@@ -42,6 +42,26 @@ static struct dpcls_subtable_lookup_info_t subtable_lookups[] = {
{ .prio = 1,
.probe = dpcls_subtable_generic_probe,
.name = "generic", },
+
+#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__)
+ /* Only available on x86_64 bit builds with SSE 4.2 used for OVS core. */
+ { .prio = 0,
+ .probe = dpcls_subtable_avx512_gather_probe,
+ .name = "avx512_gather", },
+#else
+ /* Disabling AVX512 at compile time, as compile time requirements not met.
+ * This could be due to a number of reasons:
+ * 1) core OVS is not compiled with SSE4.2 instruction set.
+ * The SSE42 instructions are required to use CRC32 ISA for high-
+ * performance hashing. Consider ./configure of OVS with -msse42 (or
+ * newer) to enable CRC32 hashing and higher performance.
+ * 2) The assembler in binutils versions 2.30 and 2.31 has bugs in AVX512
+ * assembly. Compile time probes check for this assembler issue, and
+ * disable the HAVE_LD_AVX512_GOOD check if an issue is detected.
+ * Please upgrade binutils, or backport this binutils fix commit:
+ * 2069ccaf8dc28ea699bd901fdd35d90613e4402a
+ */
+#endif
};
int32_t
diff --git a/lib/dpif-netdev-lookup.h b/lib/dpif-netdev-lookup.h
index 61f44b9e8..bd72aa29b 100644
--- a/lib/dpif-netdev-lookup.h
+++ b/lib/dpif-netdev-lookup.h
@@ -42,6 +42,10 @@ dpcls_subtable_autovalidator_probe(uint32_t u0_bit_count,
dpcls_subtable_lookup_func
dpcls_subtable_generic_probe(uint32_t u0_bit_count, uint32_t u1_bit_count);
+/* Probe function for AVX-512 gather implementation */
+dpcls_subtable_lookup_func
+dpcls_subtable_avx512_gather_probe(uint32_t u0_bit_cnt, uint32_t u1_bit_cnt);
+
/* Subtable registration and iteration helpers */
struct dpcls_subtable_lookup_info_t {
diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4
index add3aabcc..7c9a507e5 100644
--- a/m4/openvswitch.m4
+++ b/m4/openvswitch.m4
@@ -404,6 +404,36 @@ AC_DEFUN([OVS_CHECK_SPHINX],
AC_ARG_VAR([SPHINXBUILD])
AM_CONDITIONAL([HAVE_SPHINX], [test "$SPHINXBUILD" != none])])
+dnl Checks for binutils/assembler known issue with AVX512.
+dnl Due to backports, we probe assembling a reproducer instead of checking
+dnl binutils version string. More details, including ASM dumps and debug here:
+dnl GCC: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90028
+dnl The checking of binutils funcationality instead of LD version is similar
+dnl to as how DPDK proposes to solve this issue:
+dnl http://patches.dpdk.org/patch/71723/
+AC_DEFUN([OVS_CHECK_BINUTILS_AVX512],
+ [AC_CACHE_CHECK(
+ [binutils avx512 assembler checks passing],
+ [ovs_cv_binutils_avx512_good],
+ [dnl Assemble a short snippet to test for issue in "build-aux" dir:
+ mkdir -p build-aux
+ OBJFILE=build-aux/binutils_avx512_check.o
+ GATHER_PARAMS='0x8(,%ymm1,1),%ymm0{%k2}'
+ echo "vpgatherqq $GATHER_PARAMS" | as --64 -o $OBJFILE -
+ if ($CC -dumpmachine | grep x86_64) >/dev/null 2>&1; then
+ if (objdump -d --no-show-raw-insn $OBJFILE | grep -q $GATHER_PARAMS) >/dev/null 2>&1; then
+ ovs_cv_binutils_avx512_good=yes
+ CFLAGS="$CFLAGS -DHAVE_LD_AVX512_GOOD"
+ else
+ ovs_cv_binutils_avx512_good=no
+ fi
+ else
+ ovs_cv_binutils_avx512_good=no
+ fi])
+ rm $OBJFILE
+ AM_CONDITIONAL([HAVE_LD_AVX512_GOOD],
+ [test "$ovs_cv_binutils_avx512_good" = yes])])
+
dnl Checks for dot.
AC_DEFUN([OVS_CHECK_DOT],
[AC_CACHE_CHECK(