diff options
-rw-r--r-- | configure.ac | 3 | ||||
-rw-r--r-- | lib/automake.mk | 21 | ||||
-rw-r--r-- | lib/dpif-netdev-lookup-avx512-gather.c | 264 | ||||
-rw-r--r-- | lib/dpif-netdev-lookup.c | 20 | ||||
-rw-r--r-- | lib/dpif-netdev-lookup.h | 4 | ||||
-rw-r--r-- | m4/openvswitch.m4 | 30 |
6 files changed, 342 insertions, 0 deletions
diff --git a/configure.ac b/configure.ac index 4a6995ea8..da76cd8a5 100644 --- a/configure.ac +++ b/configure.ac @@ -178,10 +178,13 @@ OVS_ENABLE_OPTION([-Wno-null-pointer-arithmetic]) OVS_ENABLE_OPTION([-Warray-bounds-pointer-arithmetic]) OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED]) OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], [HAVE_WNO_UNUSED_PARAMETER]) +OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F]) +OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS -DHAVE_AVX512F"]) OVS_ENABLE_WERROR OVS_ENABLE_SPARSE OVS_CTAGS_IDENTIFIERS OVS_CHECK_DPCLS_AUTOVALIDATOR +OVS_CHECK_BINUTILS_AVX512 AC_ARG_VAR(KARCH, [Kernel Architecture String]) AC_SUBST(KARCH) diff --git a/lib/automake.mk b/lib/automake.mk index 1fc1a209e..eca448a5a 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -11,6 +11,7 @@ lib_libopenvswitch_la_LIBADD = $(SSL_LIBS) lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD) lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD) + if WIN32 lib_libopenvswitch_la_LIBADD += ${PTHREAD_LIBS} endif @@ -20,6 +21,26 @@ lib_libopenvswitch_la_LDFLAGS = \ -Wl,--version-script=$(top_builddir)/lib/libopenvswitch.sym \ $(AM_LDFLAGS) +if HAVE_AVX512F +# Build library of avx512 code with CPU ISA CFLAGS enabled. This allows the +# compiler to use the ISA features required for the ISA optimized code-paths. +# Use LDFLAGS to compile only static library of this code, as it should be +# statically linked into vswitchd even if vswitchd is a shared build. +lib_LTLIBRARIES += lib/libopenvswitchavx512.la +lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la +lib_libopenvswitchavx512_la_CFLAGS = \ + -mavx512f \ + -mavx512bw \ + -mavx512dq \ + -mbmi2 \ + $(AM_CFLAGS) +lib_libopenvswitchavx512_la_SOURCES = \ + lib/dpif-netdev-lookup-avx512-gather.c +lib_libopenvswitchavx512_la_LDFLAGS = \ + -static +endif + +# Build core vswitch libraries as before lib_libopenvswitch_la_SOURCES = \ lib/aes128.c \ lib/aes128.h \ diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c new file mode 100644 index 000000000..12a01a34a --- /dev/null +++ b/lib/dpif-netdev-lookup-avx512-gather.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2020, Intel Corperation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __x86_64__ +#if !defined(__CHECKER__) + +#include <config.h> + +#include "dpif-netdev.h" +#include "dpif-netdev-lookup.h" +#include "dpif-netdev-private.h" +#include "cmap.h" +#include "flow.h" +#include "pvector.h" +#include "openvswitch/vlog.h" + +#include "immintrin.h" + +/* Each AVX512 register (zmm register in assembly notation) can contain up to + * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum + * number of miniflow blocks that can be processed in a single pass of the + * AVX512 code at a time. + */ +#define NUM_U64_IN_ZMM_REG (8) +#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG) + + +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather); + +static inline __m512i +_mm512_popcnt_epi64_manual(__m512i v_in) +{ + static const uint8_t pop_lut[64] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, + }; + __m512i v_pop_lut = _mm512_loadu_si512(pop_lut); + + __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4); + __m512i v_nibble_mask = _mm512_set1_epi8(0xF); + __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask); + __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask); + + __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo); + __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi); + __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop); + + return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512()); +} + +static inline uint64_t +netdev_rule_matches_key(const struct dpcls_rule *rule, + const uint32_t mf_bits_total, + const uint64_t * block_cache) +{ + const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); + const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); + const uint32_t lane_mask = (1 << mf_bits_total) - 1; + + /* Always load a full cache line from blocks_cache. Other loads must be + * trimmed to the amount of data required for mf_bits_total blocks. + */ + __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]); + __m512i v_mask = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]); + __m512i v_key = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]); + + __m512i v_data = _mm512_and_si512(v_blocks, v_mask); + uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key); + + /* returns 1 assuming result of SIMD compare is all blocks. */ + return res_mask == lane_mask; +} + +static inline uint32_t ALWAYS_INLINE +avx512_lookup_impl(struct dpcls_subtable *subtable, + uint32_t keys_map, + const struct netdev_flow_key *keys[], + struct dpcls_rule **rules, + const uint32_t bit_count_u0, + const uint32_t bit_count_u1) +{ + OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE]; + + const uint32_t bit_count_total = bit_count_u0 + bit_count_u1; + int i; + uint32_t hashes[NETDEV_MAX_BURST]; + const uint32_t n_pkts = __builtin_popcountll(keys_map); + ovs_assert(NETDEV_MAX_BURST >= n_pkts); + + const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0]; + const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1]; + + /* Load subtable blocks for masking later. */ + const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf); + const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]); + + /* Load pre-created subtable masks for each block in subtable. */ + const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1; + const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask, + subtable->mf_masks); + + ULLONG_FOR_EACH_1 (i, keys_map) { + const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0]; + const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits); + + /* Pre-create register with *PER PACKET* u0 offset. */ + const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0); + const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask, + pkt_mf_u0_pop); + + /* Broadcast u0, u1 bitmasks to 8x u64 lanes. */ + __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits); + __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask, + keys[i]->mf.map.bits[1]); + + /* Bitmask by pre-created masks. */ + __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks); + + /* Manual AVX512 popcount for u64 lanes. */ + __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks); + + /* Offset popcounts for u1 with pre-created offset register. */ + __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset); + + /* Gather u64 blocks from packet miniflow. */ + const __m512i v_zeros = _mm512_setzero_si512(); + const void *pkt_data = miniflow_get_values(&keys[i]->mf); + __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros, + bit_count_total_mask, v_indexes, + pkt_data, 8); + + /* Zero out bits that pkt doesn't have: + * - 2x pext() to extract bits from packet miniflow as needed by TBL + * - Shift u1 over by bit_count of u0, OR to create zero bitmask + */ + uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0); + uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1); + uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero; + + /* Mask blocks using AND with subtable blocks, use k-mask to zero + * where lanes as required for this packet. + */ + __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask, + v_all_blocks, v_tbl_blocks); + + /* Store to blocks cache, full cache line aligned. */ + _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks); + } + + /* Hash the now linearized blocks of packet metadata. */ + ULLONG_FOR_EACH_1 (i, keys_map) { + uint64_t *block_ptr = &block_cache[i * 8]; + uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total); + hashes[i] = hash_finish(hash, bit_count_total * 8); + } + + /* Lookup: this returns a bitmask of packets where the hash table had + * an entry for the given hash key. Presence of a hash key does not + * guarantee matching the key, as there can be hash collisions. + */ + uint32_t found_map; + const struct cmap_node *nodes[NETDEV_MAX_BURST]; + found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes); + + /* Verify that packet actually matched rule. If not found, a hash + * collision has taken place, so continue searching with the next node. + */ + ULLONG_FOR_EACH_1 (i, found_map) { + struct dpcls_rule *rule; + + CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) { + const uint32_t cidx = i * 8; + uint32_t match = netdev_rule_matches_key(rule, bit_count_total, + &block_cache[cidx]); + if (OVS_LIKELY(match)) { + rules[i] = rule; + subtable->hit_cnt++; + goto next; + } + } + + /* None of the found rules was a match. Clear the i-th bit to + * search for this key in the next subtable. */ + ULLONG_SET0(found_map, i); + next: + ; /* Keep Sparse happy. */ + } + + return found_map; +} + +/* Expand out specialized functions with U0 and U1 bit attributes. */ +#define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1) \ + static uint32_t \ + dpcls_avx512_gather_mf_##U0##_##U1(struct dpcls_subtable *subtable, \ + uint32_t keys_map, \ + const struct netdev_flow_key *keys[], \ + struct dpcls_rule **rules) \ + { \ + return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1); \ + } \ + +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1) +DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0) + +/* Check if a specialized function is valid for the required subtable. */ +#define CHECK_LOOKUP_FUNCTION(U0, U1) \ + ovs_assert((U0 + U1) <= NUM_U64_IN_ZMM_REG); \ + if (!f && u0_bits == U0 && u1_bits == U1) { \ + f = dpcls_avx512_gather_mf_##U0##_##U1; \ + } + +static uint32_t +dpcls_avx512_gather_mf_any(struct dpcls_subtable *subtable, uint32_t keys_map, + const struct netdev_flow_key *keys[], + struct dpcls_rule **rules) +{ + return avx512_lookup_impl(subtable, keys_map, keys, rules, + subtable->mf_bits_set_unit0, + subtable->mf_bits_set_unit1); +} + +dpcls_subtable_lookup_func +dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits) +{ + dpcls_subtable_lookup_func f = NULL; + + int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f"); + int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2"); + if (!avx512f_available || !bmi2_available) { + return NULL; + } + + CHECK_LOOKUP_FUNCTION(5, 1); + CHECK_LOOKUP_FUNCTION(4, 1); + CHECK_LOOKUP_FUNCTION(4, 0); + + if (!f && (u0_bits + u1_bits) < NUM_U64_IN_ZMM_REG) { + f = dpcls_avx512_gather_mf_any; + VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n", + u0_bits, u1_bits); + } + + return f; +} + +#endif /* CHECKER */ +#endif /* __x86_64__ */ diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c index 530187e9c..bd0a99abe 100644 --- a/lib/dpif-netdev-lookup.c +++ b/lib/dpif-netdev-lookup.c @@ -42,6 +42,26 @@ static struct dpcls_subtable_lookup_info_t subtable_lookups[] = { { .prio = 1, .probe = dpcls_subtable_generic_probe, .name = "generic", }, + +#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__) + /* Only available on x86_64 bit builds with SSE 4.2 used for OVS core. */ + { .prio = 0, + .probe = dpcls_subtable_avx512_gather_probe, + .name = "avx512_gather", }, +#else + /* Disabling AVX512 at compile time, as compile time requirements not met. + * This could be due to a number of reasons: + * 1) core OVS is not compiled with SSE4.2 instruction set. + * The SSE42 instructions are required to use CRC32 ISA for high- + * performance hashing. Consider ./configure of OVS with -msse42 (or + * newer) to enable CRC32 hashing and higher performance. + * 2) The assembler in binutils versions 2.30 and 2.31 has bugs in AVX512 + * assembly. Compile time probes check for this assembler issue, and + * disable the HAVE_LD_AVX512_GOOD check if an issue is detected. + * Please upgrade binutils, or backport this binutils fix commit: + * 2069ccaf8dc28ea699bd901fdd35d90613e4402a + */ +#endif }; int32_t diff --git a/lib/dpif-netdev-lookup.h b/lib/dpif-netdev-lookup.h index 61f44b9e8..bd72aa29b 100644 --- a/lib/dpif-netdev-lookup.h +++ b/lib/dpif-netdev-lookup.h @@ -42,6 +42,10 @@ dpcls_subtable_autovalidator_probe(uint32_t u0_bit_count, dpcls_subtable_lookup_func dpcls_subtable_generic_probe(uint32_t u0_bit_count, uint32_t u1_bit_count); +/* Probe function for AVX-512 gather implementation */ +dpcls_subtable_lookup_func +dpcls_subtable_avx512_gather_probe(uint32_t u0_bit_cnt, uint32_t u1_bit_cnt); + /* Subtable registration and iteration helpers */ struct dpcls_subtable_lookup_info_t { diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4 index add3aabcc..7c9a507e5 100644 --- a/m4/openvswitch.m4 +++ b/m4/openvswitch.m4 @@ -404,6 +404,36 @@ AC_DEFUN([OVS_CHECK_SPHINX], AC_ARG_VAR([SPHINXBUILD]) AM_CONDITIONAL([HAVE_SPHINX], [test "$SPHINXBUILD" != none])]) +dnl Checks for binutils/assembler known issue with AVX512. +dnl Due to backports, we probe assembling a reproducer instead of checking +dnl binutils version string. More details, including ASM dumps and debug here: +dnl GCC: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90028 +dnl The checking of binutils funcationality instead of LD version is similar +dnl to as how DPDK proposes to solve this issue: +dnl http://patches.dpdk.org/patch/71723/ +AC_DEFUN([OVS_CHECK_BINUTILS_AVX512], + [AC_CACHE_CHECK( + [binutils avx512 assembler checks passing], + [ovs_cv_binutils_avx512_good], + [dnl Assemble a short snippet to test for issue in "build-aux" dir: + mkdir -p build-aux + OBJFILE=build-aux/binutils_avx512_check.o + GATHER_PARAMS='0x8(,%ymm1,1),%ymm0{%k2}' + echo "vpgatherqq $GATHER_PARAMS" | as --64 -o $OBJFILE - + if ($CC -dumpmachine | grep x86_64) >/dev/null 2>&1; then + if (objdump -d --no-show-raw-insn $OBJFILE | grep -q $GATHER_PARAMS) >/dev/null 2>&1; then + ovs_cv_binutils_avx512_good=yes + CFLAGS="$CFLAGS -DHAVE_LD_AVX512_GOOD" + else + ovs_cv_binutils_avx512_good=no + fi + else + ovs_cv_binutils_avx512_good=no + fi]) + rm $OBJFILE + AM_CONDITIONAL([HAVE_LD_AVX512_GOOD], + [test "$ovs_cv_binutils_avx512_good" = yes])]) + dnl Checks for dot. AC_DEFUN([OVS_CHECK_DOT], [AC_CACHE_CHECK( |