/* * Copyright (c) 2009, 2010, 2011, 2012, 2013, 2014, 2016, 2017 Nicira, Inc. * Copyright (c) 2019, 2020 Intel Corporation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include "dpif-netdev.h" #include "dpif-netdev-lookup.h" #include "bitmap.h" #include "cmap.h" #include "dp-packet.h" #include "dpif.h" #include "dpif-netdev-perf.h" #include "dpif-provider.h" #include "flow.h" #include "ovs-thread.h" #include "packets.h" #include "pvector.h" VLOG_DEFINE_THIS_MODULE(dpif_lookup_generic); /* Lookup functions below depends on the internal structure of flowmap. */ BUILD_ASSERT_DECL(FLOWMAP_UNITS == 2); struct block_array { uint32_t count; /* Number of items allocated in 'blocks' */ uint64_t blocks[]; }; DEFINE_PER_THREAD_MALLOCED_DATA(struct block_array *, block_array); static inline uint64_t * get_blocks_scratch(uint32_t required_count) { struct block_array *array = block_array_get(); /* Check if this thread already has a large enough array allocated. * This is a predictable and unlikely branch, as it occurs only once at * startup, or if a subtable with higher block count is added. */ if (OVS_UNLIKELY(!array || array->count < required_count)) { array = xrealloc(array, sizeof *array + (required_count * sizeof array->blocks[0])); array->count = required_count; block_array_set_unsafe(array); VLOG_DBG("Block array resized to %"PRIu32, required_count); } return &array->blocks[0]; } static inline void netdev_flow_key_flatten_unit(const uint64_t *pkt_blocks, const uint64_t *tbl_blocks, const uint64_t *mf_masks, uint64_t *blocks_scratch, const uint64_t pkt_mf_bits, const uint32_t count) { uint32_t i; for (i = 0; i < count; i++) { uint64_t mf_mask = mf_masks[i]; /* Calculate the block index for the packet metadata. */ uint64_t idx_bits = mf_mask & pkt_mf_bits; const uint32_t pkt_idx = count_1bits(idx_bits); /* Check if the packet has the subtable miniflow bit set. If yes, the * block at the above pkt_idx will be stored, otherwise it is masked * out to be zero. */ uint64_t pkt_has_mf_bit = (mf_mask + 1) & pkt_mf_bits; uint64_t no_bit = ((!pkt_has_mf_bit) > 0) - 1; /* Mask packet block by table block, and mask to zero if packet * doesn't actually contain this block of metadata. */ blocks_scratch[i] = pkt_blocks[pkt_idx] & tbl_blocks[i] & no_bit; } } /* This function takes a packet, and subtable and writes an array of uint64_t * blocks. The blocks contain the metadata that the subtable matches on, in * the same order as the subtable, allowing linear iteration over the blocks. * * To calculate the blocks contents, the netdev_flow_key_flatten_unit function * is called twice, once for each "unit" of the miniflow. This call can be * inlined by the compiler for performance. * * Note that the u0_count and u1_count variables can be compile-time constants, * allowing the loop in the inlined flatten_unit() function to be compile-time * unrolled, or possibly removed totally by unrolling by the loop iterations. * The compile time optimizations enabled by this design improves performance. */ static inline void netdev_flow_key_flatten(const struct netdev_flow_key *key, const struct netdev_flow_key *mask, const uint64_t *mf_masks, uint64_t *blocks_scratch, const uint32_t u0_count, const uint32_t u1_count) { /* Load mask from subtable, mask with packet mf, popcount to get idx. */ const uint64_t *pkt_blocks = miniflow_get_values(&key->mf); const uint64_t *tbl_blocks = miniflow_get_values(&mask->mf); /* Packet miniflow bits to be masked by pre-calculated mf_masks. */ const uint64_t pkt_bits_u0 = key->mf.map.bits[0]; const uint32_t pkt_bits_u0_pop = count_1bits(pkt_bits_u0); const uint64_t pkt_bits_u1 = key->mf.map.bits[1]; /* Unit 0 flattening */ netdev_flow_key_flatten_unit(&pkt_blocks[0], &tbl_blocks[0], &mf_masks[0], &blocks_scratch[0], pkt_bits_u0, u0_count); /* Unit 1 flattening: * Move the pointers forward in the arrays based on u0 offsets, NOTE: * 1) pkt blocks indexed by actual popcount of u0, which is NOT always * the same as the amount of bits set in the subtable. * 2) mf_masks, tbl_block and blocks_scratch are all "flat" arrays, so * the index is always u0_count. */ netdev_flow_key_flatten_unit(&pkt_blocks[pkt_bits_u0_pop], &tbl_blocks[u0_count], &mf_masks[u0_count], &blocks_scratch[u0_count], pkt_bits_u1, u1_count); } /* Compares a rule and the blocks representing a key, returns 1 on a match. */ static inline uint64_t netdev_rule_matches_key(const struct dpcls_rule *rule, const uint32_t mf_bits_total, const uint64_t *blocks_scratch) { const uint64_t *keyp = miniflow_get_values(&rule->flow.mf); const uint64_t *maskp = miniflow_get_values(&rule->mask->mf); uint64_t not_match = 0; for (int i = 0; i < mf_bits_total; i++) { not_match |= (blocks_scratch[i] & maskp[i]) != keyp[i]; } /* Invert result to show match as 1. */ return !not_match; } /* Const prop version of the function: note that mf bits total and u0 are * explicitly passed in here, while they're also available at runtime from the * subtable pointer. By making them compile time, we enable the compiler to * unroll loops and flatten out code-sequences based on the knowledge of the * mf_bits_* compile time values. This results in improved performance. * * Note: this function is marked with ALWAYS_INLINE to ensure the compiler * inlines the below code, and then uses the compile time constants to make * specialized versions of the runtime code. Without ALWAYS_INLINE, the * compiler might decide to not inline, and performance will suffer. */ static inline uint32_t ALWAYS_INLINE lookup_generic_impl(struct dpcls_subtable *subtable, uint32_t keys_map, const struct netdev_flow_key *keys[], struct dpcls_rule **rules, const uint32_t bit_count_u0, const uint32_t bit_count_u1) { const uint32_t n_pkts = count_1bits(keys_map); ovs_assert(NETDEV_MAX_BURST >= n_pkts); uint32_t hashes[NETDEV_MAX_BURST]; const uint32_t bit_count_total = bit_count_u0 + bit_count_u1; const uint32_t block_count_required = bit_count_total * NETDEV_MAX_BURST; uint64_t *mf_masks = subtable->mf_masks; int i; /* Blocks scratch is an optimization to re-use the same packet miniflow * block data when doing rule-verify. This reduces work done during lookup * and hence improves performance. The blocks_scratch array is stored as a * thread local variable, as each thread requires its own blocks memory. */ uint64_t *blocks_scratch = get_blocks_scratch(block_count_required); /* Flatten the packet metadata into the blocks_scratch[] using subtable. */ ULLONG_FOR_EACH_1 (i, keys_map) { netdev_flow_key_flatten(keys[i], &subtable->mask, mf_masks, &blocks_scratch[i * bit_count_total], bit_count_u0, bit_count_u1); } /* Hash the now linearized blocks of packet metadata. */ ULLONG_FOR_EACH_1 (i, keys_map) { uint64_t *block_ptr = &blocks_scratch[i * bit_count_total]; uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total); hashes[i] = hash_finish(hash, bit_count_total * 8); } /* Lookup: this returns a bitmask of packets where the hash table had * an entry for the given hash key. Presence of a hash key does not * guarantee matching the key, as there can be hash collisions. */ uint32_t found_map; const struct cmap_node *nodes[NETDEV_MAX_BURST]; found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes); /* Verify that packet actually matched rule. If not found, a hash * collision has taken place, so continue searching with the next node. */ ULLONG_FOR_EACH_1 (i, found_map) { struct dpcls_rule *rule; CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) { const uint32_t cidx = i * bit_count_total; uint32_t match = netdev_rule_matches_key(rule, bit_count_total, &blocks_scratch[cidx]); if (OVS_LIKELY(match)) { rules[i] = rule; subtable->hit_cnt++; goto next; } } /* None of the found rules was a match. Reset the i-th bit to * keep searching this key in the next subtable. */ ULLONG_SET0(found_map, i); /* Did not match. */ next: ; /* Keep Sparse happy. */ } return found_map; } /* Generic lookup function that uses runtime provided mf bits for iterating. */ static uint32_t dpcls_subtable_lookup_generic(struct dpcls_subtable *subtable, uint32_t keys_map, const struct netdev_flow_key *keys[], struct dpcls_rule **rules) { /* Here the runtime subtable->mf_bits counts are used, which forces the * compiler to iterate normal for() loops. Due to this limitation in the * compilers available optimizations, this function has lower performance * than the below specialized functions. */ return lookup_generic_impl(subtable, keys_map, keys, rules, subtable->mf_bits_set_unit0, subtable->mf_bits_set_unit1); } /* Expand out specialized functions with U0 and U1 bit attributes. */ #define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1) \ static uint32_t \ dpcls_subtable_lookup_mf_u0w##U0##_u1w##U1( \ struct dpcls_subtable *subtable, \ uint32_t keys_map, \ const struct netdev_flow_key *keys[],\ struct dpcls_rule **rules) \ { \ return lookup_generic_impl(subtable, keys_map, keys, rules, U0, U1); \ } \ DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(8, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 3) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 2) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1) DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0) /* Check if a specialized function is valid for the required subtable. */ #define CHECK_LOOKUP_FUNCTION(U0, U1) \ if (!f && u0_bits == U0 && u1_bits == U1) { \ f = dpcls_subtable_lookup_mf_u0w##U0##_u1w##U1; \ } /* Probe function to lookup an available specialized function. * If capable to run the requested miniflow fingerprint, this function returns * the most optimal implementation for that miniflow fingerprint. * @retval Non-NULL A valid function to handle the miniflow bit pattern * @retval NULL The requested miniflow is not supported by this implementation. */ dpcls_subtable_lookup_func dpcls_subtable_generic_probe(uint32_t u0_bits, uint32_t u1_bits) { dpcls_subtable_lookup_func f = NULL; CHECK_LOOKUP_FUNCTION(9, 4); CHECK_LOOKUP_FUNCTION(9, 1); CHECK_LOOKUP_FUNCTION(8, 1); CHECK_LOOKUP_FUNCTION(5, 3); CHECK_LOOKUP_FUNCTION(5, 2); CHECK_LOOKUP_FUNCTION(5, 1); CHECK_LOOKUP_FUNCTION(4, 1); CHECK_LOOKUP_FUNCTION(4, 0); if (f) { VLOG_DBG("Subtable using Generic Optimized for u0 %d, u1 %d\n", u0_bits, u1_bits); } else { /* Always return the generic function. */ f = dpcls_subtable_lookup_generic; } return f; }