diff options
167 files changed, 23 insertions, 44622 deletions
diff --git a/.gitignore b/.gitignore index b0098f46a..26ed8d3d0 100644 --- a/.gitignore +++ b/.gitignore @@ -34,7 +34,6 @@ /Makefile /Makefile.in /aclocal.m4 -/all-distfiles /all-gitfiles /autom4te.cache /build-arch-stamp diff --git a/Makefile.am b/Makefile.am index cb8076433..fff98564a 100644 --- a/Makefile.am +++ b/Makefile.am @@ -7,7 +7,6 @@ AUTOMAKE_OPTIONS = foreign subdir-objects ACLOCAL_AMFLAGS = -I m4 -SUBDIRS = datapath AM_CPPFLAGS = $(SSL_CFLAGS) AM_LDFLAGS = $(SSL_LDFLAGS) @@ -198,25 +197,22 @@ CLEAN_LOCAL += clean-pycov ALL_LOCAL += dist-hook-git dist-hook-git: distfiles @if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1; then \ - (cd datapath && $(MAKE) distfiles); \ - (cat distfiles; sed 's|^|datapath/|' datapath/distfiles) | \ - LC_ALL=C sort -u > all-distfiles; \ (cd $(srcdir) && git ls-files) | grep -v '\.gitignore$$' | \ grep -v '\.gitattributes$$' | \ LC_ALL=C sort -u > all-gitfiles; \ - LC_ALL=C comm -1 -3 all-distfiles all-gitfiles > missing-distfiles; \ + LC_ALL=C comm -1 -3 distfiles all-gitfiles > missing-distfiles; \ if test -s missing-distfiles; then \ echo "The following files are in git but not the distribution:"; \ cat missing-distfiles; \ exit 1; \ fi; \ - if LC_ALL=C grep '\.gitignore$$' all-distfiles; then \ + if LC_ALL=C grep '\.gitignore$$' distfiles; then \ echo "See above for list of files that are distributed but"; \ echo "should not be."; \ exit 1; \ fi \ fi -CLEANFILES += all-distfiles all-gitfiles missing-distfiles +CLEANFILES += all-gitfiles missing-distfiles # The following is based on commands for the Automake "distdir" target. distfiles: Makefile @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ @@ -235,7 +231,7 @@ config-h-check: @cd $(srcdir); \ if test -e .git && (git --version) >/dev/null 2>&1 && \ git --no-pager grep -L '#include <config\.h>' `git ls-files | grep '\.c$$' | \ - grep -vE '^datapath|^lib/sflow|^third-party|^datapath-windows|^python'`; \ + grep -vE '^datapath-windows|^lib/sflow|^python|^third-party'`; \ then \ echo "See above for list of violations of the rule that"; \ echo "every C source file must #include <config.h>."; \ @@ -256,7 +252,7 @@ printf-check: @cd $(srcdir); \ if test -e .git && (git --version) >/dev/null 2>&1 && \ git --no-pager grep -n -E -e '%[-+ #0-9.*]*([ztj]|hh)' --and --not -e 'ovs_scan' `git ls-files | grep '\.[ch]$$' | \ - grep -vE '^datapath|^lib/sflow|^third-party'`; \ + grep -vE '^datapath-windows|^lib/sflow|^third-party'`; \ then \ echo "See above for list of violations of the rule that"; \ echo "'z', 't', 'j', 'hh' printf() type modifiers are"; \ @@ -299,7 +295,7 @@ check-endian: @if test -e $(srcdir)/.git && (git --version) >/dev/null 2>&1 && \ (cd $(srcdir) && git --no-pager grep -l -E \ -e 'BIG_ENDIAN|LITTLE_ENDIAN' --and --not -e 'BYTE_ORDER' | \ - $(EGREP) -v '^datapath/|^include/sparse/rte_'); \ + $(EGREP) -v '^include/sparse/rte_'); \ then \ echo "See above for list of files that misuse LITTLE""_ENDIAN"; \ echo "or BIG""_ENDIAN. Please use WORDS_BIGENDIAN instead."; \ @@ -339,7 +335,7 @@ thread-safety-check: if test -e .git && (git --version) >/dev/null 2>&1 && \ grep -n -f build-aux/thread-safety-forbidden \ `git ls-files | grep '\.[ch]$$' \ - | $(EGREP) -v '^datapath|^lib/sflow|^third-party'` /dev/null \ + | $(EGREP) -v '^datapath-windows|^lib/sflow|^third-party'` /dev/null \ | $(EGREP) -v ':[ ]*/?\*'; \ then \ echo "See above for list of calls to functions that are"; \ @@ -468,11 +464,6 @@ install-data-local: $(INSTALL_DATA_LOCAL) uninstall-local: $(UNINSTALL_LOCAL) .PHONY: $(DIST_HOOKS) $(CLEAN_LOCAL) $(INSTALL_DATA_LOCAL) $(UNINSTALL_LOCAL) -modules_install: -if LINUX_ENABLED - cd datapath/linux && $(MAKE) modules_install -endif - dist-docs: VERSION=$(VERSION) MAKE='$(MAKE)' $(srcdir)/build-aux/dist-docs $(srcdir) $(docs) .PHONY: dist-docs @@ -74,6 +74,9 @@ Post-v2.17.0 - Linux datapath: * Add offloading meter tc police. * Add support for offloading the check_pkt_len action. + - Previously deprecated Linux kernel module is now fully removed from + the OVS source tree. The version provided with the Linux kernel + should be used instead. v2.17.0 - 17 Feb 2022 diff --git a/build-aux/initial-tab-allowed-files b/build-aux/initial-tab-allowed-files index 6a9968e32..ff597d23c 100644 --- a/build-aux/initial-tab-allowed-files +++ b/build-aux/initial-tab-allowed-files @@ -3,7 +3,6 @@ \.mk$ \.png$ \.sln$ -^datapath/ ^include/linux/ ^include/sparse/rte_ ^include/windows/ diff --git a/configure.ac b/configure.ac index 6c51e48ce..63359fe29 100644 --- a/configure.ac +++ b/configure.ac @@ -14,7 +14,7 @@ AC_PREREQ(2.63) AC_INIT(openvswitch, 2.17.90, bugs@openvswitch.org) -AC_CONFIG_SRCDIR([datapath/datapath.c]) +AC_CONFIG_SRCDIR([vswitchd/ovs-vswitchd.c]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_HEADERS([config.h]) @@ -204,10 +204,6 @@ AC_SUBST([OVS_CFLAGS]) AC_SUBST([OVS_LDFLAGS]) AC_CONFIG_FILES(Makefile) -AC_CONFIG_FILES(datapath/Makefile) -AC_CONFIG_FILES(datapath/linux/Kbuild) -AC_CONFIG_FILES(datapath/linux/Makefile) -AC_CONFIG_FILES(datapath/linux/Makefile.main) AC_CONFIG_FILES(tests/atlocal) AC_CONFIG_FILES(lib/libopenvswitch.pc) AC_CONFIG_FILES(lib/libsflow.pc) diff --git a/datapath-windows/include/automake.mk b/datapath-windows/include/automake.mk index b8dcf83b9..a354f007f 100644 --- a/datapath-windows/include/automake.mk +++ b/datapath-windows/include/automake.mk @@ -3,7 +3,7 @@ BUILT_SOURCES += $(srcdir)/datapath-windows/include/OvsDpInterface.h endif $(srcdir)/datapath-windows/include/OvsDpInterface.h: \ - datapath/linux/compat/include/linux/openvswitch.h \ + include/linux/openvswitch.h \ build-aux/extract-odp-netlink-windows-dp-h $(AM_V_GEN)sed -f $(srcdir)/build-aux/extract-odp-netlink-windows-dp-h < $< > $@ diff --git a/datapath/.gitignore b/datapath/.gitignore deleted file mode 100644 index fb8cf7d3b..000000000 --- a/datapath/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -/Makefile -/Makefile.in -*.cmd -*.ko -*.mod.c -Module.symvers -/distfiles diff --git a/datapath/Makefile.am b/datapath/Makefile.am deleted file mode 100644 index e4dd0c704..000000000 --- a/datapath/Makefile.am +++ /dev/null @@ -1,60 +0,0 @@ -SUBDIRS = -if LINUX_ENABLED -SUBDIRS += linux -endif - -EXTRA_DIST = $(dist_headers) $(dist_sources) $(dist_extras) - -# Suppress warnings about GNU extensions in Modules.mk files. -AUTOMAKE_OPTIONS = -Wno-portability - -include Modules.mk -include linux/Modules.mk - -# The following is based on commands for the Automake "distdir" target. -distfiles: Makefile - @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ - topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ - list='$(DISTFILES)'; \ - for file in $$list; do echo $$file; done | \ - sed -e "s|^$$srcdirstrip/||;t" \ - -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t" | sort -u > $@ -CLEANFILES = distfiles - -# Print name of all modules. -print-build-modules: - @if test -z "$(build_modules)"; \ - then \ - echo "Could not find any kernel module."; \ - exit 1; \ - fi - @echo "$(build_modules)" | tr '_' '-'; - -if !WIN32 -COMPAT_GET_FUNCTIONS := find $(top_srcdir)/datapath/linux/compat -name "*.h" \ - -exec sed -n '/^[a-z][a-z]* \*\?[A-Za-z0-9_][A-Za-z0-9_]*([a-z]/p; /^struct [a-z0-9_][a-z0-9_]* \*\?[A-Za-z0-9_][A-Za-z0-9_]*([a-z]/p' {} \; | tr -d '*' | cut -d '(' -f1 | rev | cut -d ' ' -f1 | rev -COMPAT_GET_EXPORTS := find $(top_srcdir)/datapath/linux/compat -name "*.c" \ - -exec sed -n 's/^EXPORT_SYMBOL[A-Z_]*(\([a-z_][a-z_]*\));$$/\1/p' {} \; -COMPAT_FUNCTIONS := $(shell $(COMPAT_GET_FUNCTIONS)) -COMPAT_EXPORTS := $(shell $(COMPAT_GET_EXPORTS)) - -# Checks that all public functions are 'rpl_' or 'ovs_' prefixed. -# Checks that all EXPORT_SYMBOL_GPL() export 'rpl_' or 'ovs_' prefixed functions. -check-export-symbol: - @for fun_ in $(COMPAT_FUNCTIONS); do \ - if ! grep -- $${fun_} $(top_srcdir)/datapath/linux/compat/build-aux/export-check-allow-list > /dev/null; then \ - if ! echo $${fun_} | grep -q -E '^(rpl|ovs)_'; then \ - echo "error: $${fun_}() needs to be prefixed with 'rpl_' or 'ovs_'."; \ - exit 1; \ - fi; \ - fi; \ - done - @for fun_ in $(COMPAT_EXPORTS); do \ - if ! echo $${fun_} | grep -q -E '^(rpl|ovs)_'; then \ - echo "error: $${fun_}() needs to be prefixed with 'rpl_' or 'ovs_'."; \ - exit 1; \ - fi; \ - done - -all-local: check-export-symbol -endif diff --git a/datapath/Modules.mk b/datapath/Modules.mk deleted file mode 100644 index 3c4ae366c..000000000 --- a/datapath/Modules.mk +++ /dev/null @@ -1,58 +0,0 @@ -# Some modules should be built and distributed, e.g. openvswitch. -# -# Some modules should be built but not distributed, e.g. third-party -# hwtable modules. -build_multi_modules = \ - openvswitch -both_modules = \ - $(build_multi_modules) \ - vport_geneve \ - vport_gre \ - vport_lisp \ - vport_stt \ - vport_vxlan -# When changing the name of 'build_modules', please also update the -# print-build-modules in Makefile.am. -build_modules = $(both_modules) # Modules to build -dist_modules = $(both_modules) # Modules to distribute - -openvswitch_sources = \ - actions.c \ - conntrack.c \ - datapath.c \ - dp_notify.c \ - flow.c \ - flow_netlink.c \ - flow_table.c \ - vport.c \ - vport-internal_dev.c \ - vport-netdev.c \ - nsh.c \ - meter.c - -vport_geneve_sources = vport-geneve.c -vport_vxlan_sources = vport-vxlan.c -vport_gre_sources = vport-gre.c -vport_lisp_sources = vport-lisp.c -vport_stt_sources = vport-stt.c -nsh_sources = nsh.c - -openvswitch_headers = \ - compat.h \ - conntrack.h \ - datapath.h \ - flow.h \ - flow_netlink.h \ - flow_table.h \ - vport.h \ - vport-internal_dev.h \ - vport-netdev.h \ - meter.h - -dist_sources = $(foreach module,$(dist_modules),$($(module)_sources)) -dist_headers = $(foreach module,$(dist_modules),$($(module)_headers)) -dist_extras = $(foreach module,$(dist_modules),$($(module)_extras)) -build_sources = $(foreach module,$(build_modules),$($(module)_sources)) -build_headers = $(foreach module,$(build_modules),$($(module)_headers)) -build_links = $(notdir $(build_sources)) -build_objects = $(notdir $(patsubst %.c,%.o,$(build_sources))) diff --git a/datapath/actions.c b/datapath/actions.c deleted file mode 100644 index fbf445703..000000000 --- a/datapath/actions.c +++ /dev/null @@ -1,1587 +0,0 @@ -/* - * Copyright (c) 2007-2017 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/openvswitch.h> -#include <linux/netfilter_ipv6.h> -#include <linux/sctp.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/in6.h> -#include <linux/if_arp.h> -#include <linux/if_vlan.h> - -#include <net/dst.h> -#include <net/ip.h> -#include <net/ipv6.h> -#include <net/checksum.h> -#include <net/dsfield.h> -#include <net/mpls.h> -#include <net/sctp/checksum.h> - -#include "datapath.h" -#include "conntrack.h" -#include "gso.h" -#include "vport.h" -#include "flow_netlink.h" - -static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, - const struct nlattr *attr, int len); - -struct deferred_action { - struct sk_buff *skb; - const struct nlattr *actions; - int actions_len; - - /* Store pkt_key clone when creating deferred action. */ - struct sw_flow_key pkt_key; -}; - -#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) -struct ovs_frag_data { - unsigned long dst; - struct vport *vport; - struct ovs_gso_cb cb; - __be16 inner_protocol; - u16 network_offset; /* valid only for MPLS */ - u16 vlan_tci; - __be16 vlan_proto; - unsigned int l2_len; - u8 mac_proto; - u8 l2_data[MAX_L2_LEN]; -}; - -static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); - -#define DEFERRED_ACTION_FIFO_SIZE 10 -#define OVS_RECURSION_LIMIT 4 -#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) -struct action_fifo { - int head; - int tail; - /* Deferred action fifo queue storage. */ - struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; -}; - -struct action_flow_keys { - struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; -}; - -static struct action_fifo __percpu *action_fifos; -static struct action_flow_keys __percpu *flow_keys; -static DEFINE_PER_CPU(int, exec_actions_level); - -/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' - * space. Return NULL if out of key spaces. - */ -static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) -{ - struct action_flow_keys *keys = this_cpu_ptr(flow_keys); - int level = this_cpu_read(exec_actions_level); - struct sw_flow_key *key = NULL; - - if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { - key = &keys->key[level - 1]; - *key = *key_; - } - - return key; -} - -static void action_fifo_init(struct action_fifo *fifo) -{ - fifo->head = 0; - fifo->tail = 0; -} - -static bool action_fifo_is_empty(const struct action_fifo *fifo) -{ - return (fifo->head == fifo->tail); -} - -static struct deferred_action *action_fifo_get(struct action_fifo *fifo) -{ - if (action_fifo_is_empty(fifo)) - return NULL; - - return &fifo->fifo[fifo->tail++]; -} - -static struct deferred_action *action_fifo_put(struct action_fifo *fifo) -{ - if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) - return NULL; - - return &fifo->fifo[fifo->head++]; -} - -/* Return queue entry if fifo is not full */ -static struct deferred_action *add_deferred_actions(struct sk_buff *skb, - const struct sw_flow_key *key, - const struct nlattr *actions, - const int actions_len) -{ - struct action_fifo *fifo; - struct deferred_action *da; - - fifo = this_cpu_ptr(action_fifos); - da = action_fifo_put(fifo); - if (da) { - da->skb = skb; - da->actions = actions; - da->actions_len = actions_len; - da->pkt_key = *key; - } - - return da; -} - -static void invalidate_flow_key(struct sw_flow_key *key) -{ - key->mac_proto |= SW_FLOW_KEY_INVALID; -} - -static bool is_flow_key_valid(const struct sw_flow_key *key) -{ - return !(key->mac_proto & SW_FLOW_KEY_INVALID); -} - -static int clone_execute(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, - u32 recirc_id, - const struct nlattr *actions, int len, - bool last, bool clone_flow_key); - -static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, - __be16 ethertype) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be16 diff[] = { ~(hdr->h_proto), ethertype }; - - skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); - } - - hdr->h_proto = ethertype; -} - -static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_action_push_mpls *mpls) -{ - struct mpls_shim_hdr *new_mpls_lse; - - /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */ - if (skb->encapsulation) - return -ENOTSUPP; - - if (skb_cow_head(skb, MPLS_HLEN) < 0) - return -ENOMEM; - - if (!ovs_skb_get_inner_protocol(skb)) { - skb_set_inner_network_header(skb, skb->mac_len); - ovs_skb_set_inner_protocol(skb, skb->protocol); - } - - skb_push(skb, MPLS_HLEN); - memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); - skb_reset_mac_header(skb); -#ifdef MPLS_HEADER_IS_L3 - skb_set_network_header(skb, skb->mac_len); -#endif - - new_mpls_lse = mpls_hdr(skb); - new_mpls_lse->label_stack_entry = mpls->mpls_lse; - - skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); - - if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) - update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); - skb->protocol = mpls->mpls_ethertype; - - invalidate_flow_key(key); - return 0; -} - -static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, - const __be16 ethertype) -{ - int err; - - err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); - if (unlikely(err)) - return err; - - skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN); - - memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), - skb->mac_len); - - __skb_pull(skb, MPLS_HLEN); - skb_reset_mac_header(skb); - skb_set_network_header(skb, skb->mac_len); - - if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) { - struct ethhdr *hdr; - - /* mpls_hdr() is used to locate the ethertype - * field correctly in the presence of VLAN tags. - */ - hdr = (struct ethhdr *)((void*)mpls_hdr(skb) - ETH_HLEN); - update_ethertype(skb, hdr, ethertype); - } - if (eth_p_mpls(skb->protocol)) - skb->protocol = ethertype; - - invalidate_flow_key(key); - return 0; -} - -static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, - const __be32 *mpls_lse, const __be32 *mask) -{ - struct mpls_shim_hdr *stack; - __be32 lse; - int err; - - err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN); - if (unlikely(err)) - return err; - - stack = mpls_hdr(skb); - lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); - if (skb->ip_summed == CHECKSUM_COMPLETE) { - __be32 diff[] = { ~(stack->label_stack_entry), lse }; - - skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum); - } - - stack->label_stack_entry = lse; - flow_key->mpls.lse[0] = lse; - return 0; -} - -static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) -{ - int err; - - err = skb_vlan_pop(skb); - if (skb_vlan_tag_present(skb)) { - invalidate_flow_key(key); - } else { - key->eth.vlan.tci = 0; - key->eth.vlan.tpid = 0; - } - return err; -} - -static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_action_push_vlan *vlan) -{ - if (skb_vlan_tag_present(skb)) { - invalidate_flow_key(key); - } else { - key->eth.vlan.tci = vlan->vlan_tci; - key->eth.vlan.tpid = vlan->vlan_tpid; - } - return skb_vlan_push(skb, vlan->vlan_tpid, - ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK); -} - -/* 'src' is already properly masked. */ -static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) -{ - u16 *dst = (u16 *)dst_; - const u16 *src = (const u16 *)src_; - const u16 *mask = (const u16 *)mask_; - - OVS_SET_MASKED(dst[0], src[0], mask[0]); - OVS_SET_MASKED(dst[1], src[1], mask[1]); - OVS_SET_MASKED(dst[2], src[2], mask[2]); -} - -static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct ovs_key_ethernet *key, - const struct ovs_key_ethernet *mask) -{ - int err; - - err = skb_ensure_writable(skb, ETH_HLEN); - if (unlikely(err)) - return err; - - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); - - ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src, - mask->eth_src); - ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, - mask->eth_dst); - - skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); - - ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); - ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); - return 0; -} - -/* pop_eth does not support VLAN packets as this action is never called - * for them. - */ -static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) -{ - skb_pull_rcsum(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - /* safe right before invalidate_flow_key */ - key->mac_proto = MAC_PROTO_NONE; - invalidate_flow_key(key); - return 0; -} - -static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, - const struct ovs_action_push_eth *ethh) -{ - struct ethhdr *hdr; - - /* Add the new Ethernet header */ - if (skb_cow_head(skb, ETH_HLEN) < 0) - return -ENOMEM; - - skb_push(skb, ETH_HLEN); - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - - hdr = eth_hdr(skb); - ether_addr_copy(hdr->h_source, ethh->addresses.eth_src); - ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst); - hdr->h_proto = skb->protocol; - - skb_postpush_rcsum(skb, hdr, ETH_HLEN); - - /* safe right before invalidate_flow_key */ - key->mac_proto = MAC_PROTO_ETHERNET; - invalidate_flow_key(key); - return 0; -} - -static int push_nsh(struct sk_buff *skb, struct sw_flow_key *key, - const struct nshhdr *nh) -{ - int err; - - err = ovs_nsh_push(skb, nh); - if (err) - return err; - - /* safe right before invalidate_flow_key */ - key->mac_proto = MAC_PROTO_NONE; - invalidate_flow_key(key); - return 0; -} - -static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key) -{ - int err; - - err = ovs_nsh_pop(skb); - if (err) - return err; - - /* safe right before invalidate_flow_key */ - if (skb->protocol == htons(ETH_P_TEB)) - key->mac_proto = MAC_PROTO_ETHERNET; - else - key->mac_proto = MAC_PROTO_NONE; - invalidate_flow_key(key); - return 0; -} - -static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, - __be32 addr, __be32 new_addr) -{ - int transport_len = skb->len - skb_transport_offset(skb); - - if (nh->frag_off & htons(IP_OFFSET)) - return; - - if (nh->protocol == IPPROTO_TCP) { - if (likely(transport_len >= sizeof(struct tcphdr))) - inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, - addr, new_addr, true); - } else if (nh->protocol == IPPROTO_UDP) { - if (likely(transport_len >= sizeof(struct udphdr))) { - struct udphdr *uh = udp_hdr(skb); - - if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { - inet_proto_csum_replace4(&uh->check, skb, - addr, new_addr, true); - if (!uh->check) - uh->check = CSUM_MANGLED_0; - } - } - } - -} - -static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, - __be32 *addr, __be32 new_addr) -{ - update_ip_l4_checksum(skb, nh, *addr, new_addr); - csum_replace4(&nh->check, *addr, new_addr); - skb_clear_hash(skb); - *addr = new_addr; -} - -static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, - __be32 addr[4], const __be32 new_addr[4]) -{ - int transport_len = skb->len - skb_transport_offset(skb); - - if (l4_proto == NEXTHDR_TCP) { - if (likely(transport_len >= sizeof(struct tcphdr))) - inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, - addr, new_addr, true); - } else if (l4_proto == NEXTHDR_UDP) { - if (likely(transport_len >= sizeof(struct udphdr))) { - struct udphdr *uh = udp_hdr(skb); - - if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { - inet_proto_csum_replace16(&uh->check, skb, - addr, new_addr, true); - if (!uh->check) - uh->check = CSUM_MANGLED_0; - } - } - } else if (l4_proto == NEXTHDR_ICMP) { - if (likely(transport_len >= sizeof(struct icmp6hdr))) - inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, - skb, addr, new_addr, true); - } -} - -static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], - const __be32 mask[4], __be32 masked[4]) -{ - masked[0] = OVS_MASKED(old[0], addr[0], mask[0]); - masked[1] = OVS_MASKED(old[1], addr[1], mask[1]); - masked[2] = OVS_MASKED(old[2], addr[2], mask[2]); - masked[3] = OVS_MASKED(old[3], addr[3], mask[3]); -} - -static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, - __be32 addr[4], const __be32 new_addr[4], - bool recalculate_csum) -{ - if (likely(recalculate_csum)) - update_ipv6_checksum(skb, l4_proto, addr, new_addr); - - skb_clear_hash(skb); - memcpy(addr, new_addr, sizeof(__be32[4])); -} - -static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) -{ - /* Bits 21-24 are always unmasked, so this retains their values. */ - OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); -} - -static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, - u8 mask) -{ - new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask); - - csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); - nh->ttl = new_ttl; -} - -static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct ovs_key_ipv4 *key, - const struct ovs_key_ipv4 *mask) -{ - struct iphdr *nh; - __be32 new_addr; - int err; - - err = skb_ensure_writable(skb, skb_network_offset(skb) + - sizeof(struct iphdr)); - if (unlikely(err)) - return err; - - nh = ip_hdr(skb); - - /* Setting an IP addresses is typically only a side effect of - * matching on them in the current userspace implementation, so it - * makes sense to check if the value actually changed. - */ - if (mask->ipv4_src) { - new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); - - if (unlikely(new_addr != nh->saddr)) { - set_ip_addr(skb, nh, &nh->saddr, new_addr); - flow_key->ipv4.addr.src = new_addr; - } - } - if (mask->ipv4_dst) { - new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); - - if (unlikely(new_addr != nh->daddr)) { - set_ip_addr(skb, nh, &nh->daddr, new_addr); - flow_key->ipv4.addr.dst = new_addr; - } - } - if (mask->ipv4_tos) { - ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos); - flow_key->ip.tos = nh->tos; - } - if (mask->ipv4_ttl) { - set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl); - flow_key->ip.ttl = nh->ttl; - } - - return 0; -} - -static bool is_ipv6_mask_nonzero(const __be32 addr[4]) -{ - return !!(addr[0] | addr[1] | addr[2] | addr[3]); -} - -static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct ovs_key_ipv6 *key, - const struct ovs_key_ipv6 *mask) -{ - struct ipv6hdr *nh; - int err; - - err = skb_ensure_writable(skb, skb_network_offset(skb) + - sizeof(struct ipv6hdr)); - if (unlikely(err)) - return err; - - nh = ipv6_hdr(skb); - - /* Setting an IP addresses is typically only a side effect of - * matching on them in the current userspace implementation, so it - * makes sense to check if the value actually changed. - */ - if (is_ipv6_mask_nonzero(mask->ipv6_src)) { - __be32 *saddr = (__be32 *)&nh->saddr; - __be32 masked[4]; - - mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked); - - if (unlikely(memcmp(saddr, masked, sizeof(masked)))) { - set_ipv6_addr(skb, flow_key->ip.proto, saddr, masked, - true); - memcpy(&flow_key->ipv6.addr.src, masked, - sizeof(flow_key->ipv6.addr.src)); - } - } - if (is_ipv6_mask_nonzero(mask->ipv6_dst)) { - unsigned int offset = 0; - int flags = IP6_FH_F_SKIP_RH; - bool recalc_csum = true; - __be32 *daddr = (__be32 *)&nh->daddr; - __be32 masked[4]; - - mask_ipv6_addr(daddr, key->ipv6_dst, mask->ipv6_dst, masked); - - if (unlikely(memcmp(daddr, masked, sizeof(masked)))) { - if (ipv6_ext_hdr(nh->nexthdr)) - recalc_csum = (ipv6_find_hdr(skb, &offset, - NEXTHDR_ROUTING, - NULL, &flags) - != NEXTHDR_ROUTING); - - set_ipv6_addr(skb, flow_key->ip.proto, daddr, masked, - recalc_csum); - memcpy(&flow_key->ipv6.addr.dst, masked, - sizeof(flow_key->ipv6.addr.dst)); - } - } - if (mask->ipv6_tclass) { - ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass); - flow_key->ip.tos = ipv6_get_dsfield(nh); - } - if (mask->ipv6_label) { - set_ipv6_fl(nh, ntohl(key->ipv6_label), - ntohl(mask->ipv6_label)); - flow_key->ipv6.label = - *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); - } - if (mask->ipv6_hlimit) { - OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, - mask->ipv6_hlimit); - flow_key->ip.ttl = nh->hop_limit; - } - return 0; -} - -static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct nlattr *a) -{ - struct nshhdr *nh; - size_t length; - int err; - u8 flags; - u8 ttl; - int i; - - struct ovs_key_nsh key; - struct ovs_key_nsh mask; - - err = nsh_key_from_nlattr(a, &key, &mask); - if (err) - return err; - - /* Make sure the NSH base header is there */ - if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN)) - return -ENOMEM; - - nh = nsh_hdr(skb); - length = nsh_hdr_len(nh); - - /* Make sure the whole NSH header is there */ - err = skb_ensure_writable(skb, skb_network_offset(skb) + - length); - if (unlikely(err)) - return err; - - nh = nsh_hdr(skb); - skb_postpull_rcsum(skb, nh, length); - flags = nsh_get_flags(nh); - flags = OVS_MASKED(flags, key.base.flags, mask.base.flags); - flow_key->nsh.base.flags = flags; - ttl = nsh_get_ttl(nh); - ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl); - flow_key->nsh.base.ttl = ttl; - nsh_set_flags_and_ttl(nh, flags, ttl); - nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr, - mask.base.path_hdr); - flow_key->nsh.base.path_hdr = nh->path_hdr; - switch (nh->mdtype) { - case NSH_M_TYPE1: - for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) { - nh->md1.context[i] = - OVS_MASKED(nh->md1.context[i], key.context[i], - mask.context[i]); - } - memcpy(flow_key->nsh.context, nh->md1.context, - sizeof(nh->md1.context)); - break; - case NSH_M_TYPE2: - memset(flow_key->nsh.context, 0, - sizeof(flow_key->nsh.context)); - break; - default: - return -EINVAL; - } - skb_postpush_rcsum(skb, nh, length); - return 0; -} - -/* Must follow skb_ensure_writable() since that can move the skb data. */ -static void set_tp_port(struct sk_buff *skb, __be16 *port, - __be16 new_port, __sum16 *check) -{ - inet_proto_csum_replace2(check, skb, *port, new_port, false); - *port = new_port; -} - -static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct ovs_key_udp *key, - const struct ovs_key_udp *mask) -{ - struct udphdr *uh; - __be16 src, dst; - int err; - - err = skb_ensure_writable(skb, skb_transport_offset(skb) + - sizeof(struct udphdr)); - if (unlikely(err)) - return err; - - uh = udp_hdr(skb); - /* Either of the masks is non-zero, so do not bother checking them. */ - src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src); - dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst); - - if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { - if (likely(src != uh->source)) { - set_tp_port(skb, &uh->source, src, &uh->check); - flow_key->tp.src = src; - } - if (likely(dst != uh->dest)) { - set_tp_port(skb, &uh->dest, dst, &uh->check); - flow_key->tp.dst = dst; - } - - if (unlikely(!uh->check)) - uh->check = CSUM_MANGLED_0; - } else { - uh->source = src; - uh->dest = dst; - flow_key->tp.src = src; - flow_key->tp.dst = dst; - } - - skb_clear_hash(skb); - - return 0; -} - -static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct ovs_key_tcp *key, - const struct ovs_key_tcp *mask) -{ - struct tcphdr *th; - __be16 src, dst; - int err; - - err = skb_ensure_writable(skb, skb_transport_offset(skb) + - sizeof(struct tcphdr)); - if (unlikely(err)) - return err; - - th = tcp_hdr(skb); - src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src); - if (likely(src != th->source)) { - set_tp_port(skb, &th->source, src, &th->check); - flow_key->tp.src = src; - } - dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst); - if (likely(dst != th->dest)) { - set_tp_port(skb, &th->dest, dst, &th->check); - flow_key->tp.dst = dst; - } - skb_clear_hash(skb); - - return 0; -} - -static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct ovs_key_sctp *key, - const struct ovs_key_sctp *mask) -{ - unsigned int sctphoff = skb_transport_offset(skb); - struct sctphdr *sh; - __le32 old_correct_csum, new_csum, old_csum; - int err; - - err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr)); - if (unlikely(err)) - return err; - - sh = sctp_hdr(skb); - old_csum = sh->checksum; - old_correct_csum = sctp_compute_cksum(skb, sctphoff); - - sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src); - sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); - - new_csum = sctp_compute_cksum(skb, sctphoff); - - /* Carry any checksum errors through. */ - sh->checksum = old_csum ^ old_correct_csum ^ new_csum; - - skb_clear_hash(skb); - flow_key->tp.src = sh->source; - flow_key->tp.dst = sh->dest; - - return 0; -} - -static int ovs_vport_output(OVS_VPORT_OUTPUT_PARAMS) -{ - struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); - struct vport *vport = data->vport; - - if (skb_cow_head(skb, data->l2_len) < 0) { - kfree_skb(skb); - return -ENOMEM; - } - - __skb_dst_copy(skb, data->dst); - *OVS_GSO_CB(skb) = data->cb; - ovs_skb_set_inner_protocol(skb, data->inner_protocol); - if (data->vlan_tci & VLAN_CFI_MASK) - __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci & ~VLAN_CFI_MASK); - else - __vlan_hwaccel_clear_tag(skb); - - /* Reconstruct the MAC header. */ - skb_push(skb, data->l2_len); - memcpy(skb->data, &data->l2_data, data->l2_len); - skb_postpush_rcsum(skb, skb->data, data->l2_len); - skb_reset_mac_header(skb); - - if (eth_p_mpls(skb->protocol)) { - skb->inner_network_header = skb->network_header; - skb_set_network_header(skb, data->network_offset); - skb_reset_mac_len(skb); - } - - ovs_vport_send(vport, skb, data->mac_proto); - return 0; -} - -static unsigned int -ovs_dst_get_mtu(const struct dst_entry *dst) -{ - return dst->dev->mtu; -} - -static struct dst_ops ovs_dst_ops = { - .family = AF_UNSPEC, - .mtu = ovs_dst_get_mtu, -}; - -/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is - * ovs_vport_output(), which is called once per fragmented packet. - */ -static void prepare_frag(struct vport *vport, struct sk_buff *skb, - u16 orig_network_offset, u8 mac_proto) -{ - unsigned int hlen = skb_network_offset(skb); - struct ovs_frag_data *data; - - data = this_cpu_ptr(&ovs_frag_data_storage); - data->dst = (unsigned long) skb_dst(skb); - data->vport = vport; - data->cb = *OVS_GSO_CB(skb); - data->inner_protocol = ovs_skb_get_inner_protocol(skb); - data->network_offset = orig_network_offset; - if (skb_vlan_tag_present(skb)) - data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK; - else - data->vlan_tci = 0; - data->vlan_proto = skb->vlan_proto; - data->mac_proto = mac_proto; - data->l2_len = hlen; - memcpy(&data->l2_data, skb->data, hlen); - - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - skb_pull(skb, hlen); -} - -static void ovs_fragment(struct net *net, struct vport *vport, - struct sk_buff *skb, u16 mru, - struct sw_flow_key *key) -{ - u16 orig_network_offset = 0; - - if (eth_p_mpls(skb->protocol)) { - orig_network_offset = skb_network_offset(skb); - skb->network_header = skb->inner_network_header; - } - - if (skb_network_offset(skb) > MAX_L2_LEN) { - OVS_NLERR(1, "L2 header too long to fragment"); - goto err; - } - - if (key->eth.type == htons(ETH_P_IP)) { - struct dst_entry ovs_dst; - unsigned long orig_dst; - - prepare_frag(vport, skb, orig_network_offset, - ovs_key_mac_proto(key)); - dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, - DST_OBSOLETE_NONE, DST_NOCOUNT); - ovs_dst.dev = vport->dev; - - orig_dst = (unsigned long) skb_dst(skb); - skb_dst_set_noref(skb, &ovs_dst); - IPCB(skb)->frag_max_size = mru; - - ip_do_fragment(net, skb->sk, skb, ovs_vport_output); - refdst_drop(orig_dst); - } else if (key->eth.type == htons(ETH_P_IPV6)) { - const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); - unsigned long orig_dst; - struct rt6_info ovs_rt; - - if (!v6ops) - goto err; - - prepare_frag(vport, skb, orig_network_offset, - ovs_key_mac_proto(key)); - memset(&ovs_rt, 0, sizeof(ovs_rt)); - dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, - DST_OBSOLETE_NONE, DST_NOCOUNT); - ovs_rt.dst.dev = vport->dev; - - orig_dst = (unsigned long) skb_dst(skb); - skb_dst_set_noref(skb, &ovs_rt.dst); - IP6CB(skb)->frag_max_size = mru; -#ifdef HAVE_IP_LOCAL_OUT_TAKES_NET - v6ops->fragment(net, skb->sk, skb, ovs_vport_output); -#else - v6ops->fragment(skb->sk, skb, ovs_vport_output); -#endif - refdst_drop(orig_dst); - } else { - WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", - ovs_vport_name(vport), ntohs(key->eth.type), mru, - vport->dev->mtu); - goto err; - } - - return; -err: - kfree_skb(skb); -} - -static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, - struct sw_flow_key *key) -{ - struct vport *vport = ovs_vport_rcu(dp, out_port); - - if (likely(vport)) { - u16 mru = OVS_CB(skb)->mru; - u32 cutlen = OVS_CB(skb)->cutlen; - - if (unlikely(cutlen > 0)) { - if (skb->len - cutlen > ovs_mac_header_len(key)) - pskb_trim(skb, skb->len - cutlen); - else - pskb_trim(skb, ovs_mac_header_len(key)); - } - - if (likely(!mru || - (skb->len <= mru + vport->dev->hard_header_len))) { - ovs_vport_send(vport, skb, ovs_key_mac_proto(key)); - } else if (mru <= vport->dev->mtu) { - struct net *net = ovs_dp_get_net(dp); - - ovs_fragment(net, vport, skb, mru, key); - } else { - OVS_NLERR(true, "Cannot fragment IP frames"); - kfree_skb(skb); - } - } else { - kfree_skb(skb); - } -} - -static int output_userspace(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr, - const struct nlattr *actions, int actions_len, - uint32_t cutlen) -{ - struct dp_upcall_info upcall; - const struct nlattr *a; - int rem, err; - - memset(&upcall, 0, sizeof(upcall)); - upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.mru = OVS_CB(skb)->mru; - - SKB_INIT_FILL_METADATA_DST(skb); - for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { - switch (nla_type(a)) { - case OVS_USERSPACE_ATTR_USERDATA: - upcall.userdata = a; - break; - - case OVS_USERSPACE_ATTR_PID: - upcall.portid = nla_get_u32(a); - break; - - case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { - /* Get out tunnel info. */ - struct vport *vport; - - vport = ovs_vport_rcu(dp, nla_get_u32(a)); - if (vport) { - err = dev_fill_metadata_dst(vport->dev, skb); - if (!err) - upcall.egress_tun_info = skb_tunnel_info(skb); - } - - break; - } - - case OVS_USERSPACE_ATTR_ACTIONS: { - /* Include actions. */ - upcall.actions = actions; - upcall.actions_len = actions_len; - break; - } - - } /* End of switch. */ - } - - err = ovs_dp_upcall(dp, skb, key, &upcall, cutlen); - SKB_RESTORE_FILL_METADATA_DST(skb); - return err; -} - -/* When 'last' is true, sample() should always consume the 'skb'. - * Otherwise, sample() should keep 'skb' intact regardless what - * actions are executed within sample(). - */ -static int sample(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr, - bool last) -{ - struct nlattr *actions; - struct nlattr *sample_arg; - int rem = nla_len(attr); - const struct sample_arg *arg; - bool clone_flow_key; - - /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */ - sample_arg = nla_data(attr); - arg = nla_data(sample_arg); - actions = nla_next(sample_arg, &rem); - - if ((arg->probability != U32_MAX) && - (!arg->probability || prandom_u32() > arg->probability)) { - if (last) - consume_skb(skb); - return 0; - } - - clone_flow_key = !arg->exec; - return clone_execute(dp, skb, key, 0, actions, rem, last, - clone_flow_key); -} - -/* When 'last' is true, clone() should always consume the 'skb'. - * Otherwise, clone() should keep 'skb' intact regardless what - * actions are executed within clone(). - */ -static int clone(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, const struct nlattr *attr, - bool last) -{ - struct nlattr *actions; - struct nlattr *clone_arg; - int rem = nla_len(attr); - bool dont_clone_flow_key; - - /* The first action is always 'OVS_CLONE_ATTR_ARG'. */ - clone_arg = nla_data(attr); - dont_clone_flow_key = nla_get_u32(clone_arg); - actions = nla_next(clone_arg, &rem); - - return clone_execute(dp, skb, key, 0, actions, rem, last, - !dont_clone_flow_key); -} - -static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, - const struct nlattr *attr) -{ - struct ovs_action_hash *hash_act = nla_data(attr); - u32 hash = 0; - - /* OVS_HASH_ALG_L4 is the only possible hash algorithm. */ - hash = skb_get_hash(skb); - hash = jhash_1word(hash, hash_act->hash_basis); - if (!hash) - hash = 0x1; - - key->ovs_flow_hash = hash; -} - -static int execute_set_action(struct sk_buff *skb, - struct sw_flow_key *flow_key, - const struct nlattr *a) -{ - /* Only tunnel set execution is supported without a mask. */ - if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { - struct ovs_tunnel_info *tun = nla_data(a); - - ovs_skb_dst_drop(skb); - ovs_dst_hold((struct dst_entry *)tun->tun_dst); - ovs_skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); - return 0; - } - - return -EINVAL; -} - -/* Mask is at the midpoint of the data. */ -#define get_mask(a, type) ((const type)nla_data(a) + 1) - -static int execute_masked_set_action(struct sk_buff *skb, - struct sw_flow_key *flow_key, - const struct nlattr *a) -{ - int err = 0; - - switch (nla_type(a)) { - case OVS_KEY_ATTR_PRIORITY: - OVS_SET_MASKED(skb->priority, nla_get_u32(a), - *get_mask(a, u32 *)); - flow_key->phy.priority = skb->priority; - break; - - case OVS_KEY_ATTR_SKB_MARK: - OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); - flow_key->phy.skb_mark = skb->mark; - break; - - case OVS_KEY_ATTR_TUNNEL_INFO: - /* Masked data not supported for tunnel. */ - err = -EINVAL; - break; - - case OVS_KEY_ATTR_ETHERNET: - err = set_eth_addr(skb, flow_key, nla_data(a), - get_mask(a, struct ovs_key_ethernet *)); - break; - - case OVS_KEY_ATTR_NSH: - err = set_nsh(skb, flow_key, a); - break; - - case OVS_KEY_ATTR_IPV4: - err = set_ipv4(skb, flow_key, nla_data(a), - get_mask(a, struct ovs_key_ipv4 *)); - break; - - case OVS_KEY_ATTR_IPV6: - err = set_ipv6(skb, flow_key, nla_data(a), - get_mask(a, struct ovs_key_ipv6 *)); - break; - - case OVS_KEY_ATTR_TCP: - err = set_tcp(skb, flow_key, nla_data(a), - get_mask(a, struct ovs_key_tcp *)); - break; - - case OVS_KEY_ATTR_UDP: - err = set_udp(skb, flow_key, nla_data(a), - get_mask(a, struct ovs_key_udp *)); - break; - - case OVS_KEY_ATTR_SCTP: - err = set_sctp(skb, flow_key, nla_data(a), - get_mask(a, struct ovs_key_sctp *)); - break; - - case OVS_KEY_ATTR_MPLS: - err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, - __be32 *)); - break; - - case OVS_KEY_ATTR_CT_STATE: - case OVS_KEY_ATTR_CT_ZONE: - case OVS_KEY_ATTR_CT_MARK: - case OVS_KEY_ATTR_CT_LABELS: - case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4: - case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6: - err = -EINVAL; - break; - } - - return err; -} - -static int execute_recirc(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, - const struct nlattr *a, bool last) -{ - u32 recirc_id; - - if (!is_flow_key_valid(key)) { - int err; - - err = ovs_flow_key_update(skb, key); - if (err) - return err; - } - BUG_ON(!is_flow_key_valid(key)); - - recirc_id = nla_get_u32(a); - return clone_execute(dp, skb, key, recirc_id, NULL, 0, last, true); -} - -static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, - const struct nlattr *attr, bool last) -{ - const struct nlattr *actions, *cpl_arg; - const struct check_pkt_len_arg *arg; - int rem = nla_len(attr); - bool clone_flow_key; - - /* The first netlink attribute in 'attr' is always - * 'OVS_CHECK_PKT_LEN_ATTR_ARG'. - */ - cpl_arg = nla_data(attr); - arg = nla_data(cpl_arg); - - if (skb->len <= arg->pkt_len) { - /* Second netlink attribute in 'attr' is always - * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. - */ - actions = nla_next(cpl_arg, &rem); - clone_flow_key = !arg->exec_for_lesser_equal; - } else { - /* Third netlink attribute in 'attr' is always - * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER'. - */ - actions = nla_next(cpl_arg, &rem); - actions = nla_next(actions, &rem); - clone_flow_key = !arg->exec_for_greater; - } - - return clone_execute(dp, skb, key, 0, nla_data(actions), - nla_len(actions), last, clone_flow_key); -} - -/* Execute a list of actions against 'skb'. */ -static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, - const struct nlattr *attr, int len) -{ - const struct nlattr *a; - int rem; - - for (a = attr, rem = len; rem > 0; - a = nla_next(a, &rem)) { - int err = 0; - - switch (nla_type(a)) { - case OVS_ACTION_ATTR_OUTPUT: { - int port = nla_get_u32(a); - struct sk_buff *clone; - - /* Every output action needs a separate clone - * of 'skb', In case the output action is the - * last action, cloning can be avoided. - */ - if (nla_is_last(a, rem)) { - do_output(dp, skb, port, key); - /* 'skb' has been used for output. - */ - return 0; - } - - clone = skb_clone(skb, GFP_ATOMIC); - if (clone) - do_output(dp, clone, port, key); - OVS_CB(skb)->cutlen = 0; - break; - } - - case OVS_ACTION_ATTR_TRUNC: { - struct ovs_action_trunc *trunc = nla_data(a); - - if (skb->len > trunc->max_len) - OVS_CB(skb)->cutlen = skb->len - trunc->max_len; - break; - } - - case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, key, a, attr, - len, OVS_CB(skb)->cutlen); - OVS_CB(skb)->cutlen = 0; - break; - - case OVS_ACTION_ATTR_HASH: - execute_hash(skb, key, a); - break; - - case OVS_ACTION_ATTR_PUSH_MPLS: - err = push_mpls(skb, key, nla_data(a)); - break; - - case OVS_ACTION_ATTR_POP_MPLS: - err = pop_mpls(skb, key, nla_get_be16(a)); - break; - - case OVS_ACTION_ATTR_PUSH_VLAN: - err = push_vlan(skb, key, nla_data(a)); - break; - - case OVS_ACTION_ATTR_POP_VLAN: - err = pop_vlan(skb, key); - break; - - case OVS_ACTION_ATTR_RECIRC: { - bool last = nla_is_last(a, rem); - - err = execute_recirc(dp, skb, key, a, last); - if (last) { - /* If this is the last action, the skb has - * been consumed or freed. - * Return immediately. - */ - return err; - } - break; - } - - case OVS_ACTION_ATTR_SET: - err = execute_set_action(skb, key, nla_data(a)); - break; - - case OVS_ACTION_ATTR_SET_MASKED: - case OVS_ACTION_ATTR_SET_TO_MASKED: - err = execute_masked_set_action(skb, key, nla_data(a)); - break; - - case OVS_ACTION_ATTR_SAMPLE: { - bool last = nla_is_last(a, rem); - - err = sample(dp, skb, key, a, last); - if (last) - return err; - - break; - } - - case OVS_ACTION_ATTR_CT: - if (!is_flow_key_valid(key)) { - err = ovs_flow_key_update(skb, key); - if (err) - return err; - } - - err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, - nla_data(a)); - - /* Hide stolen IP fragments from user space. */ - if (err) - return err == -EINPROGRESS ? 0 : err; - break; - - case OVS_ACTION_ATTR_CT_CLEAR: - err = ovs_ct_clear(skb, key); - break; - - case OVS_ACTION_ATTR_PUSH_ETH: - err = push_eth(skb, key, nla_data(a)); - break; - - case OVS_ACTION_ATTR_POP_ETH: - err = pop_eth(skb, key); - break; - - case OVS_ACTION_ATTR_PUSH_NSH: { - u8 buffer[NSH_HDR_MAX_LEN]; - struct nshhdr *nh = (struct nshhdr *)buffer; - - err = nsh_hdr_from_nlattr(nla_data(a), nh, - NSH_HDR_MAX_LEN); - if (unlikely(err)) - break; - err = push_nsh(skb, key, nh); - break; - } - - case OVS_ACTION_ATTR_POP_NSH: - err = pop_nsh(skb, key); - break; - - case OVS_ACTION_ATTR_METER: - if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) { - consume_skb(skb); - return 0; - } - break; - - case OVS_ACTION_ATTR_CLONE: { - bool last = nla_is_last(a, rem); - - err = clone(dp, skb, key, a, last); - if (last) - return err; - break; - } - - case OVS_ACTION_ATTR_CHECK_PKT_LEN: { - bool last = nla_is_last(a, rem); - - err = execute_check_pkt_len(dp, skb, key, a, last); - if (last) - return err; - - break; - } - } - - if (unlikely(err)) { - kfree_skb(skb); - return err; - } - } - - consume_skb(skb); - return 0; -} - -/* Execute the actions on the clone of the packet. The effect of the - * execution does not affect the original 'skb' nor the original 'key'. - * - * The execution may be deferred in case the actions can not be executed - * immediately. - */ -static int clone_execute(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, u32 recirc_id, - const struct nlattr *actions, int len, - bool last, bool clone_flow_key) -{ - struct deferred_action *da; - struct sw_flow_key *clone; - - skb = last ? skb : skb_clone(skb, GFP_ATOMIC); - if (!skb) { - /* Out of memory, skip this action. - */ - return 0; - } - - /* When clone_flow_key is false, the 'key' will not be change - * by the actions, then the 'key' can be used directly. - * Otherwise, try to clone key from the next recursion level of - * 'flow_keys'. If clone is successful, execute the actions - * without deferring. - */ - clone = clone_flow_key ? clone_key(key) : key; - if (clone) { - int err = 0; - - if (actions) { /* Sample action */ - if (clone_flow_key) - __this_cpu_inc(exec_actions_level); - - err = do_execute_actions(dp, skb, clone, - actions, len); - - if (clone_flow_key) - __this_cpu_dec(exec_actions_level); - } else { /* Recirc action */ - clone->recirc_id = recirc_id; - ovs_dp_process_packet(skb, clone); - } - return err; - } - - /* Out of 'flow_keys' space. Defer actions */ - da = add_deferred_actions(skb, key, actions, len); - if (da) { - if (!actions) { /* Recirc action */ - key = &da->pkt_key; - key->recirc_id = recirc_id; - } - } else { - /* Out of per CPU action FIFO space. Drop the 'skb' and - * log an error. - */ - kfree_skb(skb); - - if (net_ratelimit()) { - if (actions) { /* Sample action */ - pr_warn("%s: deferred action limit reached, drop sample action\n", - ovs_dp_name(dp)); - } else { /* Recirc action */ - pr_warn("%s: deferred action limit reached, drop recirc action\n", - ovs_dp_name(dp)); - } - } - } - return 0; -} - -static void process_deferred_actions(struct datapath *dp) -{ - struct action_fifo *fifo = this_cpu_ptr(action_fifos); - - /* Do not touch the FIFO in case there is no deferred actions. */ - if (action_fifo_is_empty(fifo)) - return; - - /* Finishing executing all deferred actions. */ - do { - struct deferred_action *da = action_fifo_get(fifo); - struct sk_buff *skb = da->skb; - struct sw_flow_key *key = &da->pkt_key; - const struct nlattr *actions = da->actions; - int actions_len = da->actions_len; - - if (actions) - do_execute_actions(dp, skb, key, actions, actions_len); - else - ovs_dp_process_packet(skb, key); - } while (!action_fifo_is_empty(fifo)); - - /* Reset FIFO for the next packet. */ - action_fifo_init(fifo); -} - -/* Execute a list of actions against 'skb'. */ -int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, - const struct sw_flow_actions *acts, - struct sw_flow_key *key) -{ - int err, level; - - level = __this_cpu_inc_return(exec_actions_level); - if (unlikely(level > OVS_RECURSION_LIMIT)) { - net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", - ovs_dp_name(dp)); - kfree_skb(skb); - err = -ENETDOWN; - goto out; - } - - OVS_CB(skb)->acts_origlen = acts->orig_len; - err = do_execute_actions(dp, skb, key, - acts->actions, acts->actions_len); - - if (level == 1) - process_deferred_actions(dp); - -out: - __this_cpu_dec(exec_actions_level); - return err; -} - -int action_fifos_init(void) -{ - action_fifos = alloc_percpu(struct action_fifo); - if (!action_fifos) - return -ENOMEM; - - flow_keys = alloc_percpu(struct action_flow_keys); - if (!flow_keys) { - free_percpu(action_fifos); - return -ENOMEM; - } - - return 0; -} - -void action_fifos_exit(void) -{ - free_percpu(action_fifos); - free_percpu(flow_keys); -} diff --git a/datapath/compat.h b/datapath/compat.h deleted file mode 100644 index b820251a4..000000000 --- a/datapath/compat.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef COMPAT_H -#define COMPAT_H 1 - -#include <linux/in.h> -#include <linux/in_route.h> -#include <linux/netlink.h> -#include <net/ip.h> -#include <net/route.h> -#include <net/xfrm.h> -#include <net/netfilter/ipv6/nf_defrag_ipv6.h> -#include <net/netfilter/nf_conntrack_count.h> - -/* Fix grsecurity patch compilation issue. */ -#ifdef CONSTIFY_PLUGIN -#include <linux/cache.h> -#undef __read_mostly -#define __read_mostly -#endif - -/* Even though vanilla 3.10 kernel has grp->id, RHEL 7 kernel is missing - * this field. */ -#ifdef HAVE_GENL_MULTICAST_GROUP_WITH_ID -#define GROUP_ID(grp) ((grp)->id) -#else -#define GROUP_ID(grp) 0 -#endif - -#ifdef HAVE_NF_IPV6_OPS_FRAGMENT -static inline int __init ip6_output_init(void) { return 0; } -static inline void ip6_output_exit(void) { } -#else -int __init ip6_output_init(void); -void ip6_output_exit(void); -#endif - -static inline int __init compat_init(void) -{ - int err; - - err = ipfrag_init(); - if (err) - return err; - - err = nf_ct_frag6_init(); - if (err) - goto error_ipfrag_exit; - - err = ip6_output_init(); - if (err) - goto error_frag6_exit; - - err = rpl_nf_conncount_modinit(); - if (err) - goto error_nf_conncount_exit; - - return 0; - -error_nf_conncount_exit: - rpl_nf_conncount_modexit(); -error_frag6_exit: - nf_ct_frag6_cleanup(); -error_ipfrag_exit: - rpl_ipfrag_fini(); - return err; -} -static inline void compat_exit(void) -{ - rpl_nf_conncount_modexit(); - ip6_output_exit(); - nf_ct_frag6_cleanup(); - rpl_ipfrag_fini(); -} - -#endif /* compat.h */ diff --git a/datapath/conntrack.c b/datapath/conntrack.c deleted file mode 100644 index fc268aeae..000000000 --- a/datapath/conntrack.c +++ /dev/null @@ -1,2413 +0,0 @@ -/* - * Copyright (c) 2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#include <linux/kconfig.h> -#include <linux/version.h> - -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - -#include <linux/module.h> -#include <linux/openvswitch.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/sctp.h> -#include <linux/static_key.h> -#include <net/ip.h> -#include <net/genetlink.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_count.h> -#include <net/netfilter/nf_conntrack_helper.h> -#include <net/netfilter/nf_conntrack_labels.h> -#include <net/netfilter/nf_conntrack_seqadj.h> -#include <net/netfilter/nf_conntrack_timeout.h> -#include <net/netfilter/nf_conntrack_zones.h> -#include <net/netfilter/ipv6/nf_defrag_ipv6.h> -#include <net/ipv6_frag.h> - -/* Upstream commit 4806e975729f ("netfilter: replace NF_NAT_NEEDED with - * IS_ENABLED(CONFIG_NF_NAT)") replaces the config checking on NF_NAT_NEEDED - * with CONFIG_NF_NAT. We will replace the checking on NF_NAT_NEEDED for the - * newer kernel with the marco in order to keep backward compatiblity. - */ -#ifndef HAVE_CONFIG_NF_NAT_NEEDED -#define CONFIG_NF_NAT_NEEDED CONFIG_NF_NAT -#endif - -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) -/* Starting from upstream commit 3bf195ae6037 ("netfilter: nat: merge - * nf_nat_ipv4,6 into nat core") in kernel 5.1. nf_nat_ipv4,6 are merged - * into nf_nat. In order to keep backward compatibility, we keep the config - * checking as is for the old kernel, and replace them with marco for the - * new kernel. */ -#ifdef HAVE_UPSTREAM_NF_NAT -#include <net/netfilter/nf_nat.h> -#define CONFIG_NF_NAT_IPV4 CONFIG_NF_NAT -#define CONFIG_NF_NAT_IPV6 CONFIG_IPV6 -#else -#include <linux/netfilter/nf_nat.h> -#include <net/netfilter/nf_nat_core.h> -#include <net/netfilter/nf_nat_l3proto.h> -#endif /* HAVE_UPSTREAM_NF_NAT */ -#endif /* CONFIG_NF_NAT_NEEDED */ - -#include "datapath.h" -#include "conntrack.h" -#include "flow.h" -#include "flow_netlink.h" -#include "gso.h" - -#ifndef HAVE_NF_NAT_RANGE2 -#define nf_nat_range2 nf_nat_range -#endif - -struct ovs_ct_len_tbl { - int maxlen; - int minlen; -}; - -/* Metadata mark for masked write to conntrack mark */ -struct md_mark { - u32 value; - u32 mask; -}; - -/* Metadata label for masked write to conntrack label. */ -struct md_labels { - struct ovs_key_ct_labels value; - struct ovs_key_ct_labels mask; -}; - -enum ovs_ct_nat { - OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ - OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ - OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ -}; - -/* Conntrack action context for execution. */ -struct ovs_conntrack_info { - struct nf_conntrack_helper *helper; - struct nf_conntrack_zone zone; - struct nf_conn *ct; - u8 commit : 1; - u8 nat : 3; /* enum ovs_ct_nat */ - u8 random_fully_compat : 1; /* bool */ - u8 force : 1; - u8 have_eventmask : 1; - u16 family; - u32 eventmask; /* Mask of 1 << IPCT_*. */ - struct md_mark mark; - struct md_labels labels; - char timeout[CTNL_TIMEOUT_NAME_MAX]; - struct nf_ct_timeout *nf_ct_timeout; -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) - struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ -#endif -}; - -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) -#define OVS_CT_LIMIT_UNLIMITED 0 -#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED -#define CT_LIMIT_HASH_BUCKETS 512 -static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled); - -struct ovs_ct_limit { - /* Elements in ovs_ct_limit_info->limits hash table */ - struct hlist_node hlist_node; - struct rcu_head rcu; - u16 zone; - u32 limit; -}; - -struct ovs_ct_limit_info { - u32 default_limit; - struct hlist_head *limits; - struct nf_conncount_data *data; -}; - -static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = { - [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, }, -}; -#endif - -static bool labels_nonzero(const struct ovs_key_ct_labels *labels); - -static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); - -static u16 key_to_nfproto(const struct sw_flow_key *key) -{ - switch (ntohs(key->eth.type)) { - case ETH_P_IP: - return NFPROTO_IPV4; - case ETH_P_IPV6: - return NFPROTO_IPV6; - default: - return NFPROTO_UNSPEC; - } -} - -/* Map SKB connection state into the values used by flow definition. */ -static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) -{ - u8 ct_state = OVS_CS_F_TRACKED; - - switch (ctinfo) { - case IP_CT_ESTABLISHED_REPLY: - case IP_CT_RELATED_REPLY: - ct_state |= OVS_CS_F_REPLY_DIR; - break; - default: - break; - } - - switch (ctinfo) { - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - ct_state |= OVS_CS_F_ESTABLISHED; - break; - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - ct_state |= OVS_CS_F_RELATED; - break; - case IP_CT_NEW: - ct_state |= OVS_CS_F_NEW; - break; - default: - break; - } - - return ct_state; -} - -static u32 ovs_ct_get_mark(const struct nf_conn *ct) -{ -#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - return ct ? ct->mark : 0; -#else - return 0; -#endif -} - -/* Guard against conntrack labels max size shrinking below 128 bits. */ -#if NF_CT_LABELS_MAX_SIZE < 16 -#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes -#endif - -static void ovs_ct_get_labels(const struct nf_conn *ct, - struct ovs_key_ct_labels *labels) -{ - struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; - - if (cl) - memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); - else - memset(labels, 0, OVS_CT_LABELS_LEN); -} - -static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key, - const struct nf_conntrack_tuple *orig, - u8 icmp_proto) -{ - key->ct_orig_proto = orig->dst.protonum; - if (orig->dst.protonum == icmp_proto) { - key->ct.orig_tp.src = htons(orig->dst.u.icmp.type); - key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code); - } else { - key->ct.orig_tp.src = orig->src.u.all; - key->ct.orig_tp.dst = orig->dst.u.all; - } -} - -static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, - const struct nf_conntrack_zone *zone, - const struct nf_conn *ct) -{ - key->ct_state = state; - key->ct_zone = zone->id; - key->ct.mark = ovs_ct_get_mark(ct); - ovs_ct_get_labels(ct, &key->ct.labels); - - if (ct) { - const struct nf_conntrack_tuple *orig; - - /* Use the master if we have one. */ - if (ct->master) - ct = ct->master; - orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - - /* IP version must match with the master connection. */ - if (key->eth.type == htons(ETH_P_IP) && - nf_ct_l3num(ct) == NFPROTO_IPV4) { - key->ipv4.ct_orig.src = orig->src.u3.ip; - key->ipv4.ct_orig.dst = orig->dst.u3.ip; - __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP); - return; - } else if (key->eth.type == htons(ETH_P_IPV6) && - !sw_flow_key_is_nd(key) && - nf_ct_l3num(ct) == NFPROTO_IPV6) { - key->ipv6.ct_orig.src = orig->src.u3.in6; - key->ipv6.ct_orig.dst = orig->dst.u3.in6; - __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP); - return; - } - } - /* Clear 'ct_orig_proto' to mark the non-existence of conntrack - * original direction key fields. - */ - key->ct_orig_proto = 0; -} - -/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has - * previously sent the packet to conntrack via the ct action. If - * 'keep_nat_flags' is true, the existing NAT flags retained, else they are - * initialized from the connection status. - */ -static void ovs_ct_update_key(const struct sk_buff *skb, - const struct ovs_conntrack_info *info, - struct sw_flow_key *key, bool post_ct, - bool keep_nat_flags) -{ - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - u8 state = 0; - - ct = nf_ct_get(skb, &ctinfo); - if (ct) { - state = ovs_ct_get_state(ctinfo); - /* All unconfirmed entries are NEW connections. */ - if (!nf_ct_is_confirmed(ct)) - state |= OVS_CS_F_NEW; - /* OVS persists the related flag for the duration of the - * connection. - */ - if (ct->master) - state |= OVS_CS_F_RELATED; - if (keep_nat_flags) { - state |= key->ct_state & OVS_CS_F_NAT_MASK; - } else { - if (ct->status & IPS_SRC_NAT) - state |= OVS_CS_F_SRC_NAT; - if (ct->status & IPS_DST_NAT) - state |= OVS_CS_F_DST_NAT; - } - zone = nf_ct_zone(ct); - } else if (post_ct) { - state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; - if (info) - zone = &info->zone; - } - __ovs_ct_update_key(key, state, zone, ct); -} - -/* This is called to initialize CT key fields possibly coming in from the local - * stack. - */ -void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) -{ - ovs_ct_update_key(skb, NULL, key, false, false); -} - -#define IN6_ADDR_INITIALIZER(ADDR) \ - { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \ - (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] } - -int ovs_ct_put_key(const struct sw_flow_key *swkey, - const struct sw_flow_key *output, struct sk_buff *skb) -{ - if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state)) - return -EMSGSIZE; - - if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && - nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone)) - return -EMSGSIZE; - - if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && - nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark)) - return -EMSGSIZE; - - if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels), - &output->ct.labels)) - return -EMSGSIZE; - - if (swkey->ct_orig_proto) { - if (swkey->eth.type == htons(ETH_P_IP)) { - struct ovs_key_ct_tuple_ipv4 orig = { - output->ipv4.ct_orig.src, - output->ipv4.ct_orig.dst, - output->ct.orig_tp.src, - output->ct.orig_tp.dst, - output->ct_orig_proto, - }; - if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, - sizeof(orig), &orig)) - return -EMSGSIZE; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - struct ovs_key_ct_tuple_ipv6 orig = { - IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src), - IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst), - output->ct.orig_tp.src, - output->ct.orig_tp.dst, - output->ct_orig_proto, - }; - if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, - sizeof(orig), &orig)) - return -EMSGSIZE; - } - } - - return 0; -} - -static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, - u32 ct_mark, u32 mask) -{ -#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) - u32 new_mark; - - new_mark = ct_mark | (ct->mark & ~(mask)); - if (ct->mark != new_mark) { - ct->mark = new_mark; - if (nf_ct_is_confirmed(ct)) - nf_conntrack_event_cache(IPCT_MARK, ct); - key->ct.mark = new_mark; - } - - return 0; -#else - return -ENOTSUPP; -#endif -} - -static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct) -{ - struct nf_conn_labels *cl; - - cl = nf_ct_labels_find(ct); - if (!cl) { - nf_ct_labels_ext_add(ct); - cl = nf_ct_labels_find(ct); - } - - return cl; -} - -/* Initialize labels for a new, yet to be committed conntrack entry. Note that - * since the new connection is not yet confirmed, and thus no-one else has - * access to it's labels, we simply write them over. - */ -static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key, - const struct ovs_key_ct_labels *labels, - const struct ovs_key_ct_labels *mask) -{ - struct nf_conn_labels *cl, *master_cl; - bool have_mask = labels_nonzero(mask); - - /* Inherit master's labels to the related connection? */ - master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL; - - if (!master_cl && !have_mask) - return 0; /* Nothing to do. */ - - cl = ovs_ct_get_conn_labels(ct); - if (!cl) - return -ENOSPC; - - /* Inherit the master's labels, if any. Must use memcpy for backport - * as struct assignment only copies the length field in older - * kernels. - */ - if (master_cl) - memcpy(cl->bits, master_cl->bits, OVS_CT_LABELS_LEN); - - if (have_mask) { - u32 *dst = (u32 *)cl->bits; - int i; - - for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) - dst[i] = (dst[i] & ~mask->ct_labels_32[i]) | - (labels->ct_labels_32[i] - & mask->ct_labels_32[i]); - } - - /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the - * IPCT_LABEL bit is set in the event cache. - */ - nf_conntrack_event_cache(IPCT_LABEL, ct); - - memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); - - return 0; -} - -static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, - const struct ovs_key_ct_labels *labels, - const struct ovs_key_ct_labels *mask) -{ - struct nf_conn_labels *cl; - int err; - - cl = ovs_ct_get_conn_labels(ct); - if (!cl) - return -ENOSPC; - - err = nf_connlabels_replace(ct, labels->ct_labels_32, - mask->ct_labels_32, - OVS_CT_LABELS_LEN_32); - if (err) - return err; - - memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); - - return 0; -} - -/* 'skb' should already be pulled to nh_ofs. */ -static int ovs_ct_helper(struct sk_buff *skb, u16 proto) -{ - const struct nf_conntrack_helper *helper; - const struct nf_conn_help *help; - enum ip_conntrack_info ctinfo; - unsigned int protoff; - struct nf_conn *ct; - u8 nexthdr; - int err; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) - bool dst_set = false; - struct rtable rt = { .rt_flags = 0 }; -#endif - - ct = nf_ct_get(skb, &ctinfo); - if (!ct || ctinfo == IP_CT_RELATED_REPLY) - return NF_ACCEPT; - - help = nfct_help(ct); - if (!help) - return NF_ACCEPT; - - helper = rcu_dereference(help->helper); - if (!helper) - return NF_ACCEPT; - - switch (proto) { - case NFPROTO_IPV4: - protoff = ip_hdrlen(skb); - break; - case NFPROTO_IPV6: { - __be16 frag_off; - int ofs; - - nexthdr = ipv6_hdr(skb)->nexthdr; - ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, - &frag_off); - if (ofs < 0 || (frag_off & htons(~0x7)) != 0) { - pr_debug("proto header not found\n"); - return NF_ACCEPT; - } - protoff = ofs; - break; - } - default: - WARN_ONCE(1, "helper invoked on non-IP family!"); - return NF_DROP; - } - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) - /* Linux 4.5 and older depend on skb_dst being set when recalculating - * checksums after NAT helper has mangled TCP or UDP packet payload. - * skb_dst is cast to a rtable struct and the flags examined. - * Forcing these flags to have RTCF_LOCAL not set ensures checksum mod - * is carried out in the same way as kernel versions > 4.5 - */ - if (ct->status & IPS_NAT_MASK && skb->ip_summed != CHECKSUM_PARTIAL - && !skb_dst(skb)) { - dst_set = true; - skb_dst_set(skb, &rt.dst); - } -#endif - err = helper->help(skb, protoff, ct, ctinfo); - if (err != NF_ACCEPT) - return err; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) - if (dst_set) - skb_dst_set(skb, NULL); -#endif - - /* Adjust seqs after helper. This is needed due to some helpers (e.g., - * FTP with NAT) adusting the TCP payload size when mangling IP - * addresses and/or port numbers in the text-based control connection. - */ - if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && - !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) - return NF_DROP; - return NF_ACCEPT; -} - -/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero - * value if 'skb' is freed. - */ -static int handle_fragments(struct net *net, struct sw_flow_key *key, - u16 zone, struct sk_buff *skb) -{ - struct ovs_gso_cb ovs_cb = *OVS_GSO_CB(skb); - int err; - - if (key->eth.type == htons(ETH_P_IP)) { - enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; - - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - err = ip_defrag(net, skb, user); - if (err) - return err; - - ovs_cb.dp_cb.mru = IPCB(skb)->frag_max_size; -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - } else if (key->eth.type == htons(ETH_P_IPV6)) { - enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - - memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); - err = nf_ct_frag6_gather(net, skb, user); - if (err) { - if (err != -EINPROGRESS) - kfree_skb(skb); - return err; - } - - key->ip.proto = ipv6_hdr(skb)->nexthdr; - ovs_cb.dp_cb.mru = IP6CB(skb)->frag_max_size; -#endif /* IP frag support */ - } else { - kfree_skb(skb); - return -EPFNOSUPPORT; - } - - /* The key extracted from the fragment that completed this datagram - * likely didn't have an L4 header, so regenerate it. - */ - ovs_flow_key_update_l3l4(skb, key); - - key->ip.frag = OVS_FRAG_TYPE_NONE; - skb_clear_hash(skb); - skb->ignore_df = 1; - *OVS_GSO_CB(skb) = ovs_cb; - - return 0; -} - -static struct nf_conntrack_expect * -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, - u16 proto, const struct sk_buff *skb) -{ - struct nf_conntrack_tuple tuple; - struct nf_conntrack_expect *exp; - - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) - return NULL; - - exp = __nf_ct_expect_find(net, zone, &tuple); - if (exp) { - struct nf_conntrack_tuple_hash *h; - - /* Delete existing conntrack entry, if it clashes with the - * expectation. This can happen since conntrack ALGs do not - * check for clashes between (new) expectations and existing - * conntrack entries. nf_conntrack_in() will check the - * expectations only if a conntrack entry can not be found, - * which can lead to OVS finding the expectation (here) in the - * init direction, but which will not be removed by the - * nf_conntrack_in() call, if a matching conntrack entry is - * found instead. In this case all init direction packets - * would be reported as new related packets, while reply - * direction packets would be reported as un-related - * established packets. - */ - h = nf_conntrack_find_get(net, zone, &tuple); - if (h) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - nf_ct_delete(ct, 0, 0); - nf_conntrack_put(&ct->ct_general); - } - } - - return exp; -} - -/* This replicates logic from nf_conntrack_core.c that is not exported. */ -static enum ip_conntrack_info -ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) -{ - const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) - return IP_CT_ESTABLISHED_REPLY; - /* Once we've had two way comms, always ESTABLISHED. */ - if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) - return IP_CT_ESTABLISHED; - if (test_bit(IPS_EXPECTED_BIT, &ct->status)) - return IP_CT_RELATED; - return IP_CT_NEW; -} - -/* Find an existing connection which this packet belongs to without - * re-attributing statistics or modifying the connection state. This allows an - * skb->_nfct lost due to an upcall to be recovered during actions execution. - * - * Must be called with rcu_read_lock. - * - * On success, populates skb->_nfct and returns the connection. Returns NULL - * if there is no existing entry. - */ -static struct nf_conn * -ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, - u8 l3num, struct sk_buff *skb, bool natted) -{ - struct nf_conntrack_tuple tuple; - struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct; - - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, - net, &tuple)) { - pr_debug("ovs_ct_find_existing: Can't get tuple\n"); - return NULL; - } - - /* Must invert the tuple if skb has been transformed by NAT. */ - if (natted) { - struct nf_conntrack_tuple inverse; - - if (!rpl_nf_ct_invert_tuple(&inverse, &tuple)) { - pr_debug("ovs_ct_find_existing: Inversion failed!\n"); - return NULL; - } - tuple = inverse; - } - - /* look for tuple match */ - h = nf_conntrack_find_get(net, zone, &tuple); - if (!h) - return NULL; /* Not found. */ - - ct = nf_ct_tuplehash_to_ctrack(h); - - /* Inverted packet tuple matches the reverse direction conntrack tuple, - * select the other tuplehash to get the right 'ctinfo' bits for this - * packet. - */ - if (natted) - h = &ct->tuplehash[!h->tuple.dst.dir]; - - nf_ct_set(skb, ct, ovs_ct_get_info(h)); - return ct; -} - -static -struct nf_conn *ovs_ct_executed(struct net *net, - const struct sw_flow_key *key, - const struct ovs_conntrack_info *info, - struct sk_buff *skb, - bool *ct_executed) -{ - struct nf_conn *ct = NULL; - - /* If no ct, check if we have evidence that an existing conntrack entry - * might be found for this skb. This happens when we lose a skb->_nfct - * due to an upcall, or if the direction is being forced. If the - * connection was not confirmed, it is not cached and needs to be run - * through conntrack again. - */ - *ct_executed = (key->ct_state & OVS_CS_F_TRACKED) && - !(key->ct_state & OVS_CS_F_INVALID) && - (key->ct_zone == info->zone.id); - - if (*ct_executed || (!key->ct_state && info->force)) { - ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, - !!(key->ct_state & - OVS_CS_F_NAT_MASK)); - } - - return ct; -} - -/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ -static bool skb_nfct_cached(struct net *net, - const struct sw_flow_key *key, - const struct ovs_conntrack_info *info, - struct sk_buff *skb) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - bool ct_executed = true; - - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - ct = ovs_ct_executed(net, key, info, skb, &ct_executed); - - if (ct) - nf_ct_get(skb, &ctinfo); - else - return false; - - if (!net_eq(net, read_pnet(&ct->ct_net))) - return false; - if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) - return false; - if (info->helper) { - struct nf_conn_help *help; - - help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); - if (help && rcu_access_pointer(help->helper) != info->helper) - return false; - } - if (info->nf_ct_timeout) { - struct nf_conn_timeout *timeout_ext; - - timeout_ext = nf_ct_timeout_find(ct); - if (!timeout_ext || info->nf_ct_timeout != - rcu_dereference(timeout_ext->timeout)) - return false; - } - /* Force conntrack entry direction to the current packet? */ - if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { - /* Delete the conntrack entry if confirmed, else just release - * the reference. - */ - if (nf_ct_is_confirmed(ct)) - nf_ct_delete(ct, 0, 0); - - nf_conntrack_put(&ct->ct_general); - nf_ct_set(skb, NULL, 0); - return false; - } - - return ct_executed; -} - -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) -/* Modelled after nf_nat_ipv[46]_fn(). - * range is only used for new, uninitialized NAT state. - * Returns either NF_ACCEPT or NF_DROP. - */ -static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - const struct nf_nat_range2 *range, - enum nf_nat_manip_type maniptype) -{ - int hooknum, nh_off, err = NF_ACCEPT; - - nh_off = skb_network_offset(skb); - skb_pull_rcsum(skb, nh_off); - - /* See HOOK2MANIP(). */ - if (maniptype == NF_NAT_MANIP_SRC) - hooknum = NF_INET_LOCAL_IN; /* Source NAT */ - else - hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */ - - switch (ctinfo) { - case IP_CT_RELATED: - case IP_CT_RELATED_REPLY: - if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && - skb->protocol == htons(ETH_P_IP) && - ip_hdr(skb)->protocol == IPPROTO_ICMP) { - if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, - hooknum)) - err = NF_DROP; - goto push; - } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && - skb->protocol == htons(ETH_P_IPV6)) { - __be16 frag_off; - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - int hdrlen = ipv6_skip_exthdr(skb, - sizeof(struct ipv6hdr), - &nexthdr, &frag_off); - - if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { - if (!nf_nat_icmpv6_reply_translation(skb, ct, - ctinfo, - hooknum, - hdrlen)) - err = NF_DROP; - goto push; - } - } - /* Non-ICMP, fall thru to initialize if needed. */ - /* fall through */ - case IP_CT_NEW: - /* Seen it before? This can happen for loopback, retrans, - * or local packets. - */ - if (!nf_nat_initialized(ct, maniptype)) { - /* Initialize according to the NAT action. */ - err = (range && range->flags & NF_NAT_RANGE_MAP_IPS) - /* Action is set up to establish a new - * mapping. - */ - ? nf_nat_setup_info(ct, range, maniptype) - : nf_nat_alloc_null_binding(ct, hooknum); - if (err != NF_ACCEPT) - goto push; - } - break; - - case IP_CT_ESTABLISHED: - case IP_CT_ESTABLISHED_REPLY: - break; - - default: - err = NF_DROP; - goto push; - } - - err = nf_nat_packet(ct, ctinfo, hooknum, skb); -push: - skb_push(skb, nh_off); - skb_postpush_rcsum(skb, skb->data, nh_off); - - return err; -} - -static void ovs_nat_update_key(struct sw_flow_key *key, - const struct sk_buff *skb, - enum nf_nat_manip_type maniptype) -{ - if (maniptype == NF_NAT_MANIP_SRC) { - __be16 src; - - key->ct_state |= OVS_CS_F_SRC_NAT; - if (key->eth.type == htons(ETH_P_IP)) - key->ipv4.addr.src = ip_hdr(skb)->saddr; - else if (key->eth.type == htons(ETH_P_IPV6)) - memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, - sizeof(key->ipv6.addr.src)); - else - return; - - if (key->ip.proto == IPPROTO_UDP) - src = udp_hdr(skb)->source; - else if (key->ip.proto == IPPROTO_TCP) - src = tcp_hdr(skb)->source; - else if (key->ip.proto == IPPROTO_SCTP) - src = sctp_hdr(skb)->source; - else - return; - - key->tp.src = src; - } else { - __be16 dst; - - key->ct_state |= OVS_CS_F_DST_NAT; - if (key->eth.type == htons(ETH_P_IP)) - key->ipv4.addr.dst = ip_hdr(skb)->daddr; - else if (key->eth.type == htons(ETH_P_IPV6)) - memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, - sizeof(key->ipv6.addr.dst)); - else - return; - - if (key->ip.proto == IPPROTO_UDP) - dst = udp_hdr(skb)->dest; - else if (key->ip.proto == IPPROTO_TCP) - dst = tcp_hdr(skb)->dest; - else if (key->ip.proto == IPPROTO_SCTP) - dst = sctp_hdr(skb)->dest; - else - return; - - key->tp.dst = dst; - } -} - -/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ -static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, - const struct ovs_conntrack_info *info, - struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - enum nf_nat_manip_type maniptype; - int err; - -#ifdef HAVE_NF_CT_IS_UNTRACKED - if (nf_ct_is_untracked(ct)) { - /* A NAT action may only be performed on tracked packets. */ - return NF_ACCEPT; - } -#endif /* HAVE_NF_CT_IS_UNTRACKED */ - - /* Add NAT extension if not confirmed yet. */ - if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) - return NF_ACCEPT; /* Can't NAT. */ - - /* Determine NAT type. - * Check if the NAT type can be deduced from the tracked connection. - * Make sure new expected connections (IP_CT_RELATED) are NATted only - * when committing. - */ - if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW && - ct->status & IPS_NAT_MASK && - (ctinfo != IP_CT_RELATED || info->commit)) { - /* NAT an established or related connection like before. */ - if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) - /* This is the REPLY direction for a connection - * for which NAT was applied in the forward - * direction. Do the reverse NAT. - */ - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC; - else - maniptype = ct->status & IPS_SRC_NAT - ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST; - } else if (info->nat & OVS_CT_SRC_NAT) { - maniptype = NF_NAT_MANIP_SRC; - } else if (info->nat & OVS_CT_DST_NAT) { - maniptype = NF_NAT_MANIP_DST; - } else { - return NF_ACCEPT; /* Connection is not NATed. */ - } - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype); - - if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) { - if (ct->status & IPS_SRC_NAT) { - if (maniptype == NF_NAT_MANIP_SRC) - maniptype = NF_NAT_MANIP_DST; - else - maniptype = NF_NAT_MANIP_SRC; - - err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, - maniptype); - } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { - err = ovs_ct_nat_execute(skb, ct, ctinfo, NULL, - NF_NAT_MANIP_SRC); - } - } - - /* Mark NAT done if successful and update the flow key. */ - if (err == NF_ACCEPT) - ovs_nat_update_key(key, skb, maniptype); - - return err; -} -#else /* !CONFIG_NF_NAT_NEEDED */ -static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, - const struct ovs_conntrack_info *info, - struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - return NF_ACCEPT; -} -#endif - -/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if - * not done already. Update key with new CT state after passing the packet - * through conntrack. - * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be - * set to NULL and 0 will be returned. - */ -static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, - const struct ovs_conntrack_info *info, - struct sk_buff *skb) -{ - /* If we are recirculating packets to match on conntrack fields and - * committing with a separate conntrack action, then we don't need to - * actually run the packet through conntrack twice unless it's for a - * different zone. - */ - bool cached = skb_nfct_cached(net, key, info, skb); - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - - if (!cached) { - struct nf_hook_state state = { - .hook = NF_INET_PRE_ROUTING, - .pf = info->family, - .net = net, - }; - struct nf_conn *tmpl = info->ct; - int err; - - /* Associate skb with specified zone. */ - if (tmpl) { - if (skb_nfct(skb)) - nf_conntrack_put(skb_nfct(skb)); - nf_conntrack_get(&tmpl->ct_general); - nf_ct_set(skb, tmpl, IP_CT_NEW); - } - - err = nf_conntrack_in(skb, &state); - if (err != NF_ACCEPT) - return -ENOENT; - - /* Clear CT state NAT flags to mark that we have not yet done - * NAT after the nf_conntrack_in() call. We can actually clear - * the whole state, as it will be re-initialized below. - */ - key->ct_state = 0; - - /* Update the key, but keep the NAT flags. */ - ovs_ct_update_key(skb, info, key, true, true); - } - - ct = nf_ct_get(skb, &ctinfo); - if (ct) { - bool add_helper = false; - - /* Packets starting a new connection must be NATted before the - * helper, so that the helper knows about the NAT. We enforce - * this by delaying both NAT and helper calls for unconfirmed - * connections until the committing CT action. For later - * packets NAT and Helper may be called in either order. - * - * NAT will be done only if the CT action has NAT, and only - * once per packet (per zone), as guarded by the NAT bits in - * the key->ct_state. - */ - if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) && - (nf_ct_is_confirmed(ct) || info->commit) && - ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) { - return -EINVAL; - } - - /* Userspace may decide to perform a ct lookup without a helper - * specified followed by a (recirculate and) commit with one, - * or attach a helper in a later commit. Therefore, for - * connections which we will commit, we may need to attach - * the helper here. - */ - if (info->commit && info->helper && !nfct_help(ct)) { - int err = __nf_ct_try_assign_helper(ct, info->ct, - GFP_ATOMIC); - if (err) - return err; - add_helper = true; - - /* helper installed, add seqadj if NAT is required */ - if (info->nat && !nfct_seqadj(ct)) { - if (!nfct_seqadj_ext_add(ct)) - return -EINVAL; - } - } - - /* Call the helper only if: - * - nf_conntrack_in() was executed above ("!cached") or a - * helper was just attached ("add_helper") for a confirmed - * connection, or - * - When committing an unconfirmed connection. - */ - if ((nf_ct_is_confirmed(ct) ? !cached || add_helper : - info->commit) && - ovs_ct_helper(skb, info->family) != NF_ACCEPT) { - return -EINVAL; - } - } - - return 0; -} - -/* Lookup connection and read fields into key. */ -static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, - const struct ovs_conntrack_info *info, - struct sk_buff *skb) -{ - struct nf_conntrack_expect *exp; - - /* If we pass an expected packet through nf_conntrack_in() the - * expectation is typically removed, but the packet could still be - * lost in upcall processing. To prevent this from happening we - * perform an explicit expectation lookup. Expected connections are - * always new, and will be passed through conntrack only when they are - * committed, as it is OK to remove the expectation at that time. - */ - exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); - if (exp) { - u8 state; - - /* NOTE: New connections are NATted and Helped only when - * committed, so we are not calling into NAT here. - */ - state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; - __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - struct nf_conn *ct; - int err; - - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; - - ct = (struct nf_conn *)skb_nfct(skb); - if (ct) - nf_ct_deliver_cached_events(ct); - } - - return 0; -} - -static bool labels_nonzero(const struct ovs_key_ct_labels *labels) -{ - size_t i; - - for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) - if (labels->ct_labels_32[i]) - return true; - - return false; -} - -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) -static struct hlist_head *ct_limit_hash_bucket( - const struct ovs_ct_limit_info *info, u16 zone) -{ - return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)]; -} - -/* Call with ovs_mutex */ -static void ct_limit_set(const struct ovs_ct_limit_info *info, - struct ovs_ct_limit *new_ct_limit) -{ - struct ovs_ct_limit *ct_limit; - struct hlist_head *head; - - head = ct_limit_hash_bucket(info, new_ct_limit->zone); - hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { - if (ct_limit->zone == new_ct_limit->zone) { - hlist_replace_rcu(&ct_limit->hlist_node, - &new_ct_limit->hlist_node); - kfree_rcu(ct_limit, rcu); - return; - } - } - - hlist_add_head_rcu(&new_ct_limit->hlist_node, head); -} - -/* Call with ovs_mutex */ -static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone) -{ - struct ovs_ct_limit *ct_limit; - struct hlist_head *head; - struct hlist_node *n; - - head = ct_limit_hash_bucket(info, zone); - hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) { - if (ct_limit->zone == zone) { - hlist_del_rcu(&ct_limit->hlist_node); - kfree_rcu(ct_limit, rcu); - return; - } - } -} - -/* Call with RCU read lock */ -static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone) -{ - struct ovs_ct_limit *ct_limit; - struct hlist_head *head; - - head = ct_limit_hash_bucket(info, zone); - hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { - if (ct_limit->zone == zone) - return ct_limit->limit; - } - - return info->default_limit; -} - -static int ovs_ct_check_limit(struct net *net, - const struct ovs_conntrack_info *info, - const struct nf_conntrack_tuple *tuple) -{ - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; - u32 per_zone_limit, connections; - u32 conncount_key; - - conncount_key = info->zone.id; - - per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id); - if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) - return 0; - - connections = nf_conncount_count(net, ct_limit_info->data, - &conncount_key, tuple, &info->zone); - if (connections > per_zone_limit) - return -ENOMEM; - - return 0; -} -#endif - -/* Lookup connection and confirm if unconfirmed. */ -static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, - const struct ovs_conntrack_info *info, - struct sk_buff *skb) -{ - enum ip_conntrack_info ctinfo; - struct nf_conn *ct; - int err; - - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; - - /* The connection could be invalid, in which case this is a no-op.*/ - ct = nf_ct_get(skb, &ctinfo); - if (!ct) - return 0; - -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) - if (static_branch_unlikely(&ovs_ct_limit_enabled)) { - if (!nf_ct_is_confirmed(ct)) { - err = ovs_ct_check_limit(net, info, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - if (err) { - net_warn_ratelimited("openvswitch: zone: %u " - "exceeds conntrack limit\n", - info->zone.id); - return err; - } - } - } -#endif - - /* Set the conntrack event mask if given. NEW and DELETE events have - * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener - * typically would receive many kinds of updates. Setting the event - * mask allows those events to be filtered. The set event mask will - * remain in effect for the lifetime of the connection unless changed - * by a further CT action with both the commit flag and the eventmask - * option. */ - if (info->have_eventmask) { - struct nf_conntrack_ecache *cache = nf_ct_ecache_find(ct); - - if (cache) - cache->ctmask = info->eventmask; - } - - /* Apply changes before confirming the connection so that the initial - * conntrack NEW netlink event carries the values given in the CT - * action. - */ - if (info->mark.mask) { - err = ovs_ct_set_mark(ct, key, info->mark.value, - info->mark.mask); - if (err) - return err; - } - if (!nf_ct_is_confirmed(ct)) { - err = ovs_ct_init_labels(ct, key, &info->labels.value, - &info->labels.mask); - if (err) - return err; - } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - labels_nonzero(&info->labels.mask)) { - err = ovs_ct_set_labels(ct, key, &info->labels.value, - &info->labels.mask); - if (err) - return err; - } - /* This will take care of sending queued events even if the connection - * is already confirmed. - */ - if (nf_conntrack_confirm(skb) != NF_ACCEPT) - return -EINVAL; - - return 0; -} - -/* Trim the skb to the length specified by the IP/IPv6 header, - * removing any trailing lower-layer padding. This prepares the skb - * for higher-layer processing that assumes skb->len excludes padding - * (such as nf_ip_checksum). The caller needs to pull the skb to the - * network header, and ensure ip_hdr/ipv6_hdr points to valid data. - */ -static int ovs_skb_network_trim(struct sk_buff *skb) -{ - unsigned int len; - int err; - - switch (skb->protocol) { - case htons(ETH_P_IP): - len = ntohs(ip_hdr(skb)->tot_len); - break; - case htons(ETH_P_IPV6): - len = sizeof(struct ipv6hdr) - + ntohs(ipv6_hdr(skb)->payload_len); - break; - default: - len = skb->len; - } - - err = pskb_trim_rcsum(skb, len); - if (err) - kfree_skb(skb); - - return err; -} - -/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero - * value if 'skb' is freed. - */ -int ovs_ct_execute(struct net *net, struct sk_buff *skb, - struct sw_flow_key *key, - const struct ovs_conntrack_info *info) -{ - int nh_ofs; - int err; - - /* The conntrack module expects to be working at L3. */ - nh_ofs = skb_network_offset(skb); - skb_pull_rcsum(skb, nh_ofs); - - err = ovs_skb_network_trim(skb); - if (err) - return err; - - if (key->ip.frag != OVS_FRAG_TYPE_NONE) { - err = handle_fragments(net, key, info->zone.id, skb); - if (err) - return err; - } - - if (info->commit) - err = ovs_ct_commit(net, key, info, skb); - else - err = ovs_ct_lookup(net, key, info, skb); - - skb_push(skb, nh_ofs); - skb_postpush_rcsum(skb, skb->data, nh_ofs); - if (err) - kfree_skb(skb); - return err; -} - -int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) -{ - if (skb_nfct(skb)) { - nf_conntrack_put(skb_nfct(skb)); -#ifdef HAVE_IP_CT_UNTRACKED - nf_ct_set(skb, NULL, IP_CT_UNTRACKED); -#else - nf_ct_set(skb, NULL, 0); -#endif - ovs_ct_fill_key(skb, key); - } - - return 0; -} - -static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, - const struct sw_flow_key *key, bool log) -{ - struct nf_conntrack_helper *helper; - struct nf_conn_help *help; - int ret = 0; - - helper = nf_conntrack_helper_try_module_get(name, info->family, - key->ip.proto); - if (!helper) { - OVS_NLERR(log, "Unknown helper \"%s\"", name); - return -EINVAL; - } - - help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); - if (!help) { - nf_conntrack_helper_put(helper); - return -ENOMEM; - } - -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) - if (info->nat) { - ret = nf_nat_helper_try_module_get(name, info->family, - key->ip.proto); - if (ret) { - nf_conntrack_helper_put(helper); - OVS_NLERR(log, "Failed to load \"%s\" NAT helper, error: %d", - name, ret); - return ret; - } - } -#endif - - rcu_assign_pointer(help->helper, helper); - info->helper = helper; - return ret; -} - -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) -static int parse_nat(const struct nlattr *attr, - struct ovs_conntrack_info *info, bool log) -{ - struct nlattr *a; - int rem; - bool have_ip_max = false; - bool have_proto_max = false; - bool ip_vers = (info->family == NFPROTO_IPV6); - - nla_for_each_nested(a, attr, rem) { - static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { - [OVS_NAT_ATTR_SRC] = {0, 0}, - [OVS_NAT_ATTR_DST] = {0, 0}, - [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), - sizeof(struct in6_addr)}, - [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), - sizeof(struct in6_addr)}, - [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)}, - [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)}, - [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, - [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, - [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, - }; - int type = nla_type(a); - - if (type > OVS_NAT_ATTR_MAX) { - OVS_NLERR(log, "Unknown NAT attribute (type=%d, max=%d)", - type, OVS_NAT_ATTR_MAX); - return -EINVAL; - } - - if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) { - OVS_NLERR(log, "NAT attribute type %d has unexpected length (%d != %d)", - type, nla_len(a), - ovs_nat_attr_lens[type][ip_vers]); - return -EINVAL; - } - - switch (type) { - case OVS_NAT_ATTR_SRC: - case OVS_NAT_ATTR_DST: - if (info->nat) { - OVS_NLERR(log, "Only one type of NAT may be specified"); - return -ERANGE; - } - info->nat |= OVS_CT_NAT; - info->nat |= ((type == OVS_NAT_ATTR_SRC) - ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT); - break; - - case OVS_NAT_ATTR_IP_MIN: - nla_memcpy(&info->range.min_addr, a, - sizeof(info->range.min_addr)); - info->range.flags |= NF_NAT_RANGE_MAP_IPS; - break; - - case OVS_NAT_ATTR_IP_MAX: - have_ip_max = true; - nla_memcpy(&info->range.max_addr, a, - sizeof(info->range.max_addr)); - info->range.flags |= NF_NAT_RANGE_MAP_IPS; - break; - - case OVS_NAT_ATTR_PROTO_MIN: - info->range.min_proto.all = htons(nla_get_u16(a)); - info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - break; - - case OVS_NAT_ATTR_PROTO_MAX: - have_proto_max = true; - info->range.max_proto.all = htons(nla_get_u16(a)); - info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - break; - - case OVS_NAT_ATTR_PERSISTENT: - info->range.flags |= NF_NAT_RANGE_PERSISTENT; - break; - - case OVS_NAT_ATTR_PROTO_HASH: - info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; - break; - - case OVS_NAT_ATTR_PROTO_RANDOM: -#ifdef NF_NAT_RANGE_PROTO_RANDOM_FULLY - info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY; -#else - info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; - info->random_fully_compat = true; -#endif - break; - - default: - OVS_NLERR(log, "Unknown nat attribute (%d)", type); - return -EINVAL; - } - } - - if (rem > 0) { - OVS_NLERR(log, "NAT attribute has %d unknown bytes", rem); - return -EINVAL; - } - if (!info->nat) { - /* Do not allow flags if no type is given. */ - if (info->range.flags) { - OVS_NLERR(log, - "NAT flags may be given only when NAT range (SRC or DST) is also specified." - ); - return -EINVAL; - } - info->nat = OVS_CT_NAT; /* NAT existing connections. */ - } else if (!info->commit) { - OVS_NLERR(log, - "NAT attributes may be specified only when CT COMMIT flag is also specified." - ); - return -EINVAL; - } - /* Allow missing IP_MAX. */ - if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) { - memcpy(&info->range.max_addr, &info->range.min_addr, - sizeof(info->range.max_addr)); - } - /* Allow missing PROTO_MAX. */ - if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && - !have_proto_max) { - info->range.max_proto.all = info->range.min_proto.all; - } - return 0; -} -#endif - -static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { - [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, - [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 }, - [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), - .maxlen = sizeof(u16) }, - [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), - .maxlen = sizeof(struct md_mark) }, - [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), - .maxlen = sizeof(struct md_labels) }, - [OVS_CT_ATTR_HELPER] = { .minlen = 1, - .maxlen = NF_CT_HELPER_NAME_LEN }, -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) - /* NAT length is checked when parsing the nested attributes. */ - [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, -#endif - [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32), - .maxlen = sizeof(u32) }, - [OVS_CT_ATTR_TIMEOUT] = { .minlen = 1, - .maxlen = CTNL_TIMEOUT_NAME_MAX }, -}; - -static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, - const char **helper, bool log) -{ - struct nlattr *a; - int rem; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - int maxlen; - int minlen; - - if (type > OVS_CT_ATTR_MAX) { - OVS_NLERR(log, - "Unknown conntrack attr (type=%d, max=%d)", - type, OVS_CT_ATTR_MAX); - return -EINVAL; - } - - maxlen = ovs_ct_attr_lens[type].maxlen; - minlen = ovs_ct_attr_lens[type].minlen; - if (nla_len(a) < minlen || nla_len(a) > maxlen) { - OVS_NLERR(log, - "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", - type, nla_len(a), maxlen); - return -EINVAL; - } - - switch (type) { - case OVS_CT_ATTR_FORCE_COMMIT: - info->force = true; - /* fall through. */ - case OVS_CT_ATTR_COMMIT: - info->commit = true; - break; -#ifdef CONFIG_NF_CONNTRACK_ZONES - case OVS_CT_ATTR_ZONE: - info->zone.id = nla_get_u16(a); - break; -#endif -#ifdef CONFIG_NF_CONNTRACK_MARK - case OVS_CT_ATTR_MARK: { - struct md_mark *mark = nla_data(a); - - if (!mark->mask) { - OVS_NLERR(log, "ct_mark mask cannot be 0"); - return -EINVAL; - } - info->mark = *mark; - break; - } -#endif -#ifdef CONFIG_NF_CONNTRACK_LABELS - case OVS_CT_ATTR_LABELS: { - struct md_labels *labels = nla_data(a); - - if (!labels_nonzero(&labels->mask)) { - OVS_NLERR(log, "ct_labels mask cannot be 0"); - return -EINVAL; - } - info->labels = *labels; - break; - } -#endif - case OVS_CT_ATTR_HELPER: - *helper = nla_data(a); - if (!memchr(*helper, '\0', nla_len(a))) { - OVS_NLERR(log, "Invalid conntrack helper"); - return -EINVAL; - } - break; -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) - case OVS_CT_ATTR_NAT: { - int err = parse_nat(a, info, log); - - if (err) - return err; - break; - } -#endif - case OVS_CT_ATTR_EVENTMASK: - info->have_eventmask = true; - info->eventmask = nla_get_u32(a); - break; -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - case OVS_CT_ATTR_TIMEOUT: - memcpy(info->timeout, nla_data(a), nla_len(a)); - if (!memchr(info->timeout, '\0', nla_len(a))) { - OVS_NLERR(log, "Invalid conntrack timeout"); - return -EINVAL; - } - break; -#endif - - default: - OVS_NLERR(log, "Unknown conntrack attr (%d)", - type); - return -EINVAL; - } - } - -#ifdef CONFIG_NF_CONNTRACK_MARK - if (!info->commit && info->mark.mask) { - OVS_NLERR(log, - "Setting conntrack mark requires 'commit' flag."); - return -EINVAL; - } -#endif -#ifdef CONFIG_NF_CONNTRACK_LABELS - if (!info->commit && labels_nonzero(&info->labels.mask)) { - OVS_NLERR(log, - "Setting conntrack labels requires 'commit' flag."); - return -EINVAL; - } -#endif - if (rem > 0) { - OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); - return -EINVAL; - } - - return 0; -} - -bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) -{ - if (attr == OVS_KEY_ATTR_CT_STATE) - return true; - if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && - attr == OVS_KEY_ATTR_CT_ZONE) - return true; - if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && - attr == OVS_KEY_ATTR_CT_MARK) - return true; - if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - attr == OVS_KEY_ATTR_CT_LABELS) { - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - return ovs_net->xt_label; - } - - return false; -} - -int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, - const struct sw_flow_key *key, - struct sw_flow_actions **sfa, bool log) -{ - struct ovs_conntrack_info ct_info; - const char *helper = NULL; - u16 family; - int err; - - family = key_to_nfproto(key); - if (family == NFPROTO_UNSPEC) { - OVS_NLERR(log, "ct family unspecified"); - return -EINVAL; - } - - memset(&ct_info, 0, sizeof(ct_info)); - ct_info.family = family; - - nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, - NF_CT_DEFAULT_ZONE_DIR, 0); - - err = parse_ct(attr, &ct_info, &helper, log); - if (err) - return err; - - /* Set up template for tracking connections in specific zones. */ - ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); - if (!ct_info.ct) { - OVS_NLERR(log, "Failed to allocate conntrack template"); - return -ENOMEM; - } - - if (ct_info.timeout[0]) { - if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, - ct_info.timeout)) - pr_info_ratelimited("Failed to associated timeout " - "policy `%s'\n", ct_info.timeout); - else - ct_info.nf_ct_timeout = rcu_dereference( - nf_ct_timeout_find(ct_info.ct)->timeout); - - } - - if (helper) { - err = ovs_ct_add_helper(&ct_info, helper, key, log); - if (err) - goto err_free_ct; - } - - err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, - sizeof(ct_info), log); - if (err) - goto err_free_ct; - - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); - nf_conntrack_get(&ct_info.ct->ct_general); - return 0; -err_free_ct: - __ovs_ct_free_action(&ct_info); - return err; -} - -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) -static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, - struct sk_buff *skb) -{ - struct nlattr *start; - - start = nla_nest_start_noflag(skb, OVS_CT_ATTR_NAT); - if (!start) - return false; - - if (info->nat & OVS_CT_SRC_NAT) { - if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) - return false; - } else if (info->nat & OVS_CT_DST_NAT) { - if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) - return false; - } else { - goto out; - } - - if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { - if (IS_ENABLED(CONFIG_NF_NAT_IPV4) && - info->family == NFPROTO_IPV4) { - if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, - info->range.min_addr.ip) || - (info->range.max_addr.ip - != info->range.min_addr.ip && - (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, - info->range.max_addr.ip)))) - return false; - } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) && - info->family == NFPROTO_IPV6) { - if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, - &info->range.min_addr.in6) || - (memcmp(&info->range.max_addr.in6, - &info->range.min_addr.in6, - sizeof(info->range.max_addr.in6)) && - (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, - &info->range.max_addr.in6)))) - return false; - } else { - return false; - } - } - if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && - (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN, - ntohs(info->range.min_proto.all)) || - (info->range.max_proto.all != info->range.min_proto.all && - nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX, - ntohs(info->range.max_proto.all))))) - return false; - - if (info->range.flags & NF_NAT_RANGE_PERSISTENT && - nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT)) - return false; - if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM && - nla_put_flag(skb, info->random_fully_compat - ? OVS_NAT_ATTR_PROTO_RANDOM - : OVS_NAT_ATTR_PROTO_HASH)) - return false; -#ifdef NF_NAT_RANGE_PROTO_RANDOM_FULLY - if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY && - nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM)) - return false; -#endif -out: - nla_nest_end(skb, start); - - return true; -} -#endif - -int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, - struct sk_buff *skb) -{ - struct nlattr *start; - - start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CT); - if (!start) - return -EMSGSIZE; - - if (ct_info->commit && nla_put_flag(skb, ct_info->force - ? OVS_CT_ATTR_FORCE_COMMIT - : OVS_CT_ATTR_COMMIT)) - return -EMSGSIZE; - if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && - nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) - return -EMSGSIZE; - if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && - nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), - &ct_info->mark)) - return -EMSGSIZE; - if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && - labels_nonzero(&ct_info->labels.mask) && - nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), - &ct_info->labels)) - return -EMSGSIZE; - if (ct_info->helper) { - if (nla_put_string(skb, OVS_CT_ATTR_HELPER, - ct_info->helper->name)) - return -EMSGSIZE; - } - if (ct_info->have_eventmask && - nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask)) - return -EMSGSIZE; - if (ct_info->timeout[0]) { - if (nla_put_string(skb, OVS_CT_ATTR_TIMEOUT, ct_info->timeout)) - return -EMSGSIZE; - } - -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) - if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) - return -EMSGSIZE; -#endif - nla_nest_end(skb, start); - - return 0; -} - -void ovs_ct_free_action(const struct nlattr *a) -{ - struct ovs_conntrack_info *ct_info = nla_data(a); - - __ovs_ct_free_action(ct_info); -} - -static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) -{ - if (ct_info->helper) { -#if IS_ENABLED(CONFIG_NF_NAT_NEEDED) - if (ct_info->nat) - nf_nat_helper_put(ct_info->helper); -#endif - nf_conntrack_helper_put(ct_info->helper); - } - if (ct_info->ct) { - if (ct_info->timeout[0]) - nf_ct_destroy_timeout(ct_info->ct); - nf_ct_tmpl_free(ct_info->ct); - } -} - -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) -static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net) -{ - int i, err; - - ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info), - GFP_KERNEL); - if (!ovs_net->ct_limit_info) - return -ENOMEM; - - ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT; - ovs_net->ct_limit_info->limits = - kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head), - GFP_KERNEL); - if (!ovs_net->ct_limit_info->limits) { - kfree(ovs_net->ct_limit_info); - return -ENOMEM; - } - - for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]); - - ovs_net->ct_limit_info->data = - nf_conncount_init(net, NFPROTO_INET, sizeof(u32)); - - if (IS_ERR(ovs_net->ct_limit_info->data)) { - err = PTR_ERR(ovs_net->ct_limit_info->data); - kfree(ovs_net->ct_limit_info->limits); - kfree(ovs_net->ct_limit_info); - pr_err("openvswitch: failed to init nf_conncount %d\n", err); - return err; - } - return 0; -} - -static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) -{ - const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info; - int i; - - nf_conncount_destroy(net, NFPROTO_INET, info->data); - for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { - struct hlist_head *head = &info->limits[i]; - struct ovs_ct_limit *ct_limit; - - hlist_for_each_entry_rcu(ct_limit, head, hlist_node, - lockdep_ovsl_is_held()) - kfree_rcu(ct_limit, rcu); - } - kfree(ovs_net->ct_limit_info->limits); - kfree(ovs_net->ct_limit_info); -} - -static struct sk_buff * -ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd, - struct ovs_header **ovs_reply_header) -{ - struct ovs_header *ovs_header = info->userhdr; - struct sk_buff *skb; - - skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return ERR_PTR(-ENOMEM); - - *ovs_reply_header = genlmsg_put(skb, info->snd_portid, - info->snd_seq, - &dp_ct_limit_genl_family, 0, cmd); - - if (!*ovs_reply_header) { - nlmsg_free(skb); - return ERR_PTR(-EMSGSIZE); - } - (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; - - return skb; -} - -static bool check_zone_id(int zone_id, u16 *pzone) -{ - if (zone_id >= 0 && zone_id <= 65535) { - *pzone = (u16)zone_id; - return true; - } - return false; -} - -static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit, - struct ovs_ct_limit_info *info) -{ - struct ovs_zone_limit *zone_limit; - int rem; - u16 zone; - - rem = NLA_ALIGN(nla_len(nla_zone_limit)); - zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); - - while (rem >= sizeof(*zone_limit)) { - if (unlikely(zone_limit->zone_id == - OVS_ZONE_LIMIT_DEFAULT_ZONE)) { - ovs_lock(); - info->default_limit = zone_limit->limit; - ovs_unlock(); - } else if (unlikely(!check_zone_id( - zone_limit->zone_id, &zone))) { - OVS_NLERR(true, "zone id is out of range"); - } else { - struct ovs_ct_limit *ct_limit; - - ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL); - if (!ct_limit) - return -ENOMEM; - - ct_limit->zone = zone; - ct_limit->limit = zone_limit->limit; - - ovs_lock(); - ct_limit_set(info, ct_limit); - ovs_unlock(); - } - rem -= NLA_ALIGN(sizeof(*zone_limit)); - zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + - NLA_ALIGN(sizeof(*zone_limit))); - } - - if (rem) - OVS_NLERR(true, "set zone limit has %d unknown bytes", rem); - - return 0; -} - -static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit, - struct ovs_ct_limit_info *info) -{ - struct ovs_zone_limit *zone_limit; - int rem; - u16 zone; - - rem = NLA_ALIGN(nla_len(nla_zone_limit)); - zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); - - while (rem >= sizeof(*zone_limit)) { - if (unlikely(zone_limit->zone_id == - OVS_ZONE_LIMIT_DEFAULT_ZONE)) { - ovs_lock(); - info->default_limit = OVS_CT_LIMIT_DEFAULT; - ovs_unlock(); - } else if (unlikely(!check_zone_id( - zone_limit->zone_id, &zone))) { - OVS_NLERR(true, "zone id is out of range"); - } else { - ovs_lock(); - ct_limit_del(info, zone); - ovs_unlock(); - } - rem -= NLA_ALIGN(sizeof(*zone_limit)); - zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + - NLA_ALIGN(sizeof(*zone_limit))); - } - - if (rem) - OVS_NLERR(true, "del zone limit has %d unknown bytes", rem); - - return 0; -} - -static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info, - struct sk_buff *reply) -{ - struct ovs_zone_limit zone_limit; - int err; - - zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE; - zone_limit.limit = info->default_limit; - err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); - if (err) - return err; - - return 0; -} - -static int __ovs_ct_limit_get_zone_limit(struct net *net, - struct nf_conncount_data *data, - u16 zone_id, u32 limit, - struct sk_buff *reply) -{ - struct nf_conntrack_zone ct_zone; - struct ovs_zone_limit zone_limit; - u32 conncount_key = zone_id; - - zone_limit.zone_id = zone_id; - zone_limit.limit = limit; - nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); - - zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, - &ct_zone); - return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); -} - -static int ovs_ct_limit_get_zone_limit(struct net *net, - struct nlattr *nla_zone_limit, - struct ovs_ct_limit_info *info, - struct sk_buff *reply) -{ - struct ovs_zone_limit *zone_limit; - int rem, err; - u32 limit; - u16 zone; - - rem = NLA_ALIGN(nla_len(nla_zone_limit)); - zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); - - while (rem >= sizeof(*zone_limit)) { - if (unlikely(zone_limit->zone_id == - OVS_ZONE_LIMIT_DEFAULT_ZONE)) { - err = ovs_ct_limit_get_default_limit(info, reply); - if (err) - return err; - } else if (unlikely(!check_zone_id(zone_limit->zone_id, - &zone))) { - OVS_NLERR(true, "zone id is out of range"); - } else { - rcu_read_lock(); - limit = ct_limit_get(info, zone); - rcu_read_unlock(); - - err = __ovs_ct_limit_get_zone_limit( - net, info->data, zone, limit, reply); - if (err) - return err; - } - rem -= NLA_ALIGN(sizeof(*zone_limit)); - zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + - NLA_ALIGN(sizeof(*zone_limit))); - } - - if (rem) - OVS_NLERR(true, "get zone limit has %d unknown bytes", rem); - - return 0; -} - -static int ovs_ct_limit_get_all_zone_limit(struct net *net, - struct ovs_ct_limit_info *info, - struct sk_buff *reply) -{ - struct ovs_ct_limit *ct_limit; - struct hlist_head *head; - int i, err = 0; - - err = ovs_ct_limit_get_default_limit(info, reply); - if (err) - return err; - - rcu_read_lock(); - for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { - head = &info->limits[i]; - hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { - err = __ovs_ct_limit_get_zone_limit(net, info->data, - ct_limit->zone, ct_limit->limit, reply); - if (err) - goto exit_err; - } - } - -exit_err: - rcu_read_unlock(); - return err; -} - -static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct sk_buff *reply; - struct ovs_header *ovs_reply_header; - struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); - struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; - int err; - - reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET, - &ovs_reply_header); - if (IS_ERR(reply)) - return PTR_ERR(reply); - - if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { - err = -EINVAL; - goto exit_err; - } - - err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], - ct_limit_info); - if (err) - goto exit_err; - - static_branch_enable(&ovs_ct_limit_enabled); - - genlmsg_end(reply, ovs_reply_header); - return genlmsg_reply(reply, info); - -exit_err: - nlmsg_free(reply); - return err; -} - -static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct sk_buff *reply; - struct ovs_header *ovs_reply_header; - struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); - struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; - int err; - - reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL, - &ovs_reply_header); - if (IS_ERR(reply)) - return PTR_ERR(reply); - - if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { - err = -EINVAL; - goto exit_err; - } - - err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], - ct_limit_info); - if (err) - goto exit_err; - - genlmsg_end(reply, ovs_reply_header); - return genlmsg_reply(reply, info); - -exit_err: - nlmsg_free(reply); - return err; -} - -static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct nlattr *nla_reply; - struct sk_buff *reply; - struct ovs_header *ovs_reply_header; - struct net *net = sock_net(skb->sk); - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; - int err; - - reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET, - &ovs_reply_header); - if (IS_ERR(reply)) - return PTR_ERR(reply); - - nla_reply = nla_nest_start_noflag(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); - if (!nla_reply) { - err = -EMSGSIZE; - goto exit_err; - } - - if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { - err = ovs_ct_limit_get_zone_limit( - net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info, - reply); - if (err) - goto exit_err; - } else { - err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info, - reply); - if (err) - goto exit_err; - } - - nla_nest_end(reply, nla_reply); - genlmsg_end(reply, ovs_reply_header); - return genlmsg_reply(reply, info); - -exit_err: - nlmsg_free(reply); - return err; -} - -static struct genl_ops ct_limit_genl_ops[] = { - { .cmd = OVS_CT_LIMIT_CMD_SET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = ct_limit_policy, -#endif - .doit = ovs_ct_limit_cmd_set, - }, - { .cmd = OVS_CT_LIMIT_CMD_DEL, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = ct_limit_policy, -#endif - .doit = ovs_ct_limit_cmd_del, - }, - { .cmd = OVS_CT_LIMIT_CMD_GET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = 0, /* OK for unprivileged users. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = ct_limit_policy, -#endif - .doit = ovs_ct_limit_cmd_get, - }, -}; - -static const struct genl_multicast_group ovs_ct_limit_multicast_group = { - .name = OVS_CT_LIMIT_MCGROUP, -}; - -struct genl_family dp_ct_limit_genl_family __ro_after_init = { - .hdrsize = sizeof(struct ovs_header), - .name = OVS_CT_LIMIT_FAMILY, - .version = OVS_CT_LIMIT_VERSION, - .maxattr = OVS_CT_LIMIT_ATTR_MAX, -#ifndef HAVE_GENL_OPS_POLICY - .policy = ct_limit_policy, -#endif - .netnsok = true, - .parallel_ops = true, - .ops = ct_limit_genl_ops, - .n_ops = ARRAY_SIZE(ct_limit_genl_ops), - .mcgrps = &ovs_ct_limit_multicast_group, - .n_mcgrps = 1, - .module = THIS_MODULE, -}; -#endif - -int ovs_ct_init(struct net *net) -{ - unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - if (nf_connlabels_get(net, n_bits - 1)) { - ovs_net->xt_label = false; - OVS_NLERR(true, "Failed to set connlabel length"); - } else { - ovs_net->xt_label = true; - } - -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) - return ovs_ct_limit_init(net, ovs_net); -#else - return 0; -#endif -} - -void ovs_ct_exit(struct net *net) -{ - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) - ovs_ct_limit_exit(net, ovs_net); -#endif - - if (ovs_net->xt_label) - nf_connlabels_put(net); -} - -#endif /* CONFIG_NF_CONNTRACK */ diff --git a/datapath/conntrack.h b/datapath/conntrack.h deleted file mode 100644 index 5b4b34c19..000000000 --- a/datapath/conntrack.h +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Copyright (c) 2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#ifndef OVS_CONNTRACK_H -#define OVS_CONNTRACK_H 1 - -#include <linux/version.h> -#include "flow.h" - -struct ovs_conntrack_info; -struct ovs_ct_limit_info; -enum ovs_key_attr; - -#if IS_ENABLED(CONFIG_NF_CONNTRACK) -int ovs_ct_init(struct net *); -void ovs_ct_exit(struct net *); -bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); -int ovs_ct_copy_action(struct net *, const struct nlattr *, - const struct sw_flow_key *, struct sw_flow_actions **, - bool log); -int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); - -int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, - const struct ovs_conntrack_info *); -int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key); - -void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); -int ovs_ct_put_key(const struct sw_flow_key *swkey, - const struct sw_flow_key *output, struct sk_buff *skb); -void ovs_ct_free_action(const struct nlattr *a); - -#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ - OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ - OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \ - OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) -#else -#include <linux/errno.h> - -static inline int ovs_ct_init(struct net *net) { return 0; } - -static inline void ovs_ct_exit(struct net *net) { } - -static inline bool ovs_ct_verify(struct net *net, int attr) -{ - return false; -} - -static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla, - const struct sw_flow_key *key, - struct sw_flow_actions **acts, bool log) -{ - return -ENOTSUPP; -} - -static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info, - struct sk_buff *skb) -{ - return -ENOTSUPP; -} - -static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, - struct sw_flow_key *key, - const struct ovs_conntrack_info *info) -{ - kfree_skb(skb); - return -ENOTSUPP; -} - -static inline int ovs_ct_clear(struct sk_buff *skb, - struct sw_flow_key *key) -{ - return -ENOTSUPP; -} - -static inline void ovs_ct_fill_key(const struct sk_buff *skb, - struct sw_flow_key *key) -{ - key->ct_state = 0; - key->ct_zone = 0; - key->ct.mark = 0; - memset(&key->ct.labels, 0, sizeof(key->ct.labels)); - /* Clear 'ct_orig_proto' to mark the non-existence of original - * direction key fields. - */ - key->ct_orig_proto = 0; -} - -static inline int ovs_ct_put_key(const struct sw_flow_key *swkey, - const struct sw_flow_key *output, - struct sk_buff *skb) -{ - return 0; -} - -static inline void ovs_ct_free_action(const struct nlattr *a) { } - -#define CT_SUPPORTED_MASK 0 -#endif - -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) -extern struct genl_family dp_ct_limit_genl_family; -#endif -#endif /* ovs_conntrack.h */ diff --git a/datapath/datapath.c b/datapath/datapath.c deleted file mode 100644 index b88d16107..000000000 --- a/datapath/datapath.c +++ /dev/null @@ -1,2707 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/if_arp.h> -#include <linux/if_vlan.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/jhash.h> -#include <linux/delay.h> -#include <linux/time.h> -#include <linux/etherdevice.h> -#include <linux/genetlink.h> -#include <linux/kernel.h> -#include <linux/kthread.h> -#include <linux/mutex.h> -#include <linux/percpu.h> -#include <linux/rcupdate.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/version.h> -#include <linux/ethtool.h> -#include <linux/wait.h> -#include <asm/div64.h> -#include <linux/highmem.h> -#include <linux/netfilter_bridge.h> -#include <linux/netfilter_ipv4.h> -#include <linux/inetdevice.h> -#include <linux/list.h> -#include <linux/openvswitch.h> -#include <linux/rculist.h> -#include <linux/dmi.h> -#include <net/genetlink.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/nsh.h> - -#include "datapath.h" -#include "conntrack.h" -#include "flow.h" -#include "flow_table.h" -#include "flow_netlink.h" -#include "meter.h" -#include "gso.h" -#include "vport-internal_dev.h" -#include "vport-netdev.h" - -unsigned int ovs_net_id __read_mostly; - -static struct genl_family dp_packet_genl_family; -static struct genl_family dp_flow_genl_family; -static struct genl_family dp_datapath_genl_family; - -static const struct nla_policy flow_policy[]; - -static const struct genl_multicast_group ovs_dp_flow_multicast_group = { - .name = OVS_FLOW_MCGROUP, -}; - -static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { - .name = OVS_DATAPATH_MCGROUP, -}; - -const struct genl_multicast_group ovs_dp_vport_multicast_group = { - .name = OVS_VPORT_MCGROUP, -}; - -/* Check if need to build a reply message. - * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. - */ -static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, - unsigned int group) -{ - return info->nlhdr->nlmsg_flags & NLM_F_ECHO || - genl_has_listeners(family, genl_info_net(info), group); -} - -static void ovs_notify(struct genl_family *family, - const struct genl_multicast_group *grp, - struct sk_buff *skb, struct genl_info *info) -{ - genl_notify(family, skb, info, GROUP_ID(grp), GFP_KERNEL); -} - -/** - * DOC: Locking: - * - * All writes e.g. Writes to device state (add/remove datapath, port, set - * operations on vports, etc.), Writes to other state (flow table - * modifications, set miscellaneous datapath parameters, etc.) are protected - * by ovs_lock. - * - * Reads are protected by RCU. - * - * There are a few special cases (mostly stats) that have their own - * synchronization but they nest under all of above and don't interact with - * each other. - * - * The RTNL lock nests inside ovs_mutex. - */ - -static DEFINE_MUTEX(ovs_mutex); - -void ovs_lock(void) -{ - mutex_lock(&ovs_mutex); -} - -void ovs_unlock(void) -{ - mutex_unlock(&ovs_mutex); -} - -#ifdef CONFIG_LOCKDEP -int lockdep_ovsl_is_held(void) -{ - if (debug_locks) - return lockdep_is_held(&ovs_mutex); - else - return 1; -} -#endif - -static int queue_gso_packets(struct datapath *dp, struct sk_buff *, - const struct sw_flow_key *, - const struct dp_upcall_info *, - uint32_t cutlen); -static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, - const struct sw_flow_key *, - const struct dp_upcall_info *, - uint32_t cutlen); - -/* Must be called with rcu_read_lock or ovs_mutex. */ -const char *ovs_dp_name(const struct datapath *dp) -{ - struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); - return ovs_vport_name(vport); -} - -static int get_dpifindex(const struct datapath *dp) -{ - struct vport *local; - int ifindex; - - rcu_read_lock(); - - local = ovs_vport_rcu(dp, OVSP_LOCAL); - if (local) - ifindex = local->dev->ifindex; - else - ifindex = 0; - - rcu_read_unlock(); - - return ifindex; -} - -static void destroy_dp_rcu(struct rcu_head *rcu) -{ - struct datapath *dp = container_of(rcu, struct datapath, rcu); - - ovs_flow_tbl_destroy(&dp->table); - free_percpu(dp->stats_percpu); - kfree(dp->ports); - ovs_meters_exit(dp); - kfree(dp); -} - -static struct hlist_head *vport_hash_bucket(const struct datapath *dp, - u16 port_no) -{ - return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; -} - -/* Called with ovs_mutex or RCU read lock. */ -struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) -{ - struct vport *vport; - struct hlist_head *head; - - head = vport_hash_bucket(dp, port_no); - hlist_for_each_entry_rcu(vport, head, dp_hash_node) { - if (vport->port_no == port_no) - return vport; - } - return NULL; -} - -/* Called with ovs_mutex. */ -static struct vport *new_vport(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = ovs_vport_add(parms); - if (!IS_ERR(vport)) { - struct datapath *dp = parms->dp; - struct hlist_head *head = vport_hash_bucket(dp, vport->port_no); - - hlist_add_head_rcu(&vport->dp_hash_node, head); - } - return vport; -} - -void ovs_dp_detach_port(struct vport *p) -{ - ASSERT_OVSL(); - - /* First drop references to device. */ - hlist_del_rcu(&p->dp_hash_node); - - /* Then destroy it. */ - ovs_vport_del(p); -} - -/* Must be called with rcu_read_lock. */ -void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) -{ - const struct vport *p = OVS_CB(skb)->input_vport; - struct datapath *dp = p->dp; - struct sw_flow *flow; - struct sw_flow_actions *sf_acts; - struct dp_stats_percpu *stats; - u64 *stats_counter; - u32 n_mask_hit; - int error; - - stats = this_cpu_ptr(dp->stats_percpu); - - /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), - &n_mask_hit); - if (unlikely(!flow)) { - struct dp_upcall_info upcall; - - memset(&upcall, 0, sizeof(upcall)); - upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.portid = ovs_vport_find_upcall_portid(p, skb); - upcall.mru = OVS_CB(skb)->mru; - error = ovs_dp_upcall(dp, skb, key, &upcall, 0); - if (unlikely(error)) - kfree_skb(skb); - else - consume_skb(skb); - stats_counter = &stats->n_missed; - goto out; - } - - ovs_flow_stats_update(flow, key->tp.flags, skb); - sf_acts = rcu_dereference(flow->sf_acts); - error = ovs_execute_actions(dp, skb, sf_acts, key); - if (unlikely(error)) - net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", - ovs_dp_name(dp), error); - - stats_counter = &stats->n_hit; - -out: - /* Update datapath statistics. */ - u64_stats_update_begin(&stats->syncp); - (*stats_counter)++; - stats->n_mask_hit += n_mask_hit; - u64_stats_update_end(&stats->syncp); -} - -int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, - const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info, - uint32_t cutlen) -{ - struct dp_stats_percpu *stats; - int err; - - if (upcall_info->portid == 0) { - err = -ENOTCONN; - goto err; - } - - if (!skb_is_gso(skb)) - err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); - else - err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); - if (err) - goto err; - - return 0; - -err: - stats = this_cpu_ptr(dp->stats_percpu); - - u64_stats_update_begin(&stats->syncp); - stats->n_lost++; - u64_stats_update_end(&stats->syncp); - - return err; -} - -static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, - const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info, - uint32_t cutlen) -{ -#ifdef HAVE_SKB_GSO_UDP - unsigned int gso_type = skb_shinfo(skb)->gso_type; - struct sw_flow_key later_key; -#endif - struct sk_buff *segs, *nskb; - struct ovs_skb_cb ovs_cb; - int err; - - ovs_cb = *OVS_CB(skb); - segs = __skb_gso_segment(skb, NETIF_F_SG, false); - *OVS_CB(skb) = ovs_cb; - if (IS_ERR(segs)) - return PTR_ERR(segs); - if (segs == NULL) - return -EINVAL; -#ifdef HAVE_SKB_GSO_UDP - if (gso_type & SKB_GSO_UDP) { - /* The initial flow key extracted by ovs_flow_key_extract() - * in this case is for a first fragment, so we need to - * properly mark later fragments. - */ - later_key = *key; - later_key.ip.frag = OVS_FRAG_TYPE_LATER; - } -#endif - /* Queue all of the segments. */ - skb_list_walk_safe(segs, skb, nskb) { - *OVS_CB(skb) = ovs_cb; -#ifdef HAVE_SKB_GSO_UDP - if (gso_type & SKB_GSO_UDP && skb != segs) - key = &later_key; -#endif - err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); - if (err) - break; - - } - - /* Free all of the segments. */ - skb_list_walk_safe(segs, skb, nskb) { - if (err) - kfree_skb(skb); - else - consume_skb(skb); - } - return err; -} - -static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, - unsigned int hdrlen, int actions_attrlen) -{ - size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ - + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ - + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ - + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ - - /* OVS_PACKET_ATTR_USERDATA */ - if (upcall_info->userdata) - size += NLA_ALIGN(upcall_info->userdata->nla_len); - - /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ - if (upcall_info->egress_tun_info) - size += nla_total_size(ovs_tun_key_attr_size()); - - /* OVS_PACKET_ATTR_ACTIONS */ - if (upcall_info->actions_len) - size += nla_total_size(actions_attrlen); - - /* OVS_PACKET_ATTR_MRU */ - if (upcall_info->mru) - size += nla_total_size(sizeof(upcall_info->mru)); - - return size; -} - -static void pad_packet(struct datapath *dp, struct sk_buff *skb) -{ - if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { - size_t plen = NLA_ALIGN(skb->len) - skb->len; - - if (plen > 0) - skb_put_zero(skb, plen); - } -} - -static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, - const struct sw_flow_key *key, - const struct dp_upcall_info *upcall_info, - uint32_t cutlen) -{ - struct ovs_header *upcall; - struct sk_buff *nskb = NULL; - struct sk_buff *user_skb = NULL; /* to be queued to userspace */ - struct nlattr *nla; - size_t len; - unsigned int hlen; - int err, dp_ifindex; - u64 hash; - - dp_ifindex = get_dpifindex(dp); - if (!dp_ifindex) - return -ENODEV; - - if (skb_vlan_tag_present(skb)) { - nskb = skb_clone(skb, GFP_ATOMIC); - if (!nskb) - return -ENOMEM; - - nskb = __vlan_hwaccel_push_inside(nskb); - if (!nskb) - return -ENOMEM; - - skb = nskb; - } - - if (nla_attr_size(skb->len) > USHRT_MAX) { - err = -EFBIG; - goto out; - } - - /* Complete checksum if needed */ - if (skb->ip_summed == CHECKSUM_PARTIAL && - (err = skb_csum_hwoffload_help(skb, 0))) - goto out; - - /* Older versions of OVS user space enforce alignment of the last - * Netlink attribute to NLA_ALIGNTO which would require extensive - * padding logic. Only perform zerocopy if padding is not required. - */ - if (dp->user_features & OVS_DP_F_UNALIGNED) - hlen = skb_zerocopy_headlen(skb); - else - hlen = skb->len; - - len = upcall_msg_size(upcall_info, hlen - cutlen, - OVS_CB(skb)->acts_origlen); - user_skb = genlmsg_new(len, GFP_ATOMIC); - if (!user_skb) { - err = -ENOMEM; - goto out; - } - - upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, - 0, upcall_info->cmd); - if (!upcall) { - err = -EINVAL; - goto out; - } - upcall->dp_ifindex = dp_ifindex; - - err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); - if (err) - goto out; - - if (upcall_info->userdata) - __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, - nla_len(upcall_info->userdata), - nla_data(upcall_info->userdata)); - - - if (upcall_info->egress_tun_info) { - nla = nla_nest_start_noflag(user_skb, - OVS_PACKET_ATTR_EGRESS_TUN_KEY); - if (!nla) { - err = -EMSGSIZE; - goto out; - } - err = ovs_nla_put_tunnel_info(user_skb, - upcall_info->egress_tun_info); - if (err) - goto out; - - nla_nest_end(user_skb, nla); - } - - if (upcall_info->actions_len) { - nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_ACTIONS); - if (!nla) { - err = -EMSGSIZE; - goto out; - } - err = ovs_nla_put_actions(upcall_info->actions, - upcall_info->actions_len, - user_skb); - if (!err) - nla_nest_end(user_skb, nla); - else - nla_nest_cancel(user_skb, nla); - } - - /* Add OVS_PACKET_ATTR_MRU */ - if (upcall_info->mru && - nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { - err = -ENOBUFS; - goto out; - } - - /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ - if (cutlen > 0 && - nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { - err = -ENOBUFS; - goto out; - } - - /* Add OVS_PACKET_ATTR_HASH */ - hash = skb_get_hash_raw(skb); -#ifdef HAVE_SW_HASH - if (skb->sw_hash) - hash |= OVS_PACKET_HASH_SW_BIT; -#endif - - if (skb->l4_hash) - hash |= OVS_PACKET_HASH_L4_BIT; - - if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { - err = -ENOBUFS; - goto out; - } - - /* Only reserve room for attribute header, packet data is added - * in skb_zerocopy() - */ - if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { - err = -ENOBUFS; - goto out; - } - nla->nla_len = nla_attr_size(skb->len - cutlen); - - err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); - if (err) - goto out; - - /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ - pad_packet(dp, user_skb); - - ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; - - err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); - user_skb = NULL; -out: - if (err) - skb_tx_error(skb); - kfree_skb(user_skb); - kfree_skb(nskb); - return err; -} - -static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) -{ - struct ovs_header *ovs_header = info->userhdr; - struct net *net = sock_net(skb->sk); - struct nlattr **a = info->attrs; - struct sw_flow_actions *acts; - struct sk_buff *packet; - struct sw_flow *flow; - struct sw_flow_actions *sf_acts; - struct datapath *dp; - struct vport *input_vport; - u16 mru = 0; - u64 hash; - int len; - int err; - bool log = !a[OVS_PACKET_ATTR_PROBE]; - - err = -EINVAL; - if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || - !a[OVS_PACKET_ATTR_ACTIONS]) - goto err; - - len = nla_len(a[OVS_PACKET_ATTR_PACKET]); - packet = __dev_alloc_skb(NET_IP_ALIGN + len, GFP_KERNEL); - err = -ENOMEM; - if (!packet) - goto err; - skb_reserve(packet, NET_IP_ALIGN); - - nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); - - /* Set packet's mru */ - if (a[OVS_PACKET_ATTR_MRU]) { - mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); - packet->ignore_df = 1; - } - OVS_CB(packet)->mru = mru; - - if (a[OVS_PACKET_ATTR_HASH]) { - hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); - - __skb_set_hash(packet, hash & 0xFFFFFFFFULL, - !!(hash & OVS_PACKET_HASH_SW_BIT), - !!(hash & OVS_PACKET_HASH_L4_BIT)); - } - - /* Build an sw_flow for sending this packet. */ - flow = ovs_flow_alloc(); - err = PTR_ERR(flow); - if (IS_ERR(flow)) - goto err_kfree_skb; - - err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], - packet, &flow->key, log); - if (err) - goto err_flow_free; - - err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], - &flow->key, &acts, log); - if (err) - goto err_flow_free; - - rcu_assign_pointer(flow->sf_acts, acts); - packet->priority = flow->key.phy.priority; - packet->mark = flow->key.phy.skb_mark; - - rcu_read_lock(); - dp = get_dp_rcu(net, ovs_header->dp_ifindex); - err = -ENODEV; - if (!dp) - goto err_unlock; - - input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); - if (!input_vport) - input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); - - if (!input_vport) - goto err_unlock; - - packet->dev = input_vport->dev; - OVS_CB(packet)->input_vport = input_vport; - sf_acts = rcu_dereference(flow->sf_acts); - - local_bh_disable(); - err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); - local_bh_enable(); - rcu_read_unlock(); - - ovs_flow_free(flow, false); - return err; - -err_unlock: - rcu_read_unlock(); -err_flow_free: - ovs_flow_free(flow, false); -err_kfree_skb: - kfree_skb(packet); -err: - return err; -} - -static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { - [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, - [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, - [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, - [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, - [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, - [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, -}; - -static struct genl_ops dp_packet_genl_ops[] = { - { .cmd = OVS_PACKET_CMD_EXECUTE, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = packet_policy, -#endif - .doit = ovs_packet_cmd_execute - } -}; - -static struct genl_family dp_packet_genl_family __ro_after_init = { - .hdrsize = sizeof(struct ovs_header), - .name = OVS_PACKET_FAMILY, - .version = OVS_PACKET_VERSION, - .maxattr = OVS_PACKET_ATTR_MAX, -#ifndef HAVE_GENL_OPS_POLICY - .policy = packet_policy, -#endif - .netnsok = true, - .parallel_ops = true, - .ops = dp_packet_genl_ops, - .n_ops = ARRAY_SIZE(dp_packet_genl_ops), - .module = THIS_MODULE, -}; - -static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, - struct ovs_dp_megaflow_stats *mega_stats) -{ - int i; - - memset(mega_stats, 0, sizeof(*mega_stats)); - - stats->n_flows = ovs_flow_tbl_count(&dp->table); - mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); - - stats->n_hit = stats->n_missed = stats->n_lost = 0; - - for_each_possible_cpu(i) { - const struct dp_stats_percpu *percpu_stats; - struct dp_stats_percpu local_stats; - unsigned int start; - - percpu_stats = per_cpu_ptr(dp->stats_percpu, i); - - do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - - stats->n_hit += local_stats.n_hit; - stats->n_missed += local_stats.n_missed; - stats->n_lost += local_stats.n_lost; - mega_stats->n_mask_hit += local_stats.n_mask_hit; - } -} - -static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags) -{ - return ovs_identifier_is_ufid(sfid) && - !(ufid_flags & OVS_UFID_F_OMIT_KEY); -} - -static bool should_fill_mask(uint32_t ufid_flags) -{ - return !(ufid_flags & OVS_UFID_F_OMIT_MASK); -} - -static bool should_fill_actions(uint32_t ufid_flags) -{ - return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS); -} - -static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, - const struct sw_flow_id *sfid, - uint32_t ufid_flags) -{ - size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); - - /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback - * see ovs_nla_put_identifier() - */ - if (sfid && ovs_identifier_is_ufid(sfid)) - len += nla_total_size(sfid->ufid_len); - else - len += nla_total_size(ovs_key_attr_size()); - - /* OVS_FLOW_ATTR_KEY */ - if (!sfid || should_fill_key(sfid, ufid_flags)) - len += nla_total_size(ovs_key_attr_size()); - - /* OVS_FLOW_ATTR_MASK */ - if (should_fill_mask(ufid_flags)) - len += nla_total_size(ovs_key_attr_size()); - - /* OVS_FLOW_ATTR_ACTIONS */ - if (should_fill_actions(ufid_flags)) - len += nla_total_size(acts->orig_len); - - return len - + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ - + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ - + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */ -} - -/* Called with ovs_mutex or RCU read lock. */ -static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, - struct sk_buff *skb) -{ - struct ovs_flow_stats stats; - __be16 tcp_flags; - unsigned long used; - - ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); - - if (used && - nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used), - OVS_FLOW_ATTR_PAD)) - return -EMSGSIZE; - - if (stats.n_packets && - nla_put_64bit(skb, OVS_FLOW_ATTR_STATS, - sizeof(struct ovs_flow_stats), &stats, - OVS_FLOW_ATTR_PAD)) - return -EMSGSIZE; - - if ((u8)ntohs(tcp_flags) && - nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) - return -EMSGSIZE; - - return 0; -} - -/* Called with ovs_mutex or RCU read lock. */ -static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, - struct sk_buff *skb, int skb_orig_len) -{ - struct nlattr *start; - int err; - - /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if - * this is the first flow to be dumped into 'skb'. This is unusual for - * Netlink but individual action lists can be longer than - * NLMSG_GOODSIZE and thus entirely undumpable if we didn't do this. - * The userspace caller can always fetch the actions separately if it - * really wants them. (Most userspace callers in fact don't care.) - * - * This can only fail for dump operations because the skb is always - * properly sized for single flows. - */ - start = nla_nest_start_noflag(skb, OVS_FLOW_ATTR_ACTIONS); - if (start) { - const struct sw_flow_actions *sf_acts; - - sf_acts = rcu_dereference_ovsl(flow->sf_acts); - err = ovs_nla_put_actions(sf_acts->actions, - sf_acts->actions_len, skb); - - if (!err) - nla_nest_end(skb, start); - else { - if (skb_orig_len) - return err; - - nla_nest_cancel(skb, start); - } - } else if (skb_orig_len) { - return -EMSGSIZE; - } - - return 0; -} - -/* Called with ovs_mutex or RCU read lock. */ -static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, - struct sk_buff *skb, u32 portid, - u32 seq, u32 flags, u8 cmd, u32 ufid_flags) -{ - const int skb_orig_len = skb->len; - struct ovs_header *ovs_header; - int err; - - ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, - flags, cmd); - if (!ovs_header) - return -EMSGSIZE; - - ovs_header->dp_ifindex = dp_ifindex; - - err = ovs_nla_put_identifier(flow, skb); - if (err) - goto error; - - if (should_fill_key(&flow->id, ufid_flags)) { - err = ovs_nla_put_masked_key(flow, skb); - if (err) - goto error; - } - - if (should_fill_mask(ufid_flags)) { - err = ovs_nla_put_mask(flow, skb); - if (err) - goto error; - } - - err = ovs_flow_cmd_fill_stats(flow, skb); - if (err) - goto error; - - if (should_fill_actions(ufid_flags)) { - err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); - if (err) - goto error; - } - - genlmsg_end(skb, ovs_header); - return 0; - -error: - genlmsg_cancel(skb, ovs_header); - return err; -} - -/* May not be called with RCU read lock. */ -static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, - const struct sw_flow_id *sfid, - struct genl_info *info, - bool always, - uint32_t ufid_flags) -{ - struct sk_buff *skb; - size_t len; - - if (!always && !ovs_must_notify(&dp_flow_genl_family, info, - GROUP_ID(&ovs_dp_flow_multicast_group))) - return NULL; - - len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); - skb = genlmsg_new(len, GFP_KERNEL); - if (!skb) - return ERR_PTR(-ENOMEM); - - return skb; -} - -/* Called with ovs_mutex. */ -static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, - int dp_ifindex, - struct genl_info *info, u8 cmd, - bool always, u32 ufid_flags) -{ - struct sk_buff *skb; - int retval; - - skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), - &flow->id, info, always, ufid_flags); - if (IS_ERR_OR_NULL(skb)) - return skb; - - retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, - info->snd_portid, info->snd_seq, 0, - cmd, ufid_flags); - if (WARN_ON_ONCE(retval < 0)) { - kfree_skb(skb); - skb = ERR_PTR(retval); - } - return skb; -} - -static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; - struct sw_flow *flow = NULL, *new_flow; - struct sw_flow_mask mask; - struct sk_buff *reply; - struct datapath *dp; - struct sw_flow_actions *acts; - struct sw_flow_match match; - u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); - int error; - bool log = !a[OVS_FLOW_ATTR_PROBE]; - - /* Must have key and actions. */ - error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR(log, "Flow key attr not present in new flow."); - goto error; - } - if (!a[OVS_FLOW_ATTR_ACTIONS]) { - OVS_NLERR(log, "Flow actions attr not present in new flow."); - goto error; - } - - /* Most of the time we need to allocate a new flow, do it before - * locking. - */ - new_flow = ovs_flow_alloc(); - if (IS_ERR(new_flow)) { - error = PTR_ERR(new_flow); - goto error; - } - - /* Extract key. */ - ovs_match_init(&match, &new_flow->key, false, &mask); - error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], - a[OVS_FLOW_ATTR_MASK], log); - if (error) - goto err_kfree_flow; - - /* Extract flow identifier. */ - error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &new_flow->key, log); - if (error) - goto err_kfree_flow; - - /* unmasked key is needed to match when ufid is not used. */ - if (ovs_identifier_is_key(&new_flow->id)) - match.key = new_flow->id.unmasked_key; - - ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); - - /* Validate actions. */ - error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], - &new_flow->key, &acts, log); - if (error) { - OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); - goto err_kfree_flow; - } - - reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, - ufid_flags); - if (IS_ERR(reply)) { - error = PTR_ERR(reply); - goto err_kfree_acts; - } - - ovs_lock(); - dp = get_dp(net, ovs_header->dp_ifindex); - if (unlikely(!dp)) { - error = -ENODEV; - goto err_unlock_ovs; - } - - /* Check if this is a duplicate flow */ - if (ovs_identifier_is_ufid(&new_flow->id)) - flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); - if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); - if (likely(!flow)) { - rcu_assign_pointer(new_flow->sf_acts, acts); - - /* Put flow in bucket. */ - error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); - if (unlikely(error)) { - acts = NULL; - goto err_unlock_ovs; - } - - if (unlikely(reply)) { - error = ovs_flow_cmd_fill_info(new_flow, - ovs_header->dp_ifindex, - reply, info->snd_portid, - info->snd_seq, 0, - OVS_FLOW_CMD_NEW, - ufid_flags); - BUG_ON(error < 0); - } - ovs_unlock(); - } else { - struct sw_flow_actions *old_acts; - - /* Bail out if we're not allowed to modify an existing flow. - * We accept NLM_F_CREATE in place of the intended NLM_F_EXCL - * because Generic Netlink treats the latter as a dump - * request. We also accept NLM_F_EXCL in case that bug ever - * gets fixed. - */ - if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE - | NLM_F_EXCL))) { - error = -EEXIST; - goto err_unlock_ovs; - } - /* The flow identifier has to be the same for flow updates. - * Look for any overlapping flow. - */ - if (unlikely(!ovs_flow_cmp(flow, &match))) { - if (ovs_identifier_is_key(&flow->id)) - flow = ovs_flow_tbl_lookup_exact(&dp->table, - &match); - else /* UFID matches but key is different */ - flow = NULL; - if (!flow) { - error = -ENOENT; - goto err_unlock_ovs; - } - } - /* Update actions. */ - old_acts = ovsl_dereference(flow->sf_acts); - rcu_assign_pointer(flow->sf_acts, acts); - - if (unlikely(reply)) { - error = ovs_flow_cmd_fill_info(flow, - ovs_header->dp_ifindex, - reply, info->snd_portid, - info->snd_seq, 0, - OVS_FLOW_CMD_NEW, - ufid_flags); - BUG_ON(error < 0); - } - ovs_unlock(); - - ovs_nla_free_flow_actions_rcu(old_acts); - ovs_flow_free(new_flow, false); - } - - if (reply) - ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info); - return 0; - -err_unlock_ovs: - ovs_unlock(); - kfree_skb(reply); -err_kfree_acts: - ovs_nla_free_flow_actions(acts); -err_kfree_flow: - ovs_flow_free(new_flow, false); -error: - return error; -} - -/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ -static noinline_for_stack struct sw_flow_actions *get_flow_actions(struct net *net, - const struct nlattr *a, - const struct sw_flow_key *key, - const struct sw_flow_mask *mask, - bool log) -{ - struct sw_flow_actions *acts; - struct sw_flow_key masked_key; - int error; - - ovs_flow_mask_key(&masked_key, key, true, mask); - error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); - if (error) { - OVS_NLERR(log, - "Actions may not be safe on all matching packets"); - return ERR_PTR(error); - } - - return acts; -} - -/* Factor out match-init and action-copy to avoid - * "Wframe-larger-than=1024" warning. Because mask is only - * used to get actions, we new a function to save some - * stack space. - * - * If there are not key and action attrs, we return 0 - * directly. In the case, the caller will also not use the - * match as before. If there is action attr, we try to get - * actions and save them to *acts. Before returning from - * the function, we reset the match->mask pointer. Because - * we should not to return match object with dangling reference - * to mask. - * */ -static noinline_for_stack int -ovs_nla_init_match_and_action(struct net *net, - struct sw_flow_match *match, - struct sw_flow_key *key, - struct nlattr **a, - struct sw_flow_actions **acts, - bool log) -{ - struct sw_flow_mask mask; - int error = 0; - - if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(match, key, true, &mask); - error = ovs_nla_get_match(net, match, a[OVS_FLOW_ATTR_KEY], - a[OVS_FLOW_ATTR_MASK], log); - if (error) - goto error; - } - - if (a[OVS_FLOW_ATTR_ACTIONS]) { - if (!a[OVS_FLOW_ATTR_KEY]) { - OVS_NLERR(log, - "Flow key attribute not present in set flow."); - error = -EINVAL; - goto error; - } - - *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key, - &mask, log); - if (IS_ERR(*acts)) { - error = PTR_ERR(*acts); - goto error; - } - } - - /* On success, error is 0. */ -error: - match->mask = NULL; - return error; -} - -static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) -{ - struct net *net = sock_net(skb->sk); - struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; - struct sw_flow_key key; - struct sw_flow *flow; - struct sk_buff *reply = NULL; - struct datapath *dp; - struct sw_flow_actions *old_acts = NULL, *acts = NULL; - struct sw_flow_match match; - struct sw_flow_id sfid; - u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); - int error = 0; - bool log = !a[OVS_FLOW_ATTR_PROBE]; - bool ufid_present; - - ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); - if (!a[OVS_FLOW_ATTR_KEY] && !ufid_present) { - OVS_NLERR(log, - "Flow set message rejected, Key attribute missing."); - return -EINVAL; - } - - error = ovs_nla_init_match_and_action(net, &match, &key, a, - &acts, log); - if (error) - goto error; - - if (acts) { - /* Can allocate before locking if have acts. */ - reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false, - ufid_flags); - if (IS_ERR(reply)) { - error = PTR_ERR(reply); - goto err_kfree_acts; - } - } - - ovs_lock(); - dp = get_dp(net, ovs_header->dp_ifindex); - if (unlikely(!dp)) { - error = -ENODEV; - goto err_unlock_ovs; - } - /* Check that the flow exists. */ - if (ufid_present) - flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid); - else - flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); - if (unlikely(!flow)) { - error = -ENOENT; - goto err_unlock_ovs; - } - - /* Update actions, if present. */ - if (likely(acts)) { - old_acts = ovsl_dereference(flow->sf_acts); - rcu_assign_pointer(flow->sf_acts, acts); - - if (unlikely(reply)) { - error = ovs_flow_cmd_fill_info(flow, - ovs_header->dp_ifindex, - reply, info->snd_portid, - info->snd_seq, 0, - OVS_FLOW_CMD_SET, - ufid_flags); - BUG_ON(error < 0); - } - } else { - /* Could not alloc without acts before locking. */ - reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, - info, OVS_FLOW_CMD_SET, false, - ufid_flags); - - if (unlikely(IS_ERR(reply))) { - error = PTR_ERR(reply); - goto err_unlock_ovs; - } - } - - /* Clear stats. */ - if (a[OVS_FLOW_ATTR_CLEAR]) - ovs_flow_stats_clear(flow); - ovs_unlock(); - - if (reply) - ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info); - if (old_acts) - ovs_nla_free_flow_actions_rcu(old_acts); - - return 0; - -err_unlock_ovs: - ovs_unlock(); - kfree_skb(reply); -err_kfree_acts: - ovs_nla_free_flow_actions(acts); -error: - return error; -} - -static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; - struct net *net = sock_net(skb->sk); - struct sw_flow_key key; - struct sk_buff *reply; - struct sw_flow *flow; - struct datapath *dp; - struct sw_flow_match match; - struct sw_flow_id ufid; - u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); - int err = 0; - bool log = !a[OVS_FLOW_ATTR_PROBE]; - bool ufid_present; - - ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); - if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, true, NULL); - err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, - log); - } else if (!ufid_present) { - OVS_NLERR(log, - "Flow get message rejected, Key attribute missing."); - err = -EINVAL; - } - if (err) - return err; - - ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { - err = -ENODEV; - goto unlock; - } - - if (ufid_present) - flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); - else - flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); - if (!flow) { - err = -ENOENT; - goto unlock; - } - - reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, - OVS_FLOW_CMD_GET, true, ufid_flags); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - goto unlock; - } - - ovs_unlock(); - return genlmsg_reply(reply, info); -unlock: - ovs_unlock(); - return err; -} - -static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; - struct net *net = sock_net(skb->sk); - struct sw_flow_key key; - struct sk_buff *reply; - struct sw_flow *flow = NULL; - struct datapath *dp; - struct sw_flow_match match; - struct sw_flow_id ufid; - u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); - int err; - bool log = !a[OVS_FLOW_ATTR_PROBE]; - bool ufid_present; - - ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); - if (a[OVS_FLOW_ATTR_KEY]) { - ovs_match_init(&match, &key, true, NULL); - err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], - NULL, log); - if (unlikely(err)) - return err; - } - - ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (unlikely(!dp)) { - err = -ENODEV; - goto unlock; - } - - if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { - err = ovs_flow_tbl_flush(&dp->table); - goto unlock; - } - - if (ufid_present) - flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); - else - flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); - if (unlikely(!flow)) { - err = -ENOENT; - goto unlock; - } - - ovs_flow_tbl_remove(&dp->table, flow); - ovs_unlock(); - - reply = ovs_flow_cmd_alloc_info(rcu_dereference_raw(flow->sf_acts), - &flow->id, info, false, ufid_flags); - - if (likely(reply)) { - if (!IS_ERR(reply)) { - rcu_read_lock(); /*To keep RCU checker happy. */ - err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, - reply, info->snd_portid, - info->snd_seq, 0, - OVS_FLOW_CMD_DEL, - ufid_flags); - rcu_read_unlock(); - if (WARN_ON_ONCE(err < 0)) { - kfree_skb(reply); - goto out_free; - } - ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info); - } else { - genl_set_err(&dp_flow_genl_family, sock_net(skb->sk), 0, - GROUP_ID(&ovs_dp_flow_multicast_group), PTR_ERR(reply)); - - } - } - -out_free: - ovs_flow_free(flow, true); - return 0; -unlock: - ovs_unlock(); - return err; -} - -static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct nlattr *a[__OVS_FLOW_ATTR_MAX]; - struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); - struct table_instance *ti; - struct datapath *dp; - u32 ufid_flags; - int err; - - err = genlmsg_parse_deprecated(cb->nlh, &dp_flow_genl_family, a, - OVS_FLOW_ATTR_MAX, flow_policy, NULL); - if (err) - return err; - ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); - - rcu_read_lock(); - dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { - rcu_read_unlock(); - return -ENODEV; - } - - ti = rcu_dereference(dp->table.ti); - for (;;) { - struct sw_flow *flow; - u32 bucket, obj; - - bucket = cb->args[0]; - obj = cb->args[1]; - flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); - if (!flow) - break; - - if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_FLOW_CMD_GET, ufid_flags) < 0) - break; - - cb->args[0] = bucket; - cb->args[1] = obj; - } - rcu_read_unlock(); - return skb->len; -} - -static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { - [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, - [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, - [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 }, - [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, -}; - -static const struct genl_ops dp_flow_genl_ops[] = { - { .cmd = OVS_FLOW_CMD_NEW, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = flow_policy, -#endif - .doit = ovs_flow_cmd_new - }, - { .cmd = OVS_FLOW_CMD_DEL, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = flow_policy, -#endif - .doit = ovs_flow_cmd_del - }, - { .cmd = OVS_FLOW_CMD_GET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = 0, /* OK for unprivileged users. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = flow_policy, -#endif - .doit = ovs_flow_cmd_get, - .dumpit = ovs_flow_cmd_dump - }, - { .cmd = OVS_FLOW_CMD_SET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = flow_policy, -#endif - .doit = ovs_flow_cmd_set, - }, -}; - -static struct genl_family dp_flow_genl_family __ro_after_init = { - .hdrsize = sizeof(struct ovs_header), - .name = OVS_FLOW_FAMILY, - .version = OVS_FLOW_VERSION, - .maxattr = OVS_FLOW_ATTR_MAX, -#ifndef HAVE_GENL_OPS_POLICY - .policy = flow_policy, -#endif - .netnsok = true, - .parallel_ops = true, - .ops = dp_flow_genl_ops, - .n_ops = ARRAY_SIZE(dp_flow_genl_ops), - .mcgrps = &ovs_dp_flow_multicast_group, - .n_mcgrps = 1, - .module = THIS_MODULE, -}; - -static size_t ovs_dp_cmd_msg_size(void) -{ - size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); - - msgsize += nla_total_size(IFNAMSIZ); - msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); - msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); - msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ - - return msgsize; -} - -/* Called with ovs_mutex. */ -static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, - u32 portid, u32 seq, u32 flags, u8 cmd) -{ - struct ovs_header *ovs_header; - struct ovs_dp_stats dp_stats; - struct ovs_dp_megaflow_stats dp_megaflow_stats; - int err; - - ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, - flags, cmd); - if (!ovs_header) - goto error; - - ovs_header->dp_ifindex = get_dpifindex(dp); - - err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); - if (err) - goto nla_put_failure; - - get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); - if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), - &dp_stats, OVS_DP_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS, - sizeof(struct ovs_dp_megaflow_stats), - &dp_megaflow_stats, OVS_DP_ATTR_PAD)) - goto nla_put_failure; - - if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) - goto nla_put_failure; - - genlmsg_end(skb, ovs_header); - return 0; - -nla_put_failure: - genlmsg_cancel(skb, ovs_header); -error: - return -EMSGSIZE; -} - -static struct sk_buff *ovs_dp_cmd_alloc_info(void) -{ - return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); -} - -/* Called with rcu_read_lock or ovs_mutex. */ -static struct datapath *lookup_datapath(struct net *net, - const struct ovs_header *ovs_header, - struct nlattr *a[OVS_DP_ATTR_MAX + 1]) -{ - struct datapath *dp; - - if (!a[OVS_DP_ATTR_NAME]) - dp = get_dp(net, ovs_header->dp_ifindex); - else { - struct vport *vport; - - vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); - dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; - } - return dp ? dp : ERR_PTR(-ENODEV); -} - -static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) -{ - struct datapath *dp; - - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); - if (IS_ERR(dp)) - return; - - WARN(dp->user_features, "Dropping previously announced user features\n"); - dp->user_features = 0; -} - -DEFINE_STATIC_KEY_FALSE(tc_recirc_sharing_support); - -static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) -{ - u32 user_features = 0; - - if (a[OVS_DP_ATTR_USER_FEATURES]) { - user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); - - if (user_features & ~(OVS_DP_F_VPORT_PIDS | - OVS_DP_F_UNALIGNED | - OVS_DP_F_TC_RECIRC_SHARING)) - return -EOPNOTSUPP; - -#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - if (user_features & OVS_DP_F_TC_RECIRC_SHARING) - return -EOPNOTSUPP; -#endif - } - - dp->user_features = user_features; - - if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) - static_branch_enable(&tc_recirc_sharing_support); - else - static_branch_disable(&tc_recirc_sharing_support); - - return 0; -} - -static int ovs_dp_stats_init(struct datapath *dp) -{ - dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); - if (!dp->stats_percpu) - return -ENOMEM; - - return 0; -} - -static int ovs_dp_vport_init(struct datapath *dp) -{ - int i; - - dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!dp->ports) - return -ENOMEM; - - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->ports[i]); - - return 0; -} - -static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct vport_parms parms; - struct sk_buff *reply; - struct datapath *dp; - struct vport *vport; - struct ovs_net *ovs_net; - int err; - - err = -EINVAL; - if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) - goto err; - - reply = ovs_dp_cmd_alloc_info(); - if (!reply) - return -ENOMEM; - - err = -ENOMEM; - dp = kzalloc(sizeof(*dp), GFP_KERNEL); - if (dp == NULL) - goto err_destroy_reply; - - ovs_dp_set_net(dp, sock_net(skb->sk)); - - /* Allocate table. */ - err = ovs_flow_tbl_init(&dp->table); - if (err) - goto err_destroy_dp; - - err = ovs_dp_stats_init(dp); - if (err) - goto err_destroy_table; - - err = ovs_dp_vport_init(dp); - if (err) - goto err_destroy_stats; - - err = ovs_meters_init(dp); - if (err) - goto err_destroy_ports; - - /* Set up our datapath device. */ - parms.name = nla_data(a[OVS_DP_ATTR_NAME]); - parms.type = OVS_VPORT_TYPE_INTERNAL; - parms.options = NULL; - parms.dp = dp; - parms.port_no = OVSP_LOCAL; - parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; - - err = ovs_dp_change(dp, a); - if (err) - goto err_destroy_meters; - - /* So far only local changes have been made, now need the lock. */ - ovs_lock(); - - vport = new_vport(&parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - if (err == -EBUSY) - err = -EEXIST; - - if (err == -EEXIST) { - /* An outdated user space instance that does not understand - * the concept of user_features has attempted to create a new - * datapath and is likely to reuse it. Drop all user features. - */ - if (info->genlhdr->version < OVS_DP_VER_FEATURES) - ovs_dp_reset_user_features(skb, info); - } - - ovs_unlock(); - goto err_destroy_meters; - } - - err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_DP_CMD_NEW); - BUG_ON(err < 0); - - ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); - list_add_tail_rcu(&dp->list_node, &ovs_net->dps); - - ovs_unlock(); - - ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info); - return 0; - -err_destroy_meters: - ovs_meters_exit(dp); -err_destroy_ports: - kfree(dp->ports); -err_destroy_stats: - free_percpu(dp->stats_percpu); -err_destroy_table: - ovs_flow_tbl_destroy(&dp->table); -err_destroy_dp: - kfree(dp); -err_destroy_reply: - kfree_skb(reply); -err: - return err; -} - -/* Called with ovs_mutex. */ -static void __dp_destroy(struct datapath *dp) -{ - int i; - - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { - struct vport *vport; - struct hlist_node *n; - - hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) - if (vport->port_no != OVSP_LOCAL) - ovs_dp_detach_port(vport); - } - - list_del_rcu(&dp->list_node); - - /* OVSP_LOCAL is datapath internal port. We need to make sure that - * all ports in datapath are destroyed first before freeing datapath. - */ - ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); - - /* RCU destroy the flow table */ - call_rcu(&dp->rcu, destroy_dp_rcu); -} - -static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) -{ - struct sk_buff *reply; - struct datapath *dp; - int err; - - reply = ovs_dp_cmd_alloc_info(); - if (!reply) - return -ENOMEM; - - ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); - err = PTR_ERR(dp); - if (IS_ERR(dp)) - goto err_unlock_free; - - err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_DP_CMD_DEL); - BUG_ON(err < 0); - - __dp_destroy(dp); - ovs_unlock(); - - ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info); - return 0; - -err_unlock_free: - ovs_unlock(); - kfree_skb(reply); - return err; -} - -static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) -{ - struct sk_buff *reply; - struct datapath *dp; - int err; - - reply = ovs_dp_cmd_alloc_info(); - if (!reply) - return -ENOMEM; - - ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); - err = PTR_ERR(dp); - if (IS_ERR(dp)) - goto err_unlock_free; - - err = ovs_dp_change(dp, info->attrs); - if (err) - goto err_unlock_free; - - err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_DP_CMD_GET); - BUG_ON(err < 0); - - ovs_unlock(); - - ovs_notify(&dp_datapath_genl_family, &ovs_dp_datapath_multicast_group, reply, info); - return 0; - -err_unlock_free: - ovs_unlock(); - kfree_skb(reply); - return err; -} - -static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) -{ - struct sk_buff *reply; - struct datapath *dp; - int err; - - reply = ovs_dp_cmd_alloc_info(); - if (!reply) - return -ENOMEM; - - ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); - if (IS_ERR(dp)) { - err = PTR_ERR(dp); - goto err_unlock_free; - } - err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_DP_CMD_GET); - BUG_ON(err < 0); - ovs_unlock(); - - return genlmsg_reply(reply, info); - -err_unlock_free: - ovs_unlock(); - kfree_skb(reply); - return err; -} - -static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); - struct datapath *dp; - int skip = cb->args[0]; - int i = 0; - - ovs_lock(); - list_for_each_entry(dp, &ovs_net->dps, list_node) { - if (i >= skip && - ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_DP_CMD_GET) < 0) - break; - i++; - } - ovs_unlock(); - - cb->args[0] = i; - - return skb->len; -} - -static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { - [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, - [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, - [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, -}; - -static const struct genl_ops dp_datapath_genl_ops[] = { - { .cmd = OVS_DP_CMD_NEW, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = datapath_policy, -#endif - .doit = ovs_dp_cmd_new - }, - { .cmd = OVS_DP_CMD_DEL, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = datapath_policy, -#endif - .doit = ovs_dp_cmd_del - }, - { .cmd = OVS_DP_CMD_GET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = 0, /* OK for unprivileged users. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = datapath_policy, -#endif - .doit = ovs_dp_cmd_get, - .dumpit = ovs_dp_cmd_dump - }, - { .cmd = OVS_DP_CMD_SET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = datapath_policy, -#endif - .doit = ovs_dp_cmd_set, - }, -}; - -static struct genl_family dp_datapath_genl_family __ro_after_init = { - .hdrsize = sizeof(struct ovs_header), - .name = OVS_DATAPATH_FAMILY, - .version = OVS_DATAPATH_VERSION, - .maxattr = OVS_DP_ATTR_MAX, -#ifndef HAVE_GENL_OPS_POLICY - .policy = datapath_policy, -#endif - .netnsok = true, - .parallel_ops = true, - .ops = dp_datapath_genl_ops, - .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), - .mcgrps = &ovs_dp_datapath_multicast_group, - .n_mcgrps = 1, - .module = THIS_MODULE, -}; - -/* Called with ovs_mutex or RCU read lock. */ -static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, - struct net *net, u32 portid, u32 seq, - u32 flags, u8 cmd, gfp_t gfp) -{ - struct ovs_header *ovs_header; - struct ovs_vport_stats vport_stats; - int err; - - ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, - flags, cmd); - if (!ovs_header) - return -EMSGSIZE; - - ovs_header->dp_ifindex = get_dpifindex(vport->dp); - - if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || - nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || - nla_put_string(skb, OVS_VPORT_ATTR_NAME, - ovs_vport_name(vport)) || - nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) - goto nla_put_failure; - -#ifdef HAVE_PEERNET2ID_ALLOC - if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev), gfp); - - if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) - goto nla_put_failure; - } - -#endif - ovs_vport_get_stats(vport, &vport_stats); - if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, - sizeof(struct ovs_vport_stats), &vport_stats, - OVS_VPORT_ATTR_PAD)) - goto nla_put_failure; - - if (ovs_vport_get_upcall_portids(vport, skb)) - goto nla_put_failure; - - err = ovs_vport_get_options(vport, skb); - if (err == -EMSGSIZE) - goto error; - - genlmsg_end(skb, ovs_header); - return 0; - -nla_put_failure: - err = -EMSGSIZE; -error: - genlmsg_cancel(skb, ovs_header); - return err; -} - -static struct sk_buff *ovs_vport_cmd_alloc_info(void) -{ - return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); -} - -/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ -struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, - u32 portid, u32 seq, u8 cmd) -{ - struct sk_buff *skb; - int retval; - - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return ERR_PTR(-ENOMEM); - - retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd, - GFP_KERNEL); - BUG_ON(retval < 0); - - return skb; -} - -/* Called with ovs_mutex or RCU read lock. */ -static struct vport *lookup_vport(struct net *net, - const struct ovs_header *ovs_header, - struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) -{ - struct datapath *dp; - struct vport *vport; - - if (a[OVS_VPORT_ATTR_IFINDEX]) - return ERR_PTR(-EOPNOTSUPP); - if (a[OVS_VPORT_ATTR_NAME]) { - vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); - if (!vport) - return ERR_PTR(-ENODEV); - if (ovs_header->dp_ifindex && - ovs_header->dp_ifindex != get_dpifindex(vport->dp)) - return ERR_PTR(-ENODEV); - return vport; - } else if (a[OVS_VPORT_ATTR_PORT_NO]) { - u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); - - if (port_no >= DP_MAX_PORTS) - return ERR_PTR(-EFBIG); - - dp = get_dp(net, ovs_header->dp_ifindex); - if (!dp) - return ERR_PTR(-ENODEV); - - vport = ovs_vport_ovsl_rcu(dp, port_no); - if (!vport) - return ERR_PTR(-ENODEV); - return vport; - } else - return ERR_PTR(-EINVAL); - -} - -static unsigned int ovs_get_max_headroom(struct datapath *dp) -{ - unsigned int dev_headroom, max_headroom = 0; - struct net_device *dev; - struct vport *vport; - int i; - - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { - dev = vport->dev; - dev_headroom = netdev_get_fwd_headroom(dev); - if (dev_headroom > max_headroom) - max_headroom = dev_headroom; - } - } - - return max_headroom; -} - -/* Called with ovs_mutex */ -static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) -{ - struct vport *vport; - int i; - - dp->max_headroom = new_headroom; - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) - netdev_set_rx_headroom(vport->dev, new_headroom); -} - -static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; - struct vport_parms parms; - struct sk_buff *reply; - struct vport *vport; - struct datapath *dp; - unsigned int new_headroom; - u32 port_no; - int err; - - if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || - !a[OVS_VPORT_ATTR_UPCALL_PID]) - return -EINVAL; - if (a[OVS_VPORT_ATTR_IFINDEX]) - return -EOPNOTSUPP; - - port_no = a[OVS_VPORT_ATTR_PORT_NO] - ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; - if (port_no >= DP_MAX_PORTS) - return -EFBIG; - - reply = ovs_vport_cmd_alloc_info(); - if (!reply) - return -ENOMEM; - - ovs_lock(); -restart: - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - err = -ENODEV; - if (!dp) - goto exit_unlock_free; - - if (port_no) { - vport = ovs_vport_ovsl(dp, port_no); - err = -EBUSY; - if (vport) - goto exit_unlock_free; - } else { - for (port_no = 1; ; port_no++) { - if (port_no >= DP_MAX_PORTS) { - err = -EFBIG; - goto exit_unlock_free; - } - vport = ovs_vport_ovsl(dp, port_no); - if (!vport) - break; - } - } - - parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); - parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); - parms.options = a[OVS_VPORT_ATTR_OPTIONS]; - parms.dp = dp; - parms.port_no = port_no; - parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; - - vport = new_vport(&parms); - err = PTR_ERR(vport); - if (IS_ERR(vport)) { - if (err == -EAGAIN) - goto restart; - goto exit_unlock_free; - } - - err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), - info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_NEW, GFP_KERNEL); - BUG_ON(err < 0); - - new_headroom = netdev_get_fwd_headroom(vport->dev); - - if (new_headroom > dp->max_headroom) - ovs_update_headroom(dp, new_headroom); - else - netdev_set_rx_headroom(vport->dev, dp->max_headroom); - - ovs_unlock(); - - ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info); - return 0; - -exit_unlock_free: - ovs_unlock(); - kfree_skb(reply); - return err; -} - -static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct sk_buff *reply; - struct vport *vport; - int err; - - reply = ovs_vport_cmd_alloc_info(); - if (!reply) - return -ENOMEM; - - ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); - err = PTR_ERR(vport); - if (IS_ERR(vport)) - goto exit_unlock_free; - - if (a[OVS_VPORT_ATTR_TYPE] && - nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { - err = -EINVAL; - goto exit_unlock_free; - } - - if (a[OVS_VPORT_ATTR_OPTIONS]) { - err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); - if (err) - goto exit_unlock_free; - } - - if (a[OVS_VPORT_ATTR_UPCALL_PID]) { - struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; - - err = ovs_vport_set_upcall_portids(vport, ids); - if (err) - goto exit_unlock_free; - } - - err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), - info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_SET, GFP_KERNEL); - BUG_ON(err < 0); - ovs_unlock(); - - ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info); - return 0; - -exit_unlock_free: - ovs_unlock(); - kfree_skb(reply); - return err; -} - -static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) -{ - bool update_headroom = false; - struct nlattr **a = info->attrs; - struct sk_buff *reply; - struct datapath *dp; - struct vport *vport; - unsigned int new_headroom; - int err; - - reply = ovs_vport_cmd_alloc_info(); - if (!reply) - return -ENOMEM; - - ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); - err = PTR_ERR(vport); - if (IS_ERR(vport)) - goto exit_unlock_free; - - if (vport->port_no == OVSP_LOCAL) { - err = -EINVAL; - goto exit_unlock_free; - } - - err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), - info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_DEL, GFP_KERNEL); - BUG_ON(err < 0); - - /* the vport deletion may trigger dp headroom update */ - dp = vport->dp; - if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) - update_headroom = true; - - netdev_reset_rx_headroom(vport->dev); - ovs_dp_detach_port(vport); - - if (update_headroom) { - new_headroom = ovs_get_max_headroom(dp); - - if (new_headroom < dp->max_headroom) - ovs_update_headroom(dp, new_headroom); - } - ovs_unlock(); - - ovs_notify(&dp_vport_genl_family, &ovs_dp_vport_multicast_group, reply, info); - return 0; - -exit_unlock_free: - ovs_unlock(); - kfree_skb(reply); - return err; -} - -static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; - struct sk_buff *reply; - struct vport *vport; - int err; - - reply = ovs_vport_cmd_alloc_info(); - if (!reply) - return -ENOMEM; - - rcu_read_lock(); - vport = lookup_vport(sock_net(skb->sk), ovs_header, a); - err = PTR_ERR(vport); - if (IS_ERR(vport)) - goto exit_unlock_free; - err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), - info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_GET, GFP_ATOMIC); - BUG_ON(err < 0); - rcu_read_unlock(); - - return genlmsg_reply(reply, info); - -exit_unlock_free: - rcu_read_unlock(); - kfree_skb(reply); - return err; -} - -static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); - struct datapath *dp; - int bucket = cb->args[0], skip = cb->args[1]; - int i, j = 0; - - rcu_read_lock(); - dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { - rcu_read_unlock(); - return -ENODEV; - } - for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { - struct vport *vport; - - j = 0; - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { - if (j >= skip && - ovs_vport_cmd_fill_info(vport, skb, - sock_net(skb->sk), - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI, - OVS_VPORT_CMD_GET, - GFP_ATOMIC) < 0) - goto out; - - j++; - } - skip = 0; - } -out: - rcu_read_unlock(); - - cb->args[0] = i; - cb->args[1] = j; - - return skb->len; -} - -static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { - [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, - [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, - [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, - [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, - [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, -}; - -static const struct genl_ops dp_vport_genl_ops[] = { - { .cmd = OVS_VPORT_CMD_NEW, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = vport_policy, -#endif - .doit = ovs_vport_cmd_new - }, - { .cmd = OVS_VPORT_CMD_DEL, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = vport_policy, -#endif - .doit = ovs_vport_cmd_del - }, - { .cmd = OVS_VPORT_CMD_GET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = 0, /* OK for unprivileged users. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = vport_policy, -#endif - .doit = ovs_vport_cmd_get, - .dumpit = ovs_vport_cmd_dump - }, - { .cmd = OVS_VPORT_CMD_SET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = vport_policy, -#endif - .doit = ovs_vport_cmd_set, - }, -}; - -struct genl_family dp_vport_genl_family __ro_after_init = { - .hdrsize = sizeof(struct ovs_header), - .name = OVS_VPORT_FAMILY, - .version = OVS_VPORT_VERSION, - .maxattr = OVS_VPORT_ATTR_MAX, -#ifndef HAVE_GENL_OPS_POLICY - .policy = vport_policy, -#endif - .netnsok = true, - .parallel_ops = true, - .ops = dp_vport_genl_ops, - .n_ops = ARRAY_SIZE(dp_vport_genl_ops), - .mcgrps = &ovs_dp_vport_multicast_group, - .n_mcgrps = 1, - .module = THIS_MODULE, -}; - -static struct genl_family *dp_genl_families[] = { - &dp_datapath_genl_family, - &dp_vport_genl_family, - &dp_flow_genl_family, - &dp_packet_genl_family, - &dp_meter_genl_family, -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) - &dp_ct_limit_genl_family, -#endif -}; - -static void dp_unregister_genl(int n_families) -{ - int i; - - for (i = 0; i < n_families; i++) - genl_unregister_family(dp_genl_families[i]); -} - -static int __init dp_register_genl(void) -{ - int err; - int i; - - for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { - - err = genl_register_family(dp_genl_families[i]); - if (err) - goto error; - } - - return 0; - -error: - dp_unregister_genl(i); - return err; -} - -static int __net_init ovs_init_net(struct net *net) -{ - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - INIT_LIST_HEAD(&ovs_net->dps); - INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); - ovs_netns_frags_init(net); - ovs_netns_frags6_init(net); - return ovs_ct_init(net); -} - -static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, - struct list_head *head) -{ - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - struct datapath *dp; - - list_for_each_entry(dp, &ovs_net->dps, list_node) { - int i; - - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { - struct vport *vport; - - hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { - - if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) - continue; - - if (dev_net(vport->dev) == dnet) - list_add(&vport->detach_list, head); - } - } - } -} - -static void __net_exit ovs_exit_net(struct net *dnet) -{ - struct datapath *dp, *dp_next; - struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); - struct vport *vport, *vport_next; - struct net *net; - LIST_HEAD(head); - - ovs_netns_frags6_exit(dnet); - ovs_netns_frags_exit(dnet); - ovs_lock(); - - ovs_ct_exit(dnet); - - list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) - __dp_destroy(dp); - -#ifdef HAVE_NET_RWSEM - down_read(&net_rwsem); -#else - rtnl_lock(); -#endif - for_each_net(net) - list_vports_from_net(net, dnet, &head); -#ifdef HAVE_NET_RWSEM - up_read(&net_rwsem); -#else - rtnl_unlock(); -#endif - - /* Detach all vports from given namespace. */ - list_for_each_entry_safe(vport, vport_next, &head, detach_list) { - list_del(&vport->detach_list); - ovs_dp_detach_port(vport); - } - - ovs_unlock(); - - cancel_work_sync(&ovs_net->dp_notify_work); -} - -static struct pernet_operations ovs_net_ops = { - .init = ovs_init_net, - .exit = ovs_exit_net, - .id = &ovs_net_id, - .size = sizeof(struct ovs_net), -}; - -static int __init dp_init(void) -{ - int err; - - BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > sizeof_field(struct sk_buff, cb)); - - pr_info("Open vSwitch switching datapath %s\n", VERSION); - - ovs_nsh_init(); - err = action_fifos_init(); - if (err) - goto error; - - err = ovs_internal_dev_rtnl_link_register(); - if (err) - goto error_action_fifos_exit; - - err = ovs_flow_init(); - if (err) - goto error_unreg_rtnl_link; - - err = ovs_vport_init(); - if (err) - goto error_flow_exit; - - err = register_pernet_device(&ovs_net_ops); - if (err) - goto error_vport_exit; - - err = compat_init(); - if (err) - goto error_netns_exit; - - err = register_netdevice_notifier(&ovs_dp_device_notifier); - if (err) - goto error_compat_exit; - - err = ovs_netdev_init(); - if (err) - goto error_unreg_notifier; - - err = dp_register_genl(); - if (err < 0) - goto error_unreg_netdev; - - return 0; - -error_unreg_netdev: - ovs_netdev_exit(); -error_unreg_notifier: - unregister_netdevice_notifier(&ovs_dp_device_notifier); -error_compat_exit: - compat_exit(); -error_netns_exit: - unregister_pernet_device(&ovs_net_ops); -error_vport_exit: - ovs_vport_exit(); -error_flow_exit: - ovs_flow_exit(); -error_unreg_rtnl_link: - ovs_internal_dev_rtnl_link_unregister(); -error_action_fifos_exit: - action_fifos_exit(); -error: - ovs_nsh_cleanup(); - return err; -} - -static void dp_cleanup(void) -{ - dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); - ovs_netdev_exit(); - unregister_netdevice_notifier(&ovs_dp_device_notifier); - compat_exit(); - unregister_pernet_device(&ovs_net_ops); - rcu_barrier(); - ovs_vport_exit(); - ovs_flow_exit(); - ovs_internal_dev_rtnl_link_unregister(); - action_fifos_exit(); - ovs_nsh_cleanup(); -} - -module_init(dp_init); -module_exit(dp_cleanup); - -MODULE_DESCRIPTION("Open vSwitch switching datapath"); -MODULE_LICENSE("GPL"); -MODULE_VERSION(VERSION); -MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); -MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); -MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); -MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); -MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); -MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY); diff --git a/datapath/datapath.h b/datapath/datapath.h deleted file mode 100644 index c377e9b24..000000000 --- a/datapath/datapath.h +++ /dev/null @@ -1,283 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef DATAPATH_H -#define DATAPATH_H 1 - -#include <asm/page.h> -#include <linux/kernel.h> -#include <linux/mutex.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/u64_stats_sync.h> -#include <net/net_namespace.h> -#include <net/ip_tunnels.h> - -#include "compat.h" -#include "flow.h" -#include "flow_table.h" -#include "meter.h" -#include "vport-internal_dev.h" - -#define DP_MAX_PORTS USHRT_MAX -#define DP_VPORT_HASH_BUCKETS 1024 - -/** - * struct dp_stats_percpu - per-cpu packet processing statistics for a given - * datapath. - * @n_hit: Number of received packets for which a matching flow was found in - * the flow table. - * @n_miss: Number of received packets that had no matching flow in the flow - * table. The sum of @n_hit and @n_miss is the number of packets that have - * been received by the datapath. - * @n_lost: Number of received packets that had no matching flow in the flow - * table that could not be sent to userspace (normally due to an overflow in - * one of the datapath's queues). - * @n_mask_hit: Number of masks looked up for flow match. - * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked - * up per packet. - */ -struct dp_stats_percpu { - u64 n_hit; - u64 n_missed; - u64 n_lost; - u64 n_mask_hit; - struct u64_stats_sync syncp; -}; - -/** - * struct datapath - datapath for flow-based packet switching - * @rcu: RCU callback head for deferred destruction. - * @list_node: Element in global 'dps' list. - * @table: flow table. - * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by - * ovs_mutex and RCU. - * @stats_percpu: Per-CPU datapath statistics. - * @net: Reference to net namespace. - * @max_headroom: the maximum headroom of all vports in this datapath; it will - * be used by all the internal vports in this dp. - * - * Context: See the comment on locking at the top of datapath.c for additional - * locking information. - */ -struct datapath { - struct rcu_head rcu; - struct list_head list_node; - - /* Flow table. */ - struct flow_table table; - - /* Switch ports. */ - struct hlist_head *ports; - - /* Stats. */ - struct dp_stats_percpu __percpu *stats_percpu; - - /* Network namespace ref. */ - possible_net_t net; - - u32 user_features; - - u32 max_headroom; - - /* Switch meters. */ - struct hlist_head *meters; -}; - -/** - * struct ovs_skb_cb - OVS data in skb CB - * @input_vport: The original vport packet came in on. This value is cached - * when a packet is received by OVS. - * @mru: The maximum received fragement size; 0 if the packet is not - * fragmented. - * @acts_origlen: The netlink size of the flow actions applied to this skb. - * @cutlen: The number of bytes from the packet end to be removed. - */ -struct ovs_skb_cb { - struct vport *input_vport; - u16 mru; - u16 acts_origlen; - u32 cutlen; -}; -#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) - -/** - * struct dp_upcall - metadata to include with a packet to send to userspace - * @cmd: One of %OVS_PACKET_CMD_*. - * @userdata: If nonnull, its variable-length value is passed to userspace as - * %OVS_PACKET_ATTR_USERDATA. - * @portid: Netlink portid to which packet should be sent. If @portid is 0 - * then no packet is sent and the packet is accounted in the datapath's @n_lost - * counter. - * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. - * @mru: If not zero, Maximum received IP fragment size. - */ -struct dp_upcall_info { - struct ip_tunnel_info *egress_tun_info; - const struct nlattr *userdata; - const struct nlattr *actions; - int actions_len; - u32 portid; - u8 cmd; - u16 mru; -}; - -/** - * struct ovs_net - Per net-namespace data for ovs. - * @dps: List of datapaths to enable dumping them all out. - * Protected by genl_mutex. - */ -struct ovs_net { - struct list_head dps; - struct work_struct dp_notify_work; -#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) - struct ovs_ct_limit_info *ct_limit_info; -#endif - - /* Module reference for configuring conntrack. */ - bool xt_label; - -#ifdef HAVE_INET_FRAG_LRU_MOVE - struct net *net; - struct netns_frags ipv4_frags; - struct netns_frags nf_frags; -#endif -}; - -/** - * enum ovs_pkt_hash_types - hash info to include with a packet - * to send to userspace. - * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack. - * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash - * over transport ports. - */ -enum ovs_pkt_hash_types { - OVS_PACKET_HASH_SW_BIT = (1ULL << 32), - OVS_PACKET_HASH_L4_BIT = (1ULL << 33), -}; - -extern unsigned int ovs_net_id; -void ovs_lock(void); -void ovs_unlock(void); - -#ifdef CONFIG_LOCKDEP -int lockdep_ovsl_is_held(void); -#else -#define lockdep_ovsl_is_held() 1 -#endif - -#define ASSERT_OVSL() WARN_ON(!lockdep_ovsl_is_held()) -#define ovsl_dereference(p) \ - rcu_dereference_protected(p, lockdep_ovsl_is_held()) -#define rcu_dereference_ovsl(p) \ - rcu_dereference_check(p, lockdep_ovsl_is_held()) - -static inline struct net *ovs_dp_get_net(const struct datapath *dp) -{ - return rpl_read_pnet(&dp->net); -} - -static inline void ovs_dp_set_net(struct datapath *dp, struct net *net) -{ - rpl_write_pnet(&dp->net, net); -} - -struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); - -static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return ovs_lookup_vport(dp, port_no); -} - -static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no) -{ - WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); - return ovs_lookup_vport(dp, port_no); -} - -static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no) -{ - ASSERT_OVSL(); - return ovs_lookup_vport(dp, port_no); -} - -/* Must be called with rcu_read_lock. */ -static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) -{ - struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); - - if (dev) { - struct vport *vport = ovs_internal_dev_get_vport(dev); - - if (vport) - return vport->dp; - } - - return NULL; -} - -/* The caller must hold either ovs_mutex or rcu_read_lock to keep the - * returned dp pointer valid. - */ -static inline struct datapath *get_dp(struct net *net, int dp_ifindex) -{ - struct datapath *dp; - - WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); - rcu_read_lock(); - dp = get_dp_rcu(net, dp_ifindex); - rcu_read_unlock(); - - return dp; -} - -extern struct notifier_block ovs_dp_device_notifier; -extern struct genl_family dp_vport_genl_family; -extern const struct genl_multicast_group ovs_dp_vport_multicast_group; - -DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support); - -void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); -void ovs_dp_detach_port(struct vport *); -int ovs_dp_upcall(struct datapath *, struct sk_buff *, - const struct sw_flow_key *, const struct dp_upcall_info *, - uint32_t cutlen); - -const char *ovs_dp_name(const struct datapath *dp); -struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, - u32 portid, u32 seq, u8 cmd); - -int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, - const struct sw_flow_actions *, struct sw_flow_key *); - -void ovs_dp_notify_wq(struct work_struct *work); - -int action_fifos_init(void); -void action_fifos_exit(void); - -/* 'KEY' must not have any bits set outside of the 'MASK' */ -#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) -#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) - -#define OVS_NLERR(logging_allowed, fmt, ...) \ -do { \ - if (logging_allowed && net_ratelimit()) \ - pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \ -} while (0) -#endif /* datapath.h */ diff --git a/datapath/dp_notify.c b/datapath/dp_notify.c deleted file mode 100644 index 932a37ed2..000000000 --- a/datapath/dp_notify.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include <linux/netdevice.h> -#include <net/genetlink.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> - -#include "datapath.h" -#include "vport-internal_dev.h" -#include "vport-netdev.h" - -static void dp_detach_port_notify(struct vport *vport) -{ - struct sk_buff *notify; - struct datapath *dp; - - dp = vport->dp; - notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp), - 0, 0, OVS_VPORT_CMD_DEL); - ovs_dp_detach_port(vport); - if (IS_ERR(notify)) { - genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, - GROUP_ID(&ovs_dp_vport_multicast_group), - PTR_ERR(notify)); - return; - } - - genlmsg_multicast_netns(&dp_vport_genl_family, - ovs_dp_get_net(dp), notify, 0, - GROUP_ID(&ovs_dp_vport_multicast_group), - GFP_KERNEL); -} - -void ovs_dp_notify_wq(struct work_struct *work) -{ - struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); - struct datapath *dp; - - ovs_lock(); - list_for_each_entry(dp, &ovs_net->dps, list_node) { - int i; - - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { - struct vport *vport; - struct hlist_node *n; - - hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) - continue; - - if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) - dp_detach_port_notify(vport); - } - } - } - ovs_unlock(); -} - -static int dp_device_event(struct notifier_block *unused, unsigned long event, - void *ptr) -{ - struct ovs_net *ovs_net; - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct vport *vport = NULL; - - if (!ovs_is_internal_dev(dev)) - vport = ovs_netdev_get_vport(dev); - - if (!vport) - return NOTIFY_DONE; - - if (event == NETDEV_UNREGISTER) { - /* upper_dev_unlink and decrement promisc immediately */ - ovs_netdev_detach_dev(vport); - - /* schedule vport destroy, dev_put and genl notification */ - ovs_net = net_generic(dev_net(dev), ovs_net_id); - queue_work(system_wq, &ovs_net->dp_notify_work); - } - - return NOTIFY_DONE; -} - -struct notifier_block ovs_dp_device_notifier = { - .notifier_call = dp_device_event -}; diff --git a/datapath/flow.c b/datapath/flow.c deleted file mode 100644 index 5a00c238c..000000000 --- a/datapath/flow.c +++ /dev/null @@ -1,972 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include <linux/uaccess.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/if_vlan.h> -#include <net/llc_pdu.h> -#include <linux/kernel.h> -#include <linux/jhash.h> -#include <linux/jiffies.h> -#include <linux/llc.h> -#include <linux/module.h> -#include <linux/in.h> -#include <linux/rcupdate.h> -#include <linux/cpumask.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/mpls.h> -#include <linux/sctp.h> -#include <linux/smp.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmp.h> -#include <linux/icmpv6.h> -#include <linux/rculist.h> -#include <linux/timekeeping.h> -#include <net/ip.h> -#include <net/ipv6.h> -#include <net/mpls.h> -#include <net/ndisc.h> -#include <net/nsh.h> - -#include "datapath.h" -#include "conntrack.h" -#include "flow.h" -#include "flow_netlink.h" -#include "vport.h" - -u64 ovs_flow_used_time(unsigned long flow_jiffies) -{ - struct timespec64 cur_ts; - u64 cur_ms, idle_ms; - - ktime_get_ts64(&cur_ts); - idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); - cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC + - cur_ts.tv_nsec / NSEC_PER_MSEC; - - return cur_ms - idle_ms; -} - -#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) - -void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, - const struct sk_buff *skb) -{ - struct sw_flow_stats *stats; - unsigned int cpu = smp_processor_id(); - int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - stats = rcu_dereference(flow->stats[cpu]); - - /* Check if already have CPU-specific stats. */ - if (likely(stats)) { - spin_lock(&stats->lock); - /* Mark if we write on the pre-allocated stats. */ - if (cpu == 0 && unlikely(flow->stats_last_writer != cpu)) - flow->stats_last_writer = cpu; - } else { - stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ - spin_lock(&stats->lock); - - /* If the current CPU is the only writer on the - * pre-allocated stats keep using them. - */ - if (unlikely(flow->stats_last_writer != cpu)) { - /* A previous locker may have already allocated the - * stats, so we need to check again. If CPU-specific - * stats were already allocated, we update the pre- - * allocated stats as we have already locked them. - */ - if (likely(flow->stats_last_writer != -1) && - likely(!rcu_access_pointer(flow->stats[cpu]))) { - /* Try to allocate CPU-specific stats. */ - struct sw_flow_stats *new_stats; - - new_stats = - kmem_cache_alloc_node(flow_stats_cache, - GFP_NOWAIT | - __GFP_THISNODE | - __GFP_NOWARN | - __GFP_NOMEMALLOC, - numa_node_id()); - if (likely(new_stats)) { - new_stats->used = jiffies; - new_stats->packet_count = 1; - new_stats->byte_count = len; - new_stats->tcp_flags = tcp_flags; - spin_lock_init(&new_stats->lock); - - rcu_assign_pointer(flow->stats[cpu], - new_stats); - cpumask_set_cpu(cpu, &flow->cpu_used_mask); - goto unlock; - } - } - flow->stats_last_writer = cpu; - } - } - - stats->used = jiffies; - stats->packet_count++; - stats->byte_count += len; - stats->tcp_flags |= tcp_flags; -unlock: - spin_unlock(&stats->lock); -} - -/* Must be called with rcu_read_lock or ovs_mutex. */ -void ovs_flow_stats_get(const struct sw_flow *flow, - struct ovs_flow_stats *ovs_stats, - unsigned long *used, __be16 *tcp_flags) -{ - int cpu; - - *used = 0; - *tcp_flags = 0; - memset(ovs_stats, 0, sizeof(*ovs_stats)); - - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { - struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); - - if (stats) { - /* Local CPU may write on non-local stats, so we must - * block bottom-halves here. - */ - spin_lock_bh(&stats->lock); - if (!*used || time_after(stats->used, *used)) - *used = stats->used; - *tcp_flags |= stats->tcp_flags; - ovs_stats->n_packets += stats->packet_count; - ovs_stats->n_bytes += stats->byte_count; - spin_unlock_bh(&stats->lock); - } - } -} - -/* Called with ovs_mutex. */ -void ovs_flow_stats_clear(struct sw_flow *flow) -{ - int cpu; - - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) { - struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]); - - if (stats) { - spin_lock_bh(&stats->lock); - stats->used = 0; - stats->packet_count = 0; - stats->byte_count = 0; - stats->tcp_flags = 0; - spin_unlock_bh(&stats->lock); - } - } -} - -static int check_header(struct sk_buff *skb, int len) -{ - if (unlikely(skb->len < len)) - return -EINVAL; - if (unlikely(!pskb_may_pull(skb, len))) - return -ENOMEM; - return 0; -} - -static bool arphdr_ok(struct sk_buff *skb) -{ - return pskb_may_pull(skb, skb_network_offset(skb) + - sizeof(struct arp_eth_header)); -} - -static int check_iphdr(struct sk_buff *skb) -{ - unsigned int nh_ofs = skb_network_offset(skb); - unsigned int ip_len; - int err; - - err = check_header(skb, nh_ofs + sizeof(struct iphdr)); - if (unlikely(err)) - return err; - - ip_len = ip_hdrlen(skb); - if (unlikely(ip_len < sizeof(struct iphdr) || - skb->len < nh_ofs + ip_len)) - return -EINVAL; - - skb_set_transport_header(skb, nh_ofs + ip_len); - return 0; -} - -static bool tcphdr_ok(struct sk_buff *skb) -{ - int th_ofs = skb_transport_offset(skb); - int tcp_len; - - if (unlikely(!pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr)))) - return false; - - tcp_len = tcp_hdrlen(skb); - if (unlikely(tcp_len < sizeof(struct tcphdr) || - skb->len < th_ofs + tcp_len)) - return false; - - return true; -} - -static bool udphdr_ok(struct sk_buff *skb) -{ - return pskb_may_pull(skb, skb_transport_offset(skb) + - sizeof(struct udphdr)); -} - -static bool sctphdr_ok(struct sk_buff *skb) -{ - return pskb_may_pull(skb, skb_transport_offset(skb) + - sizeof(struct sctphdr)); -} - -static bool icmphdr_ok(struct sk_buff *skb) -{ - return pskb_may_pull(skb, skb_transport_offset(skb) + - sizeof(struct icmphdr)); -} - -static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) -{ - unsigned short frag_off; - unsigned int payload_ofs = 0; - unsigned int nh_ofs = skb_network_offset(skb); - unsigned int nh_len; - struct ipv6hdr *nh; - int err, nexthdr, flags = 0; - - err = check_header(skb, nh_ofs + sizeof(*nh)); - if (unlikely(err)) - return err; - - nh = ipv6_hdr(skb); - - key->ip.proto = NEXTHDR_NONE; - key->ip.tos = ipv6_get_dsfield(nh); - key->ip.ttl = nh->hop_limit; - key->ipv6.label = *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); - key->ipv6.addr.src = nh->saddr; - key->ipv6.addr.dst = nh->daddr; - - nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); - if (flags & IP6_FH_F_FRAG) { - if (frag_off) { - key->ip.frag = OVS_FRAG_TYPE_LATER; - key->ip.proto = nexthdr; - return 0; - } - key->ip.frag = OVS_FRAG_TYPE_FIRST; - } else { - key->ip.frag = OVS_FRAG_TYPE_NONE; - } - - /* Delayed handling of error in ipv6_find_hdr() as it - * always sets flags and frag_off to a valid value which may be - * used to set key->ip.frag above. - */ - if (unlikely(nexthdr < 0)) - return -EPROTO; - - nh_len = payload_ofs - nh_ofs; - skb_set_transport_header(skb, nh_ofs + nh_len); - key->ip.proto = nexthdr; - return nh_len; -} - -static bool icmp6hdr_ok(struct sk_buff *skb) -{ - return pskb_may_pull(skb, skb_transport_offset(skb) + - sizeof(struct icmp6hdr)); -} - -/** - * Parse vlan tag from vlan header. - * Returns ERROR on memory error. - * Returns 0 if it encounters a non-vlan or incomplete packet. - * Returns 1 after successfully parsing vlan tag. - */ -static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, - bool untag_vlan) -{ - struct vlan_head *vh = (struct vlan_head *)skb->data; - - if (likely(!eth_type_vlan(vh->tpid))) - return 0; - - if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16))) - return 0; - - if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) + - sizeof(__be16)))) - return -ENOMEM; - - vh = (struct vlan_head *)skb->data; - key_vh->tci = vh->tci | htons(VLAN_CFI_MASK); - key_vh->tpid = vh->tpid; - - if (unlikely(untag_vlan)) { - int offset = skb->data - skb_mac_header(skb); - u16 tci; - int err; - - __skb_push(skb, offset); - err = __skb_vlan_pop(skb, &tci); - __skb_pull(skb, offset); - if (err) - return err; - __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci); - } else { - __skb_pull(skb, sizeof(struct vlan_head)); - } - return 1; -} - -static void clear_vlan(struct sw_flow_key *key) -{ - key->eth.vlan.tci = 0; - key->eth.vlan.tpid = 0; - key->eth.cvlan.tci = 0; - key->eth.cvlan.tpid = 0; -} - -static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) -{ - int res; - - key->eth.vlan.tci = 0; - key->eth.vlan.tpid = 0; - key->eth.cvlan.tci = 0; - key->eth.cvlan.tpid = 0; - - if (skb_vlan_tag_present(skb)) { - key->eth.vlan.tci = htons(skb->vlan_tci) | htons(VLAN_CFI_MASK); - key->eth.vlan.tpid = skb->vlan_proto; - } else { - /* Parse outer vlan tag in the non-accelerated case. */ - res = parse_vlan_tag(skb, &key->eth.vlan, true); - if (res <= 0) - return res; - } - - /* Parse inner vlan tag. */ - res = parse_vlan_tag(skb, &key->eth.cvlan, false); - if (res <= 0) - return res; - - return 0; -} - -static __be16 parse_ethertype(struct sk_buff *skb) -{ - struct llc_snap_hdr { - u8 dsap; /* Always 0xAA */ - u8 ssap; /* Always 0xAA */ - u8 ctrl; - u8 oui[3]; - __be16 ethertype; - }; - struct llc_snap_hdr *llc; - __be16 proto; - - proto = *(__be16 *) skb->data; - __skb_pull(skb, sizeof(__be16)); - - if (eth_proto_is_802_3(proto)) - return proto; - - if (skb->len < sizeof(struct llc_snap_hdr)) - return htons(ETH_P_802_2); - - if (unlikely(!pskb_may_pull(skb, sizeof(struct llc_snap_hdr)))) - return htons(0); - - llc = (struct llc_snap_hdr *) skb->data; - if (llc->dsap != LLC_SAP_SNAP || - llc->ssap != LLC_SAP_SNAP || - (llc->oui[0] | llc->oui[1] | llc->oui[2]) != 0) - return htons(ETH_P_802_2); - - __skb_pull(skb, sizeof(struct llc_snap_hdr)); - - if (eth_proto_is_802_3(llc->ethertype)) - return llc->ethertype; - - return htons(ETH_P_802_2); -} - -static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, - int nh_len) -{ - struct icmp6hdr *icmp = icmp6_hdr(skb); - - /* The ICMPv6 type and code fields use the 16-bit transport port - * fields, so we need to store them in 16-bit network byte order. - */ - key->tp.src = htons(icmp->icmp6_type); - key->tp.dst = htons(icmp->icmp6_code); - memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); - - if (icmp->icmp6_code == 0 && - (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || - icmp->icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) { - int icmp_len = skb->len - skb_transport_offset(skb); - struct nd_msg *nd; - int offset; - - /* In order to process neighbor discovery options, we need the - * entire packet. - */ - if (unlikely(icmp_len < sizeof(*nd))) - return 0; - - if (unlikely(skb_linearize(skb))) - return -ENOMEM; - - nd = (struct nd_msg *)skb_transport_header(skb); - key->ipv6.nd.target = nd->target; - - icmp_len -= sizeof(*nd); - offset = 0; - while (icmp_len >= 8) { - struct nd_opt_hdr *nd_opt = - (struct nd_opt_hdr *)(nd->opt + offset); - int opt_len = nd_opt->nd_opt_len * 8; - - if (unlikely(!opt_len || opt_len > icmp_len)) - return 0; - - /* Store the link layer address if the appropriate - * option is provided. It is considered an error if - * the same link layer option is specified twice. - */ - if (nd_opt->nd_opt_type == ND_OPT_SOURCE_LL_ADDR - && opt_len == 8) { - if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) - goto invalid; - ether_addr_copy(key->ipv6.nd.sll, - &nd->opt[offset+sizeof(*nd_opt)]); - } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR - && opt_len == 8) { - if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) - goto invalid; - ether_addr_copy(key->ipv6.nd.tll, - &nd->opt[offset+sizeof(*nd_opt)]); - } - - icmp_len -= opt_len; - offset += opt_len; - } - } - - return 0; - -invalid: - memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); - memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); - memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); - - return 0; -} - -static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) -{ - struct nshhdr *nh; - unsigned int nh_ofs = skb_network_offset(skb); - u8 version, length; - int err; - - err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN); - if (unlikely(err)) - return err; - - nh = nsh_hdr(skb); - version = nsh_get_ver(nh); - length = nsh_hdr_len(nh); - - if (version != 0) - return -EINVAL; - - err = check_header(skb, nh_ofs + length); - if (unlikely(err)) - return err; - - nh = nsh_hdr(skb); - key->nsh.base.flags = nsh_get_flags(nh); - key->nsh.base.ttl = nsh_get_ttl(nh); - key->nsh.base.mdtype = nh->mdtype; - key->nsh.base.np = nh->np; - key->nsh.base.path_hdr = nh->path_hdr; - switch (key->nsh.base.mdtype) { - case NSH_M_TYPE1: - if (length != NSH_M_TYPE1_LEN) - return -EINVAL; - memcpy(key->nsh.context, nh->md1.context, - sizeof(nh->md1)); - break; - case NSH_M_TYPE2: - memset(key->nsh.context, 0, - sizeof(nh->md1)); - break; - default: - return -EINVAL; - } - - return 0; -} - -/** - * key_extract_l3l4 - extracts L3/L4 header information. - * @skb: sk_buff that contains the frame, with skb->data pointing to the - * L3 header - * @key: output flow key - */ -static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) -{ - int error; - - /* Network layer. */ - if (key->eth.type == htons(ETH_P_IP)) { - struct iphdr *nh; - __be16 offset; - - error = check_iphdr(skb); - if (unlikely(error)) { - memset(&key->ip, 0, sizeof(key->ip)); - memset(&key->ipv4, 0, sizeof(key->ipv4)); - if (error == -EINVAL) { - skb->transport_header = skb->network_header; - error = 0; - } - return error; - } - - nh = ip_hdr(skb); - key->ipv4.addr.src = nh->saddr; - key->ipv4.addr.dst = nh->daddr; - - key->ip.proto = nh->protocol; - key->ip.tos = nh->tos; - key->ip.ttl = nh->ttl; - - offset = nh->frag_off & htons(IP_OFFSET); - if (offset) { - key->ip.frag = OVS_FRAG_TYPE_LATER; - memset(&key->tp, 0, sizeof(key->tp)); - return 0; - } -#ifdef HAVE_SKB_GSO_UDP - if (nh->frag_off & htons(IP_MF) || - skb_shinfo(skb)->gso_type & SKB_GSO_UDP) -#else - if (nh->frag_off & htons(IP_MF)) -#endif - key->ip.frag = OVS_FRAG_TYPE_FIRST; - else - key->ip.frag = OVS_FRAG_TYPE_NONE; - - /* Transport layer. */ - if (key->ip.proto == IPPROTO_TCP) { - if (tcphdr_ok(skb)) { - struct tcphdr *tcp = tcp_hdr(skb); - key->tp.src = tcp->source; - key->tp.dst = tcp->dest; - key->tp.flags = TCP_FLAGS_BE16(tcp); - } else { - memset(&key->tp, 0, sizeof(key->tp)); - } - - } else if (key->ip.proto == IPPROTO_UDP) { - if (udphdr_ok(skb)) { - struct udphdr *udp = udp_hdr(skb); - key->tp.src = udp->source; - key->tp.dst = udp->dest; - } else { - memset(&key->tp, 0, sizeof(key->tp)); - } - } else if (key->ip.proto == IPPROTO_SCTP) { - if (sctphdr_ok(skb)) { - struct sctphdr *sctp = sctp_hdr(skb); - key->tp.src = sctp->source; - key->tp.dst = sctp->dest; - } else { - memset(&key->tp, 0, sizeof(key->tp)); - } - } else if (key->ip.proto == IPPROTO_ICMP) { - if (icmphdr_ok(skb)) { - struct icmphdr *icmp = icmp_hdr(skb); - /* The ICMP type and code fields use the 16-bit - * transport port fields, so we need to store - * them in 16-bit network byte order. - */ - key->tp.src = htons(icmp->type); - key->tp.dst = htons(icmp->code); - } else { - memset(&key->tp, 0, sizeof(key->tp)); - } - } - - } else if (key->eth.type == htons(ETH_P_ARP) || - key->eth.type == htons(ETH_P_RARP)) { - struct arp_eth_header *arp; - bool arp_available = arphdr_ok(skb); - - arp = (struct arp_eth_header *)skb_network_header(skb); - - if (arp_available && - arp->ar_hrd == htons(ARPHRD_ETHER) && - arp->ar_pro == htons(ETH_P_IP) && - arp->ar_hln == ETH_ALEN && - arp->ar_pln == 4) { - - /* We only match on the lower 8 bits of the opcode. */ - if (ntohs(arp->ar_op) <= 0xff) - key->ip.proto = ntohs(arp->ar_op); - else - key->ip.proto = 0; - - memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); - memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); - ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); - ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); - } else { - memset(&key->ip, 0, sizeof(key->ip)); - memset(&key->ipv4, 0, sizeof(key->ipv4)); - } - } else if (eth_p_mpls(key->eth.type)) { - u8 label_count = 1; - - memset(&key->mpls, 0, sizeof(key->mpls)); - skb_set_inner_network_header(skb, skb->mac_len); - while (1) { - __be32 lse; - - error = check_header(skb, skb->mac_len + - label_count * MPLS_HLEN); - if (unlikely(error)) - return 0; - - memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); - - if (label_count <= MPLS_LABEL_DEPTH) - memcpy(&key->mpls.lse[label_count - 1], &lse, - MPLS_HLEN); - - skb_set_inner_network_header(skb, skb->mac_len + - label_count * MPLS_HLEN); - if (lse & htonl(MPLS_LS_S_MASK)) - break; - - label_count++; - } - if (label_count > MPLS_LABEL_DEPTH) - label_count = MPLS_LABEL_DEPTH; - - key->mpls.num_labels_mask = GENMASK(label_count - 1, 0); - } else if (key->eth.type == htons(ETH_P_IPV6)) { - int nh_len; /* IPv6 Header + Extensions */ - - nh_len = parse_ipv6hdr(skb, key); - if (unlikely(nh_len < 0)) { - switch (nh_len) { - case -EINVAL: - memset(&key->ip, 0, sizeof(key->ip)); - memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); - /* fall-through */ - case -EPROTO: - skb->transport_header = skb->network_header; - error = 0; - break; - default: - error = nh_len; - } - return error; - } - - if (key->ip.frag == OVS_FRAG_TYPE_LATER) { - memset(&key->tp, 0, sizeof(key->tp)); - return 0; - } -#ifdef HAVE_SKB_GSO_UDP - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) - key->ip.frag = OVS_FRAG_TYPE_FIRST; - -#endif - /* Transport layer. */ - if (key->ip.proto == NEXTHDR_TCP) { - if (tcphdr_ok(skb)) { - struct tcphdr *tcp = tcp_hdr(skb); - key->tp.src = tcp->source; - key->tp.dst = tcp->dest; - key->tp.flags = TCP_FLAGS_BE16(tcp); - } else { - memset(&key->tp, 0, sizeof(key->tp)); - } - } else if (key->ip.proto == NEXTHDR_UDP) { - if (udphdr_ok(skb)) { - struct udphdr *udp = udp_hdr(skb); - key->tp.src = udp->source; - key->tp.dst = udp->dest; - } else { - memset(&key->tp, 0, sizeof(key->tp)); - } - } else if (key->ip.proto == NEXTHDR_SCTP) { - if (sctphdr_ok(skb)) { - struct sctphdr *sctp = sctp_hdr(skb); - key->tp.src = sctp->source; - key->tp.dst = sctp->dest; - } else { - memset(&key->tp, 0, sizeof(key->tp)); - } - } else if (key->ip.proto == NEXTHDR_ICMP) { - if (icmp6hdr_ok(skb)) { - error = parse_icmpv6(skb, key, nh_len); - if (error) - return error; - } else { - memset(&key->tp, 0, sizeof(key->tp)); - } - } - } else if (key->eth.type == htons(ETH_P_NSH)) { - error = parse_nsh(skb, key); - if (error) - return error; - } - return 0; -} - -/** - * key_extract - extracts a flow key from an Ethernet frame. - * @skb: sk_buff that contains the frame, with skb->data pointing to the - * Ethernet header - * @key: output flow key - * - * The caller must ensure that skb->len >= ETH_HLEN. - * - * Returns 0 if successful, otherwise a negative errno value. - * - * Initializes @skb header fields as follows: - * - * - skb->mac_header: the L2 header. - * - * - skb->network_header: just past the L2 header, or just past the - * VLAN header, to the first byte of the L2 payload. - * - * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 - * on output, then just past the IP header, if one is present and - * of a correct length, otherwise the same as skb->network_header. - * For other key->eth.type values it is left untouched. - * - * - skb->protocol: the type of the data starting at skb->network_header. - * Equals to key->eth.type. - */ -static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) -{ - struct ethhdr *eth; - - /* Flags are always used as part of stats */ - key->tp.flags = 0; - - skb_reset_mac_header(skb); - - /* Link layer. */ - clear_vlan(key); - if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) { - if (unlikely(eth_type_vlan(skb->protocol))) - return -EINVAL; - - skb_reset_network_header(skb); - key->eth.type = skb->protocol; - } else { - eth = eth_hdr(skb); - ether_addr_copy(key->eth.src, eth->h_source); - ether_addr_copy(key->eth.dst, eth->h_dest); - - __skb_pull(skb, 2 * ETH_ALEN); - /* We are going to push all headers that we pull, so no need to - * update skb->csum here. - */ - - if (unlikely(parse_vlan(skb, key))) - return -ENOMEM; - - key->eth.type = parse_ethertype(skb); - if (unlikely(key->eth.type == htons(0))) - return -ENOMEM; - - /* Multiple tagged packets need to retain TPID to satisfy - * skb_vlan_pop(), which will later shift the ethertype into - * skb->protocol. - */ - if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK)) - skb->protocol = key->eth.cvlan.tpid; - else - skb->protocol = key->eth.type; - - skb_reset_network_header(skb); - __skb_push(skb, skb->data - skb_mac_header(skb)); - } - - skb_reset_mac_len(skb); - - /* Fill out L3/L4 key info, if any */ - return key_extract_l3l4(skb, key); -} - -/* In the case of conntrack fragment handling it expects L3 headers, - * add a helper. - */ -int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key) -{ - return key_extract_l3l4(skb, key); -} - -int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) -{ - int res; - - res = key_extract(skb, key); - if (!res) - key->mac_proto &= ~SW_FLOW_KEY_INVALID; - - return res; -} - -static int key_extract_mac_proto(struct sk_buff *skb) -{ - switch (skb->dev->type) { - case ARPHRD_ETHER: - return MAC_PROTO_ETHERNET; - case ARPHRD_NONE: - if (skb->protocol == htons(ETH_P_TEB)) - return MAC_PROTO_ETHERNET; - return MAC_PROTO_NONE; - } - WARN_ON_ONCE(1); - return -EINVAL; -} - -int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, - struct sk_buff *skb, struct sw_flow_key *key) -{ -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - struct tc_skb_ext *tc_ext; -#endif - int res, err; - - /* Extract metadata from packet. */ - if (tun_info) { - key->tun_proto = ip_tunnel_info_af(tun_info); - memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); - BUILD_BUG_ON(((1 << (sizeof(tun_info->options_len) * 8)) - 1) > - sizeof(key->tun_opts)); - - if (tun_info->options_len) { - ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), - tun_info); - key->tun_opts_len = tun_info->options_len; - } else { - key->tun_opts_len = 0; - } - } else { - key->tun_proto = 0; - key->tun_opts_len = 0; - memset(&key->tun_key, 0, sizeof(key->tun_key)); - } - - key->phy.priority = skb->priority; - key->phy.in_port = OVS_CB(skb)->input_vport->port_no; - key->phy.skb_mark = skb->mark; - key->ovs_flow_hash = 0; - res = key_extract_mac_proto(skb); - if (res < 0) - return res; - key->mac_proto = res; - -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - if (static_branch_unlikely(&tc_recirc_sharing_support)) { - tc_ext = skb_ext_find(skb, TC_SKB_EXT); - key->recirc_id = tc_ext ? tc_ext->chain : 0; - } else { - key->recirc_id = 0; - } -#else - key->recirc_id = 0; -#endif - - err = key_extract(skb, key); - if (!err) - ovs_ct_fill_key(skb, key); /* Must be after key_extract(). */ - return err; -} - -int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, - struct sk_buff *skb, - struct sw_flow_key *key, bool log) -{ - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - u64 attrs = 0; - int err; - - err = parse_flow_nlattrs(attr, a, &attrs, log); - if (err) - return -EINVAL; - - /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(net, a, attrs, key, log); - if (err) - return err; - - /* key_extract assumes that skb->protocol is set-up for - * layer 3 packets which is the case for other callers, - * in particular packets received from the network stack. - * Here the correct value can be set from the metadata - * extracted above. - * For L2 packet key eth type would be zero. skb protocol - * would be set to correct value later during key-extact. - */ - - skb->protocol = key->eth.type; - err = key_extract(skb, key); - if (err) - return err; - - /* Check that we have conntrack original direction tuple metadata only - * for packets for which it makes sense. Otherwise the key may be - * corrupted due to overlapping key fields. - */ - if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) && - key->eth.type != htons(ETH_P_IP)) - return -EINVAL; - if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) && - (key->eth.type != htons(ETH_P_IPV6) || - sw_flow_key_is_nd(key))) - return -EINVAL; - - return 0; -} diff --git a/datapath/flow.h b/datapath/flow.h deleted file mode 100644 index 584d9f565..000000000 --- a/datapath/flow.h +++ /dev/null @@ -1,297 +0,0 @@ -/* - * Copyright (c) 2007-2017 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef FLOW_H -#define FLOW_H 1 - -#include <linux/cache.h> -#include <linux/kernel.h> -#include <linux/netlink.h> -#include <linux/openvswitch.h> -#include <linux/spinlock.h> -#include <linux/types.h> -#include <linux/rcupdate.h> -#include <linux/if_ether.h> -#include <linux/in6.h> -#include <linux/jiffies.h> -#include <linux/time.h> -#include <linux/cpumask.h> -#include <net/inet_ecn.h> -#include <net/ip_tunnels.h> -#include <net/dst_metadata.h> -#include <net/nsh.h> - -struct sk_buff; - -enum sw_flow_mac_proto { - MAC_PROTO_NONE = 0, - MAC_PROTO_ETHERNET, -}; -#define SW_FLOW_KEY_INVALID 0x80 -#define MPLS_LABEL_DEPTH 3 - -/* Store options at the end of the array if they are less than the - * maximum size. This allows us to get the benefits of variable length - * matching for small options. - */ -#define TUN_METADATA_OFFSET(opt_len) \ - (sizeof_field(struct sw_flow_key, tun_opts) - opt_len) -#define TUN_METADATA_OPTS(flow_key, opt_len) \ - ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) - -struct ovs_tunnel_info { - struct metadata_dst *tun_dst; -}; - -struct vlan_head { - __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/ - __be16 tci; /* 0 if no VLAN, VLAN_CFI_MASK set otherwise. */ -}; - -#define OVS_SW_FLOW_KEY_METADATA_SIZE \ - (offsetof(struct sw_flow_key, recirc_id) + \ - sizeof_field(struct sw_flow_key, recirc_id)) - -struct ovs_key_nsh { - struct ovs_nsh_key_base base; - __be32 context[NSH_MD1_CONTEXT_SIZE]; -}; - -struct sw_flow_key { - u8 tun_opts[255]; - u8 tun_opts_len; - struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ - struct { - u32 priority; /* Packet QoS priority. */ - u32 skb_mark; /* SKB mark. */ - u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ - } __packed phy; /* Safe when right after 'tun_key'. */ - u8 mac_proto; /* MAC layer protocol (e.g. Ethernet). */ - u8 tun_proto; /* Protocol of encapsulating tunnel. */ - u32 ovs_flow_hash; /* Datapath computed hash value. */ - u32 recirc_id; /* Recirculation ID. */ - struct { - u8 src[ETH_ALEN]; /* Ethernet source address. */ - u8 dst[ETH_ALEN]; /* Ethernet destination address. */ - struct vlan_head vlan; - struct vlan_head cvlan; - __be16 type; /* Ethernet frame type. */ - } eth; - /* Filling a hole of two bytes. */ - u8 ct_state; - u8 ct_orig_proto; /* CT original direction tuple IP - * protocol. - */ - union { - struct { - u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ - u8 tos; /* IP ToS. */ - u8 ttl; /* IP TTL/hop limit. */ - u8 frag; /* One of OVS_FRAG_TYPE_*. */ - } ip; - }; - u16 ct_zone; /* Conntrack zone. */ - struct { - __be16 src; /* TCP/UDP/SCTP source port. */ - __be16 dst; /* TCP/UDP/SCTP destination port. */ - __be16 flags; /* TCP flags. */ - } tp; - union { - struct { - struct { - __be32 src; /* IP source address. */ - __be32 dst; /* IP destination address. */ - } addr; - union { - struct { - __be32 src; - __be32 dst; - } ct_orig; /* Conntrack original direction fields. */ - struct { - u8 sha[ETH_ALEN]; /* ARP source hardware address. */ - u8 tha[ETH_ALEN]; /* ARP target hardware address. */ - } arp; - }; - } ipv4; - struct { - struct { - struct in6_addr src; /* IPv6 source address. */ - struct in6_addr dst; /* IPv6 destination address. */ - } addr; - __be32 label; /* IPv6 flow label. */ - union { - struct { - struct in6_addr src; - struct in6_addr dst; - } ct_orig; /* Conntrack original direction fields. */ - struct { - struct in6_addr target; /* ND target address. */ - u8 sll[ETH_ALEN]; /* ND source link layer address. */ - u8 tll[ETH_ALEN]; /* ND target link layer address. */ - } nd; - }; - } ipv6; - struct { - u32 num_labels_mask; /* labels present bitmap of effective length MPLS_LABEL_DEPTH */ - __be32 lse[MPLS_LABEL_DEPTH]; /* label stack entry */ - } mpls; - struct ovs_key_nsh nsh; /* network service header */ - }; - struct { - /* Connection tracking fields not packed above. */ - struct { - __be16 src; /* CT orig tuple tp src port. */ - __be16 dst; /* CT orig tuple tp dst port. */ - } orig_tp; - u32 mark; - struct ovs_key_ct_labels labels; - } ct; - -} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ - -static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key) -{ - return key->eth.type == htons(ETH_P_IPV6) && - key->ip.proto == NEXTHDR_ICMP && - key->tp.dst == 0 && - (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || - key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)); -} - -struct sw_flow_key_range { - unsigned short int start; - unsigned short int end; -}; - -struct sw_flow_mask { - int ref_count; - struct rcu_head rcu; - struct sw_flow_key_range range; - struct sw_flow_key key; -}; - -struct sw_flow_match { - struct sw_flow_key *key; - struct sw_flow_key_range range; - struct sw_flow_mask *mask; -}; - -#define MAX_UFID_LENGTH 16 /* 128 bits */ - -struct sw_flow_id { - u32 ufid_len; - union { - u32 ufid[MAX_UFID_LENGTH / 4]; - struct sw_flow_key *unmasked_key; - }; -}; - -struct sw_flow_actions { - struct rcu_head rcu; - size_t orig_len; /* From flow_cmd_new netlink actions size */ - u32 actions_len; - struct nlattr actions[]; -}; - -struct sw_flow_stats { - u64 packet_count; /* Number of packets matched. */ - u64 byte_count; /* Number of bytes matched. */ - unsigned long used; /* Last used time (in jiffies). */ - spinlock_t lock; /* Lock for atomic stats update. */ - __be16 tcp_flags; /* Union of seen TCP flags. */ -}; - -struct sw_flow { - struct rcu_head rcu; - struct { - struct hlist_node node[2]; - u32 hash; - } flow_table, ufid_table; - int stats_last_writer; /* CPU id of the last writer on - * 'stats[0]'. - */ - struct sw_flow_key key; - struct sw_flow_id id; - struct cpumask cpu_used_mask; - struct sw_flow_mask *mask; - struct sw_flow_actions __rcu *sf_acts; - struct sw_flow_stats __rcu *stats[]; /* One for each CPU. First one - * is allocated at flow creation time, - * the rest are allocated on demand - * while holding the 'stats[0].lock'. - */ -}; - -struct arp_eth_header { - __be16 ar_hrd; /* format of hardware address */ - __be16 ar_pro; /* format of protocol address */ - unsigned char ar_hln; /* length of hardware address */ - unsigned char ar_pln; /* length of protocol address */ - __be16 ar_op; /* ARP opcode (command) */ - - /* Ethernet+IPv4 specific members. */ - unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */ - unsigned char ar_sip[4]; /* sender IP address */ - unsigned char ar_tha[ETH_ALEN]; /* target hardware address */ - unsigned char ar_tip[4]; /* target IP address */ -} __packed; - -static inline u8 ovs_key_mac_proto(const struct sw_flow_key *key) -{ - return key->mac_proto & ~SW_FLOW_KEY_INVALID; -} - -static inline u16 __ovs_mac_header_len(u8 mac_proto) -{ - return mac_proto == MAC_PROTO_ETHERNET ? ETH_HLEN : 0; -} - -static inline u16 ovs_mac_header_len(const struct sw_flow_key *key) -{ - return __ovs_mac_header_len(ovs_key_mac_proto(key)); -} - -static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid) -{ - return sfid->ufid_len; -} - -static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid) -{ - return !ovs_identifier_is_ufid(sfid); -} - -void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, - const struct sk_buff *); -void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, - unsigned long *used, __be16 *tcp_flags); -void ovs_flow_stats_clear(struct sw_flow *); -u64 ovs_flow_used_time(unsigned long flow_jiffies); - -/* Update the non-metadata part of the flow key using skb. */ -int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, - struct sk_buff *skb, - struct sw_flow_key *key); -/* Extract key from packet coming from userspace. */ -int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, - struct sk_buff *skb, - struct sw_flow_key *key, bool log); - -#endif /* flow.h */ diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c deleted file mode 100644 index caed44386..000000000 --- a/datapath/flow_netlink.c +++ /dev/null @@ -1,3519 +0,0 @@ -/* - * Copyright (c) 2007-2017 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/uaccess.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/if_vlan.h> -#include <net/llc_pdu.h> -#include <linux/kernel.h> -#include <linux/jhash.h> -#include <linux/jiffies.h> -#include <linux/llc.h> -#include <linux/module.h> -#include <linux/in.h> -#include <linux/rcupdate.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/sctp.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmp.h> -#include <linux/icmpv6.h> -#include <linux/rculist.h> -#include <net/geneve.h> -#include <net/ip.h> -#include <net/ipv6.h> -#include <net/ndisc.h> -#include <net/mpls.h> -#include <net/vxlan.h> -#include <net/tun_proto.h> -#include <net/erspan.h> - -#include "datapath.h" -#include "conntrack.h" -#include "flow.h" -#include "flow_netlink.h" -#include "gso.h" - -struct ovs_len_tbl { - int len; - const struct ovs_len_tbl *next; -}; - -#define OVS_ATTR_NESTED -1 -#define OVS_ATTR_VARIABLE -2 - -static bool actions_may_change_flow(const struct nlattr *actions) -{ - struct nlattr *nla; - int rem; - - nla_for_each_nested(nla, actions, rem) { - u16 action = nla_type(nla); - - switch (action) { - case OVS_ACTION_ATTR_OUTPUT: - case OVS_ACTION_ATTR_RECIRC: - case OVS_ACTION_ATTR_TRUNC: - case OVS_ACTION_ATTR_USERSPACE: - break; - - case OVS_ACTION_ATTR_CT: - case OVS_ACTION_ATTR_CT_CLEAR: - case OVS_ACTION_ATTR_HASH: - case OVS_ACTION_ATTR_POP_ETH: - case OVS_ACTION_ATTR_POP_MPLS: - case OVS_ACTION_ATTR_POP_NSH: - case OVS_ACTION_ATTR_POP_VLAN: - case OVS_ACTION_ATTR_PUSH_ETH: - case OVS_ACTION_ATTR_PUSH_MPLS: - case OVS_ACTION_ATTR_PUSH_NSH: - case OVS_ACTION_ATTR_PUSH_VLAN: - case OVS_ACTION_ATTR_SAMPLE: - case OVS_ACTION_ATTR_SET: - case OVS_ACTION_ATTR_SET_MASKED: - case OVS_ACTION_ATTR_METER: - case OVS_ACTION_ATTR_CHECK_PKT_LEN: - default: - return true; - } - } - return false; -} - -static void update_range(struct sw_flow_match *match, - size_t offset, size_t size, bool is_mask) -{ - struct sw_flow_key_range *range; - size_t start = rounddown(offset, sizeof(long)); - size_t end = roundup(offset + size, sizeof(long)); - - if (!is_mask) - range = &match->range; - else - range = &match->mask->range; - - if (range->start == range->end) { - range->start = start; - range->end = end; - return; - } - - if (range->start > start) - range->start = start; - - if (range->end < end) - range->end = end; -} - -#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ - do { \ - update_range(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) \ - (match)->mask->key.field = value; \ - else \ - (match)->key->field = value; \ - } while (0) - -#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ - do { \ - update_range(match, offset, len, is_mask); \ - if (is_mask) \ - memcpy((u8 *)&(match)->mask->key + offset, value_p, len);\ - else \ - memcpy((u8 *)(match)->key + offset, value_p, len); \ - } while (0) - -#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ - SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ - value_p, len, is_mask) - -#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ - do { \ - update_range(match, offsetof(struct sw_flow_key, field), \ - sizeof((match)->key->field), is_mask); \ - if (is_mask) \ - memset((u8 *)&(match)->mask->key.field, value, \ - sizeof((match)->mask->key.field)); \ - else \ - memset((u8 *)&(match)->key->field, value, \ - sizeof((match)->key->field)); \ - } while (0) - -static bool match_validate(const struct sw_flow_match *match, - u64 key_attrs, u64 mask_attrs, bool log) -{ - u64 key_expected = 0; - u64 mask_allowed = key_attrs; /* At most allow all key attributes */ - - /* The following mask attributes allowed only if they - * pass the validation tests. - */ - mask_allowed &= ~((1ULL << OVS_KEY_ATTR_IPV4) - | (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) - | (1ULL << OVS_KEY_ATTR_IPV6) - | (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) - | (1ULL << OVS_KEY_ATTR_TCP) - | (1ULL << OVS_KEY_ATTR_TCP_FLAGS) - | (1ULL << OVS_KEY_ATTR_UDP) - | (1ULL << OVS_KEY_ATTR_SCTP) - | (1ULL << OVS_KEY_ATTR_ICMP) - | (1ULL << OVS_KEY_ATTR_ICMPV6) - | (1ULL << OVS_KEY_ATTR_ARP) - | (1ULL << OVS_KEY_ATTR_ND) - | (1ULL << OVS_KEY_ATTR_MPLS) - | (1ULL << OVS_KEY_ATTR_NSH)); - - /* Always allowed mask fields. */ - mask_allowed |= ((1ULL << OVS_KEY_ATTR_TUNNEL) - | (1ULL << OVS_KEY_ATTR_IN_PORT) - | (1ULL << OVS_KEY_ATTR_ETHERTYPE)); - - /* Check key attributes. */ - if (match->key->eth.type == htons(ETH_P_ARP) - || match->key->eth.type == htons(ETH_P_RARP)) { - key_expected |= 1ULL << OVS_KEY_ATTR_ARP; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) - mask_allowed |= 1ULL << OVS_KEY_ATTR_ARP; - } - - if (eth_p_mpls(match->key->eth.type)) { - key_expected |= 1ULL << OVS_KEY_ATTR_MPLS; - if (match->mask && (match->mask->key.eth.type == htons(0xffff))) - mask_allowed |= 1ULL << OVS_KEY_ATTR_MPLS; - } - - if (match->key->eth.type == htons(ETH_P_IP)) { - key_expected |= 1ULL << OVS_KEY_ATTR_IPV4; - if (match->mask && match->mask->key.eth.type == htons(0xffff)) { - mask_allowed |= 1ULL << OVS_KEY_ATTR_IPV4; - mask_allowed |= 1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4; - } - - if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { - if (match->key->ip.proto == IPPROTO_UDP) { - key_expected |= 1ULL << OVS_KEY_ATTR_UDP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1ULL << OVS_KEY_ATTR_UDP; - } - - if (match->key->ip.proto == IPPROTO_SCTP) { - key_expected |= 1ULL << OVS_KEY_ATTR_SCTP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1ULL << OVS_KEY_ATTR_SCTP; - } - - if (match->key->ip.proto == IPPROTO_TCP) { - key_expected |= 1ULL << OVS_KEY_ATTR_TCP; - key_expected |= 1ULL << OVS_KEY_ATTR_TCP_FLAGS; - if (match->mask && (match->mask->key.ip.proto == 0xff)) { - mask_allowed |= 1ULL << OVS_KEY_ATTR_TCP; - mask_allowed |= 1ULL << OVS_KEY_ATTR_TCP_FLAGS; - } - } - - if (match->key->ip.proto == IPPROTO_ICMP) { - key_expected |= 1ULL << OVS_KEY_ATTR_ICMP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1ULL << OVS_KEY_ATTR_ICMP; - } - } - } - - if (match->key->eth.type == htons(ETH_P_IPV6)) { - key_expected |= 1ULL << OVS_KEY_ATTR_IPV6; - if (match->mask && match->mask->key.eth.type == htons(0xffff)) { - mask_allowed |= 1ULL << OVS_KEY_ATTR_IPV6; - mask_allowed |= 1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6; - } - - if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { - if (match->key->ip.proto == IPPROTO_UDP) { - key_expected |= 1ULL << OVS_KEY_ATTR_UDP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1ULL << OVS_KEY_ATTR_UDP; - } - - if (match->key->ip.proto == IPPROTO_SCTP) { - key_expected |= 1ULL << OVS_KEY_ATTR_SCTP; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1ULL << OVS_KEY_ATTR_SCTP; - } - - if (match->key->ip.proto == IPPROTO_TCP) { - key_expected |= 1ULL << OVS_KEY_ATTR_TCP; - key_expected |= 1ULL << OVS_KEY_ATTR_TCP_FLAGS; - if (match->mask && (match->mask->key.ip.proto == 0xff)) { - mask_allowed |= 1ULL << OVS_KEY_ATTR_TCP; - mask_allowed |= 1ULL << OVS_KEY_ATTR_TCP_FLAGS; - } - } - - if (match->key->ip.proto == IPPROTO_ICMPV6) { - key_expected |= 1ULL << OVS_KEY_ATTR_ICMPV6; - if (match->mask && (match->mask->key.ip.proto == 0xff)) - mask_allowed |= 1ULL << OVS_KEY_ATTR_ICMPV6; - - if (match->key->tp.src == - htons(NDISC_NEIGHBOUR_SOLICITATION) || - match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { - key_expected |= 1ULL << OVS_KEY_ATTR_ND; - /* Original direction conntrack tuple - * uses the same space as the ND fields - * in the key, so both are not allowed - * at the same time. - */ - mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6); - if (match->mask && (match->mask->key.tp.src == htons(0xff))) - mask_allowed |= 1ULL << OVS_KEY_ATTR_ND; - } - } - } - } - - if (match->key->eth.type == htons(ETH_P_NSH)) { - key_expected |= 1 << OVS_KEY_ATTR_NSH; - if (match->mask && - match->mask->key.eth.type == htons(0xffff)) { - mask_allowed |= 1 << OVS_KEY_ATTR_NSH; - } - } - - if ((key_attrs & key_expected) != key_expected) { - /* Key attributes check failed. */ - OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", - (unsigned long long)key_attrs, - (unsigned long long)key_expected); - return false; - } - - if ((mask_attrs & mask_allowed) != mask_attrs) { - /* Mask attributes check failed. */ - OVS_NLERR(log, "Unexpected mask (mask=%llx, allowed=%llx)", - (unsigned long long)mask_attrs, - (unsigned long long)mask_allowed); - return false; - } - - return true; -} - -size_t ovs_tun_key_attr_size(void) -{ - /* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider - * updating this function. - */ - return nla_total_size_64bit(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */ - + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ - + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ - /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and - * OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with - * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. - */ - + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ - + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ -} - -static size_t ovs_nsh_key_attr_size(void) -{ - /* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider - * updating this function. - */ - return nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */ - /* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are - * mutually exclusive, so the bigger one can cover - * the small one. - */ - + nla_total_size(NSH_CTX_HDRS_MAX_LEN); -} - -size_t ovs_key_attr_size(void) -{ - /* Whenever adding new OVS_KEY_ FIELDS, we should consider - * updating this function. - */ - BUILD_BUG_ON(OVS_KEY_ATTR_MAX != 31); - - return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ - + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ - + ovs_tun_key_attr_size() - + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ - + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ - + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ - + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ - + nla_total_size(4) /* OVS_KEY_ATTR_CT_STATE */ - + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ - + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ - + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ - + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */ - + nla_total_size(0) /* OVS_KEY_ATTR_NSH */ - + ovs_nsh_key_attr_size() - + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ - + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ - + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ - + nla_total_size(28); /* OVS_KEY_ATTR_ND */ -} - -static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, -}; - -static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { - [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, - [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, - [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) }, - [OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 }, - [OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 }, - [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, - [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, - [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, - [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, - [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, - .next = ovs_vxlan_ext_key_lens }, - [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, - [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, - [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_VARIABLE }, -}; - -static const struct ovs_len_tbl -ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = { - [OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) }, - [OVS_NSH_KEY_ATTR_MD1] = { .len = sizeof(struct ovs_nsh_key_md1) }, - [OVS_NSH_KEY_ATTR_MD2] = { .len = OVS_ATTR_VARIABLE }, -}; - -/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ -static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { - [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED }, - [OVS_KEY_ATTR_PRIORITY] = { .len = sizeof(u32) }, - [OVS_KEY_ATTR_IN_PORT] = { .len = sizeof(u32) }, - [OVS_KEY_ATTR_SKB_MARK] = { .len = sizeof(u32) }, - [OVS_KEY_ATTR_ETHERNET] = { .len = sizeof(struct ovs_key_ethernet) }, - [OVS_KEY_ATTR_VLAN] = { .len = sizeof(__be16) }, - [OVS_KEY_ATTR_ETHERTYPE] = { .len = sizeof(__be16) }, - [OVS_KEY_ATTR_IPV4] = { .len = sizeof(struct ovs_key_ipv4) }, - [OVS_KEY_ATTR_IPV6] = { .len = sizeof(struct ovs_key_ipv6) }, - [OVS_KEY_ATTR_TCP] = { .len = sizeof(struct ovs_key_tcp) }, - [OVS_KEY_ATTR_TCP_FLAGS] = { .len = sizeof(__be16) }, - [OVS_KEY_ATTR_UDP] = { .len = sizeof(struct ovs_key_udp) }, - [OVS_KEY_ATTR_SCTP] = { .len = sizeof(struct ovs_key_sctp) }, - [OVS_KEY_ATTR_ICMP] = { .len = sizeof(struct ovs_key_icmp) }, - [OVS_KEY_ATTR_ICMPV6] = { .len = sizeof(struct ovs_key_icmpv6) }, - [OVS_KEY_ATTR_ARP] = { .len = sizeof(struct ovs_key_arp) }, - [OVS_KEY_ATTR_ND] = { .len = sizeof(struct ovs_key_nd) }, - [OVS_KEY_ATTR_RECIRC_ID] = { .len = sizeof(u32) }, - [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, - [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, - .next = ovs_tunnel_key_lens, }, - [OVS_KEY_ATTR_MPLS] = { .len = OVS_ATTR_VARIABLE }, - [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, - [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, - [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, - [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, - [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = { - .len = sizeof(struct ovs_key_ct_tuple_ipv4) }, - [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = { - .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, - [OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED, - .next = ovs_nsh_key_attr_lens, }, -}; - -static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) -{ - return expected_len == attr_len || - expected_len == OVS_ATTR_NESTED || - expected_len == OVS_ATTR_VARIABLE; -} - -static bool is_all_zero(const u8 *fp, size_t size) -{ - int i; - - if (!fp) - return false; - - for (i = 0; i < size; i++) - if (fp[i]) - return false; - - return true; -} - -static int __parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], - u64 *attrsp, bool log, bool nz) -{ - const struct nlattr *nla; - u64 attrs; - int rem; - - attrs = *attrsp; - nla_for_each_nested(nla, attr, rem) { - u16 type = nla_type(nla); - int expected_len; - - if (type > OVS_KEY_ATTR_MAX) { - OVS_NLERR(log, "Key type %d is out of range max %d", - type, OVS_KEY_ATTR_MAX); - return -EINVAL; - } - - if (type == OVS_KEY_ATTR_PACKET_TYPE || - type == OVS_KEY_ATTR_ND_EXTENSIONS || - type == OVS_KEY_ATTR_TUNNEL_INFO) { - OVS_NLERR(log, "Key type %d is not supported", type); - return -EINVAL; - } - - if (attrs & (1ULL << type)) { - OVS_NLERR(log, "Duplicate key (type %d).", type); - return -EINVAL; - } - - expected_len = ovs_key_lens[type].len; - if (!check_attr_len(nla_len(nla), expected_len)) { - OVS_NLERR(log, "Key %d has unexpected len %d expected %d", - type, nla_len(nla), expected_len); - return -EINVAL; - } - - if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) { - attrs |= 1ULL << type; - a[type] = nla; - } - } - if (rem) { - OVS_NLERR(log, "Message has %d unknown bytes.", rem); - return -EINVAL; - } - - *attrsp = attrs; - return 0; -} - -static int parse_flow_mask_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u64 *attrsp, - bool log) -{ - return __parse_flow_nlattrs(attr, a, attrsp, log, true); -} - -int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], - u64 *attrsp, bool log) -{ - return __parse_flow_nlattrs(attr, a, attrsp, log, false); -} - -static int genev_tun_opt_from_nlattr(const struct nlattr *a, - struct sw_flow_match *match, bool is_mask, - bool log) -{ - unsigned long opt_key_offset; - - if (nla_len(a) > sizeof(match->key->tun_opts)) { - OVS_NLERR(log, "Geneve option length err (len %d, max %zu).", - nla_len(a), sizeof(match->key->tun_opts)); - return -EINVAL; - } - - if (nla_len(a) % 4 != 0) { - OVS_NLERR(log, "Geneve opt len %d is not a multiple of 4.", - nla_len(a)); - return -EINVAL; - } - - /* We need to record the length of the options passed - * down, otherwise packets with the same format but - * additional options will be silently matched. - */ - if (!is_mask) { - SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), - false); - } else { - /* This is somewhat unusual because it looks at - * both the key and mask while parsing the - * attributes (and by extension assumes the key - * is parsed first). Normally, we would verify - * that each is the correct length and that the - * attributes line up in the validate function. - * However, that is difficult because this is - * variable length and we won't have the - * information later. - */ - if (match->key->tun_opts_len != nla_len(a)) { - OVS_NLERR(log, "Geneve option len %d != mask len %d", - match->key->tun_opts_len, nla_len(a)); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); - } - - opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); - SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), - nla_len(a), is_mask); - return 0; -} - -static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool log) -{ - struct nlattr *a; - int rem; - unsigned long opt_key_offset; - struct vxlan_metadata opts; - - BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); - - memset(&opts, 0, sizeof(opts)); - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - - if (type > OVS_VXLAN_EXT_MAX) { - OVS_NLERR(log, "VXLAN extension %d out of range max %d", - type, OVS_VXLAN_EXT_MAX); - return -EINVAL; - } - - if (!check_attr_len(nla_len(a), - ovs_vxlan_ext_key_lens[type].len)) { - OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", - type, nla_len(a), - ovs_vxlan_ext_key_lens[type].len); - return -EINVAL; - } - - switch (type) { - case OVS_VXLAN_EXT_GBP: - opts.gbp = nla_get_u32(a); - break; - default: - OVS_NLERR(log, "Unknown VXLAN extension attribute %d", - type); - return -EINVAL; - } - } - if (rem) { - OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", - rem); - return -EINVAL; - } - - if (!is_mask) - SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); - else - SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); - - opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); - SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), - is_mask); - return 0; -} - -static int erspan_tun_opt_from_nlattr(const struct nlattr *a, - struct sw_flow_match *match, bool is_mask, - bool log) -{ - unsigned long opt_key_offset; - - BUILD_BUG_ON(sizeof(struct erspan_metadata) > - sizeof(match->key->tun_opts)); - - if (nla_len(a) > sizeof(match->key->tun_opts)) { - OVS_NLERR(log, "ERSPAN option length err (len %d, max %zu).", - nla_len(a), sizeof(match->key->tun_opts)); - return -EINVAL; - } - - if (!is_mask) - SW_FLOW_KEY_PUT(match, tun_opts_len, - sizeof(struct erspan_metadata), false); - else - SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); - - opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); - SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), - nla_len(a), is_mask); - return 0; -} - -static int ip_tun_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool log) -{ - bool ttl = false, ipv4 = false, ipv6 = false; - __be16 tun_flags = 0; - int opts_type = 0; - struct nlattr *a; - int rem; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - int err; - - if (type > OVS_TUNNEL_KEY_ATTR_MAX) { - OVS_NLERR(log, "Tunnel attr %d out of range max %d", - type, OVS_TUNNEL_KEY_ATTR_MAX); - return -EINVAL; - } - - if (!check_attr_len(nla_len(a), - ovs_tunnel_key_lens[type].len)) { - OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", - type, nla_len(a), ovs_tunnel_key_lens[type].len); - return -EINVAL; - } - - switch (type) { - case OVS_TUNNEL_KEY_ATTR_ID: - SW_FLOW_KEY_PUT(match, tun_key.tun_id, - nla_get_be64(a), is_mask); - tun_flags |= TUNNEL_KEY; - break; - case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, - nla_get_in_addr(a), is_mask); - ipv4 = true; - break; - case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, - nla_get_in_addr(a), is_mask); - ipv4 = true; - break; - case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: - SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src, - nla_get_in6_addr(a), is_mask); - ipv6 = true; - break; - case OVS_TUNNEL_KEY_ATTR_IPV6_DST: - SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, - nla_get_in6_addr(a), is_mask); - ipv6 = true; - break; - case OVS_TUNNEL_KEY_ATTR_TOS: - SW_FLOW_KEY_PUT(match, tun_key.tos, - nla_get_u8(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_TTL: - SW_FLOW_KEY_PUT(match, tun_key.ttl, - nla_get_u8(a), is_mask); - ttl = true; - break; - case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_flags |= TUNNEL_DONT_FRAGMENT; - break; - case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_flags |= TUNNEL_CSUM; - break; - case OVS_TUNNEL_KEY_ATTR_TP_SRC: - SW_FLOW_KEY_PUT(match, tun_key.tp_src, - nla_get_be16(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_TP_DST: - SW_FLOW_KEY_PUT(match, tun_key.tp_dst, - nla_get_be16(a), is_mask); - break; - case OVS_TUNNEL_KEY_ATTR_OAM: - tun_flags |= TUNNEL_OAM; - break; - case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: - if (opts_type) { - OVS_NLERR(log, "Multiple metadata blocks provided"); - return -EINVAL; - } - - err = genev_tun_opt_from_nlattr(a, match, is_mask, log); - if (err) - return err; - - tun_flags |= TUNNEL_GENEVE_OPT; - opts_type = type; - break; - case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: - if (opts_type) { - OVS_NLERR(log, "Multiple metadata blocks provided"); - return -EINVAL; - } - - err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log); - if (err) - return err; - - tun_flags |= TUNNEL_VXLAN_OPT; - opts_type = type; - break; - case OVS_TUNNEL_KEY_ATTR_PAD: - break; - case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: - if (opts_type) { - OVS_NLERR(log, "Multiple metadata blocks provided"); - return -EINVAL; - } - - err = erspan_tun_opt_from_nlattr(a, match, is_mask, - log); - if (err) - return err; - - tun_flags |= TUNNEL_ERSPAN_OPT; - opts_type = type; - break; - default: - OVS_NLERR(log, "Unknown IP tunnel attribute %d", - type); - return -EINVAL; - } - } - - SW_FLOW_KEY_PUT(match, tun_key.tun_flags, tun_flags, is_mask); - if (is_mask) - SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); - else - SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET, - false); - - if (rem > 0) { - OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.", - rem); - return -EINVAL; - } - - if (ipv4 && ipv6) { - OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes"); - return -EINVAL; - } - - if (!is_mask) { - if (!ipv4 && !ipv6) { - OVS_NLERR(log, "IP tunnel dst address not specified"); - return -EINVAL; - } - if (ipv4 && !match->key->tun_key.u.ipv4.dst) { - OVS_NLERR(log, "IPv4 tunnel dst address is zero"); - return -EINVAL; - } - if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { - OVS_NLERR(log, "IPv6 tunnel dst address is zero"); - return -EINVAL; - } - - if (!ttl) { - OVS_NLERR(log, "IP tunnel TTL not specified."); - return -EINVAL; - } - } - - return opts_type; -} - -static int vxlan_opt_to_nlattr(struct sk_buff *skb, - const void *tun_opts, int swkey_tun_opts_len) -{ - const struct vxlan_metadata *opts = tun_opts; - struct nlattr *nla; - - nla = nla_nest_start_noflag(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); - if (!nla) - return -EMSGSIZE; - - if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0) - return -EMSGSIZE; - - nla_nest_end(skb, nla); - return 0; -} - -static int __ip_tun_to_nlattr(struct sk_buff *skb, - const struct ip_tunnel_key *output, - const void *tun_opts, int swkey_tun_opts_len, - unsigned short tun_proto) -{ - if (output->tun_flags & TUNNEL_KEY && - nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id, - OVS_TUNNEL_KEY_ATTR_PAD)) - return -EMSGSIZE; - switch (tun_proto) { - case AF_INET: - if (output->u.ipv4.src && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, - output->u.ipv4.src)) - return -EMSGSIZE; - if (output->u.ipv4.dst && - nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, - output->u.ipv4.dst)) - return -EMSGSIZE; - break; - case AF_INET6: - if (!ipv6_addr_any(&output->u.ipv6.src) && - nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC, - &output->u.ipv6.src)) - return -EMSGSIZE; - if (!ipv6_addr_any(&output->u.ipv6.dst) && - nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST, - &output->u.ipv6.dst)) - return -EMSGSIZE; - break; - } - if (output->tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) - return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) - return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) - return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_CSUM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) - return -EMSGSIZE; - if (output->tp_src && - nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_SRC, output->tp_src)) - return -EMSGSIZE; - if (output->tp_dst && - nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst)) - return -EMSGSIZE; - if ((output->tun_flags & TUNNEL_OAM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) - return -EMSGSIZE; - if (swkey_tun_opts_len) { - if (output->tun_flags & TUNNEL_GENEVE_OPT && - nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, - swkey_tun_opts_len, tun_opts)) - return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_VXLAN_OPT && - vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) - return -EMSGSIZE; - else if (output->tun_flags & TUNNEL_ERSPAN_OPT && - nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, - swkey_tun_opts_len, tun_opts)) - return -EMSGSIZE; - } - - return 0; -} - -static int ip_tun_to_nlattr(struct sk_buff *skb, - const struct ip_tunnel_key *output, - const void *tun_opts, int swkey_tun_opts_len, - unsigned short tun_proto) -{ - struct nlattr *nla; - int err; - - nla = nla_nest_start_noflag(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - - err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, - tun_proto); - if (err) - return err; - - nla_nest_end(skb, nla); - return 0; -} - -int ovs_nla_put_tunnel_info(struct sk_buff *skb, - struct ip_tunnel_info *tun_info) -{ - return __ip_tun_to_nlattr(skb, &tun_info->key, - ip_tunnel_info_opts(tun_info), - tun_info->options_len, - ip_tunnel_info_af(tun_info)); -} - -static int encode_vlan_from_nlattrs(struct sw_flow_match *match, - const struct nlattr *a[], - bool is_mask, bool inner) -{ - __be16 tci = 0; - __be16 tpid = 0; - - if (a[OVS_KEY_ATTR_VLAN]) - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - - if (a[OVS_KEY_ATTR_ETHERTYPE]) - tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - - if (likely(!inner)) { - SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask); - SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask); - } else { - SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask); - SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask); - } - return 0; -} - -static int validate_vlan_from_nlattrs(const struct sw_flow_match *match, - u64 key_attrs, bool inner, - const struct nlattr **a, bool log) -{ - __be16 tci = 0; - - if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && - (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && - eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) { - /* Not a VLAN. */ - return 0; - } - - if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && - (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { - OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN"); - return -EINVAL; - } - - if (a[OVS_KEY_ATTR_VLAN]) - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - - if (!(tci & htons(VLAN_CFI_MASK))) { - if (tci) { - OVS_NLERR(log, "%s TCI does not have VLAN_CFI_MASK bit set.", - (inner) ? "C-VLAN" : "VLAN"); - return -EINVAL; - } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) { - /* Corner case for truncated VLAN header. */ - OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.", - (inner) ? "C-VLAN" : "VLAN"); - return -EINVAL; - } - } - - return 1; -} - -static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, - u64 key_attrs, bool inner, - const struct nlattr **a, bool log) -{ - __be16 tci = 0; - __be16 tpid = 0; - bool encap_valid = !!(match->key->eth.vlan.tci & - htons(VLAN_CFI_MASK)); - bool i_encap_valid = !!(match->key->eth.cvlan.tci & - htons(VLAN_CFI_MASK)); - - if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) { - /* Not a VLAN. */ - return 0; - } - - if ((!inner && !encap_valid) || (inner && !i_encap_valid)) { - OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.", - (inner) ? "C-VLAN" : "VLAN"); - return -EINVAL; - } - - if (a[OVS_KEY_ATTR_VLAN]) - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - - if (a[OVS_KEY_ATTR_ETHERTYPE]) - tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - - if (tpid != htons(0xffff)) { - OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).", - (inner) ? "C-VLAN" : "VLAN", ntohs(tpid)); - return -EINVAL; - } - if (!(tci & htons(VLAN_CFI_MASK))) { - OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_CFI_MASK bit.", - (inner) ? "C-VLAN" : "VLAN"); - return -EINVAL; - } - - return 1; -} - -static int __parse_vlan_from_nlattrs(struct sw_flow_match *match, - u64 *key_attrs, bool inner, - const struct nlattr **a, bool is_mask, - bool log) -{ - int err; - const struct nlattr *encap; - - if (!is_mask) - err = validate_vlan_from_nlattrs(match, *key_attrs, inner, - a, log); - else - err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner, - a, log); - if (err <= 0) - return err; - - err = encode_vlan_from_nlattrs(match, a, is_mask, inner); - if (err) - return err; - - *key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); - *key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN); - *key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - - encap = a[OVS_KEY_ATTR_ENCAP]; - - if (!is_mask) - err = parse_flow_nlattrs(encap, a, key_attrs, log); - else - err = parse_flow_mask_nlattrs(encap, a, key_attrs, log); - - return err; -} - -static int parse_vlan_from_nlattrs(struct sw_flow_match *match, - u64 *key_attrs, const struct nlattr **a, - bool is_mask, bool log) -{ - int err; - bool encap_valid = false; - - err = __parse_vlan_from_nlattrs(match, key_attrs, false, a, - is_mask, log); - if (err) - return err; - - encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_CFI_MASK)); - if (encap_valid) { - err = __parse_vlan_from_nlattrs(match, key_attrs, true, a, - is_mask, log); - if (err) - return err; - } - - return 0; -} - -static int parse_eth_type_from_nlattrs(struct sw_flow_match *match, - u64 *attrs, const struct nlattr **a, - bool is_mask, bool log) -{ - __be16 eth_type; - - eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (is_mask) { - /* Always exact match EtherType. */ - eth_type = htons(0xffff); - } else if (!eth_proto_is_802_3(eth_type)) { - OVS_NLERR(log, "EtherType %x is less than min %x", - ntohs(eth_type), ETH_P_802_3_MIN); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); - *attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); - return 0; -} - -static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, - u64 *attrs, const struct nlattr **a, - bool is_mask, bool log) -{ - u8 mac_proto = MAC_PROTO_ETHERNET; - - if (*attrs & (1ULL << OVS_KEY_ATTR_DP_HASH)) { - u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); - - SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_DP_HASH); - } - - if (*attrs & (1ULL << OVS_KEY_ATTR_RECIRC_ID)) { - u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); - - SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_RECIRC_ID); - } - - if (*attrs & (1ULL << OVS_KEY_ATTR_PRIORITY)) { - SW_FLOW_KEY_PUT(match, phy.priority, - nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_PRIORITY); - } - - if (*attrs & (1ULL << OVS_KEY_ATTR_IN_PORT)) { - u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - - if (is_mask) { - in_port = 0xffffffff; /* Always exact match in_port. */ - } else if (in_port >= DP_MAX_PORTS) { - OVS_NLERR(log, "Port %d exceeds max allowable %d", - in_port, DP_MAX_PORTS); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_IN_PORT); - } else if (!is_mask) { - SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); - } - - if (*attrs & (1ULL << OVS_KEY_ATTR_SKB_MARK)) { - uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); - - SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_SKB_MARK); - } - if (*attrs & (1ULL << OVS_KEY_ATTR_TUNNEL)) { - if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, - is_mask, log) < 0) - return -EINVAL; - *attrs &= ~(1ULL << OVS_KEY_ATTR_TUNNEL); - } - - if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && - ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { - u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]); - - if (ct_state & ~CT_SUPPORTED_MASK) { - OVS_NLERR(log, "ct_state flags %08x unsupported", - ct_state); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); - } - if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && - ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { - u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); - - SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); - } - if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && - ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) { - u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]); - - SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); - } - if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) && - ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) { - const struct ovs_key_ct_labels *cl; - - cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]); - SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels, - sizeof(*cl), is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); - } - if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) { - const struct ovs_key_ct_tuple_ipv4 *ct; - - ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]); - - SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask); - SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask); - SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask); - SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4); - } - if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) { - const struct ovs_key_ct_tuple_ipv6 *ct; - - ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]); - - SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src, - sizeof(match->key->ipv6.ct_orig.src), - is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst, - sizeof(match->key->ipv6.ct_orig.dst), - is_mask); - SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask); - SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask); - SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask); - *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6); - } - - /* For layer 3 packets the Ethernet type is provided - * and treated as metadata but no MAC addresses are provided. - */ - if (!(*attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) && - (*attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE))) - mac_proto = MAC_PROTO_NONE; - - /* Always exact match mac_proto */ - SW_FLOW_KEY_PUT(match, mac_proto, is_mask ? 0xff : mac_proto, is_mask); - - if (mac_proto == MAC_PROTO_NONE) - return parse_eth_type_from_nlattrs(match, attrs, a, is_mask, - log); - - return 0; -} - -int nsh_hdr_from_nlattr(const struct nlattr *attr, - struct nshhdr *nh, size_t size) -{ - struct nlattr *a; - int rem; - u8 flags = 0; - u8 ttl = 0; - int mdlen = 0; - - /* validate_nsh has check this, so we needn't do duplicate check here - */ - if (size < NSH_BASE_HDR_LEN) - return -ENOBUFS; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - - switch (type) { - case OVS_NSH_KEY_ATTR_BASE: { - const struct ovs_nsh_key_base *base = nla_data(a); - - flags = base->flags; - ttl = base->ttl; - nh->np = base->np; - nh->mdtype = base->mdtype; - nh->path_hdr = base->path_hdr; - break; - } - case OVS_NSH_KEY_ATTR_MD1: - mdlen = nla_len(a); - if (mdlen > size - NSH_BASE_HDR_LEN) - return -ENOBUFS; - memcpy(&nh->md1, nla_data(a), mdlen); - break; - - case OVS_NSH_KEY_ATTR_MD2: - mdlen = nla_len(a); - if (mdlen > size - NSH_BASE_HDR_LEN) - return -ENOBUFS; - memcpy(&nh->md2, nla_data(a), mdlen); - break; - - default: - return -EINVAL; - } - } - - /* nsh header length = NSH_BASE_HDR_LEN + mdlen */ - nh->ver_flags_ttl_len = 0; - nsh_set_flags_ttl_len(nh, flags, ttl, NSH_BASE_HDR_LEN + mdlen); - - return 0; -} - -int nsh_key_from_nlattr(const struct nlattr *attr, - struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask) -{ - struct nlattr *a; - int rem; - - /* validate_nsh has check this, so we needn't do duplicate check here - */ - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - - switch (type) { - case OVS_NSH_KEY_ATTR_BASE: { - const struct ovs_nsh_key_base *base = nla_data(a); - const struct ovs_nsh_key_base *base_mask = base + 1; - - nsh->base = *base; - nsh_mask->base = *base_mask; - break; - } - case OVS_NSH_KEY_ATTR_MD1: { - const struct ovs_nsh_key_md1 *md1 = nla_data(a); - const struct ovs_nsh_key_md1 *md1_mask = md1 + 1; - - memcpy(nsh->context, md1->context, sizeof(*md1)); - memcpy(nsh_mask->context, md1_mask->context, - sizeof(*md1_mask)); - break; - } - case OVS_NSH_KEY_ATTR_MD2: - /* Not supported yet */ - return -ENOTSUPP; - default: - return -EINVAL; - } - } - - return 0; -} - -static int nsh_key_put_from_nlattr(const struct nlattr *attr, - struct sw_flow_match *match, bool is_mask, - bool is_push_nsh, bool log) -{ - struct nlattr *a; - int rem; - bool has_base = false; - bool has_md1 = false; - bool has_md2 = false; - u8 mdtype = 0; - int mdlen = 0; - - if (WARN_ON(is_push_nsh && is_mask)) - return -EINVAL; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - int i; - - if (type > OVS_NSH_KEY_ATTR_MAX) { - OVS_NLERR(log, "nsh attr %d is out of range max %d", - type, OVS_NSH_KEY_ATTR_MAX); - return -EINVAL; - } - - if (!check_attr_len(nla_len(a), - ovs_nsh_key_attr_lens[type].len)) { - OVS_NLERR( - log, - "nsh attr %d has unexpected len %d expected %d", - type, - nla_len(a), - ovs_nsh_key_attr_lens[type].len - ); - return -EINVAL; - } - - switch (type) { - case OVS_NSH_KEY_ATTR_BASE: { - const struct ovs_nsh_key_base *base = nla_data(a); - - has_base = true; - mdtype = base->mdtype; - SW_FLOW_KEY_PUT(match, nsh.base.flags, - base->flags, is_mask); - SW_FLOW_KEY_PUT(match, nsh.base.ttl, - base->ttl, is_mask); - SW_FLOW_KEY_PUT(match, nsh.base.mdtype, - base->mdtype, is_mask); - SW_FLOW_KEY_PUT(match, nsh.base.np, - base->np, is_mask); - SW_FLOW_KEY_PUT(match, nsh.base.path_hdr, - base->path_hdr, is_mask); - break; - } - case OVS_NSH_KEY_ATTR_MD1: { - const struct ovs_nsh_key_md1 *md1 = nla_data(a); - - has_md1 = true; - for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) - SW_FLOW_KEY_PUT(match, nsh.context[i], - md1->context[i], is_mask); - break; - } - case OVS_NSH_KEY_ATTR_MD2: - if (!is_push_nsh) /* Not supported MD type 2 yet */ - return -ENOTSUPP; - - has_md2 = true; - mdlen = nla_len(a); - if (mdlen > NSH_CTX_HDRS_MAX_LEN || mdlen <= 0) { - OVS_NLERR( - log, - "Invalid MD length %d for MD type %d", - mdlen, - mdtype - ); - return -EINVAL; - } - break; - default: - OVS_NLERR(log, "Unknown nsh attribute %d", - type); - return -EINVAL; - } - } - - if (rem > 0) { - OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem); - return -EINVAL; - } - - if (has_md1 && has_md2) { - OVS_NLERR( - 1, - "invalid nsh attribute: md1 and md2 are exclusive." - ); - return -EINVAL; - } - - if (!is_mask) { - if ((has_md1 && mdtype != NSH_M_TYPE1) || - (has_md2 && mdtype != NSH_M_TYPE2)) { - OVS_NLERR(1, "nsh attribute has unmatched MD type %d.", - mdtype); - return -EINVAL; - } - - if (is_push_nsh && - (!has_base || (!has_md1 && !has_md2))) { - OVS_NLERR( - 1, - "push_nsh: missing base or metadata attributes" - ); - return -EINVAL; - } - } - - return 0; -} - -static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, - u64 attrs, const struct nlattr **a, - bool is_mask, bool log) -{ - int err; - - err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log); - if (err) - return err; - - if (attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) { - const struct ovs_key_ethernet *eth_key; - - eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); - SW_FLOW_KEY_MEMCPY(match, eth.src, - eth_key->eth_src, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, eth.dst, - eth_key->eth_dst, ETH_ALEN, is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_ETHERNET); - - if (attrs & (1ULL << OVS_KEY_ATTR_VLAN)) { - /* VLAN attribute is always parsed before getting here since it - * may occur multiple times. - */ - OVS_NLERR(log, "VLAN attribute unexpected."); - return -EINVAL; - } - - if (attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE)) { - err = parse_eth_type_from_nlattrs(match, &attrs, a, is_mask, - log); - if (err) - return err; - } else if (!is_mask) { - SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); - } - } else if (!match->key->eth.type) { - OVS_NLERR(log, "Either Ethernet header or EtherType is required."); - return -EINVAL; - } - - if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { - const struct ovs_key_ipv4 *ipv4_key; - - ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); - if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR(log, "IPv4 frag type %d is out of range max %d", - ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); - return -EINVAL; - } - SW_FLOW_KEY_PUT(match, ip.proto, - ipv4_key->ipv4_proto, is_mask); - SW_FLOW_KEY_PUT(match, ip.tos, - ipv4_key->ipv4_tos, is_mask); - SW_FLOW_KEY_PUT(match, ip.ttl, - ipv4_key->ipv4_ttl, is_mask); - SW_FLOW_KEY_PUT(match, ip.frag, - ipv4_key->ipv4_frag, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.src, - ipv4_key->ipv4_src, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.dst, - ipv4_key->ipv4_dst, is_mask); - attrs &= ~(1 << OVS_KEY_ATTR_IPV4); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_IPV6)) { - const struct ovs_key_ipv6 *ipv6_key; - - ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); - if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { - OVS_NLERR(log, "IPv6 frag type %d is out of range max %d", - ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); - return -EINVAL; - } - - if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) { - OVS_NLERR(log, "IPv6 flow label %x is out of range (max=%x)", - ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, ipv6.label, - ipv6_key->ipv6_label, is_mask); - SW_FLOW_KEY_PUT(match, ip.proto, - ipv6_key->ipv6_proto, is_mask); - SW_FLOW_KEY_PUT(match, ip.tos, - ipv6_key->ipv6_tclass, is_mask); - SW_FLOW_KEY_PUT(match, ip.ttl, - ipv6_key->ipv6_hlimit, is_mask); - SW_FLOW_KEY_PUT(match, ip.frag, - ipv6_key->ipv6_frag, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, - ipv6_key->ipv6_src, - sizeof(match->key->ipv6.addr.src), - is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, - ipv6_key->ipv6_dst, - sizeof(match->key->ipv6.addr.dst), - is_mask); - - attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_ARP)) { - const struct ovs_key_arp *arp_key; - - arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); - if (!is_mask && (arp_key->arp_op & htons(0xff00))) { - OVS_NLERR(log, "Unknown ARP opcode (opcode=%d).", - arp_key->arp_op); - return -EINVAL; - } - - SW_FLOW_KEY_PUT(match, ipv4.addr.src, - arp_key->arp_sip, is_mask); - SW_FLOW_KEY_PUT(match, ipv4.addr.dst, - arp_key->arp_tip, is_mask); - SW_FLOW_KEY_PUT(match, ip.proto, - ntohs(arp_key->arp_op), is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, - arp_key->arp_sha, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, - arp_key->arp_tha, ETH_ALEN, is_mask); - - attrs &= ~(1ULL << OVS_KEY_ATTR_ARP); - } - - if (attrs & (1 << OVS_KEY_ATTR_NSH)) { - if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match, - is_mask, false, log) < 0) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_NSH); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_MPLS)) { - const struct ovs_key_mpls *mpls_key; - u32 hdr_len; - u32 label_count, label_count_mask, i; - - - mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); - hdr_len = nla_len(a[OVS_KEY_ATTR_MPLS]); - label_count = hdr_len / sizeof(struct ovs_key_mpls); - - if (label_count == 0 || label_count > MPLS_LABEL_DEPTH || - hdr_len % sizeof(struct ovs_key_mpls)) - return -EINVAL; - - label_count_mask = GENMASK(label_count - 1, 0); - - for (i = 0 ; i < label_count; i++) - SW_FLOW_KEY_PUT(match, mpls.lse[i], - mpls_key[i].mpls_lse, is_mask); - - SW_FLOW_KEY_PUT(match, mpls.num_labels_mask, - label_count_mask, is_mask); - - - attrs &= ~(1ULL << OVS_KEY_ATTR_MPLS); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_TCP)) { - const struct ovs_key_tcp *tcp_key; - - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask); - SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_TCP); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_TCP_FLAGS)) { - SW_FLOW_KEY_PUT(match, tp.flags, - nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), - is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_TCP_FLAGS); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_UDP)) { - const struct ovs_key_udp *udp_key; - - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask); - SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_UDP); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_SCTP)) { - const struct ovs_key_sctp *sctp_key; - - sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); - SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask); - SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_SCTP); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_ICMP)) { - const struct ovs_key_icmp *icmp_key; - - icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); - SW_FLOW_KEY_PUT(match, tp.src, - htons(icmp_key->icmp_type), is_mask); - SW_FLOW_KEY_PUT(match, tp.dst, - htons(icmp_key->icmp_code), is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_ICMP); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_ICMPV6)) { - const struct ovs_key_icmpv6 *icmpv6_key; - - icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); - SW_FLOW_KEY_PUT(match, tp.src, - htons(icmpv6_key->icmpv6_type), is_mask); - SW_FLOW_KEY_PUT(match, tp.dst, - htons(icmpv6_key->icmpv6_code), is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_ICMPV6); - } - - if (attrs & (1ULL << OVS_KEY_ATTR_ND)) { - const struct ovs_key_nd *nd_key; - - nd_key = nla_data(a[OVS_KEY_ATTR_ND]); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, - nd_key->nd_target, - sizeof(match->key->ipv6.nd.target), - is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, - nd_key->nd_sll, ETH_ALEN, is_mask); - SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, - nd_key->nd_tll, ETH_ALEN, is_mask); - attrs &= ~(1ULL << OVS_KEY_ATTR_ND); - } - - if (attrs != 0) { - OVS_NLERR(log, "Unknown key attributes %llx", - (unsigned long long)attrs); - return -EINVAL; - } - - return 0; -} - -static void nlattr_set(struct nlattr *attr, u8 val, - const struct ovs_len_tbl *tbl) -{ - struct nlattr *nla; - int rem; - - /* The nlattr stream should already have been validated */ - nla_for_each_nested(nla, attr, rem) { - if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) - nlattr_set(nla, val, tbl[nla_type(nla)].next ? : tbl); - else - memset(nla_data(nla), val, nla_len(nla)); - - if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE) - *(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK; - } -} - -static void mask_set_nlattr(struct nlattr *attr, u8 val) -{ - nlattr_set(attr, val, ovs_key_lens); -} - -/** - * ovs_nla_get_match - parses Netlink attributes into a flow key and - * mask. In case the 'mask' is NULL, the flow is treated as exact match - * flow. Otherwise, it is treated as a wildcarded flow, except the mask - * does not include any don't care bit. - * @net: Used to determine per-namespace field support. - * @match: receives the extracted flow match information. - * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. The fields should of the packet that triggered the creation - * of this flow. - * @mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink - * attribute specifies the mask field of the wildcarded flow. - * @log: Boolean to allow kernel error logging. Normally true, but when - * probing for feature compatibility this should be passed in as false to - * suppress unnecessary error logging. - */ -int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, - const struct nlattr *nla_key, - const struct nlattr *nla_mask, - bool log) -{ - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - struct nlattr *newmask = NULL; - u64 key_attrs = 0; - u64 mask_attrs = 0; - int err; - - err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); - if (err) - return err; - - err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log); - if (err) - return err; - - err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); - if (err) - return err; - - if (match->mask) { - if (!nla_mask) { - /* Create an exact match mask. We need to set to 0xff - * all the 'match->mask' fields that have been touched - * in 'match->key'. We cannot simply memset - * 'match->mask', because padding bytes and fields not - * specified in 'match->key' should be left to 0. - * Instead, we use a stream of netlink attributes, - * copied from 'key' and set to 0xff. - * ovs_key_from_nlattrs() will take care of filling - * 'match->mask' appropriately. - */ - newmask = kmemdup(nla_key, - nla_total_size(nla_len(nla_key)), - GFP_KERNEL); - if (!newmask) - return -ENOMEM; - - mask_set_nlattr(newmask, 0xff); - - /* The userspace does not send tunnel attributes that - * are 0, but we should not wildcard them nonetheless. - */ - if (match->key->tun_proto) - SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, - 0xff, true); - - nla_mask = newmask; - } - - err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log); - if (err) - goto free_newmask; - - SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true); - SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true); - - err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log); - if (err) - goto free_newmask; - - err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, - log); - if (err) - goto free_newmask; - } - - if (!match_validate(match, key_attrs, mask_attrs, log)) - err = -EINVAL; - -free_newmask: - kfree(newmask); - return err; -} - -static size_t get_ufid_len(const struct nlattr *attr, bool log) -{ - size_t len; - - if (!attr) - return 0; - - len = nla_len(attr); - if (len < 1 || len > MAX_UFID_LENGTH) { - OVS_NLERR(log, "ufid size %u bytes exceeds the range (1, %d)", - nla_len(attr), MAX_UFID_LENGTH); - return 0; - } - - return len; -} - -/* Initializes 'flow->ufid', returning true if 'attr' contains a valid UFID, - * or false otherwise. - */ -bool ovs_nla_get_ufid(struct sw_flow_id *sfid, const struct nlattr *attr, - bool log) -{ - sfid->ufid_len = get_ufid_len(attr, log); - if (sfid->ufid_len) - memcpy(sfid->ufid, nla_data(attr), sfid->ufid_len); - - return sfid->ufid_len; -} - -int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, - const struct sw_flow_key *key, bool log) -{ - struct sw_flow_key *new_key; - - if (ovs_nla_get_ufid(sfid, ufid, log)) - return 0; - - /* If UFID was not provided, use unmasked key. */ - new_key = kmalloc(sizeof(*new_key), GFP_KERNEL); - if (!new_key) - return -ENOMEM; - memcpy(new_key, key, sizeof(*key)); - sfid->unmasked_key = new_key; - - return 0; -} - -u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) -{ - return attr ? nla_get_u32(attr) : 0; -} - -/** - * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. - * @net: Network namespace. - * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack - * metadata. - * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink - * attributes. - * @attrs: Bit mask for the netlink attributes included in @a. - * @log: Boolean to allow kernel error logging. Normally true, but when - * probing for feature compatibility this should be passed in as false to - * suppress unnecessary error logging. - * - * This parses a series of Netlink attributes that form a flow key, which must - * take the same form accepted by flow_from_nlattrs(), but only enough of it to - * get the metadata, that is, the parts of the flow key that cannot be - * extracted from the packet itself. - * - * This must be called before the packet key fields are filled in 'key'. - */ - -int ovs_nla_get_flow_metadata(struct net *net, - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1], - u64 attrs, struct sw_flow_key *key, bool log) -{ - struct sw_flow_match match; - - memset(&match, 0, sizeof(match)); - match.key = key; - - key->ct_state = 0; - key->ct_zone = 0; - key->ct_orig_proto = 0; - memset(&key->ct, 0, sizeof(key->ct)); - memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig)); - memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig)); - - key->phy.in_port = DP_MAX_PORTS; - - return metadata_from_nlattrs(net, &match, &attrs, a, false, log); -} - -static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh, - bool is_mask) -{ - __be16 eth_type = !is_mask ? vh->tpid : htons(0xffff); - - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci)) - return -EMSGSIZE; - return 0; -} - -static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask, - struct sk_buff *skb) -{ - struct nlattr *start; - - start = nla_nest_start_noflag(skb, OVS_KEY_ATTR_NSH); - if (!start) - return -EMSGSIZE; - - if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(nsh->base), &nsh->base)) - goto nla_put_failure; - - if (is_mask || nsh->base.mdtype == NSH_M_TYPE1) { - if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1, - sizeof(nsh->context), nsh->context)) - goto nla_put_failure; - } - - /* Don't support MD type 2 yet */ - - nla_nest_end(skb, start); - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static int __ovs_nla_put_key(const struct sw_flow_key *swkey, - const struct sw_flow_key *output, bool is_mask, - struct sk_buff *skb) -{ - struct ovs_key_ethernet *eth_key; - struct nlattr *nla; - struct nlattr *encap = NULL; - struct nlattr *in_encap = NULL; - - if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) - goto nla_put_failure; - - if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) - goto nla_put_failure; - - if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) - goto nla_put_failure; - - if ((swkey->tun_proto || is_mask)) { - const void *opts = NULL; - - if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) - opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); - - if (ip_tun_to_nlattr(skb, &output->tun_key, opts, - swkey->tun_opts_len, swkey->tun_proto)) - goto nla_put_failure; - } - - if (swkey->phy.in_port == DP_MAX_PORTS) { - if (is_mask && (output->phy.in_port == 0xffff)) - if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) - goto nla_put_failure; - } else { - u16 upper_u16; - upper_u16 = !is_mask ? 0 : 0xffff; - - if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, - (upper_u16 << 16) | output->phy.in_port)) - goto nla_put_failure; - } - - if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) - goto nla_put_failure; - - if (ovs_ct_put_key(swkey, output, skb)) - goto nla_put_failure; - - if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) { - nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); - if (!nla) - goto nla_put_failure; - - eth_key = nla_data(nla); - ether_addr_copy(eth_key->eth_src, output->eth.src); - ether_addr_copy(eth_key->eth_dst, output->eth.dst); - - if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) { - if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask)) - goto nla_put_failure; - encap = nla_nest_start_noflag(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.vlan.tci) - goto unencap; - - if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { - if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) - goto nla_put_failure; - in_encap = nla_nest_start_noflag(skb, - OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.cvlan.tci) - goto unencap; - } - } - - if (swkey->eth.type == htons(ETH_P_802_2)) { - /* - * Ethertype 802.2 is represented in the netlink with omitted - * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and - * 0xffff in the mask attribute. Ethertype can also - * be wildcarded. - */ - if (is_mask && output->eth.type) - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, - output->eth.type)) - goto nla_put_failure; - goto unencap; - } - } - - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) - goto nla_put_failure; - - if (eth_type_vlan(swkey->eth.type)) { - /* There are 3 VLAN tags, we don't know anything about the rest - * of the packet, so truncate here. - */ - WARN_ON_ONCE(!(encap && in_encap)); - goto unencap; - } - - if (swkey->eth.type == htons(ETH_P_IP)) { - struct ovs_key_ipv4 *ipv4_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); - if (!nla) - goto nla_put_failure; - ipv4_key = nla_data(nla); - ipv4_key->ipv4_src = output->ipv4.addr.src; - ipv4_key->ipv4_dst = output->ipv4.addr.dst; - ipv4_key->ipv4_proto = output->ip.proto; - ipv4_key->ipv4_tos = output->ip.tos; - ipv4_key->ipv4_ttl = output->ip.ttl; - ipv4_key->ipv4_frag = output->ip.frag; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - struct ovs_key_ipv6 *ipv6_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); - if (!nla) - goto nla_put_failure; - ipv6_key = nla_data(nla); - memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, - sizeof(ipv6_key->ipv6_src)); - memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, - sizeof(ipv6_key->ipv6_dst)); - ipv6_key->ipv6_label = output->ipv6.label; - ipv6_key->ipv6_proto = output->ip.proto; - ipv6_key->ipv6_tclass = output->ip.tos; - ipv6_key->ipv6_hlimit = output->ip.ttl; - ipv6_key->ipv6_frag = output->ip.frag; - } else if (swkey->eth.type == htons(ETH_P_NSH)) { - if (nsh_key_to_nlattr(&output->nsh, is_mask, skb)) - goto nla_put_failure; - } else if (swkey->eth.type == htons(ETH_P_ARP) || - swkey->eth.type == htons(ETH_P_RARP)) { - struct ovs_key_arp *arp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); - if (!nla) - goto nla_put_failure; - arp_key = nla_data(nla); - memset(arp_key, 0, sizeof(struct ovs_key_arp)); - arp_key->arp_sip = output->ipv4.addr.src; - arp_key->arp_tip = output->ipv4.addr.dst; - arp_key->arp_op = htons(output->ip.proto); - ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); - ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); - } else if (eth_p_mpls(swkey->eth.type)) { - u8 num_labels, i; - struct ovs_key_mpls *mpls_key; - - num_labels = hweight_long(output->mpls.num_labels_mask); - nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, - num_labels * sizeof(*mpls_key)); - if (!nla) - goto nla_put_failure; - - mpls_key = nla_data(nla); - for (i = 0; i < num_labels; i++) - mpls_key[i].mpls_lse = output->mpls.lse[i]; - } - - if ((swkey->eth.type == htons(ETH_P_IP) || - swkey->eth.type == htons(ETH_P_IPV6)) && - swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - - if (swkey->ip.proto == IPPROTO_TCP) { - struct ovs_key_tcp *tcp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); - if (!nla) - goto nla_put_failure; - tcp_key = nla_data(nla); - tcp_key->tcp_src = output->tp.src; - tcp_key->tcp_dst = output->tp.dst; - if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, - output->tp.flags)) - goto nla_put_failure; - } else if (swkey->ip.proto == IPPROTO_UDP) { - struct ovs_key_udp *udp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); - if (!nla) - goto nla_put_failure; - udp_key = nla_data(nla); - udp_key->udp_src = output->tp.src; - udp_key->udp_dst = output->tp.dst; - } else if (swkey->ip.proto == IPPROTO_SCTP) { - struct ovs_key_sctp *sctp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); - if (!nla) - goto nla_put_failure; - sctp_key = nla_data(nla); - sctp_key->sctp_src = output->tp.src; - sctp_key->sctp_dst = output->tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IP) && - swkey->ip.proto == IPPROTO_ICMP) { - struct ovs_key_icmp *icmp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); - if (!nla) - goto nla_put_failure; - icmp_key = nla_data(nla); - icmp_key->icmp_type = ntohs(output->tp.src); - icmp_key->icmp_code = ntohs(output->tp.dst); - } else if (swkey->eth.type == htons(ETH_P_IPV6) && - swkey->ip.proto == IPPROTO_ICMPV6) { - struct ovs_key_icmpv6 *icmpv6_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, - sizeof(*icmpv6_key)); - if (!nla) - goto nla_put_failure; - icmpv6_key = nla_data(nla); - icmpv6_key->icmpv6_type = ntohs(output->tp.src); - icmpv6_key->icmpv6_code = ntohs(output->tp.dst); - - if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || - icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { - struct ovs_key_nd *nd_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); - if (!nla) - goto nla_put_failure; - nd_key = nla_data(nla); - memcpy(nd_key->nd_target, &output->ipv6.nd.target, - sizeof(nd_key->nd_target)); - ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll); - ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll); - } - } - } - -unencap: - if (in_encap) - nla_nest_end(skb, in_encap); - if (encap) - nla_nest_end(skb, encap); - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -int ovs_nla_put_key(const struct sw_flow_key *swkey, - const struct sw_flow_key *output, int attr, bool is_mask, - struct sk_buff *skb) -{ - int err; - struct nlattr *nla; - - nla = nla_nest_start_noflag(skb, attr); - if (!nla) - return -EMSGSIZE; - err = __ovs_nla_put_key(swkey, output, is_mask, skb); - if (err) - return err; - nla_nest_end(skb, nla); - - return 0; -} - -/* Called with ovs_mutex or RCU read lock. */ -int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb) -{ - if (ovs_identifier_is_ufid(&flow->id)) - return nla_put(skb, OVS_FLOW_ATTR_UFID, flow->id.ufid_len, - flow->id.ufid); - - return ovs_nla_put_key(flow->id.unmasked_key, flow->id.unmasked_key, - OVS_FLOW_ATTR_KEY, false, skb); -} - -/* Called with ovs_mutex or RCU read lock. */ -int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb) -{ - return ovs_nla_put_key(&flow->key, &flow->key, - OVS_FLOW_ATTR_KEY, false, skb); -} - -/* Called with ovs_mutex or RCU read lock. */ -int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) -{ - return ovs_nla_put_key(&flow->key, &flow->mask->key, - OVS_FLOW_ATTR_MASK, true, skb); -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,9,0) -#define MAX_ACTIONS_BUFSIZE (16 * 1024) -#else -#define MAX_ACTIONS_BUFSIZE (32 * 1024) -#endif - -static struct sw_flow_actions *nla_alloc_flow_actions(int size) -{ - struct sw_flow_actions *sfa; - - WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); - - sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); - if (!sfa) - return ERR_PTR(-ENOMEM); - - sfa->actions_len = 0; - return sfa; -} - -static void ovs_nla_free_set_action(const struct nlattr *a) -{ - const struct nlattr *ovs_key = nla_data(a); - struct ovs_tunnel_info *ovs_tun; - - switch (nla_type(ovs_key)) { - case OVS_KEY_ATTR_TUNNEL_INFO: - ovs_tun = nla_data(ovs_key); - ovs_dst_release((struct dst_entry *)ovs_tun->tun_dst); - break; - } -} - -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) -{ - const struct nlattr *a; - int rem; - - if (!sf_acts) - return; - - nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { - switch (nla_type(a)) { - case OVS_ACTION_ATTR_SET: - ovs_nla_free_set_action(a); - break; - case OVS_ACTION_ATTR_CT: - ovs_ct_free_action(a); - break; - } - } - - kfree(sf_acts); -} - -static void __ovs_nla_free_flow_actions(struct rcu_head *head) -{ - ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); -} - -/* Schedules 'sf_acts' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) -{ - call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); -} - -static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, - int attr_len, bool log) -{ - - struct sw_flow_actions *acts; - int new_acts_size; - size_t req_size = NLA_ALIGN(attr_len); - int next_offset = offsetof(struct sw_flow_actions, actions) + - (*sfa)->actions_len; - - if (req_size <= (ksize(*sfa) - next_offset)) - goto out; - - new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); - - if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) { - OVS_NLERR(log, "Flow action size exceeds max %u", - MAX_ACTIONS_BUFSIZE); - return ERR_PTR(-EMSGSIZE); - } - new_acts_size = MAX_ACTIONS_BUFSIZE; - } - - acts = nla_alloc_flow_actions(new_acts_size); - if (IS_ERR(acts)) - return (void *)acts; - - memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); - acts->actions_len = (*sfa)->actions_len; - acts->orig_len = (*sfa)->orig_len; - kfree(*sfa); - *sfa = acts; - -out: - (*sfa)->actions_len += req_size; - return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); -} - -static struct nlattr *__add_action(struct sw_flow_actions **sfa, - int attrtype, void *data, int len, bool log) -{ - struct nlattr *a; - - a = reserve_sfa_size(sfa, nla_attr_size(len), log); - if (IS_ERR(a)) - return a; - - a->nla_type = attrtype; - a->nla_len = nla_attr_size(len); - - if (data) - memcpy(nla_data(a), data, len); - memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); - - return a; -} - -int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data, - int len, bool log) -{ - struct nlattr *a; - - a = __add_action(sfa, attrtype, data, len, log); - - return PTR_ERR_OR_ZERO(a); -} - -static inline int add_nested_action_start(struct sw_flow_actions **sfa, - int attrtype, bool log) -{ - int used = (*sfa)->actions_len; - int err; - - err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log); - if (err) - return err; - - return used; -} - -static inline void add_nested_action_end(struct sw_flow_actions *sfa, - int st_offset) -{ - struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + - st_offset); - - a->nla_len = sfa->actions_len - st_offset; -} - -static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, - const struct sw_flow_key *key, - struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log); - -static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, - const struct sw_flow_key *key, - struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log, bool last) -{ - const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; - const struct nlattr *probability, *actions; - const struct nlattr *a; - int rem, start, err; - struct sample_arg arg; - - memset(attrs, 0, sizeof(attrs)); - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) - return -EINVAL; - attrs[type] = a; - } - if (rem) - return -EINVAL; - - probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; - if (!probability || nla_len(probability) != sizeof(u32)) - return -EINVAL; - - actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; - if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) - return -EINVAL; - - /* validation done, copy sample action. */ - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); - if (start < 0) - return start; - - /* When both skb and flow may be changed, put the sample - * into a deferred fifo. On the other hand, if only skb - * may be modified, the actions can be executed in place. - * - * Do this analysis at the flow installation time. - * Set 'clone_action->exec' to true if the actions can be - * executed without being deferred. - * - * If the sample is the last action, it can always be excuted - * rather than deferred. - */ - arg.exec = last || !actions_may_change_flow(actions); - arg.probability = nla_get_u32(probability); - - err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_ARG, &arg, sizeof(arg), - log); - if (err) - return err; - - err = __ovs_nla_copy_actions(net, actions, key, sfa, - eth_type, vlan_tci, mpls_label_count, log); - - if (err) - return err; - - add_nested_action_end(*sfa, start); - - return 0; -} - -static int validate_and_copy_clone(struct net *net, - const struct nlattr *attr, - const struct sw_flow_key *key, - struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log, bool last) -{ - int start, err; - u32 exec; - - if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN) - return -EINVAL; - - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log); - if (start < 0) - return start; - - exec = last || !actions_may_change_flow(attr); - - err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec, - sizeof(exec), log); - if (err) - return err; - - err = __ovs_nla_copy_actions(net, attr, key, sfa, - eth_type, vlan_tci, mpls_label_count, log); - if (err) - return err; - - add_nested_action_end(*sfa, start); - - return 0; -} - -void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, - bool reset_key, - struct sw_flow_mask *mask) -{ - memset(match, 0, sizeof(*match)); - match->key = key; - match->mask = mask; - - if (reset_key) - memset(key, 0, sizeof(*key)); - - if (mask) { - memset(&mask->key, 0, sizeof(mask->key)); - mask->range.start = mask->range.end = 0; - } -} - -static int validate_geneve_opts(struct sw_flow_key *key) -{ - struct geneve_opt *option; - int opts_len = key->tun_opts_len; - bool crit_opt = false; - - option = (struct geneve_opt *)TUN_METADATA_OPTS(key, key->tun_opts_len); - while (opts_len > 0) { - int len; - - if (opts_len < sizeof(*option)) - return -EINVAL; - - len = sizeof(*option) + option->length * 4; - if (len > opts_len) - return -EINVAL; - - crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); - - option = (struct geneve_opt *)((u8 *)option + len); - opts_len -= len; - } - - key->tun_key.tun_flags |= crit_opt ? TUNNEL_CRIT_OPT : 0; - - return 0; -} - -static int validate_and_copy_set_tun(const struct nlattr *attr, - struct sw_flow_actions **sfa, bool log) -{ - struct sw_flow_match match; - struct sw_flow_key key; - struct metadata_dst *tun_dst; - struct ip_tunnel_info *tun_info; - struct ovs_tunnel_info *ovs_tun; - struct nlattr *a; - int err = 0, start, opts_type; - __be16 dst_opt_type; - - dst_opt_type = 0; - ovs_match_init(&match, &key, true, NULL); - opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); - if (opts_type < 0) - return opts_type; - - if (key.tun_opts_len) { - switch (opts_type) { - case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: - err = validate_geneve_opts(&key); - if (err < 0) - return err; - dst_opt_type = TUNNEL_GENEVE_OPT; - break; - case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: - dst_opt_type = TUNNEL_VXLAN_OPT; - break; - case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: - dst_opt_type = TUNNEL_ERSPAN_OPT; - break; - } - } - - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log); - if (start < 0) - return start; - - tun_dst = metadata_dst_alloc(key.tun_opts_len, METADATA_IP_TUNNEL, - GFP_KERNEL); - - if (!tun_dst) - return -ENOMEM; - - err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL); - if (err) { - dst_release((struct dst_entry *)tun_dst); - return err; - } - a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*ovs_tun), log); - if (IS_ERR(a)) { - ovs_dst_release((struct dst_entry *)tun_dst); - return PTR_ERR(a); - } - - ovs_tun = nla_data(a); - ovs_tun->tun_dst = tun_dst; - - tun_info = &tun_dst->u.tun_info; - tun_info->mode = IP_TUNNEL_INFO_TX; - if (key.tun_proto == AF_INET6) - tun_info->mode |= IP_TUNNEL_INFO_IPV6; - tun_info->key = key.tun_key; - - /* We need to store the options in the action itself since - * everything else will go away after flow setup. We can append - * it to tun_info and then point there. - */ - ip_tunnel_info_opts_set(tun_info, - TUN_METADATA_OPTS(&key, key.tun_opts_len), - key.tun_opts_len, dst_opt_type); - add_nested_action_end(*sfa, start); - - return err; -} - -static bool validate_nsh(const struct nlattr *attr, bool is_mask, - bool is_push_nsh, bool log) -{ - struct sw_flow_match match; - struct sw_flow_key key; - int ret = 0; - - ovs_match_init(&match, &key, true, NULL); - ret = nsh_key_put_from_nlattr(attr, &match, is_mask, - is_push_nsh, log); - return !ret; -} - -/* Return false if there are any non-masked bits set. - * Mask follows data immediately, before any netlink padding. - */ -static bool validate_masked(u8 *data, int len) -{ - u8 *mask = data + len; - - while (len--) - if (*data++ & ~*mask++) - return false; - - return true; -} - -static int validate_set(const struct nlattr *a, - const struct sw_flow_key *flow_key, - struct sw_flow_actions **sfa, bool *skip_copy, - u8 mac_proto, __be16 eth_type, bool masked, bool log) -{ - const struct nlattr *ovs_key = nla_data(a); - int key_type = nla_type(ovs_key); - size_t key_len; - - /* There can be only one key in a action */ - if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) - return -EINVAL; - - key_len = nla_len(ovs_key); - if (masked) - key_len /= 2; - - if (key_type > OVS_KEY_ATTR_MAX || - !check_attr_len(key_len, ovs_key_lens[key_type].len)) - return -EINVAL; - - if (masked && !validate_masked(nla_data(ovs_key), key_len)) - return -EINVAL; - - switch (key_type) { - case OVS_KEY_ATTR_PRIORITY: - case OVS_KEY_ATTR_SKB_MARK: - case OVS_KEY_ATTR_CT_MARK: - case OVS_KEY_ATTR_CT_LABELS: - break; - - case OVS_KEY_ATTR_ETHERNET: - if (mac_proto != MAC_PROTO_ETHERNET) - return -EINVAL; - break; - - case OVS_KEY_ATTR_TUNNEL: { - int err; - -#ifndef USE_UPSTREAM_TUNNEL - if (eth_p_mpls(eth_type)) - return -EINVAL; -#endif - if (masked) - return -EINVAL; /* Masked tunnel set not supported. */ - - *skip_copy = true; - err = validate_and_copy_set_tun(a, sfa, log); - if (err) - return err; - break; - } - case OVS_KEY_ATTR_IPV4: { - const struct ovs_key_ipv4 *ipv4_key; - - if (eth_type != htons(ETH_P_IP)) - return -EINVAL; - - ipv4_key = nla_data(ovs_key); - - if (masked) { - const struct ovs_key_ipv4 *mask = ipv4_key + 1; - - /* Non-writeable fields. */ - if (mask->ipv4_proto || mask->ipv4_frag) - return -EINVAL; - } else { - if (ipv4_key->ipv4_proto != flow_key->ip.proto) - return -EINVAL; - - if (ipv4_key->ipv4_frag != flow_key->ip.frag) - return -EINVAL; - } - break; - } - case OVS_KEY_ATTR_IPV6: { - const struct ovs_key_ipv6 *ipv6_key; - - if (eth_type != htons(ETH_P_IPV6)) - return -EINVAL; - - ipv6_key = nla_data(ovs_key); - - if (masked) { - const struct ovs_key_ipv6 *mask = ipv6_key + 1; - - /* Non-writeable fields. */ - if (mask->ipv6_proto || mask->ipv6_frag) - return -EINVAL; - - /* Invalid bits in the flow label mask? */ - if (ntohl(mask->ipv6_label) & 0xFFF00000) - return -EINVAL; - } else { - if (ipv6_key->ipv6_proto != flow_key->ip.proto) - return -EINVAL; - - if (ipv6_key->ipv6_frag != flow_key->ip.frag) - return -EINVAL; - } - if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) - return -EINVAL; - - break; - } - case OVS_KEY_ATTR_TCP: - if ((eth_type != htons(ETH_P_IP) && - eth_type != htons(ETH_P_IPV6)) || - flow_key->ip.proto != IPPROTO_TCP) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_UDP: - if ((eth_type != htons(ETH_P_IP) && - eth_type != htons(ETH_P_IPV6)) || - flow_key->ip.proto != IPPROTO_UDP) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_MPLS: - if (!eth_p_mpls(eth_type)) - return -EINVAL; - break; - - case OVS_KEY_ATTR_SCTP: - if ((eth_type != htons(ETH_P_IP) && - eth_type != htons(ETH_P_IPV6)) || - flow_key->ip.proto != IPPROTO_SCTP) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_NSH: - if (eth_type != htons(ETH_P_NSH)) - return -EINVAL; - if (!validate_nsh(nla_data(a), masked, false, log)) - return -EINVAL; - break; - - default: - return -EINVAL; - } - - /* Convert non-masked non-tunnel set actions to masked set actions. */ - if (!masked && key_type != OVS_KEY_ATTR_TUNNEL) { - int start, len = key_len * 2; - struct nlattr *at; - - *skip_copy = true; - - start = add_nested_action_start(sfa, - OVS_ACTION_ATTR_SET_TO_MASKED, - log); - if (start < 0) - return start; - - at = __add_action(sfa, key_type, NULL, len, log); - if (IS_ERR(at)) - return PTR_ERR(at); - - memcpy(nla_data(at), nla_data(ovs_key), key_len); /* Key. */ - memset(nla_data(at) + key_len, 0xff, key_len); /* Mask. */ - /* Clear non-writeable bits from otherwise writeable fields. */ - if (key_type == OVS_KEY_ATTR_IPV6) { - struct ovs_key_ipv6 *mask = nla_data(at) + key_len; - - mask->ipv6_label &= htonl(0x000FFFFF); - } - add_nested_action_end(*sfa, start); - } - - return 0; -} - -static int validate_userspace(const struct nlattr *attr) -{ - static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { - [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, - [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, - [OVS_USERSPACE_ATTR_EGRESS_TUN_PORT] = {.type = NLA_U32 }, - }; - struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; - int error; - - error = nla_parse_nested_deprecated(a, OVS_USERSPACE_ATTR_MAX, attr, - userspace_policy, NULL); - if (error) - return error; - - if (!a[OVS_USERSPACE_ATTR_PID] || - !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) - return -EINVAL; - - return 0; -} - -static const struct nla_policy cpl_policy[OVS_CHECK_PKT_LEN_ATTR_MAX + 1] = { - [OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] = {.type = NLA_U16 }, - [OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER] = {.type = NLA_NESTED }, - [OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL] = {.type = NLA_NESTED }, -}; - -static int validate_and_copy_check_pkt_len(struct net *net, - const struct nlattr *attr, - const struct sw_flow_key *key, - struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, - bool log, bool last) -{ - const struct nlattr *acts_if_greater, *acts_if_lesser_eq; - struct nlattr *a[OVS_CHECK_PKT_LEN_ATTR_MAX + 1]; - struct check_pkt_len_arg arg; - int nested_acts_start; - int start, err; - - err = nla_parse_deprecated_strict(a, OVS_CHECK_PKT_LEN_ATTR_MAX, - nla_data(attr), nla_len(attr), - cpl_policy, NULL); - if (err) - return err; - - if (!a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] || - !nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN])) - return -EINVAL; - - acts_if_lesser_eq = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL]; - acts_if_greater = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER]; - - /* Both the nested action should be present. */ - if (!acts_if_greater || !acts_if_lesser_eq) - return -EINVAL; - - /* validation done, copy the nested actions. */ - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CHECK_PKT_LEN, - log); - if (start < 0) - return start; - - arg.pkt_len = nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN]); - arg.exec_for_lesser_equal = - last || !actions_may_change_flow(acts_if_lesser_eq); - arg.exec_for_greater = - last || !actions_may_change_flow(acts_if_greater); - - err = ovs_nla_add_action(sfa, OVS_CHECK_PKT_LEN_ATTR_ARG, &arg, - sizeof(arg), log); - if (err) - return err; - - nested_acts_start = add_nested_action_start(sfa, - OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL, log); - if (nested_acts_start < 0) - return nested_acts_start; - - err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, - eth_type, vlan_tci, mpls_label_count, log); - - if (err) - return err; - - add_nested_action_end(*sfa, nested_acts_start); - - nested_acts_start = add_nested_action_start(sfa, - OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER, log); - if (nested_acts_start < 0) - return nested_acts_start; - - err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, - eth_type, vlan_tci, mpls_label_count, log); - - if (err) - return err; - - add_nested_action_end(*sfa, nested_acts_start); - add_nested_action_end(*sfa, start); - return 0; -} - -static int copy_action(const struct nlattr *from, - struct sw_flow_actions **sfa, bool log) -{ - int totlen = NLA_ALIGN(from->nla_len); - struct nlattr *to; - - to = reserve_sfa_size(sfa, from->nla_len, log); - if (IS_ERR(to)) - return PTR_ERR(to); - - memcpy(to, from, totlen); - return 0; -} - -static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, - const struct sw_flow_key *key, - struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log) -{ - u8 mac_proto = ovs_key_mac_proto(key); - const struct nlattr *a; - int rem, err; - - nla_for_each_nested(a, attr, rem) { - /* Expected argument lengths, (u32)-1 for variable length. */ - static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { - [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), - [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), - [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, - [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), - [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), - [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), - [OVS_ACTION_ATTR_POP_VLAN] = 0, - [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, - [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), - [OVS_ACTION_ATTR_CT] = (u32)-1, - [OVS_ACTION_ATTR_CT_CLEAR] = 0, - [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), - [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), - [OVS_ACTION_ATTR_POP_ETH] = 0, - [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, - [OVS_ACTION_ATTR_POP_NSH] = 0, - [OVS_ACTION_ATTR_METER] = sizeof(u32), - [OVS_ACTION_ATTR_CLONE] = (u32)-1, - [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, - }; - const struct ovs_action_push_vlan *vlan; - int type = nla_type(a); - bool skip_copy; - - if (type > OVS_ACTION_ATTR_MAX || - (action_lens[type] != nla_len(a) && - action_lens[type] != (u32)-1)) - return -EINVAL; - - skip_copy = false; - switch (type) { - case OVS_ACTION_ATTR_UNSPEC: - return -EINVAL; - - case OVS_ACTION_ATTR_USERSPACE: - err = validate_userspace(a); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_OUTPUT: - if (nla_get_u32(a) >= DP_MAX_PORTS) - return -EINVAL; - break; - - case OVS_ACTION_ATTR_TRUNC: { - const struct ovs_action_trunc *trunc = nla_data(a); - - if (trunc->max_len < ETH_HLEN) - return -EINVAL; - break; - } - - case OVS_ACTION_ATTR_HASH: { - const struct ovs_action_hash *act_hash = nla_data(a); - - switch (act_hash->hash_alg) { - case OVS_HASH_ALG_L4: - break; - default: - return -EINVAL; - } - - break; - } - - case OVS_ACTION_ATTR_POP_VLAN: - if (mac_proto != MAC_PROTO_ETHERNET) - return -EINVAL; - vlan_tci = htons(0); - break; - - case OVS_ACTION_ATTR_PUSH_VLAN: - if (mac_proto != MAC_PROTO_ETHERNET) - return -EINVAL; - vlan = nla_data(a); - if (!eth_type_vlan(vlan->vlan_tpid)) - return -EINVAL; - if (!(vlan->vlan_tci & htons(VLAN_CFI_MASK))) - return -EINVAL; - vlan_tci = vlan->vlan_tci; - break; - - case OVS_ACTION_ATTR_RECIRC: - break; - - case OVS_ACTION_ATTR_PUSH_MPLS: { - const struct ovs_action_push_mpls *mpls = nla_data(a); - - if (!eth_p_mpls(mpls->mpls_ethertype)) - return -EINVAL; - /* Prohibit push MPLS other than to a white list - * for packets that have a known tag order. - */ - if (vlan_tci & htons(VLAN_CFI_MASK) || - (eth_type != htons(ETH_P_IP) && - eth_type != htons(ETH_P_IPV6) && - eth_type != htons(ETH_P_ARP) && - eth_type != htons(ETH_P_RARP) && - !eth_p_mpls(eth_type))) - return -EINVAL; - eth_type = mpls->mpls_ethertype; - mpls_label_count++; - break; - } - - case OVS_ACTION_ATTR_POP_MPLS: { - __be16 proto; - if (vlan_tci & htons(VLAN_CFI_MASK) || - !eth_p_mpls(eth_type)) - return -EINVAL; - - /* Disallow subsequent L2.5+ set actions and mpls_pop - * actions once the last MPLS label in the packet is - * popped as there is no check here to ensure that - * the new eth type is valid and thus set actions could - * write off the end of the packet or otherwise corrupt - * it. - * - * Support for these actions is planned using packet - * recirculation. - */ - proto = nla_get_be16(a); - mpls_label_count--; - - if (!eth_p_mpls(proto) || !mpls_label_count) - eth_type = htons(0); - else - eth_type = proto; - break; - } - case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, - &skip_copy, mac_proto, eth_type, - false, log); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SET_MASKED: - err = validate_set(a, key, sfa, - &skip_copy, mac_proto, eth_type, - true, log); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SAMPLE: { - bool last = nla_is_last(a, rem); - - err = validate_and_copy_sample(net, a, key, sfa, - eth_type, vlan_tci, - mpls_label_count, - log, last); - if (err) - return err; - skip_copy = true; - break; - } - - case OVS_ACTION_ATTR_CT: - err = ovs_ct_copy_action(net, a, key, sfa, log); - if (err) - return err; - skip_copy = true; - break; - - case OVS_ACTION_ATTR_CT_CLEAR: - break; - - case OVS_ACTION_ATTR_PUSH_ETH: - /* Disallow pushing an Ethernet header if one - * is already present */ - if (mac_proto != MAC_PROTO_NONE) - return -EINVAL; - mac_proto = MAC_PROTO_ETHERNET; - break; - - case OVS_ACTION_ATTR_POP_ETH: - if (mac_proto != MAC_PROTO_ETHERNET) - return -EINVAL; - if (vlan_tci & htons(VLAN_CFI_MASK)) - return -EINVAL; - mac_proto = MAC_PROTO_NONE; - break; - - case OVS_ACTION_ATTR_PUSH_NSH: - if (mac_proto != MAC_PROTO_ETHERNET) { - u8 next_proto; - - next_proto = tun_p_from_eth_p(eth_type); - if (!next_proto) - return -EINVAL; - } - mac_proto = MAC_PROTO_NONE; - if (!validate_nsh(nla_data(a), false, true, true)) - return -EINVAL; - break; - - case OVS_ACTION_ATTR_POP_NSH: { - __be16 inner_proto; - - if (eth_type != htons(ETH_P_NSH)) - return -EINVAL; - inner_proto = tun_p_to_eth_p(key->nsh.base.np); - if (!inner_proto) - return -EINVAL; - if (key->nsh.base.np == TUN_P_ETHERNET) - mac_proto = MAC_PROTO_ETHERNET; - else - mac_proto = MAC_PROTO_NONE; - break; - } - - case OVS_ACTION_ATTR_METER: - /* Non-existent meters are simply ignored. */ - break; - - case OVS_ACTION_ATTR_CLONE: { - bool last = nla_is_last(a, rem); - - err = validate_and_copy_clone(net, a, key, sfa, - eth_type, vlan_tci, - mpls_label_count, - log, last); - if (err) - return err; - skip_copy = true; - break; - } - - case OVS_ACTION_ATTR_CHECK_PKT_LEN: { - bool last = nla_is_last(a, rem); - - err = validate_and_copy_check_pkt_len(net, a, key, sfa, - eth_type, - vlan_tci, log, - mpls_label_count, - last); - if (err) - return err; - skip_copy = true; - break; - } - - default: - OVS_NLERR(log, "Unknown Action type %d", type); - return -EINVAL; - } - if (!skip_copy) { - err = copy_action(a, sfa, log); - if (err) - return err; - } - } - - if (rem > 0) - return -EINVAL; - - return 0; -} - -/* 'key' must be the masked key. */ -int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, - const struct sw_flow_key *key, - struct sw_flow_actions **sfa, bool log) -{ - int err; - u32 mpls_label_count = 0; - - *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE)); - if (IS_ERR(*sfa)) - return PTR_ERR(*sfa); - - if (eth_p_mpls(key->eth.type)) - mpls_label_count = hweight_long(key->mpls.num_labels_mask); - - (*sfa)->orig_len = nla_len(attr); - err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, - key->eth.vlan.tci, mpls_label_count, log); - if (err) - ovs_nla_free_flow_actions(*sfa); - - return err; -} - -static int sample_action_to_attr(const struct nlattr *attr, - struct sk_buff *skb) -{ - struct nlattr *start, *ac_start = NULL, *sample_arg; - int err = 0, rem = nla_len(attr); - const struct sample_arg *arg; - struct nlattr *actions; - - start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SAMPLE); - if (!start) - return -EMSGSIZE; - - sample_arg = nla_data(attr); - arg = nla_data(sample_arg); - actions = nla_next(sample_arg, &rem); - - if (nla_put_u32(skb, OVS_SAMPLE_ATTR_PROBABILITY, arg->probability)) { - err = -EMSGSIZE; - goto out; - } - - ac_start = nla_nest_start_noflag(skb, OVS_SAMPLE_ATTR_ACTIONS); - if (!ac_start) { - err = -EMSGSIZE; - goto out; - } - - err = ovs_nla_put_actions(actions, rem, skb); - -out: - if (err) { - nla_nest_cancel(skb, ac_start); - nla_nest_cancel(skb, start); - } else { - nla_nest_end(skb, ac_start); - nla_nest_end(skb, start); - } - - return err; -} - -static int clone_action_to_attr(const struct nlattr *attr, - struct sk_buff *skb) -{ - struct nlattr *start; - int err = 0, rem = nla_len(attr); - - start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CLONE); - if (!start) - return -EMSGSIZE; - - err = ovs_nla_put_actions(nla_data(attr), rem, skb); - - if (err) - nla_nest_cancel(skb, start); - else - nla_nest_end(skb, start); - - return err; -} - -static int check_pkt_len_action_to_attr(const struct nlattr *attr, - struct sk_buff *skb) -{ - struct nlattr *start, *ac_start = NULL; - const struct check_pkt_len_arg *arg; - const struct nlattr *a, *cpl_arg; - int err = 0, rem = nla_len(attr); - - start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CHECK_PKT_LEN); - if (!start) - return -EMSGSIZE; - - /* The first nested attribute in 'attr' is always - * 'OVS_CHECK_PKT_LEN_ATTR_ARG'. - */ - cpl_arg = nla_data(attr); - arg = nla_data(cpl_arg); - - if (nla_put_u16(skb, OVS_CHECK_PKT_LEN_ATTR_PKT_LEN, arg->pkt_len)) { - err = -EMSGSIZE; - goto out; - } - - /* Second nested attribute in 'attr' is always - * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. - */ - a = nla_next(cpl_arg, &rem); - ac_start = nla_nest_start_noflag(skb, - OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL); - if (!ac_start) { - err = -EMSGSIZE; - goto out; - } - - err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); - if (err) { - nla_nest_cancel(skb, ac_start); - goto out; - } else { - nla_nest_end(skb, ac_start); - } - - /* Third nested attribute in 'attr' is always - * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER. - */ - a = nla_next(a, &rem); - ac_start = nla_nest_start_noflag(skb, - OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER); - if (!ac_start) { - err = -EMSGSIZE; - goto out; - } - - err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); - if (err) { - nla_nest_cancel(skb, ac_start); - goto out; - } else { - nla_nest_end(skb, ac_start); - } - - nla_nest_end(skb, start); - return 0; - -out: - nla_nest_cancel(skb, start); - return err; -} - -static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) -{ - const struct nlattr *ovs_key = nla_data(a); - int key_type = nla_type(ovs_key); - struct nlattr *start; - int err; - - switch (key_type) { - case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); - struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; - - start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SET); - if (!start) - return -EMSGSIZE; - - err = ip_tun_to_nlattr(skb, &tun_info->key, - ip_tunnel_info_opts(tun_info), - tun_info->options_len, - ip_tunnel_info_af(tun_info)); - if (err) - return err; - nla_nest_end(skb, start); - break; - } - default: - if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) - return -EMSGSIZE; - break; - } - - return 0; -} - -static int masked_set_action_to_set_action_attr(const struct nlattr *a, - struct sk_buff *skb) -{ - const struct nlattr *ovs_key = nla_data(a); - struct nlattr *nla; - size_t key_len = nla_len(ovs_key) / 2; - - /* Revert the conversion we did from a non-masked set action to - * masked set action. - */ - nla = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SET); - if (!nla) - return -EMSGSIZE; - - if (nla_put(skb, nla_type(ovs_key), key_len, nla_data(ovs_key))) - return -EMSGSIZE; - - nla_nest_end(skb, nla); - return 0; -} - -int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) -{ - const struct nlattr *a; - int rem, err; - - nla_for_each_attr(a, attr, len, rem) { - int type = nla_type(a); - - switch (type) { - case OVS_ACTION_ATTR_SET: - err = set_action_to_attr(a, skb); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SET_TO_MASKED: - err = masked_set_action_to_set_action_attr(a, skb); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SAMPLE: - err = sample_action_to_attr(a, skb); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_CT: - err = ovs_ct_action_to_attr(nla_data(a), skb); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_CLONE: - err = clone_action_to_attr(a, skb); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_CHECK_PKT_LEN: - err = check_pkt_len_action_to_attr(a, skb); - if (err) - return err; - break; - - default: - if (nla_put(skb, type, nla_len(a), nla_data(a))) - return -EMSGSIZE; - break; - } - } - - return 0; -} diff --git a/datapath/flow_netlink.h b/datapath/flow_netlink.h deleted file mode 100644 index e10df2b5c..000000000 --- a/datapath/flow_netlink.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - - -#ifndef FLOW_NETLINK_H -#define FLOW_NETLINK_H 1 - -#include <linux/kernel.h> -#include <linux/netlink.h> -#include <linux/openvswitch.h> -#include <linux/spinlock.h> -#include <linux/types.h> -#include <linux/rcupdate.h> -#include <linux/if_ether.h> -#include <linux/in6.h> -#include <linux/jiffies.h> -#include <linux/time.h> - -#include <net/inet_ecn.h> -#include <net/ip_tunnels.h> - -#include "flow.h" - -size_t ovs_tun_key_attr_size(void); -size_t ovs_key_attr_size(void); - -void ovs_match_init(struct sw_flow_match *match, - struct sw_flow_key *key, bool reset_key, - struct sw_flow_mask *mask); - -int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, - int attr, bool is_mask, struct sk_buff *); -int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], - u64 *attrsp, bool log); -int ovs_nla_get_flow_metadata(struct net *net, - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1], - u64 attrs, struct sw_flow_key *key, bool log); - -int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); -int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); -int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); - -int ovs_nla_get_match(struct net *, struct sw_flow_match *, - const struct nlattr *key, const struct nlattr *mask, - bool log); -int ovs_nla_put_tunnel_info(struct sk_buff *skb, - struct ip_tunnel_info *tun_info); - -bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); -int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, - const struct sw_flow_key *key, bool log); -u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); - -int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, - const struct sw_flow_key *key, - struct sw_flow_actions **sfa, bool log); -int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, - void *data, int len, bool log); -int ovs_nla_put_actions(const struct nlattr *attr, - int len, struct sk_buff *skb); - -void ovs_nla_free_flow_actions(struct sw_flow_actions *); -void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); - -int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh, - struct ovs_key_nsh *nsh_mask); -int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, - size_t size); - -#endif /* flow_netlink.h */ diff --git a/datapath/flow_table.c b/datapath/flow_table.c deleted file mode 100644 index 650338fb0..000000000 --- a/datapath/flow_table.c +++ /dev/null @@ -1,988 +0,0 @@ -/* - * Copyright (c) 2007-2013 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include "flow.h" -#include "datapath.h" -#include <linux/uaccess.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/if_vlan.h> -#include <net/llc_pdu.h> -#include <linux/kernel.h> -#include <linux/jhash.h> -#include <linux/jiffies.h> -#include <linux/llc.h> -#include <linux/module.h> -#include <linux/in.h> -#include <linux/rcupdate.h> -#include <linux/cpumask.h> -#include <linux/if_arp.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/sctp.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/icmp.h> -#include <linux/icmpv6.h> -#include <linux/rculist.h> -#include <net/ip.h> -#include <net/ipv6.h> -#include <net/ndisc.h> - -#include "flow_netlink.h" - -#define TBL_MIN_BUCKETS 1024 -#define MASK_ARRAY_SIZE_MIN 16 -#define REHASH_INTERVAL (10 * 60 * HZ) - -#define MC_HASH_SHIFT 8 -#define MC_HASH_ENTRIES (1u << MC_HASH_SHIFT) -#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) - -static struct kmem_cache *flow_cache; -struct kmem_cache *flow_stats_cache __read_mostly; - -static u16 range_n_bytes(const struct sw_flow_key_range *range) -{ - return range->end - range->start; -} - -void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - bool full, const struct sw_flow_mask *mask) -{ - int start = full ? 0 : mask->range.start; - int len = full ? sizeof *dst : range_n_bytes(&mask->range); - const long *m = (const long *)((const u8 *)&mask->key + start); - const long *s = (const long *)((const u8 *)src + start); - long *d = (long *)((u8 *)dst + start); - int i; - - /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, - * if 'full' is false the memory outside of the 'mask->range' is left - * uninitialized. This can be used as an optimization when further - * operations on 'dst' only use contents within 'mask->range'. - */ - for (i = 0; i < len; i += sizeof(long)) - *d++ = *s++ & *m++; -} - -struct sw_flow *ovs_flow_alloc(void) -{ - struct sw_flow *flow; - struct sw_flow_stats *stats; - - flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL); - if (!flow) - return ERR_PTR(-ENOMEM); - - flow->stats_last_writer = -1; - - /* Initialize the default stat node. */ - stats = kmem_cache_alloc_node(flow_stats_cache, - GFP_KERNEL | __GFP_ZERO, - node_online(0) ? 0 : NUMA_NO_NODE); - if (!stats) - goto err; - - spin_lock_init(&stats->lock); - - RCU_INIT_POINTER(flow->stats[0], stats); - - cpumask_set_cpu(0, &flow->cpu_used_mask); - - return flow; -err: - kmem_cache_free(flow_cache, flow); - return ERR_PTR(-ENOMEM); -} - -int ovs_flow_tbl_count(const struct flow_table *table) -{ - return table->count; -} - -static void flow_free(struct sw_flow *flow) -{ - int cpu; - - if (ovs_identifier_is_key(&flow->id)) - kfree(flow->id.unmasked_key); - if (flow->sf_acts) - ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask)) - if (flow->stats[cpu]) - kmem_cache_free(flow_stats_cache, - rcu_dereference_raw(flow->stats[cpu])); - kmem_cache_free(flow_cache, flow); -} - -static void rcu_free_flow_callback(struct rcu_head *rcu) -{ - struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); - - flow_free(flow); -} - -void ovs_flow_free(struct sw_flow *flow, bool deferred) -{ - if (!flow) - return; - - if (deferred) - call_rcu(&flow->rcu, rcu_free_flow_callback); - else - flow_free(flow); -} - -static void __table_instance_destroy(struct table_instance *ti) -{ - kvfree(ti->buckets); - kfree(ti); -} - -static struct table_instance *table_instance_alloc(int new_size) -{ - struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); - int i; - - if (!ti) - return NULL; - - ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head), - GFP_KERNEL); - if (!ti->buckets) { - kfree(ti); - return NULL; - } - - for (i = 0; i < new_size; i++) - INIT_HLIST_HEAD(&ti->buckets[i]); - - ti->n_buckets = new_size; - ti->node_ver = 0; - ti->keep_flows = false; - get_random_bytes(&ti->hash_seed, sizeof(u32)); - - return ti; -} - -static void mask_array_rcu_cb(struct rcu_head *rcu) -{ - struct mask_array *ma = container_of(rcu, struct mask_array, rcu); - - kfree(ma); -} - -static struct mask_array *tbl_mask_array_alloc(int size) -{ - struct mask_array *new; - - size = max(MASK_ARRAY_SIZE_MIN, size); - new = kzalloc(sizeof(struct mask_array) + - sizeof(struct sw_flow_mask *) * size, GFP_KERNEL); - if (!new) - return NULL; - - new->count = 0; - new->max = size; - - return new; -} - -static int tbl_mask_array_realloc(struct flow_table *tbl, int size) -{ - struct mask_array *old; - struct mask_array *new; - - new = tbl_mask_array_alloc(size); - if (!new) - return -ENOMEM; - - old = ovsl_dereference(tbl->mask_array); - if (old) { - int i, count = 0; - - for (i = 0; i < old->max; i++) { - if (ovsl_dereference(old->masks[i])) - new->masks[count++] = old->masks[i]; - } - - new->count = count; - } - rcu_assign_pointer(tbl->mask_array, new); - - if (old) - call_rcu(&old->rcu, mask_array_rcu_cb); - - return 0; -} - -static int tbl_mask_array_add_mask(struct flow_table *tbl, - struct sw_flow_mask *new) -{ - struct mask_array *ma = ovsl_dereference(tbl->mask_array); - int err, ma_count = READ_ONCE(ma->count); - - if (ma_count >= ma->max) { - err = tbl_mask_array_realloc(tbl, ma->max + - MASK_ARRAY_SIZE_MIN); - if (err) - return err; - - ma = ovsl_dereference(tbl->mask_array); - } - - BUG_ON(ovsl_dereference(ma->masks[ma_count])); - - rcu_assign_pointer(ma->masks[ma_count], new); - WRITE_ONCE(ma->count, ma_count +1); - - return 0; -} - -static void tbl_mask_array_del_mask(struct flow_table *tbl, - struct sw_flow_mask *mask) -{ - struct mask_array *ma = ovsl_dereference(tbl->mask_array); - int i, ma_count = READ_ONCE(ma->count); - - /* Remove the deleted mask pointers from the array */ - for (i = 0; i < ma_count; i++) { - if (mask == ovsl_dereference(ma->masks[i])) - goto found; - } - - BUG(); - return; - -found: - WRITE_ONCE(ma->count, ma_count -1); - - rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]); - RCU_INIT_POINTER(ma->masks[ma_count -1], NULL); - - kfree_rcu(mask, rcu); - - /* Shrink the mask array if necessary. */ - if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && - ma_count <= (ma->max / 3)) - tbl_mask_array_realloc(tbl, ma->max / 2); -} - -/* Remove 'mask' from the mask list, if it is not needed any more. */ -static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) -{ - if (mask) { - /* ovs-lock is required to protect mask-refcount and - * mask list. - */ - ASSERT_OVSL(); - BUG_ON(!mask->ref_count); - mask->ref_count--; - - if (!mask->ref_count) - tbl_mask_array_del_mask(tbl, mask); - } -} - -int ovs_flow_tbl_init(struct flow_table *table) -{ - struct table_instance *ti, *ufid_ti; - struct mask_array *ma; - - table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) * - MC_HASH_ENTRIES, __alignof__(struct mask_cache_entry)); - if (!table->mask_cache) - return -ENOMEM; - - ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); - if (!ma) - goto free_mask_cache; - - ti = table_instance_alloc(TBL_MIN_BUCKETS); - if (!ti) - goto free_mask_array; - - ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); - if (!ufid_ti) - goto free_ti; - - rcu_assign_pointer(table->ti, ti); - rcu_assign_pointer(table->ufid_ti, ufid_ti); - rcu_assign_pointer(table->mask_array, ma); - table->last_rehash = jiffies; - table->count = 0; - table->ufid_count = 0; - return 0; - -free_ti: - __table_instance_destroy(ti); -free_mask_array: - kfree(ma); -free_mask_cache: - free_percpu(table->mask_cache); - return -ENOMEM; -} - -static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) -{ - struct table_instance *ti = container_of(rcu, struct table_instance, rcu); - - __table_instance_destroy(ti); -} - -static void table_instance_flow_free(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti, - struct sw_flow *flow, - bool count) -{ - hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); - if (count) - table->count--; - - if (ovs_identifier_is_ufid(&flow->id)) { - hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); - - if (count) - table->ufid_count--; - } - - flow_mask_remove(table, flow->mask); -} - -static void table_instance_destroy(struct flow_table *table, - struct table_instance *ti, - struct table_instance *ufid_ti, - bool deferred) -{ - int i; - - if (!ti) - return; - - BUG_ON(!ufid_ti); - if (ti->keep_flows) - goto skip_flows; - - for (i = 0; i < ti->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head = &ti->buckets[i]; - struct hlist_node *n; - - hlist_for_each_entry_safe(flow, n, head, - flow_table.node[ti->node_ver]) { - - table_instance_flow_free(table, ti, ufid_ti, - flow, false); - ovs_flow_free(flow, deferred); - } - } - -skip_flows: - if (deferred) { - call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); - call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); - } else { - __table_instance_destroy(ti); - __table_instance_destroy(ufid_ti); - } -} - -/* No need for locking this function is called from RCU callback or - * error path. - */ -void ovs_flow_tbl_destroy(struct flow_table *table) -{ - struct table_instance *ti = rcu_dereference_raw(table->ti); - struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); - - free_percpu(table->mask_cache); - kfree(rcu_dereference_raw(table->mask_array)); - table_instance_destroy(table, ti, ufid_ti, false); -} - -struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, - u32 *bucket, u32 *last) -{ - struct sw_flow *flow; - struct hlist_head *head; - int ver; - int i; - - ver = ti->node_ver; - while (*bucket < ti->n_buckets) { - i = 0; - head = &ti->buckets[*bucket]; - hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) { - if (i < *last) { - i++; - continue; - } - *last = i + 1; - return flow; - } - (*bucket)++; - *last = 0; - } - - return NULL; -} - -static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) -{ - hash = jhash_1word(hash, ti->hash_seed); - return &ti->buckets[hash & (ti->n_buckets - 1)]; -} - -static void table_instance_insert(struct table_instance *ti, - struct sw_flow *flow) -{ - struct hlist_head *head; - - head = find_bucket(ti, flow->flow_table.hash); - hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head); -} - -static void ufid_table_instance_insert(struct table_instance *ti, - struct sw_flow *flow) -{ - struct hlist_head *head; - - head = find_bucket(ti, flow->ufid_table.hash); - hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head); -} - -static void flow_table_copy_flows(struct table_instance *old, - struct table_instance *new, bool ufid) -{ - int old_ver; - int i; - - old_ver = old->node_ver; - new->node_ver = !old_ver; - - /* Insert in new table. */ - for (i = 0; i < old->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head = &old->buckets[i]; - - if (ufid) - hlist_for_each_entry_rcu(flow, head, - ufid_table.node[old_ver]) - ufid_table_instance_insert(new, flow); - else - hlist_for_each_entry_rcu(flow, head, - flow_table.node[old_ver]) - table_instance_insert(new, flow); - } - - old->keep_flows = true; -} - -static struct table_instance *table_instance_rehash(struct table_instance *ti, - int n_buckets, bool ufid) -{ - struct table_instance *new_ti; - - new_ti = table_instance_alloc(n_buckets); - if (!new_ti) - return NULL; - - flow_table_copy_flows(ti, new_ti, ufid); - - return new_ti; -} - -int ovs_flow_tbl_flush(struct flow_table *flow_table) -{ - struct table_instance *old_ti, *new_ti; - struct table_instance *old_ufid_ti, *new_ufid_ti; - - new_ti = table_instance_alloc(TBL_MIN_BUCKETS); - if (!new_ti) - return -ENOMEM; - new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); - if (!new_ufid_ti) - goto err_free_ti; - - old_ti = ovsl_dereference(flow_table->ti); - old_ufid_ti = ovsl_dereference(flow_table->ufid_ti); - - rcu_assign_pointer(flow_table->ti, new_ti); - rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); - flow_table->last_rehash = jiffies; - flow_table->count = 0; - flow_table->ufid_count = 0; - - table_instance_destroy(flow_table, old_ti, old_ufid_ti, true); - return 0; - -err_free_ti: - __table_instance_destroy(new_ti); - return -ENOMEM; -} - -static u32 flow_hash(const struct sw_flow_key *key, - const struct sw_flow_key_range *range) -{ - const u32 *hash_key = (const u32 *)((const u8 *)key + range->start); - - /* Make sure number of hash bytes are multiple of u32. */ - int hash_u32s = range_n_bytes(range) >> 2; - - return jhash2(hash_key, hash_u32s, 0); -} - -static int flow_key_start(const struct sw_flow_key *key) -{ - if (key->tun_proto) - return 0; - else - return rounddown(offsetof(struct sw_flow_key, phy), - sizeof(long)); -} - -static bool cmp_key(const struct sw_flow_key *key1, - const struct sw_flow_key *key2, - int key_start, int key_end) -{ - const long *cp1 = (const long *)((const u8 *)key1 + key_start); - const long *cp2 = (const long *)((const u8 *)key2 + key_start); - long diffs = 0; - int i; - - for (i = key_start; i < key_end; i += sizeof(long)) - diffs |= *cp1++ ^ *cp2++; - - return diffs == 0; -} - -static bool flow_cmp_masked_key(const struct sw_flow *flow, - const struct sw_flow_key *key, - const struct sw_flow_key_range *range) -{ - return cmp_key(&flow->key, key, range->start, range->end); -} - -static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, - const struct sw_flow_match *match) -{ - struct sw_flow_key *key = match->key; - int key_start = flow_key_start(key); - int key_end = match->range.end; - - BUG_ON(ovs_identifier_is_ufid(&flow->id)); - return cmp_key(flow->id.unmasked_key, key, key_start, key_end); -} - -static struct sw_flow *masked_flow_lookup(struct table_instance *ti, - const struct sw_flow_key *unmasked, - const struct sw_flow_mask *mask, - u32 *n_mask_hit) -{ - struct sw_flow *flow; - struct hlist_head *head; - u32 hash; - struct sw_flow_key masked_key; - - ovs_flow_mask_key(&masked_key, unmasked, false, mask); - hash = flow_hash(&masked_key, &mask->range); - head = find_bucket(ti, hash); - (*n_mask_hit)++; - hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) { - if (flow->mask == mask && flow->flow_table.hash == hash && - flow_cmp_masked_key(flow, &masked_key, &mask->range)) - return flow; - } - return NULL; -} - -/* Flow lookup does full lookup on flow table. It starts with - * mask from index passed in *index. - */ -static struct sw_flow *flow_lookup(struct flow_table *tbl, - struct table_instance *ti, - const struct mask_array *ma, - const struct sw_flow_key *key, - u32 *n_mask_hit, - u32 *index) -{ - struct sw_flow *flow; - struct sw_flow_mask *mask; - int i; - - if (likely(*index < ma->max)) { - mask = rcu_dereference_ovsl(ma->masks[*index]); - if (mask) { - flow = masked_flow_lookup(ti, key, mask, n_mask_hit); - if (flow) - return flow; - } - } - - for (i = 0; i < ma->max; i++) { - - if (i == *index) - continue; - - mask = rcu_dereference_ovsl(ma->masks[i]); - if (unlikely(!mask)) - break; - - flow = masked_flow_lookup(ti, key, mask, n_mask_hit); - if (flow) { /* Found */ - *index = i; - return flow; - } - } - - return NULL; -} - -/* - * mask_cache maps flow to probable mask. This cache is not tightly - * coupled cache, It means updates to mask list can result in inconsistent - * cache entry in mask cache. - * This is per cpu cache and is divided in MC_HASH_SEGS segments. - * In case of a hash collision the entry is hashed in next segment. - */ -struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, - const struct sw_flow_key *key, - u32 skb_hash, - u32 *n_mask_hit) -{ - struct mask_array *ma = rcu_dereference(tbl->mask_array); - struct table_instance *ti = rcu_dereference(tbl->ti); - struct mask_cache_entry *entries, *ce; - struct sw_flow *flow; - u32 hash; - int seg; - - *n_mask_hit = 0; - if (unlikely(!skb_hash)) { - u32 mask_index = 0; - - return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index); - } - - /* Pre and post recirulation flows usually have the same skb_hash - * value. To avoid hash collisions, rehash the 'skb_hash' with - * 'recirc_id'. */ - if (key->recirc_id) - skb_hash = jhash_1word(skb_hash, key->recirc_id); - - ce = NULL; - hash = skb_hash; - entries = this_cpu_ptr(tbl->mask_cache); - - /* Find the cache entry 'ce' to operate on. */ - for (seg = 0; seg < MC_HASH_SEGS; seg++) { - int index = hash & (MC_HASH_ENTRIES - 1); - struct mask_cache_entry *e; - - e = &entries[index]; - if (e->skb_hash == skb_hash) { - flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, - &e->mask_index); - if (!flow) - e->skb_hash = 0; - return flow; - } - - if (!ce || e->skb_hash < ce->skb_hash) - ce = e; /* A better replacement cache candidate. */ - - hash >>= MC_HASH_SHIFT; - } - - /* Cache miss, do full lookup. */ - flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index); - if (flow) - ce->skb_hash = skb_hash; - - return flow; -} - -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, - const struct sw_flow_key *key) -{ - struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); - struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); - u32 __always_unused n_mask_hit; - u32 index = 0; - - return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index); -} - -struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, - const struct sw_flow_match *match) -{ - struct mask_array *ma = ovsl_dereference(tbl->mask_array); - int i; - - /* Always called under ovs-mutex. */ - for (i = 0; i < ma->max; i++) { - struct table_instance *ti = ovsl_dereference(tbl->ti); - u32 __always_unused n_mask_hit; - struct sw_flow_mask *mask; - struct sw_flow *flow; - - mask = ovsl_dereference(ma->masks[i]); - if (!mask) - continue; - flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit); - if (flow && ovs_identifier_is_key(&flow->id) && - ovs_flow_cmp_unmasked_key(flow, match)) - return flow; - } - return NULL; -} - -static u32 ufid_hash(const struct sw_flow_id *sfid) -{ - return jhash(sfid->ufid, sfid->ufid_len, 0); -} - -static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, - const struct sw_flow_id *sfid) -{ - if (flow->id.ufid_len != sfid->ufid_len) - return false; - - return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); -} - -bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match) -{ - if (ovs_identifier_is_ufid(&flow->id)) - return flow_cmp_masked_key(flow, match->key, &match->range); - - return ovs_flow_cmp_unmasked_key(flow, match); -} - -struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, - const struct sw_flow_id *ufid) -{ - struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti); - struct sw_flow *flow; - struct hlist_head *head; - u32 hash; - - hash = ufid_hash(ufid); - head = find_bucket(ti, hash); - hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) { - if (flow->ufid_table.hash == hash && - ovs_flow_cmp_ufid(flow, ufid)) - return flow; - } - return NULL; -} - -int ovs_flow_tbl_num_masks(const struct flow_table *table) -{ - struct mask_array *ma; - - ma = rcu_dereference_ovsl(table->mask_array); - return READ_ONCE(ma->count); -} - -static struct table_instance *table_instance_expand(struct table_instance *ti, - bool ufid) -{ - return table_instance_rehash(ti, ti->n_buckets * 2, ufid); -} - -/* Must be called with OVS mutex held. */ -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) -{ - struct table_instance *ti = ovsl_dereference(table->ti); - struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); - - BUG_ON(table->count == 0); - table_instance_flow_free(table, ti, ufid_ti, flow, true); -} - -static struct sw_flow_mask *mask_alloc(void) -{ - struct sw_flow_mask *mask; - - mask = kmalloc(sizeof(*mask), GFP_KERNEL); - if (mask) - mask->ref_count = 1; - - return mask; -} - -static bool mask_equal(const struct sw_flow_mask *a, - const struct sw_flow_mask *b) -{ - const u8 *a_ = (const u8 *)&a->key + a->range.start; - const u8 *b_ = (const u8 *)&b->key + b->range.start; - - return (a->range.end == b->range.end) - && (a->range.start == b->range.start) - && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); -} - -static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, - const struct sw_flow_mask *mask) -{ - struct mask_array *ma; - int i; - - ma = ovsl_dereference(tbl->mask_array); - for (i = 0; i < ma->max; i++) { - struct sw_flow_mask *t; - - t = ovsl_dereference(ma->masks[i]); - if (t && mask_equal(mask, t)) - return t; - } - - return NULL; -} - -/* Add 'mask' into the mask list, if it is not already there. */ -static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, - const struct sw_flow_mask *new) -{ - struct sw_flow_mask *mask; - - mask = flow_mask_find(tbl, new); - if (!mask) { - /* Allocate a new mask if none exsits. */ - mask = mask_alloc(); - if (!mask) - return -ENOMEM; - - mask->key = new->key; - mask->range = new->range; - - /* Add mask to mask-list. */ - if (tbl_mask_array_add_mask(tbl, mask)) { - kfree(mask); - return -ENOMEM; - } - - } else { - BUG_ON(!mask->ref_count); - mask->ref_count++; - } - - flow->mask = mask; - return 0; -} - -/* Must be called with OVS mutex held. */ -static void flow_key_insert(struct flow_table *table, struct sw_flow *flow) -{ - struct table_instance *new_ti = NULL; - struct table_instance *ti; - - flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range); - ti = ovsl_dereference(table->ti); - table_instance_insert(ti, flow); - table->count++; - - /* Expand table, if necessary, to make room. */ - if (table->count > ti->n_buckets) - new_ti = table_instance_expand(ti, false); - else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) - new_ti = table_instance_rehash(ti, ti->n_buckets, false); - - if (new_ti) { - rcu_assign_pointer(table->ti, new_ti); - call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); - table->last_rehash = jiffies; - } -} - -/* Must be called with OVS mutex held. */ -static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow) -{ - struct table_instance *ti; - - flow->ufid_table.hash = ufid_hash(&flow->id); - ti = ovsl_dereference(table->ufid_ti); - ufid_table_instance_insert(ti, flow); - table->ufid_count++; - - /* Expand table, if necessary, to make room. */ - if (table->ufid_count > ti->n_buckets) { - struct table_instance *new_ti; - - new_ti = table_instance_expand(ti, true); - if (new_ti) { - rcu_assign_pointer(table->ufid_ti, new_ti); - call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); - } - } -} - -/* Must be called with OVS mutex held. */ -int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - const struct sw_flow_mask *mask) -{ - int err; - - err = flow_mask_insert(table, flow, mask); - if (err) - return err; - flow_key_insert(table, flow); - if (ovs_identifier_is_ufid(&flow->id)) - flow_ufid_insert(table, flow); - - return 0; -} - -/* Initializes the flow module. - * Returns zero if successful or a negative error code. - */ -int ovs_flow_init(void) -{ - BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); - BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); - - flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) - + (nr_cpu_ids - * sizeof(struct sw_flow_stats *)), - 0, 0, NULL); - if (flow_cache == NULL) - return -ENOMEM; - - flow_stats_cache - = kmem_cache_create("sw_flow_stats", sizeof(struct sw_flow_stats), - 0, SLAB_HWCACHE_ALIGN, NULL); - if (flow_stats_cache == NULL) { - kmem_cache_destroy(flow_cache); - flow_cache = NULL; - return -ENOMEM; - } - - return 0; -} - -/* Uninitializes the flow module. */ -void ovs_flow_exit(void) -{ - kmem_cache_destroy(flow_stats_cache); - kmem_cache_destroy(flow_cache); -} diff --git a/datapath/flow_table.h b/datapath/flow_table.h deleted file mode 100644 index 1a76886b5..000000000 --- a/datapath/flow_table.h +++ /dev/null @@ -1,102 +0,0 @@ -/* - * Copyright (c) 2007-2013 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef FLOW_TABLE_H -#define FLOW_TABLE_H 1 - -#include <linux/kernel.h> -#include <linux/netlink.h> -#include <linux/openvswitch.h> -#include <linux/spinlock.h> -#include <linux/types.h> -#include <linux/rcupdate.h> -#include <linux/if_ether.h> -#include <linux/in6.h> -#include <linux/jiffies.h> -#include <linux/time.h> - -#include <net/inet_ecn.h> -#include <net/ip_tunnels.h> - -#include "flow.h" - -struct mask_cache_entry { - u32 skb_hash; - u32 mask_index; -}; - -struct mask_array { - struct rcu_head rcu; - int count, max; - struct sw_flow_mask __rcu *masks[]; -}; - -struct table_instance { - struct hlist_head *buckets; - unsigned int n_buckets; - struct rcu_head rcu; - int node_ver; - u32 hash_seed; - bool keep_flows; -}; - -struct flow_table { - struct table_instance __rcu *ti; - struct table_instance __rcu *ufid_ti; - struct mask_cache_entry __percpu *mask_cache; - struct mask_array __rcu *mask_array; - unsigned long last_rehash; - unsigned int count; - unsigned int ufid_count; -}; - -extern struct kmem_cache *flow_stats_cache; - -int ovs_flow_init(void); -void ovs_flow_exit(void); - -struct sw_flow *ovs_flow_alloc(void); -void ovs_flow_free(struct sw_flow *, bool deferred); - -int ovs_flow_tbl_init(struct flow_table *); -int ovs_flow_tbl_count(const struct flow_table *table); -void ovs_flow_tbl_destroy(struct flow_table *table); -int ovs_flow_tbl_flush(struct flow_table *flow_table); - -int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - const struct sw_flow_mask *mask); -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); -int ovs_flow_tbl_num_masks(const struct flow_table *table); -struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, - u32 *bucket, u32 *idx); -struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, - const struct sw_flow_key *, - u32 skb_hash, - u32 *n_mask_hit); -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, - const struct sw_flow_key *); -struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, - const struct sw_flow_match *match); -struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, - const struct sw_flow_id *); - -bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); - -void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, - bool full, const struct sw_flow_mask *mask); -#endif /* flow_table.h */ diff --git a/datapath/linux/.gitignore b/datapath/linux/.gitignore deleted file mode 100644 index 8e9d781b1..000000000 --- a/datapath/linux/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -/Kbuild -/Makefile -/Makefile.main -/Module.markers -/kcompat.h -/modules.order -/tmp -/*.c diff --git a/datapath/linux/Kbuild.in b/datapath/linux/Kbuild.in deleted file mode 100644 index 395b0cbc0..000000000 --- a/datapath/linux/Kbuild.in +++ /dev/null @@ -1,27 +0,0 @@ -# -*- makefile -*- -export builddir = @abs_builddir@ -export srcdir = @abs_srcdir@ -export top_srcdir = @abs_top_srcdir@ -export VERSION = @VERSION@ - -include $(srcdir)/../Modules.mk -include $(srcdir)/Modules.mk - -ccflags-y := -DVERSION=\"$(VERSION)\" -ccflags-y += -I$(srcdir)/.. -ccflags-y += -I$(builddir)/.. -ccflags-y += -g -ccflags-y += -include $(builddir)/kcompat.h - -# These include directories have to go before -I$(KSRC)/include. -# NOSTDINC_FLAGS just happens to be a variable that goes in the -# right place, even though it's conceptually incorrect. -NOSTDINC_FLAGS += -include $(builddir)/kcompat.h -I$(top_srcdir)/include -I$(srcdir)/compat -I$(srcdir)/compat/include - -obj-m := $(subst _,-,$(patsubst %,%.o,$(build_modules))) - -define module_template -$(1)-y = $$(notdir $$(patsubst %.c,%.o,$($(1)_sources))) -endef - -$(foreach module,$(build_multi_modules),$(eval $(call module_template,$(module)))) diff --git a/datapath/linux/Makefile.in b/datapath/linux/Makefile.in deleted file mode 100644 index efc1663e4..000000000 --- a/datapath/linux/Makefile.in +++ /dev/null @@ -1,9 +0,0 @@ -ifeq ($(KERNELRELEASE),) -# We're being called directly by running make in this directory. -include Makefile.main -else -# We're being included by the Linux kernel build system -include Kbuild -endif - - diff --git a/datapath/linux/Makefile.main.in b/datapath/linux/Makefile.main.in deleted file mode 100644 index 6db4aa3ab..000000000 --- a/datapath/linux/Makefile.main.in +++ /dev/null @@ -1,107 +0,0 @@ -# -*- makefile -*- -export builddir = @abs_builddir@ -export srcdir = @abs_srcdir@ -export top_srcdir = @abs_top_srcdir@ -export KSRC = @KBUILD@ -export VERSION = @VERSION@ - -include $(srcdir)/../Modules.mk -include $(srcdir)/Modules.mk - -default: $(build_links) - -$(foreach s,$(sort $(foreach m,$(build_modules),$($(m)_sources))), \ - $(eval $(notdir $(s)): ; ln -s $(srcdir)/../$(s) $@)) - -all: default -distdir: clean -install: -install-data: -install-exec: -uninstall: -install-dvi: -install-html: -install-info: -install-ps: -install-pdf: -installdirs: -check: all -installcheck: -mostlyclean: -clean: - rm -f *.o *.ko *.mod.* .*.gcno .*.d .*.cmd kcompat.h.new \ - .cache.mk Module.symvers modules.order .tmp_versions/*.mod - for d in $(build_links); do if test -h $$d; then rm $$d; fi; done -distclean: clean - rm -f kcompat.h -maintainer-clean: distclean -dvi: -pdf: -ps: -info: -html: -tags: -TAGS: - -ifneq ($(KSRC),) - -ifeq (/lib/modules/$(shell uname -r)/source, $(KSRC)) - KOBJ := /lib/modules/$(shell uname -r)/build -else - KOBJ := $(KSRC) -endif - -VERSION_FILE := $(KOBJ)/include/linux/version.h -ifeq (,$(wildcard $(VERSION_FILE))) - VERSION_FILE := $(KOBJ)/include/generated/uapi/linux/version.h - ifeq (,$(wildcard $(VERSION_FILE))) - $(error Linux kernel source not configured - missing version.h) - endif -endif - -CONFIG_FILE := $(KSRC)/include/generated/autoconf.h -ifeq (,$(wildcard $(CONFIG_FILE))) - CONFIG_FILE := $(KSRC)/include/linux/autoconf.h - ifeq (,$(wildcard $(CONFIG_FILE))) - $(error Linux kernel source not configured - missing autoconf.h) - endif -endif - -default: - $(MAKE) -C $(KSRC) $(if @KARCH@,ARCH=@KARCH@) M=$(builddir) modules - -modules_install: - $(MAKE) -C $(KSRC) $(if @KARCH@,ARCH=@KARCH@) M=$(builddir) modules_install - /sbin/depmod `sed -n 's/#define UTS_RELEASE "\([^"]*\)"/\1/p' $(KSRC)/include/generated/utsrelease.h` -endif - -# Much of the kernel build system in this file is derived from Intel's -# e1000 distribution, with the following license: - -################################################################################ -# -# Intel PRO/1000 Linux driver -# Copyright(c) 1999 - 2007, 2009 Intel Corporation. -# -# This program is free software; you can redistribute it and/or modify it -# under the terms and conditions of the GNU General Public License, -# version 2, as published by the Free Software Foundation. -# -# This program is distributed in the hope it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -# more details. -# -# You should have received a copy of the GNU General Public License along with -# this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# -# The full GNU General Public License is included in this distribution in -# the file called "COPYING". -# -# Contact Information: -# Linux NICS <linux.nics@intel.com> -# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> -# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497 -# -################################################################################ diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk deleted file mode 100644 index 372243988..000000000 --- a/datapath/linux/Modules.mk +++ /dev/null @@ -1,123 +0,0 @@ -openvswitch_sources += \ - linux/compat/dev-openvswitch.c \ - linux/compat/dst_cache.c \ - linux/compat/exthdrs_core.c \ - linux/compat/geneve.c \ - linux/compat/gre.c \ - linux/compat/gso.c \ - linux/compat/genetlink-openvswitch.c \ - linux/compat/inet_fragment.c \ - linux/compat/ip_gre.c \ - linux/compat/ip_fragment.c \ - linux/compat/ip_output.c \ - linux/compat/ip_tunnel.c \ - linux/compat/ip_tunnels_core.c \ - linux/compat/ip6_output.c \ - linux/compat/ip6_gre.c \ - linux/compat/ip6_tunnel.c \ - linux/compat/lisp.c \ - linux/compat/netdevice.c \ - linux/compat/nf_conncount.c \ - linux/compat/nf_conntrack_core.c \ - linux/compat/nf_conntrack_proto.c \ - linux/compat/nf_conntrack_reasm.c \ - linux/compat/nf_conntrack_timeout.c \ - linux/compat/reciprocal_div.c \ - linux/compat/skbuff-openvswitch.c \ - linux/compat/socket.c \ - linux/compat/stt.c \ - linux/compat/udp.c \ - linux/compat/udp_tunnel.c \ - linux/compat/vxlan.c \ - linux/compat/utils.c -openvswitch_headers += \ - linux/compat/gso.h \ - linux/compat/include/linux/percpu.h \ - linux/compat/include/linux/bug.h \ - linux/compat/include/linux/cache.h \ - linux/compat/include/linux/compiler.h \ - linux/compat/include/linux/compiler-gcc.h \ - linux/compat/include/linux/cpumask.h \ - linux/compat/include/linux/err.h \ - linux/compat/include/linux/etherdevice.h \ - linux/compat/include/linux/genetlink.h \ - linux/compat/include/linux/if.h \ - linux/compat/include/linux/if_ether.h \ - linux/compat/include/linux/if_link.h \ - linux/compat/include/linux/if_vlan.h \ - linux/compat/include/linux/in.h \ - linux/compat/include/linux/jiffies.h \ - linux/compat/include/linux/kconfig.h \ - linux/compat/include/linux/kernel.h \ - linux/compat/include/net/lisp.h \ - linux/compat/include/linux/list.h \ - linux/compat/include/linux/mpls.h \ - linux/compat/include/linux/net.h \ - linux/compat/include/linux/random.h \ - linux/compat/include/linux/netdevice.h \ - linux/compat/include/linux/netdev_features.h \ - linux/compat/include/linux/netfilter_ipv6.h \ - linux/compat/include/linux/netlink.h \ - linux/compat/include/linux/openvswitch.h \ - linux/compat/include/linux/rculist.h \ - linux/compat/include/linux/rcupdate.h \ - linux/compat/include/linux/reciprocal_div.h \ - linux/compat/include/linux/rtnetlink.h \ - linux/compat/include/linux/skbuff.h \ - linux/compat/include/linux/static_key.h \ - linux/compat/include/linux/stddef.h \ - linux/compat/include/linux/types.h \ - linux/compat/include/linux/u64_stats_sync.h \ - linux/compat/include/linux/udp.h \ - linux/compat/include/linux/workqueue.h \ - linux/compat/include/linux/timekeeping.h \ - linux/compat/include/net/checksum.h \ - linux/compat/include/net/dst.h \ - linux/compat/include/net/dst_cache.h \ - linux/compat/include/net/dst_metadata.h \ - linux/compat/include/net/genetlink.h \ - linux/compat/include/net/geneve.h \ - linux/compat/include/net/gre.h \ - linux/compat/include/net/inet_ecn.h \ - linux/compat/include/net/inet_frag.h \ - linux/compat/include/net/inetpeer.h \ - linux/compat/include/net/ip.h \ - linux/compat/include/net/ip_tunnels.h \ - linux/compat/include/net/ip6_fib.h \ - linux/compat/include/net/ip6_route.h \ - linux/compat/include/net/ip6_tunnel.h \ - linux/compat/include/net/ipv6.h \ - linux/compat/include/net/ipv6_frag.h \ - linux/compat/include/net/mpls.h \ - linux/compat/include/net/net_namespace.h \ - linux/compat/include/net/netlink.h \ - linux/compat/include/net/protocol.h \ - linux/compat/include/net/route.h \ - linux/compat/include/net/rtnetlink.h \ - linux/compat/include/net/udp.h \ - linux/compat/include/net/udp_tunnel.h \ - linux/compat/include/net/sock.h \ - linux/compat/include/net/stt.h \ - linux/compat/include/net/vrf.h \ - linux/compat/include/net/tun_proto.h \ - linux/compat/include/net/nsh.h \ - linux/compat/include/net/vxlan.h \ - linux/compat/include/net/netfilter/nf_conntrack.h \ - linux/compat/include/net/netfilter/nf_conntrack_core.h \ - linux/compat/include/net/netfilter/nf_conntrack_count.h \ - linux/compat/include/net/netfilter/nf_conntrack_expect.h \ - linux/compat/include/net/netfilter/nf_conntrack_helper.h \ - linux/compat/include/net/netfilter/nf_conntrack_labels.h \ - linux/compat/include/net/netfilter/nf_conntrack_seqadj.h \ - linux/compat/include/net/netfilter/nf_conntrack_timeout.h \ - linux/compat/include/net/netfilter/nf_conntrack_zones.h \ - linux/compat/include/net/netfilter/nf_nat.h \ - linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h \ - linux/compat/include/net/sctp/checksum.h \ - linux/compat/include/net/erspan.h \ - linux/compat/include/uapi/linux/netfilter.h \ - linux/compat/include/linux/mm.h \ - linux/compat/include/linux/netfilter.h \ - linux/compat/include/linux/overflow.h \ - linux/compat/include/linux/rbtree.h -EXTRA_DIST += linux/compat/build-aux/export-check-allow-list diff --git a/datapath/linux/compat/build-aux/export-check-allow-list b/datapath/linux/compat/build-aux/export-check-allow-list deleted file mode 100644 index 1178f46ee..000000000 --- a/datapath/linux/compat/build-aux/export-check-allow-list +++ /dev/null @@ -1 +0,0 @@ -pskb_expand_head
\ No newline at end of file diff --git a/datapath/linux/compat/dev-openvswitch.c b/datapath/linux/compat/dev-openvswitch.c deleted file mode 100644 index 56e1a5b68..000000000 --- a/datapath/linux/compat/dev-openvswitch.c +++ /dev/null @@ -1,83 +0,0 @@ -#include <linux/if_bridge.h> -#include <linux/netdevice.h> -#include <linux/version.h> -#include <net/rtnetlink.h> - -#include "gso.h" -#include "vport.h" -#include "vport-internal_dev.h" -#include "vport-netdev.h" - -#ifndef HAVE_DEV_DISABLE_LRO - -#ifdef NETIF_F_LRO -#include <linux/ethtool.h> - -/** - * dev_disable_lro - disable Large Receive Offload on a device - * @dev: device - * - * Disable Large Receive Offload (LRO) on a net device. Must be - * called under RTNL. This is needed if received packets may be - * forwarded to another interface. - */ -void dev_disable_lro(struct net_device *dev) -{ - if (dev->ethtool_ops && dev->ethtool_ops->get_flags && - dev->ethtool_ops->set_flags) { - u32 flags = dev->ethtool_ops->get_flags(dev); - if (flags & ETH_FLAG_LRO) { - flags &= ~ETH_FLAG_LRO; - dev->ethtool_ops->set_flags(dev, flags); - } - } - WARN_ON(dev->features & NETIF_F_LRO); -} -#else -void dev_disable_lro(struct net_device *dev) { } -#endif /* NETIF_F_LRO */ - -#endif /* HAVE_DEV_DISABLE_LRO */ - -int rpl_rtnl_delete_link(struct net_device *dev) -{ - const struct rtnl_link_ops *ops; - LIST_HEAD(list_kill); - - ops = dev->rtnl_link_ops; - if (!ops || !ops->dellink) - return -EOPNOTSUPP; - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - - return 0; -} -EXPORT_SYMBOL_GPL(rpl_rtnl_delete_link); - -#ifndef USE_UPSTREAM_TUNNEL -int ovs_dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) -{ - struct ip_tunnel_info *info; - struct vport *vport; - - if (!SKB_SETUP_FILL_METADATA_DST(skb)) - return -ENOMEM; - - vport = ovs_netdev_get_vport(dev); - if (!vport) - return -EINVAL; - - if (!vport->ops->fill_metadata_dst) - return -EINVAL; - - info = skb_tunnel_info(skb); - if (!info) - return -ENOMEM; - if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX))) - return -EINVAL; - - return vport->ops->fill_metadata_dst(dev, skb); -} -EXPORT_SYMBOL_GPL(ovs_dev_fill_metadata_dst); -#endif diff --git a/datapath/linux/compat/dst_cache.c b/datapath/linux/compat/dst_cache.c deleted file mode 100644 index 45990cba7..000000000 --- a/datapath/linux/compat/dst_cache.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * net/core/dst_cache.c - dst entry cache - * - * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef USE_BUILTIN_DST_CACHE -#include <linux/kernel.h> -#include <linux/percpu.h> -#include <net/dst_cache.h> -#include <net/route.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ip6_fib.h> -#endif -#include <uapi/linux/in.h> - -#ifndef USE_UPSTREAM_TUNNEL -struct dst_cache_pcpu { - unsigned long refresh_ts; - struct dst_entry *dst; - u32 cookie; - union { - struct in_addr in_saddr; - struct in6_addr in6_saddr; - }; -}; - -static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache, - struct dst_entry *dst, u32 cookie) -{ - dst_release(dst_cache->dst); - if (dst) - dst_hold(dst); - - dst_cache->cookie = cookie; - dst_cache->dst = dst; -} - -static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, - struct dst_cache_pcpu *idst) -{ - struct dst_entry *dst; - - dst = idst->dst; - if (!dst) - goto fail; - - /* the cache already hold a dst reference; it can't go away */ - dst_hold(dst); - - if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) || - (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { - dst_cache_per_cpu_dst_set(idst, NULL, 0); - dst_release(dst); - goto fail; - } - return dst; - -fail: - idst->refresh_ts = jiffies; - return NULL; -} - -struct dst_entry *rpl_dst_cache_get(struct dst_cache *dst_cache) -{ - if (!dst_cache->cache) - return NULL; - - return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache)); -} -EXPORT_SYMBOL_GPL(rpl_dst_cache_get); - -struct rtable *rpl_dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr) -{ - struct dst_cache_pcpu *idst; - struct dst_entry *dst; - - if (!dst_cache->cache) - return NULL; - - idst = this_cpu_ptr(dst_cache->cache); - dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) - return NULL; - - *saddr = idst->in_saddr.s_addr; - return container_of(dst, struct rtable, dst); -} -EXPORT_SYMBOL_GPL(rpl_dst_cache_get_ip4); - -void rpl_dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, - __be32 saddr) -{ - struct dst_cache_pcpu *idst; - - if (!dst_cache->cache) - return; - - idst = this_cpu_ptr(dst_cache->cache); - dst_cache_per_cpu_dst_set(idst, dst, 0); - idst->in_saddr.s_addr = saddr; -} -EXPORT_SYMBOL_GPL(rpl_dst_cache_set_ip4); - -#if IS_ENABLED(CONFIG_IPV6) -void rpl_dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, - const struct in6_addr *addr) -{ - struct dst_cache_pcpu *idst; - - if (!dst_cache->cache) - return; - - idst = this_cpu_ptr(dst_cache->cache); - dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst, - rt6_get_cookie((struct rt6_info *)dst)); - idst->in6_saddr = *addr; -} -EXPORT_SYMBOL_GPL(rpl_dst_cache_set_ip6); - -struct dst_entry *rpl_dst_cache_get_ip6(struct dst_cache *dst_cache, - struct in6_addr *saddr) -{ - struct dst_cache_pcpu *idst; - struct dst_entry *dst; - - if (!dst_cache->cache) - return NULL; - - idst = this_cpu_ptr(dst_cache->cache); - dst = dst_cache_per_cpu_get(dst_cache, idst); - if (!dst) - return NULL; - - *saddr = idst->in6_saddr; - return dst; -} -EXPORT_SYMBOL_GPL(rpl_dst_cache_get_ip6); - -#endif - -int rpl_dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp) -{ - dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu, - gfp | __GFP_ZERO); - if (!dst_cache->cache) - return -ENOMEM; - - dst_cache_reset(dst_cache); - return 0; -} -EXPORT_SYMBOL_GPL(rpl_dst_cache_init); - -void rpl_dst_cache_destroy(struct dst_cache *dst_cache) -{ - int i; - - if (!dst_cache->cache) - return; - - for_each_possible_cpu(i) - dst_release(per_cpu_ptr(dst_cache->cache, i)->dst); - - free_percpu(dst_cache->cache); -} -EXPORT_SYMBOL_GPL(rpl_dst_cache_destroy); -#endif /*USE_UPSTREAM_TUNNEL */ -#endif /* USE_BUILTIN_DST_CACHE */ diff --git a/datapath/linux/compat/exthdrs_core.c b/datapath/linux/compat/exthdrs_core.c deleted file mode 100644 index 697f9d082..000000000 --- a/datapath/linux/compat/exthdrs_core.c +++ /dev/null @@ -1,129 +0,0 @@ -#include <linux/ipv6.h> -#include <linux/version.h> -#include <net/ipv6.h> - -#ifndef HAVE_IP6_FH_F_SKIP_RH -/* - * find the offset to specified header or the protocol number of last header - * if target < 0. "last header" is transport protocol header, ESP, or - * "No next header". - * - * Note that *offset is used as input/output parameter. an if it is not zero, - * then it must be a valid offset to an inner IPv6 header. This can be used - * to explore inner IPv6 header, eg. ICMPv6 error messages. - * - * If target header is found, its offset is set in *offset and return protocol - * number. Otherwise, return -1. - * - * If the first fragment doesn't contain the final protocol header or - * NEXTHDR_NONE it is considered invalid. - * - * Note that non-1st fragment is special case that "the protocol number - * of last header" is "next header" field in Fragment header. In this case, - * *offset is meaningless and fragment offset is stored in *fragoff if fragoff - * isn't NULL. - * - * if flags is not NULL and it's a fragment, then the frag flag - * IP6_FH_F_FRAG will be set. If it's an AH header, the - * IP6_FH_F_AUTH flag is set and target < 0, then this function will - * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this - * function will skip all those routing headers, where segements_left was 0. - */ -int rpl_ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, - int target, unsigned short *fragoff, int *flags) -{ - unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - unsigned int len; - bool found; - - if (fragoff) - *fragoff = 0; - - if (*offset) { - struct ipv6hdr _ip6, *ip6; - - ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); - if (!ip6 || (ip6->version != 6)) { - printk(KERN_ERR "IPv6 header not found\n"); - return -EBADMSG; - } - start = *offset + sizeof(struct ipv6hdr); - nexthdr = ip6->nexthdr; - } - len = skb->len - start; - - do { - struct ipv6_opt_hdr _hdr, *hp; - unsigned int hdrlen; - found = (nexthdr == target); - - if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { - if (target < 0 || found) - break; - return -ENOENT; - } - - hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); - if (hp == NULL) - return -EBADMSG; - - if (nexthdr == NEXTHDR_ROUTING) { - struct ipv6_rt_hdr _rh, *rh; - - rh = skb_header_pointer(skb, start, sizeof(_rh), - &_rh); - if (rh == NULL) - return -EBADMSG; - - if (flags && (*flags & IP6_FH_F_SKIP_RH) && - rh->segments_left == 0) - found = false; - } - - if (nexthdr == NEXTHDR_FRAGMENT) { - unsigned short _frag_off; - __be16 *fp; - - if (flags) /* Indicate that this is a fragment */ - *flags |= IP6_FH_F_FRAG; - fp = skb_header_pointer(skb, - start+offsetof(struct frag_hdr, - frag_off), - sizeof(_frag_off), - &_frag_off); - if (fp == NULL) - return -EBADMSG; - - _frag_off = ntohs(*fp) & ~0x7; - if (_frag_off) { - if (target < 0 && - ((!ipv6_ext_hdr(hp->nexthdr)) || - hp->nexthdr == NEXTHDR_NONE)) { - if (fragoff) - *fragoff = _frag_off; - return hp->nexthdr; - } - return -ENOENT; - } - hdrlen = 8; - } else if (nexthdr == NEXTHDR_AUTH) { - if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) - break; - hdrlen = (hp->hdrlen + 2) << 2; - } else - hdrlen = ipv6_optlen(hp); - - if (!found) { - nexthdr = hp->nexthdr; - len -= hdrlen; - start += hdrlen; - } - } while (!found); - - *offset = start; - return nexthdr; -} -EXPORT_SYMBOL_GPL(rpl_ipv6_find_hdr); - -#endif diff --git a/datapath/linux/compat/genetlink-openvswitch.c b/datapath/linux/compat/genetlink-openvswitch.c deleted file mode 100644 index 5b0ecfa8d..000000000 --- a/datapath/linux/compat/genetlink-openvswitch.c +++ /dev/null @@ -1,55 +0,0 @@ -#include <net/genetlink.h> -#include <linux/version.h> - -#ifndef HAVE_GENL_NOTIFY_TAKES_FAMILY -int rpl___genl_register_family(struct rpl_genl_family *f) -{ - int err; - - f->compat_family.id = f->id; - f->compat_family.hdrsize = f->hdrsize; - strncpy(f->compat_family.name, f->name, GENL_NAMSIZ); - f->compat_family.version = f->version; - f->compat_family.maxattr = f->maxattr; - f->compat_family.netnsok = f->netnsok; -#ifdef HAVE_PARALLEL_OPS - f->compat_family.parallel_ops = f->parallel_ops; -#endif - err = genl_register_family_with_ops(&f->compat_family, - (struct genl_ops *) f->ops, f->n_ops); - if (err) - goto error; - - if (f->mcgrps) { - /* Need to Fix GROUP_ID() for more than one group. */ - BUG_ON(f->n_mcgrps > 1); - err = genl_register_mc_group(&f->compat_family, - (struct genl_multicast_group *) f->mcgrps); - if (err) - goto error; - } -error: - return err; - -} -EXPORT_SYMBOL_GPL(rpl___genl_register_family); -#endif /* HAVE_GENL_NOTIFY_TAKES_FAMILY */ - -#ifdef HAVE_GENL_NOTIFY_TAKES_NET - -#undef genl_notify - -void rpl_genl_notify(struct genl_family *family, struct sk_buff *skb, - struct genl_info *info, u32 group, gfp_t flags) -{ - struct net *net = genl_info_net(info); - u32 portid = info->snd_portid; - struct nlmsghdr *nlh = info->nlhdr; - -#ifdef HAVE_GENL_NOTIFY_TAKES_FAMILY - genl_notify(family, skb, net, portid, group, nlh, flags); -#else - genl_notify(skb, net, portid, group, nlh, flags); -#endif -} -#endif /* HAVE_GENL_NOTIFY_TAKES_NET */ diff --git a/datapath/linux/compat/geneve.c b/datapath/linux/compat/geneve.c deleted file mode 100644 index 02c6403e6..000000000 --- a/datapath/linux/compat/geneve.c +++ /dev/null @@ -1,1854 +0,0 @@ -/* - * GENEVE: Generic Network Virtualization Encapsulation - * - * Copyright (c) 2015 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/hash.h> -#include <linux/if_link.h> -#include <linux/if_vlan.h> - -#include <net/addrconf.h> -#include <net/dst_cache.h> -#include <net/dst_metadata.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/rtnetlink.h> -#include <net/geneve.h> -#include <net/protocol.h> -#include <net/udp_tunnel.h> -#include <net/ip6_route.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/addrconf.h> -#include <net/ip6_tunnel.h> -#include <net/ip6_checksum.h> -#endif - - -#include "gso.h" -#include "vport-netdev.h" -#include "compat.h" - -#ifndef USE_UPSTREAM_TUNNEL - -#define GENEVE_NETDEV_VER "0.6" - -#define GENEVE_UDP_PORT 6081 - -#define GENEVE_N_VID (1u << 24) -#define GENEVE_VID_MASK (GENEVE_N_VID - 1) - -#define VNI_HASH_BITS 10 -#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) - -#define GENEVE_VER 0 -#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) - -/* per-network namespace private data for this module */ -struct geneve_net { - struct list_head geneve_list; - struct list_head sock_list; -}; - -static int geneve_net_id; - -union geneve_addr { - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - struct sockaddr sa; -}; - -static union geneve_addr geneve_remote_unspec = { .sa.sa_family = AF_UNSPEC, }; - -/* Pseudo network device */ -struct geneve_dev { - struct hlist_node hlist; /* vni hash table */ - struct net *net; /* netns for packet i/o */ - struct net_device *dev; /* netdev for geneve tunnel */ - struct geneve_sock __rcu *sock4; /* IPv4 socket used for geneve tunnel */ -#if IS_ENABLED(CONFIG_IPV6) - struct geneve_sock __rcu *sock6; /* IPv6 socket used for geneve tunnel */ -#endif - u8 vni[3]; /* virtual network ID for tunnel */ - u8 ttl; /* TTL override */ - u8 tos; /* TOS override */ - union geneve_addr remote; /* IP address for link partner */ - struct list_head next; /* geneve's per namespace list */ - __be32 label; /* IPv6 flowlabel override */ - __be16 dst_port; - bool collect_md; - u32 flags; - struct dst_cache dst_cache; -}; - -/* Geneve device flags */ -#define GENEVE_F_UDP_ZERO_CSUM_TX BIT(0) -#define GENEVE_F_UDP_ZERO_CSUM6_TX BIT(1) -#define GENEVE_F_UDP_ZERO_CSUM6_RX BIT(2) - -struct geneve_sock { - bool collect_md; - struct list_head list; - struct socket *sock; - struct rcu_head rcu; - int refcnt; - struct hlist_head vni_list[VNI_HASH_SIZE]; - u32 flags; -#ifdef HAVE_UDP_OFFLOAD - struct udp_offload udp_offloads; -#endif -}; - -static inline __u32 geneve_net_vni_hash(u8 vni[3]) -{ - __u32 vnid; - - vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2]; - return hash_32(vnid, VNI_HASH_BITS); -} - -static __be64 vni_to_tunnel_id(const __u8 *vni) -{ -#ifdef __BIG_ENDIAN - return (vni[0] << 16) | (vni[1] << 8) | vni[2]; -#else - return (__force __be64)(((__force u64)vni[0] << 40) | - ((__force u64)vni[1] << 48) | - ((__force u64)vni[2] << 56)); -#endif -} - -static sa_family_t geneve_get_sk_family(struct geneve_sock *gs) -{ - return gs->sock->sk->sk_family; -} - -static struct geneve_dev *geneve_lookup(struct geneve_sock *gs, - __be32 addr, u8 vni[]) -{ - struct hlist_head *vni_list_head; - struct geneve_dev *geneve; - __u32 hash; - - /* Find the device for this VNI */ - hash = geneve_net_vni_hash(vni); - vni_list_head = &gs->vni_list[hash]; - hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) { - if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && - addr == geneve->remote.sin.sin_addr.s_addr) - return geneve; - } - return NULL; -} - -#if IS_ENABLED(CONFIG_IPV6) -static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs, - struct in6_addr addr6, u8 vni[]) -{ - struct hlist_head *vni_list_head; - struct geneve_dev *geneve; - __u32 hash; - - /* Find the device for this VNI */ - hash = geneve_net_vni_hash(vni); - vni_list_head = &gs->vni_list[hash]; - hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) { - if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && - ipv6_addr_equal(&addr6, &geneve->remote.sin6.sin6_addr)) - return geneve; - } - return NULL; -} -#endif - -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - -static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs, - struct sk_buff *skb) -{ - u8 *vni; - __be32 addr; - static u8 zero_vni[3]; -#if IS_ENABLED(CONFIG_IPV6) - static struct in6_addr zero_addr6; -#endif - - if (geneve_get_sk_family(gs) == AF_INET) { - struct iphdr *iph; - - iph = ip_hdr(skb); /* outer IP header... */ - - if (gs->collect_md) { - vni = zero_vni; - addr = 0; - } else { - vni = geneve_hdr(skb)->vni; - addr = iph->saddr; - } - - return geneve_lookup(gs, addr, vni); -#if IS_ENABLED(CONFIG_IPV6) - } else if (geneve_get_sk_family(gs) == AF_INET6) { - struct ipv6hdr *ip6h; - struct in6_addr addr6; - - ip6h = ipv6_hdr(skb); /* outer IPv6 header... */ - - if (gs->collect_md) { - vni = zero_vni; - addr6 = zero_addr6; - } else { - vni = geneve_hdr(skb)->vni; - addr6 = ip6h->saddr; - } - - return geneve6_lookup(gs, addr6, vni); -#endif - } - return NULL; -} - -/* geneve receive/decap routine */ -static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs, - struct sk_buff *skb) -{ - struct genevehdr *gnvh = geneve_hdr(skb); - struct metadata_dst *tun_dst = NULL; - struct pcpu_sw_netstats *stats; - int err = 0; - void *oiph; - union { - struct metadata_dst dst; - char buf[sizeof(struct metadata_dst) + 256]; - } buf; - - if (ip_tunnel_collect_metadata() || gs->collect_md) { - __be16 flags; - - flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | - (gnvh->oam ? TUNNEL_OAM : 0) | - (gnvh->critical ? TUNNEL_CRIT_OPT : 0); - - tun_dst = &buf.dst; - ovs_udp_tun_rx_dst(tun_dst, - skb, geneve_get_sk_family(gs), flags, - vni_to_tunnel_id(gnvh->vni), - gnvh->opt_len * 4); - if (!tun_dst) - goto drop; - /* Update tunnel dst according to Geneve options. */ - ip_tunnel_info_opts_set(&tun_dst->u.tun_info, - gnvh->options, gnvh->opt_len * 4, - TUNNEL_GENEVE_OPT); - } else { - /* Drop packets w/ critical options, - * since we don't support any... - */ - if (gnvh->critical) - goto drop; - } - - skb_reset_mac_header(skb); - skb->protocol = eth_type_trans(skb, geneve->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - if (tun_dst) - ovs_skb_dst_set(skb, &tun_dst->dst); - - /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) - goto drop; - - oiph = skb_network_header(skb); - skb_reset_network_header(skb); - - if (geneve_get_sk_family(gs) == AF_INET) - err = IP_ECN_decapsulate(oiph, skb); -#if IS_ENABLED(CONFIG_IPV6) - else - err = IP6_ECN_decapsulate(oiph, skb); -#endif - if (unlikely(err > 1)) { - ++geneve->dev->stats.rx_frame_errors; - ++geneve->dev->stats.rx_errors; - goto drop; - } - - stats = this_cpu_ptr(geneve->dev->tstats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); - - netdev_port_receive(skb, skb_tunnel_info(skb)); - return; -drop: - /* Consume bad packet */ - kfree_skb(skb); -} - -/* Setup stats when device is created */ -static int geneve_init(struct net_device *dev) -{ - struct geneve_dev *geneve = netdev_priv(dev); - int err; - - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - err = dst_cache_init(&geneve->dst_cache, GFP_KERNEL); - if (err) { - free_percpu(dev->tstats); - return err; - } - - return 0; -} - -static void geneve_uninit(struct net_device *dev) -{ - struct geneve_dev *geneve = netdev_priv(dev); - - dst_cache_destroy(&geneve->dst_cache); - free_percpu(dev->tstats); -} - -/* Callback from net/ipv4/udp.c to receive packets */ -static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) -{ - struct genevehdr *geneveh; - struct geneve_dev *geneve; - struct geneve_sock *gs; - int opts_len; - - /* Need Geneve and inner Ethernet header to be present */ - if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) - goto drop; - - /* Return packets with reserved bits set */ - geneveh = geneve_hdr(skb); - if (unlikely(geneveh->ver != GENEVE_VER)) - goto drop; - - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) - goto drop; - - gs = rcu_dereference_sk_user_data(sk); - if (!gs) - goto drop; - -#if IS_ENABLED(CONFIG_IPV6) -#ifdef OVS_CHECK_UDP_TUNNEL_ZERO_CSUM - if (geneve_get_sk_family(gs) == AF_INET6 && - !udp_hdr(skb)->check && - !(gs->flags & GENEVE_F_UDP_ZERO_CSUM6_RX)) { - udp6_csum_zero_error(skb); - goto drop; - } -#endif -#endif - geneve = geneve_lookup_skb(gs, skb); - if (!geneve) - goto drop; - - opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB), - !net_eq(geneve->net, dev_net(geneve->dev)))) - goto drop; - - geneve_rx(geneve, gs, skb); - return 0; - -drop: - /* Consume bad packet */ - kfree_skb(skb); - return 0; -} - -static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port, u32 flags) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - udp_conf.ipv6_v6only = 1; - udp_conf.use_udp6_rx_checksums = - !(flags & GENEVE_F_UDP_ZERO_CSUM6_RX); - } else { - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -static void geneve_notify_add_rx_port(struct geneve_sock *gs) -{ - struct net_device *dev; - struct sock *sk = gs->sock->sk; - struct net *net = sock_net(sk); - sa_family_t sa_family = geneve_get_sk_family(gs); - int err; - - if (sa_family == AF_INET) { - err = udp_add_offload(sock_net(sk), &gs->udp_offloads); - if (err) - pr_warn("geneve: udp_add_offload failed with status %d\n", - err); - } - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { -#ifdef HAVE_NDO_ADD_GENEVE_PORT - __be16 port = inet_sk(sk)->inet_sport; - - if (dev->netdev_ops->ndo_add_geneve_port) - dev->netdev_ops->ndo_add_geneve_port(dev, sa_family, - port); -#elif defined(HAVE_NDO_UDP_TUNNEL_ADD) - struct udp_tunnel_info ti; - ti.type = UDP_TUNNEL_TYPE_GENEVE; - ti.sa_family = sa_family; - ti.port = inet_sk(sk)->inet_sport; - - if (dev->netdev_ops->ndo_udp_tunnel_add) - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); -#endif - } - rcu_read_unlock(); -} - -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ - struct net_device *dev; - struct sock *sk = gs->sock->sk; - struct net *net = sock_net(sk); - sa_family_t sa_family = geneve_get_sk_family(gs); - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { -#ifdef HAVE_NDO_ADD_GENEVE_PORT - __be16 port = inet_sk(sk)->inet_sport; - - if (dev->netdev_ops->ndo_del_geneve_port) - dev->netdev_ops->ndo_del_geneve_port(dev, sa_family, - port); -#elif defined(HAVE_NDO_UDP_TUNNEL_ADD) - struct udp_tunnel_info ti; - ti.type = UDP_TUNNEL_TYPE_GENEVE; - ti.port = inet_sk(sk)->inet_sport; - ti.sa_family = sa_family; - - if (dev->netdev_ops->ndo_udp_tunnel_del) - dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); -#endif - } - - rcu_read_unlock(); - - if (sa_family == AF_INET) - udp_del_offload(&gs->udp_offloads); -} - -#if defined(HAVE_UDP_OFFLOAD) || \ - defined(HAVE_UDP_TUNNEL_SOCK_CFG_GRO_RECEIVE) - -static int geneve_hlen(struct genevehdr *gh) -{ - return sizeof(*gh) + gh->opt_len * 4; -} - -#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF -static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -#else -static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) -#endif -{ - struct sk_buff *p, **pp = NULL; - struct genevehdr *gh, *gh2; - unsigned int hlen, gh_len, off_gnv; - const struct packet_offload *ptype; - __be16 type; - int flush = 1; - - off_gnv = skb_gro_offset(skb); - hlen = off_gnv + sizeof(*gh); - gh = skb_gro_header_fast(skb, off_gnv); - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - if (gh->ver != GENEVE_VER || gh->oam) - goto out; - gh_len = geneve_hlen(gh); - - hlen = off_gnv + gh_len; - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - gh2 = (struct genevehdr *)(p->data + off_gnv); - if (gh->opt_len != gh2->opt_len || - memcmp(gh, gh2, gh_len)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) - goto out_unlock; - - skb_gro_pull(skb, gh_len); - skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); - flush = 0; - -out_unlock: - rcu_read_unlock(); -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF -static int geneve_gro_complete(struct sk_buff *skb, int nhoff) -#else -static int geneve_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) -#endif -{ - struct genevehdr *gh; - struct packet_offload *ptype; - __be16 type; - int gh_len; - int err = -ENOSYS; - - udp_tunnel_gro_complete(skb, nhoff); - - gh = (struct genevehdr *)(skb->data + nhoff); - gh_len = geneve_hlen(gh); - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); - - rcu_read_unlock(); - - skb_set_inner_mac_header(skb, nhoff + gh_len); - return err; -} -#endif - -/* Create new listen socket if needed */ -static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - bool ipv6, u32 flags) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - struct socket *sock; - struct udp_tunnel_sock_cfg tunnel_cfg; - int h; - - gs = kzalloc(sizeof(*gs), GFP_KERNEL); - if (!gs) - return ERR_PTR(-ENOMEM); - - sock = geneve_create_sock(net, ipv6, port, flags); - if (IS_ERR(sock)) { - kfree(gs); - return ERR_CAST(sock); - } - - gs->sock = sock; - gs->refcnt = 1; - for (h = 0; h < VNI_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&gs->vni_list[h]); - - /* Initialize the geneve udp offloads structure */ -#ifdef HAVE_UDP_OFFLOAD - gs->udp_offloads.port = port; - gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; - gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; -#endif - - geneve_notify_add_rx_port(gs); - /* Mark socket as an encapsulation socket */ - memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); - tunnel_cfg.sk_user_data = gs; - tunnel_cfg.encap_type = 1; -#ifdef HAVE_UDP_TUNNEL_SOCK_CFG_GRO_RECEIVE - tunnel_cfg.gro_receive = geneve_gro_receive; - tunnel_cfg.gro_complete = geneve_gro_complete; -#endif - tunnel_cfg.encap_rcv = geneve_udp_encap_recv; - tunnel_cfg.encap_destroy = NULL; - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - list_add(&gs->list, &gn->sock_list); - return gs; -} - -static void __geneve_sock_release(struct geneve_sock *gs) -{ - if (!gs || --gs->refcnt) - return; - - list_del(&gs->list); - geneve_notify_del_rx_port(gs); - udp_tunnel_sock_release(gs->sock); - kfree_rcu(gs, rcu); -} - -static void geneve_sock_release(struct geneve_dev *geneve) -{ - struct geneve_sock *gs4 = rtnl_dereference(geneve->sock4); -#if IS_ENABLED(CONFIG_IPV6) - struct geneve_sock *gs6 = rtnl_dereference(geneve->sock6); - - rcu_assign_pointer(geneve->sock6, NULL); -#endif - - rcu_assign_pointer(geneve->sock4, NULL); - synchronize_net(); - - __geneve_sock_release(gs4); -#if IS_ENABLED(CONFIG_IPV6) - __geneve_sock_release(gs6); -#endif -} - -static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, - sa_family_t family, - __be16 dst_port) -{ - struct geneve_sock *gs; - - list_for_each_entry(gs, &gn->sock_list, list) { - if (inet_sk(gs->sock->sk)->inet_sport == dst_port && - geneve_get_sk_family(gs) == family) { - return gs; - } - } - return NULL; -} - -static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6) -{ - struct net *net = geneve->net; - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - __u32 hash; - - gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->dst_port); - if (gs) { - gs->refcnt++; - goto out; - } - - gs = geneve_socket_create(net, geneve->dst_port, ipv6, geneve->flags); - if (IS_ERR(gs)) - return PTR_ERR(gs); - -out: - gs->collect_md = geneve->collect_md; - gs->flags = geneve->flags; -#if IS_ENABLED(CONFIG_IPV6) - if (ipv6) - rcu_assign_pointer(geneve->sock6, gs); - else -#endif - rcu_assign_pointer(geneve->sock4, gs); - - hash = geneve_net_vni_hash(geneve->vni); - hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]); - return 0; -} - -static int geneve_open(struct net_device *dev) -{ - struct geneve_dev *geneve = netdev_priv(dev); - bool ipv6 = geneve->remote.sa.sa_family == AF_INET6; - bool metadata = geneve->collect_md; - int ret = 0; - -#if IS_ENABLED(CONFIG_IPV6) - if (ipv6 || metadata) - ret = geneve_sock_add(geneve, true); -#endif - - if (!ret && (!ipv6 || metadata)) - ret = geneve_sock_add(geneve, false); - if (ret < 0) - geneve_sock_release(geneve); - - return ret; -} - -static int geneve_stop(struct net_device *dev) -{ - struct geneve_dev *geneve = netdev_priv(dev); - - if (!hlist_unhashed(&geneve->hlist)) - hlist_del_rcu(&geneve->hlist); - geneve_sock_release(geneve); - return 0; -} - -static void geneve_build_header(struct genevehdr *geneveh, - __be16 tun_flags, u8 vni[3], - u8 options_len, u8 *options) -{ - geneveh->ver = GENEVE_VER; - geneveh->opt_len = options_len / 4; - geneveh->oam = !!(tun_flags & TUNNEL_OAM); - geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - geneveh->rsvd1 = 0; - memcpy(geneveh->vni, vni, 3); - geneveh->proto_type = htons(ETH_P_TEB); - geneveh->rsvd2 = 0; - - memcpy(geneveh->options, options, options_len); -} - -static int push_vlan_tag(struct sk_buff *skb) -{ - if (skb_vlan_tag_present(skb)) { - __be16 vlan_proto = skb->vlan_proto; - int err; - - err = __vlan_insert_tag(skb, skb->vlan_proto, - skb_vlan_tag_get(skb)); - - if (unlikely(err)) - return err; - skb->vlan_tci = 0; - skb->protocol = vlan_proto; - } - return 0; -} - -static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - u32 flags, bool xnet) -{ - struct genevehdr *gnvh; - int min_headroom; - int err; - bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM_TX); - - skb_scrub_packet(skb, xnet); - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) - goto free_rt; - - err = push_vlan_tag(skb); - if (unlikely(err)) - goto free_rt; - - err = udp_tunnel_handle_offloads(skb, udp_sum); - if (err) - goto free_rt; - - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); - - ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - return 0; - -free_rt: - ip_rt_put(rt); - return err; -} - -#if IS_ENABLED(CONFIG_IPV6) -static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - u32 flags, bool xnet) -{ - struct genevehdr *gnvh; - int min_headroom; - int err; - bool udp_sum = !(flags & GENEVE_F_UDP_ZERO_CSUM6_TX); - - skb_scrub_packet(skb, xnet); - - min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct ipv6hdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) - goto free_dst; - - err = push_vlan_tag(skb); - if (unlikely(err)) - goto free_dst; - - err = udp_tunnel_handle_offloads(skb, udp_sum); - if (err) - goto free_dst; - - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); - - ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - return 0; - -free_dst: - dst_release(dst); - return err; -} -#endif - -static struct rtable *geneve_get_v4_rt(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl4, - struct ip_tunnel_info *info, - __be16 dport, __be16 sport) -{ - bool use_cache = ip_tunnel_dst_cache_usable(skb, info); - struct geneve_dev *geneve = netdev_priv(dev); - struct dst_cache *dst_cache; - struct rtable *rt = NULL; - __u8 tos; - - if (!rcu_dereference(geneve->sock4)) - return ERR_PTR(-EIO); - - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_mark = skb->mark; - fl4->flowi4_proto = IPPROTO_UDP; - fl4->fl4_dport = dport; - fl4->fl4_sport = sport; - - if (info) { - fl4->daddr = info->key.u.ipv4.dst; - fl4->saddr = info->key.u.ipv4.src; - fl4->flowi4_tos = RT_TOS(info->key.tos); - dst_cache = &info->dst_cache; - } else { - tos = geneve->tos; - if (tos == 1) { - const struct iphdr *iip = ip_hdr(skb); - - tos = ip_tunnel_get_dsfield(iip, skb); - use_cache = false; - } - - fl4->flowi4_tos = RT_TOS(tos); - fl4->daddr = geneve->remote.sin.sin_addr.s_addr; - dst_cache = &geneve->dst_cache; - } - - if (use_cache) { - rt = dst_cache_get_ip4(dst_cache, &fl4->saddr); - if (rt) - return rt; - } - - rt = ip_route_output_key(geneve->net, fl4); - if (IS_ERR(rt)) { - netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr); - return ERR_PTR(-ENETUNREACH); - } - if (rt->dst.dev == dev) { /* is this necessary? */ - netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr); - ip_rt_put(rt); - return ERR_PTR(-ELOOP); - } - if (use_cache) - dst_cache_set_ip4(dst_cache, &rt->dst, fl4->saddr); - return rt; -} - -#if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, - struct net_device *dev, - struct flowi6 *fl6, - struct ip_tunnel_info *info, - __be16 dport, __be16 sport) -{ - bool use_cache = ip_tunnel_dst_cache_usable(skb, info); - struct geneve_dev *geneve = netdev_priv(dev); - struct dst_entry *dst = NULL; - struct dst_cache *dst_cache; - struct geneve_sock *gs6; - __u8 prio; - - gs6 = rcu_dereference(geneve->sock6); - if (!gs6) - return ERR_PTR(-EIO); - - memset(fl6, 0, sizeof(*fl6)); - fl6->flowi6_mark = skb->mark; - fl6->flowi6_proto = IPPROTO_UDP; - fl6->fl6_dport = dport; - fl6->fl6_sport = sport; - - if (info) { - fl6->daddr = info->key.u.ipv6.dst; - fl6->saddr = info->key.u.ipv6.src; - fl6->flowlabel = ip6_make_flowinfo(RT_TOS(info->key.tos), - info->key.label); - dst_cache = &info->dst_cache; - } else { - prio = geneve->tos; - if (prio == 1) { - const struct iphdr *iip = ip_hdr(skb); - - prio = ip_tunnel_get_dsfield(iip, skb); - use_cache = false; - } - - fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio), - geneve->label); - fl6->daddr = geneve->remote.sin6.sin6_addr; - dst_cache = &geneve->dst_cache; - } - - if (use_cache) { - dst = dst_cache_get_ip6(dst_cache, &fl6->saddr); - if (dst) - return dst; - } - -#if defined(HAVE_IPV6_STUB_WITH_DST_ENTRY) && defined(HAVE_IPV6_DST_LOOKUP_FLOW) -#ifdef HAVE_IPV6_DST_LOOKUP_FLOW_NET - dst = ipv6_stub->ipv6_dst_lookup_flow(geneve->net, gs6->sock->sk, fl6, - NULL); -#else - dst = ipv6_stub->ipv6_dst_lookup_flow(gs6->sock->sk, fl6, - NULL); -#endif - if (IS_ERR(dst)) { -#elif defined(HAVE_IPV6_DST_LOOKUP_FLOW_NET) - if (ipv6_stub->ipv6_dst_lookup_flow(geneve->net, gs6->sock->sk, &dst, - fl6)) { -#elif defined(HAVE_IPV6_DST_LOOKUP_FLOW) - if (ipv6_stub->ipv6_dst_lookup_flow(gs6->sock->sk, &dst, fl6)) { -#elif defined(HAVE_IPV6_DST_LOOKUP_NET) - if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) { -#elif defined(HAVE_IPV6_STUB) - if (ipv6_stub->ipv6_dst_lookup(gs6->sock->sk, &dst, fl6)) { -#else - if (ip6_dst_lookup(gs6->sock->sk, &dst, fl6)) { -#endif - netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr); - return ERR_PTR(-ENETUNREACH); - } - if (dst->dev == dev) { /* is this necessary? */ - netdev_dbg(dev, "circular route to %pI6\n", &fl6->daddr); - dst_release(dst); - return ERR_PTR(-ELOOP); - } - - if (use_cache) - dst_cache_set_ip6(dst_cache, dst, &fl6->saddr); - return dst; -} -#endif - -/* Convert 64 bit tunnel ID to 24 bit VNI. */ -static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) -{ -#ifdef __BIG_ENDIAN - vni[0] = (__force __u8)(tun_id >> 16); - vni[1] = (__force __u8)(tun_id >> 8); - vni[2] = (__force __u8)tun_id; -#else - vni[0] = (__force __u8)((__force u64)tun_id >> 40); - vni[1] = (__force __u8)((__force u64)tun_id >> 48); - vni[2] = (__force __u8)((__force u64)tun_id >> 56); -#endif -} - -static netdev_tx_t geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, - struct ip_tunnel_info *info) -{ - struct geneve_dev *geneve = netdev_priv(dev); - struct geneve_sock *gs4; - struct rtable *rt = NULL; - const struct iphdr *iip; /* interior IP header */ - int err = -EINVAL; - struct flowi4 fl4; - __u8 tos, ttl; - __be16 sport; - __be16 df; - bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); - u32 flags = geneve->flags; - - gs4 = rcu_dereference(geneve->sock4); - if (!gs4) - goto tx_error; - - if (geneve->collect_md) { - if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { - netdev_dbg(dev, "no tunnel metadata\n"); - goto tx_error; - } - if (info && ip_tunnel_info_af(info) != AF_INET) - goto tx_error; - } - - sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); - rt = geneve_get_v4_rt(skb, dev, &fl4, info, geneve->dst_port, sport); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto tx_error; - } - - skb_reset_mac_header(skb); - - iip = ip_hdr(skb); - - if (info) { - const struct ip_tunnel_key *key = &info->key; - u8 *opts = NULL; - u8 vni[3]; - - tunnel_id_to_vni(key->tun_id, vni); - if (info->options_len) - opts = ip_tunnel_info_opts(info); - - if (key->tun_flags & TUNNEL_CSUM) - flags &= ~GENEVE_F_UDP_ZERO_CSUM_TX; - else - flags |= GENEVE_F_UDP_ZERO_CSUM_TX; - - err = geneve_build_skb(rt, skb, key->tun_flags, vni, - info->options_len, opts, flags, xnet); - if (unlikely(err)) - goto tx_error; - - tos = ip_tunnel_ecn_encap(key->tos, iip, skb); - ttl = key->ttl; - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - } else { - err = geneve_build_skb(rt, skb, 0, geneve->vni, - 0, NULL, flags, xnet); - if (unlikely(err)) - goto tx_error; - - tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb); - ttl = geneve->ttl; - if (!ttl && IN_MULTICAST(ntohl(fl4.daddr))) - ttl = 1; - ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - df = 0; - } - udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr, - tos, ttl, df, sport, geneve->dst_port, - !net_eq(geneve->net, dev_net(geneve->dev)), - !!(flags & GENEVE_F_UDP_ZERO_CSUM_TX)); - - return NETDEV_TX_OK; - -tx_error: - dev_kfree_skb(skb); - - if (err == -ELOOP) - dev->stats.collisions++; - else if (err == -ENETUNREACH) - dev->stats.tx_carrier_errors++; - - dev->stats.tx_errors++; - return NETDEV_TX_OK; -} - -#if IS_ENABLED(CONFIG_IPV6) -static netdev_tx_t geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, - struct ip_tunnel_info *info) -{ - struct geneve_dev *geneve = netdev_priv(dev); - struct dst_entry *dst = NULL; - const struct iphdr *iip; /* interior IP header */ - struct geneve_sock *gs6; - int err = -EINVAL; - struct flowi6 fl6; - __u8 prio, ttl; - __be16 sport; - __be32 label; - bool xnet = !net_eq(geneve->net, dev_net(geneve->dev)); - u32 flags = geneve->flags; - - gs6 = rcu_dereference(geneve->sock6); - if (!gs6) - goto tx_error; - - if (geneve->collect_md) { - if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) { - netdev_dbg(dev, "no tunnel metadata\n"); - goto tx_error; - } - } - - sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); - dst = geneve_get_v6_dst(skb, dev, &fl6, info, geneve->dst_port, sport); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - goto tx_error; - } - - skb_reset_mac_header(skb); - - iip = ip_hdr(skb); - - if (info) { - const struct ip_tunnel_key *key = &info->key; - u8 *opts = NULL; - u8 vni[3]; - - tunnel_id_to_vni(key->tun_id, vni); - if (info->options_len) - opts = ip_tunnel_info_opts(info); - - if (key->tun_flags & TUNNEL_CSUM) - flags &= ~GENEVE_F_UDP_ZERO_CSUM6_TX; - else - flags |= GENEVE_F_UDP_ZERO_CSUM6_TX; - - err = geneve6_build_skb(dst, skb, key->tun_flags, vni, - info->options_len, opts, - flags, xnet); - if (unlikely(err)) - goto tx_error; - - prio = ip_tunnel_ecn_encap(key->tos, iip, skb); - ttl = key->ttl; - label = info->key.label; - } else { - err = geneve6_build_skb(dst, skb, 0, geneve->vni, - 0, NULL, flags, xnet); - if (unlikely(err)) - goto tx_error; - - prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel), - iip, skb); - ttl = geneve->ttl; - if (!ttl && ipv6_addr_is_multicast(&fl6.daddr)) - ttl = 1; - ttl = ttl ? : ip6_dst_hoplimit(dst); - label = geneve->label; - } - udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev, - &fl6.saddr, &fl6.daddr, prio, ttl, label, - sport, geneve->dst_port, - !!(flags & GENEVE_F_UDP_ZERO_CSUM6_TX)); - return NETDEV_TX_OK; - -tx_error: - dev_kfree_skb(skb); - - if (err == -ELOOP) - dev->stats.collisions++; - else if (err == -ENETUNREACH) - dev->stats.tx_carrier_errors++; - - dev->stats.tx_errors++; - return NETDEV_TX_OK; -} -#endif - -netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct geneve_dev *geneve = netdev_priv(dev); - struct ip_tunnel_info *info = NULL; - - if (geneve->collect_md) - info = skb_tunnel_info(skb); - -#if IS_ENABLED(CONFIG_IPV6) - if ((info && ip_tunnel_info_af(info) == AF_INET6) || - (!info && geneve->remote.sa.sa_family == AF_INET6)) - return geneve6_xmit_skb(skb, dev, info); -#endif - return geneve_xmit_skb(skb, dev, info); -} -EXPORT_SYMBOL_GPL(rpl_geneve_xmit); - -static netdev_tx_t geneve_dev_xmit(struct sk_buff *skb, struct net_device *dev) -{ - /* Drop All packets coming from networking stack. OVS-CB is - * not initialized for these packets. - */ - - dev_kfree_skb(skb); - dev->stats.tx_dropped++; - return NETDEV_TX_OK; -} - -static int __geneve_change_mtu(struct net_device *dev, int new_mtu, bool strict) -{ - struct geneve_dev *geneve = netdev_priv(dev); - /* The max_mtu calculation does not take account of GENEVE - * options, to avoid excluding potentially valid - * configurations. - */ - int max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len; - - if (geneve->remote.sa.sa_family == AF_INET6) - max_mtu -= sizeof(struct ipv6hdr); - else - max_mtu -= sizeof(struct iphdr); - - if (new_mtu < 68) - return -EINVAL; - - if (new_mtu > max_mtu) { - if (strict) - return -EINVAL; - - new_mtu = max_mtu; - } - - dev->mtu = new_mtu; - return 0; -} - -static int geneve_change_mtu(struct net_device *dev, int new_mtu) -{ - return __geneve_change_mtu(dev, new_mtu, true); -} - -int ovs_geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) -{ - struct ip_tunnel_info *info = skb_tunnel_info(skb); - struct geneve_dev *geneve = netdev_priv(dev); - struct rtable *rt; - struct flowi4 fl4; - __be16 sport; -#if IS_ENABLED(CONFIG_IPV6) - struct dst_entry *dst; - struct flowi6 fl6; -#endif - - sport = udp_flow_src_port(geneve->net, skb, - 1, USHRT_MAX, true); - - if (ip_tunnel_info_af(info) == AF_INET) { - rt = geneve_get_v4_rt(skb, dev, &fl4, info, geneve->dst_port, sport); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - ip_rt_put(rt); - info->key.u.ipv4.src = fl4.saddr; -#if IS_ENABLED(CONFIG_IPV6) - } else if (ip_tunnel_info_af(info) == AF_INET6) { - dst = geneve_get_v6_dst(skb, dev, &fl6, info, geneve->dst_port, sport); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - dst_release(dst); - info->key.u.ipv6.src = fl6.saddr; -#endif - } else { - return -EINVAL; - } - - info->key.tp_src = sport; - info->key.tp_dst = geneve->dst_port; - return 0; -} -EXPORT_SYMBOL_GPL(ovs_geneve_fill_metadata_dst); - -static const struct net_device_ops geneve_netdev_ops = { - .ndo_init = geneve_init, - .ndo_uninit = geneve_uninit, - .ndo_open = geneve_open, - .ndo_stop = geneve_stop, - .ndo_start_xmit = geneve_dev_xmit, - .ndo_get_stats64 = ip_tunnel_get_stats64, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = geneve_change_mtu, -#else - .ndo_change_mtu = geneve_change_mtu, -#endif - .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, -#ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = geneve_fill_metadata_dst, -#endif -}; - -static void geneve_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *drvinfo) -{ - strlcpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version)); - strlcpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver)); -} - -static const struct ethtool_ops geneve_ethtool_ops = { - .get_drvinfo = geneve_get_drvinfo, - .get_link = ethtool_op_get_link, -}; - -/* Info for udev, that this is a virtual tunnel endpoint */ -static struct device_type geneve_type = { - .name = "geneve", -}; - -/* Calls the ndo_add_geneve_port or ndo_udp_tunnel_add of the caller - * in order to supply the listening GENEVE udp ports. Callers are - * expected to implement the ndo_add_geneve_port. - */ -static void geneve_push_rx_ports(struct net_device *dev) -{ -#ifdef HAVE_NDO_ADD_GENEVE_PORT - struct net *net = dev_net(dev); - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - sa_family_t sa_family; - struct sock *sk; - __be16 port; - - if (!dev->netdev_ops->ndo_add_geneve_port) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(gs, &gn->sock_list, list) { - sk = gs->sock->sk; - sa_family = sk->sk_family; - port = inet_sk(sk)->inet_sport; - dev->netdev_ops->ndo_add_geneve_port(dev, sa_family, port); - } - rcu_read_unlock(); -#elif defined(HAVE_NDO_UDP_TUNNEL_ADD) - struct net *net = dev_net(dev); - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - struct sock *sk; - - if (!dev->netdev_ops->ndo_udp_tunnel_add) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(gs, &gn->sock_list, list) { - struct udp_tunnel_info ti; - ti.type = UDP_TUNNEL_TYPE_GENEVE; - sk = gs->sock->sk; - ti.port = inet_sk(sk)->inet_sport; - ti.sa_family = sk->sk_family; - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); - } - rcu_read_unlock(); -#endif -} - -/* Initialize the device structure. */ -static void geneve_setup(struct net_device *dev) -{ - ether_setup(dev); - - dev->netdev_ops = &geneve_netdev_ops; - dev->ethtool_ops = &geneve_ethtool_ops; -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = free_netdev; -#else - dev->needs_free_netdev = true; -#endif - - SET_NETDEV_DEVTYPE(dev, &geneve_type); - - dev->features |= NETIF_F_LLTX; - dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; - dev->features |= NETIF_F_RXCSUM; - dev->features |= NETIF_F_GSO_SOFTWARE; - - dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - -#if 0 - netif_keep_dst(dev); -#endif - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; - eth_hw_addr_random(dev); -} - -static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { - [IFLA_GENEVE_ID] = { .type = NLA_U32 }, - [IFLA_GENEVE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, - [IFLA_GENEVE_REMOTE6] = { .len = sizeof(struct in6_addr) }, - [IFLA_GENEVE_TTL] = { .type = NLA_U8 }, - [IFLA_GENEVE_TOS] = { .type = NLA_U8 }, - [IFLA_GENEVE_LABEL] = { .type = NLA_U32 }, - [IFLA_GENEVE_PORT] = { .type = NLA_U16 }, - [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG }, - [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, - [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, - [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, -}; - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int geneve_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) - return -EINVAL; - - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) - return -EADDRNOTAVAIL; - } - - if (!data) - return -EINVAL; - - if (data[IFLA_GENEVE_ID]) { - __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]); - - if (vni >= GENEVE_VID_MASK) - return -ERANGE; - } - - return 0; -} - -static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, - __be16 dst_port, - union geneve_addr *remote, - u8 vni[], - bool *tun_on_same_port, - bool *tun_collect_md) -{ - struct geneve_dev *geneve, *t; - - *tun_on_same_port = false; - *tun_collect_md = false; - t = NULL; - list_for_each_entry(geneve, &gn->geneve_list, next) { - if (geneve->dst_port == dst_port) { - *tun_collect_md = geneve->collect_md; - *tun_on_same_port = true; - } - if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && - !memcmp(remote, &geneve->remote, sizeof(geneve->remote)) && - dst_port == geneve->dst_port) - t = geneve; - } - return t; -} - -static int geneve_configure(struct net *net, struct net_device *dev, - union geneve_addr *remote, - __u32 vni, __u8 ttl, __u8 tos, __be32 label, - __be16 dst_port, bool metadata, u32 flags) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_dev *t, *geneve = netdev_priv(dev); - bool tun_collect_md, tun_on_same_port; - int err, encap_len; - - if (!remote) - return -EINVAL; - if (metadata && - (remote->sa.sa_family != AF_UNSPEC || vni || tos || ttl || label)) - return -EINVAL; - - geneve->net = net; - geneve->dev = dev; - - geneve->vni[0] = (vni & 0x00ff0000) >> 16; - geneve->vni[1] = (vni & 0x0000ff00) >> 8; - geneve->vni[2] = vni & 0x000000ff; - - if ((remote->sa.sa_family == AF_INET && - IN_MULTICAST(ntohl(remote->sin.sin_addr.s_addr))) || - (remote->sa.sa_family == AF_INET6 && - ipv6_addr_is_multicast(&remote->sin6.sin6_addr))) - return -EINVAL; - if (label && remote->sa.sa_family != AF_INET6) - return -EINVAL; - - geneve->remote = *remote; - - geneve->ttl = ttl; - geneve->tos = tos; - geneve->label = label; - geneve->dst_port = dst_port; - geneve->collect_md = metadata; - geneve->flags = flags; - - t = geneve_find_dev(gn, dst_port, remote, geneve->vni, - &tun_on_same_port, &tun_collect_md); - if (t) - return -EBUSY; - - /* make enough headroom for basic scenario */ - encap_len = GENEVE_BASE_HLEN + ETH_HLEN; - if (remote->sa.sa_family == AF_INET) - encap_len += sizeof(struct iphdr); - else - encap_len += sizeof(struct ipv6hdr); - dev->needed_headroom = encap_len + ETH_HLEN; - - if (metadata) { - if (tun_on_same_port) - return -EPERM; - } else { - if (tun_collect_md) - return -EPERM; - } - - dst_cache_reset(&geneve->dst_cache); - - err = register_netdevice(dev); - if (err) - return err; - - list_add(&geneve->next, &gn->geneve_list); - return 0; -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int geneve_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int geneve_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - __be16 dst_port = htons(GENEVE_UDP_PORT); - __u8 ttl = 0, tos = 0; - bool metadata = false; - union geneve_addr remote = geneve_remote_unspec; - __be32 label = 0; - __u32 vni = 0; - u32 flags = 0; - - if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6]) - return -EINVAL; - - if (data[IFLA_GENEVE_REMOTE]) { - remote.sa.sa_family = AF_INET; - remote.sin.sin_addr.s_addr = - nla_get_in_addr(data[IFLA_GENEVE_REMOTE]); - } - - if (data[IFLA_GENEVE_REMOTE6]) { - if (!IS_ENABLED(CONFIG_IPV6)) - return -EPFNOSUPPORT; - - remote.sa.sa_family = AF_INET6; - remote.sin6.sin6_addr = - nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]); - - if (ipv6_addr_type(&remote.sin6.sin6_addr) & - IPV6_ADDR_LINKLOCAL) { - netdev_dbg(dev, "link-local remote is unsupported\n"); - return -EINVAL; - } - } - - if (data[IFLA_GENEVE_ID]) - vni = nla_get_u32(data[IFLA_GENEVE_ID]); - - if (data[IFLA_GENEVE_TTL]) - ttl = nla_get_u8(data[IFLA_GENEVE_TTL]); - - if (data[IFLA_GENEVE_TOS]) - tos = nla_get_u8(data[IFLA_GENEVE_TOS]); - - if (data[IFLA_GENEVE_LABEL]) - label = nla_get_be32(data[IFLA_GENEVE_LABEL]) & - IPV6_FLOWLABEL_MASK; - - if (data[IFLA_GENEVE_PORT]) - dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]); - - if (data[IFLA_GENEVE_COLLECT_METADATA]) - metadata = true; - - if (data[IFLA_GENEVE_UDP_CSUM] && - !nla_get_u8(data[IFLA_GENEVE_UDP_CSUM])) - flags |= GENEVE_F_UDP_ZERO_CSUM_TX; - - if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] && - nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX])) - flags |= GENEVE_F_UDP_ZERO_CSUM6_TX; - - if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] && - nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX])) - flags |= GENEVE_F_UDP_ZERO_CSUM6_RX; - - return geneve_configure(net, dev, &remote, vni, ttl, tos, label, - dst_port, metadata, flags); -} - -static void geneve_dellink(struct net_device *dev, struct list_head *head) -{ - struct geneve_dev *geneve = netdev_priv(dev); - - list_del(&geneve->next); - unregister_netdevice_queue(dev, head); -} - -static size_t geneve_get_size(const struct net_device *dev) -{ - return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */ - nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */ - nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */ - nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */ - nla_total_size(sizeof(__be32)) + /* IFLA_GENEVE_LABEL */ - nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */ - nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */ - nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ - nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ - nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ - 0; -} - -static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct geneve_dev *geneve = netdev_priv(dev); - __u32 vni; - - vni = (geneve->vni[0] << 16) | (geneve->vni[1] << 8) | geneve->vni[2]; - if (nla_put_u32(skb, IFLA_GENEVE_ID, vni)) - goto nla_put_failure; - - if (geneve->remote.sa.sa_family == AF_INET) { - if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE, - geneve->remote.sin.sin_addr.s_addr)) - goto nla_put_failure; -#if IS_ENABLED(CONFIG_IPV6) - } else { - if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6, - &geneve->remote.sin6.sin6_addr)) - goto nla_put_failure; -#endif - } - - if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) || - nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos) || - nla_put_be32(skb, IFLA_GENEVE_LABEL, geneve->label)) - goto nla_put_failure; - - if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port)) - goto nla_put_failure; - - if (geneve->collect_md) { - if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA)) - goto nla_put_failure; - } - - if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM, - !(geneve->flags & GENEVE_F_UDP_ZERO_CSUM_TX)) || - nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX, - !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_TX)) || - nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, - !!(geneve->flags & GENEVE_F_UDP_ZERO_CSUM6_RX))) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static struct rtnl_link_ops geneve_link_ops __read_mostly = { - .kind = "ovs_geneve", - .maxtype = IFLA_GENEVE_MAX, - .policy = geneve_policy, - .priv_size = sizeof(struct geneve_dev), - .setup = geneve_setup, - .validate = geneve_validate, - .newlink = geneve_newlink, - .dellink = geneve_dellink, - .get_size = geneve_get_size, - .fill_info = geneve_fill_info, -}; - -struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name, - u8 name_assign_type, u16 dst_port) -{ - struct nlattr *tb[IFLA_MAX + 1]; - struct net_device *dev; - LIST_HEAD(list_kill); - int err; - - memset(tb, 0, sizeof(tb)); - dev = rtnl_create_link(net, name, name_assign_type, - &geneve_link_ops, tb); - if (IS_ERR(dev)) - return dev; - - err = geneve_configure(net, dev, &geneve_remote_unspec, - 0, 0, 0, 0, htons(dst_port), true, - GENEVE_F_UDP_ZERO_CSUM6_RX); - if (err) { - free_netdev(dev); - return ERR_PTR(err); - } - - /* openvswitch users expect packet sizes to be unrestricted, - * so set the largest MTU we can. - */ - err = __geneve_change_mtu(dev, IP_MAX_MTU, false); - if (err) - goto err; - - err = rtnl_configure_link(dev, NULL); - if (err < 0) - goto err; - - return dev; - - err: - geneve_dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(rpl_geneve_dev_create_fb); - -static int geneve_netdevice_event(struct notifier_block *unused, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - if (event == NETDEV_OFFLOAD_PUSH_GENEVE) - geneve_push_rx_ports(dev); - - return NOTIFY_DONE; -} - -static struct notifier_block geneve_notifier_block __read_mostly = { - .notifier_call = geneve_netdevice_event, -}; - -static __net_init int geneve_init_net(struct net *net) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - - INIT_LIST_HEAD(&gn->geneve_list); - INIT_LIST_HEAD(&gn->sock_list); - return 0; -} - -static void __net_exit geneve_exit_net(struct net *net) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_dev *geneve, *next; - struct net_device *dev, *aux; - LIST_HEAD(list); - - rtnl_lock(); - - /* gather any geneve devices that were moved into this ns */ - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &geneve_link_ops) - unregister_netdevice_queue(dev, &list); - - /* now gather any other geneve devices that were created in this ns */ - list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) { - /* If geneve->dev is in the same netns, it was already added - * to the list by the previous loop. - */ - if (!net_eq(dev_net(geneve->dev), net)) - unregister_netdevice_queue(geneve->dev, &list); - } - - /* unregister the devices gathered above */ - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -static struct pernet_operations geneve_net_ops = { - .init = geneve_init_net, - .exit = geneve_exit_net, - .id = &geneve_net_id, - .size = sizeof(struct geneve_net), -}; - -int rpl_geneve_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&geneve_net_ops); - if (rc) - goto out1; - - rc = register_netdevice_notifier(&geneve_notifier_block); - if (rc) - goto out2; - - rc = rtnl_link_register(&geneve_link_ops); - if (rc) - goto out3; - - pr_info("Geneve tunneling driver\n"); - return 0; - -out3: - unregister_netdevice_notifier(&geneve_notifier_block); -out2: - unregister_pernet_subsys(&geneve_net_ops); -out1: - pr_err("Error while initializing GENEVE %d\n", rc); - return rc; -} - -void rpl_geneve_cleanup_module(void) -{ - rtnl_link_unregister(&geneve_link_ops); - unregister_netdevice_notifier(&geneve_notifier_block); - unregister_pernet_subsys(&geneve_net_ops); -} - -#endif diff --git a/datapath/linux/compat/gre.c b/datapath/linux/compat/gre.c deleted file mode 100644 index e57528f80..000000000 --- a/datapath/linux/compat/gre.c +++ /dev/null @@ -1,239 +0,0 @@ -/* - * Copyright (c) 2007-2013 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include <linux/version.h> -#include <linux/kconfig.h> -#include <linux/module.h> -#include <linux/if.h> -#include <linux/if_tunnel.h> -#include <linux/icmp.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/kernel.h> -#include <linux/kmod.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/spinlock.h> - -#include <net/gre.h> -#include <net/icmp.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/xfrm.h> - -#include "gso.h" - -#ifndef USE_UPSTREAM_TUNNEL -#if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX) - -static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; - -int rpl_gre_add_protocol(const struct gre_protocol *proto, u8 version) -{ - if (version >= GREPROTO_MAX) - return -EINVAL; - - return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ? - 0 : -EBUSY; -} -EXPORT_SYMBOL_GPL(rpl_gre_add_protocol); - -int rpl_gre_del_protocol(const struct gre_protocol *proto, u8 version) -{ - int ret; - - if (version >= GREPROTO_MAX) - return -EINVAL; - - ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ? - 0 : -EBUSY; - - if (ret) - return ret; - - synchronize_rcu(); - return 0; -} -EXPORT_SYMBOL_GPL(rpl_gre_del_protocol); - -static int gre_rcv(struct sk_buff *skb) -{ - const struct gre_protocol *proto; - u8 ver; - int ret; - - if (!pskb_may_pull(skb, 12)) - goto drop; - - ver = skb->data[1]&0x7f; - if (ver >= GREPROTO_MAX) - goto drop; - - rcu_read_lock(); - proto = rcu_dereference(gre_proto[ver]); - if (!proto || !proto->handler) - goto drop_unlock; - ret = proto->handler(skb); - rcu_read_unlock(); - return ret; - -drop_unlock: - rcu_read_unlock(); -drop: - kfree_skb(skb); - return NET_RX_DROP; -} - -static void gre_err(struct sk_buff *skb, u32 info) -{ - const struct gre_protocol *proto; - const struct iphdr *iph = (const struct iphdr *)skb->data; - u8 ver = skb->data[(iph->ihl<<2) + 1]&0x7f; - - if (ver >= GREPROTO_MAX) - return; - - rcu_read_lock(); - proto = rcu_dereference(gre_proto[ver]); - if (proto && proto->err_handler) - proto->err_handler(skb, info); - rcu_read_unlock(); -} - -static const struct net_protocol net_gre_protocol = { - .handler = gre_rcv, - .err_handler = gre_err, - .netns_ok = 1, -}; - -int rpl_gre_init(void) -{ - pr_info("GRE over IPv4 demultiplexor driver\n"); - - if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { - pr_err("can't add protocol\n"); - return -EEXIST; - } - return 0; -} -EXPORT_SYMBOL_GPL(rpl_gre_init); - -void rpl_gre_exit(void) -{ - inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); -} -EXPORT_SYMBOL_GPL(rpl_gre_exit); - -void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len) -{ - struct gre_base_hdr *greh; - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(tpi->flags); - greh->protocol = tpi->proto; - - if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (tpi->flags&TUNNEL_SEQ) { - *ptr = tpi->seq; - ptr--; - } - if (tpi->flags&TUNNEL_KEY) { - *ptr = tpi->key; - ptr--; - } - if (tpi->flags&TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & - (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { - *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); - } - } -} -EXPORT_SYMBOL_GPL(rpl_gre_build_header); - -/* Fills in tpi and returns header length to be pulled. */ -int rpl_gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, - bool *csum_err, __be16 proto, int nhs) -{ - const struct gre_base_hdr *greh; - __be32 *options; - int hdr_len; - - if (unlikely(!pskb_may_pull(skb, nhs + sizeof(struct gre_base_hdr)))) - return -EINVAL; - - greh = (struct gre_base_hdr *)(skb->data + nhs); - if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) - return -EINVAL; - - tpi->flags = gre_flags_to_tnl_flags(greh->flags); - hdr_len = gre_calc_hlen(tpi->flags); - - if (!pskb_may_pull(skb, nhs + hdr_len)) - return -EINVAL; - - greh = (struct gre_base_hdr *)(skb->data + nhs); - tpi->proto = greh->protocol; - - options = (__be32 *)(greh + 1); - if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { - *csum_err = true; - return -EINVAL; - } - - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); - options++; - } - - if (greh->flags & GRE_KEY) { - tpi->key = *options; - options++; - } else { - tpi->key = 0; - } - if (unlikely(greh->flags & GRE_SEQ)) { - tpi->seq = *options; - options++; - } else { - tpi->seq = 0; - } - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IPv4/IPv6 - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { - tpi->proto = proto; - if ((*(u8 *)options & 0xF0) != 0x40) - hdr_len += 4; - } - tpi->hdr_len = hdr_len; - return hdr_len; -} -EXPORT_SYMBOL(rpl_gre_parse_header); - -#endif /* CONFIG_NET_IPGRE_DEMUX */ -#endif /* USE_UPSTREAM_TUNNEL */ diff --git a/datapath/linux/compat/gso.c b/datapath/linux/compat/gso.c deleted file mode 100644 index 65da5d876..000000000 --- a/datapath/linux/compat/gso.c +++ /dev/null @@ -1,317 +0,0 @@ -/* - * Copyright (c) 2007-2013 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include <linux/version.h> - -#include <linux/module.h> -#include <linux/if.h> -#include <linux/if_tunnel.h> -#include <linux/if_vlan.h> -#include <linux/icmp.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/kernel.h> -#include <linux/kmod.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/spinlock.h> - -#include <net/gre.h> -#include <net/icmp.h> -#include <net/mpls.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/xfrm.h> - -#include "gso.h" - -#ifdef OVS_USE_COMPAT_GSO_SEGMENTATION -/* Strictly this is not needed and will be optimised out - * as this code is guarded by if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0). - * It is here to make things explicit should the compatibility - * code be extended in some way prior extending its life-span - * beyond v3.19. - */ -static bool supports_mpls_gso(void) -{ -/* MPLS GSO was introduced in v3.11, however it was not correctly - * activated using mpls_features until v3.19. */ -#ifdef OVS_USE_COMPAT_GSO_SEGMENTATION - return true; -#else - return false; -#endif -} - -int rpl_dev_queue_xmit(struct sk_buff *skb) -{ -#undef dev_queue_xmit - int err = -ENOMEM; - bool mpls; - - mpls = false; - - /* Avoid traversing any VLAN tags that are present to determine if - * the ethtype is MPLS. Instead compare the mac_len (end of L2) and - * skb_network_offset() (beginning of L3) whose inequality will - * indicate the presence of an MPLS label stack. */ - if (skb->mac_len != skb_network_offset(skb) && !supports_mpls_gso()) - mpls = true; - - if (mpls) { - int features; - - features = netif_skb_features(skb); - - /* As of v3.11 the kernel provides an mpls_features field in - * struct net_device which allows devices to advertise which - * features its supports for MPLS. This value defaults to - * NETIF_F_SG and as of v3.19. - * - * This compatibility code is intended for kernels older - * than v3.19 that do not support MPLS GSO and do not - * use mpls_features. Thus this code uses NETIF_F_SG - * directly in place of mpls_features. - */ - if (mpls) - features &= NETIF_F_SG; - - if (netif_needs_gso(skb, features)) { - struct sk_buff *nskb; - - nskb = skb_gso_segment(skb, features); - if (!nskb) { - if (unlikely(skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) - goto drop; - - skb_shinfo(skb)->gso_type &= ~SKB_GSO_DODGY; - goto xmit; - } - - if (IS_ERR(nskb)) { - err = PTR_ERR(nskb); - goto drop; - } - consume_skb(skb); - skb = nskb; - - do { - nskb = skb->next; - skb->next = NULL; - err = dev_queue_xmit(skb); - skb = nskb; - } while (skb); - - return err; - } - } -xmit: - return dev_queue_xmit(skb); - -drop: - kfree_skb(skb); - return err; -} -EXPORT_SYMBOL_GPL(rpl_dev_queue_xmit); -#endif /* OVS_USE_COMPAT_GSO_SEGMENTATION */ - -#ifndef USE_UPSTREAM_TUNNEL_GSO -static __be16 __skb_network_protocol(struct sk_buff *skb) -{ - __be16 type = skb->protocol; - int vlan_depth = ETH_HLEN; - - while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { - struct vlan_hdr *vh; - - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return 0; - - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; - } - - if (eth_p_mpls(type)) - type = ovs_skb_get_inner_protocol(skb); - - return type; -} - -static struct sk_buff *tnl_skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, - bool tx_path, - sa_family_t sa_family) -{ - void *iph = skb_network_header(skb); - int pkt_hlen = skb_inner_network_offset(skb); /* inner l2 + tunnel hdr. */ - int mac_offset = skb_inner_mac_offset(skb); - int outer_l3_offset = skb_network_offset(skb); - int outer_l4_offset = skb_transport_offset(skb); - struct sk_buff *skb1 = skb; - struct dst_entry *dst = skb_dst(skb); - struct sk_buff *segs; - __be16 proto = skb->protocol; - char cb[sizeof(skb->cb)]; - - BUILD_BUG_ON(sizeof(struct ovs_gso_cb) > sizeof_field(struct sk_buff, cb)); - OVS_GSO_CB(skb)->ipv6 = (sa_family == AF_INET6); - /* setup whole inner packet to get protocol. */ - __skb_pull(skb, mac_offset); - skb->protocol = __skb_network_protocol(skb); - - /* setup l3 packet to gso, to get around segmentation bug on older kernel.*/ - __skb_pull(skb, (pkt_hlen - mac_offset)); - skb_reset_mac_header(skb); - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - - /* From 3.9 kernel skb->cb is used by skb gso. Therefore - * make copy of it to restore it back. */ - memcpy(cb, skb->cb, sizeof(cb)); - - /* We are handling offloads by segmenting l3 packet, so - * no need to call OVS compat segmentation function. */ - -#ifdef HAVE___SKB_GSO_SEGMENT -#undef __skb_gso_segment - segs = __skb_gso_segment(skb, 0, tx_path); -#else -#undef skb_gso_segment - segs = skb_gso_segment(skb, 0); -#endif - - if (!segs || IS_ERR(segs)) - goto free; - - skb = segs; - while (skb) { - __skb_push(skb, pkt_hlen); - skb_reset_mac_header(skb); - skb_set_network_header(skb, outer_l3_offset); - skb_set_transport_header(skb, outer_l4_offset); - skb->mac_len = 0; - - memcpy(skb_network_header(skb), iph, pkt_hlen); - memcpy(skb->cb, cb, sizeof(cb)); - - skb->protocol = proto; - if (skb->next) - dst = dst_clone(dst); - - skb_dst_set(skb, dst); - OVS_GSO_CB(skb)->fix_segment(skb); - - skb = skb->next; - } -free: - consume_skb(skb1); - return segs; -} - -static int output_ip(struct sk_buff *skb) -{ - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - -#undef ip_local_out - return ip_local_out(skb); -} - -int rpl_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - if (!OVS_GSO_CB(skb)->fix_segment) - return output_ip(skb); - - /* This bit set can confuse some drivers on old kernel. */ - skb->encapsulation = 0; - - if (skb_is_gso(skb)) { - int ret; - int id; - - skb = tnl_skb_gso_segment(skb, 0, false, AF_INET); - if (!skb || IS_ERR(skb)) - return NET_XMIT_DROP; - - id = ntohs(ip_hdr(skb)->id); - do { - struct sk_buff *next_skb = skb->next; - - skb->next = NULL; - ip_hdr(skb)->id = htons(id++); - - ret = output_ip(skb); - skb = next_skb; - } while (skb); - return ret; - } else if (skb->ip_summed == CHECKSUM_PARTIAL) { - int err; - - err = skb_checksum_help(skb); - if (unlikely(err)) - return NET_XMIT_DROP; - } - - return output_ip(skb); -} -EXPORT_SYMBOL_GPL(rpl_ip_local_out); - -static int output_ipv6(struct sk_buff *skb) -{ - memset(IP6CB(skb), 0, sizeof (*IP6CB(skb))); -#undef ip6_local_out - return ip6_local_out(skb); -} - -int rpl_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - if (!OVS_GSO_CB(skb)->fix_segment) - return output_ipv6(skb); - - /* This bit set can confuse some drivers on old kernel. */ - skb->encapsulation = 0; - - if (skb_is_gso(skb)) { - int ret; - - skb = tnl_skb_gso_segment(skb, 0, false, AF_INET6); - if (!skb || IS_ERR(skb)) - return NET_XMIT_DROP; - - do { - struct sk_buff *next_skb = skb->next; - - skb->next = NULL; - ret = output_ipv6(skb); - skb = next_skb; - } while (skb); - return ret; - } else if (skb->ip_summed == CHECKSUM_PARTIAL) { - int err; - - err = skb_checksum_help(skb); - if (unlikely(err)) - return NET_XMIT_DROP; - } - - return output_ipv6(skb); -} -EXPORT_SYMBOL_GPL(rpl_ip6_local_out); -#endif /* USE_UPSTREAM_TUNNEL_GSO */ diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h deleted file mode 100644 index 20109406a..000000000 --- a/datapath/linux/compat/gso.h +++ /dev/null @@ -1,214 +0,0 @@ -#ifndef __LINUX_GSO_WRAPPER_H -#define __LINUX_GSO_WRAPPER_H - -#include <linux/version.h> -#include "datapath.h" - -typedef void (*gso_fix_segment_t)(struct sk_buff *); - -struct ovs_gso_cb { - struct ovs_skb_cb dp_cb; -#ifndef USE_UPSTREAM_TUNNEL - struct metadata_dst *tun_dst; -#endif -#ifndef USE_UPSTREAM_TUNNEL_GSO - gso_fix_segment_t fix_segment; - bool ipv6; -#endif -#ifndef HAVE_INNER_PROTOCOL - __be16 inner_protocol; -#endif -#ifndef USE_UPSTREAM_TUNNEL - /* Keep original tunnel info during userspace action execution. */ - struct metadata_dst *fill_md_dst; -#endif -}; -#define OVS_GSO_CB(skb) ((struct ovs_gso_cb *)(skb)->cb) - - -#ifndef USE_UPSTREAM_TUNNEL_GSO -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <net/protocol.h> - -static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb) -{ - OVS_GSO_CB(skb)->fix_segment = NULL; -#ifndef USE_UPSTREAM_TUNNEL - OVS_GSO_CB(skb)->tun_dst = NULL; -#endif -} -#else -static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb) -{ -#ifndef USE_UPSTREAM_TUNNEL - OVS_GSO_CB(skb)->tun_dst = NULL; -#endif -} -#endif - -#ifndef HAVE_INNER_PROTOCOL -static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) -{ - OVS_GSO_CB(skb)->inner_protocol = htons(0); -} - -static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb, - __be16 ethertype) -{ - OVS_GSO_CB(skb)->inner_protocol = ethertype; -} - -static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb) -{ - return OVS_GSO_CB(skb)->inner_protocol; -} - -#else - -static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) -{ - /* Nothing to do. The inner_protocol is either zero or - * has been set to a value by another user. - * Either way it may be considered initialised. - */ -} - -static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb) -{ - return skb->inner_protocol; -} - -#ifdef ENCAP_TYPE_ETHER -#define ovs_skb_set_inner_protocol skb_set_inner_protocol -#else -static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb, - __be16 ethertype) -{ - skb->inner_protocol = ethertype; -} -#endif /* ENCAP_TYPE_ETHER */ -#endif /* HAVE_INNER_PROTOCOL */ - -#define skb_inner_mac_offset rpl_skb_inner_mac_offset -static inline int skb_inner_mac_offset(const struct sk_buff *skb) -{ - return skb_inner_mac_header(skb) - skb->data; -} - -#ifndef USE_UPSTREAM_TUNNEL_GSO -#define ip_local_out rpl_ip_local_out -int rpl_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); - -#define ip6_local_out rpl_ip6_local_out -int rpl_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); -#else - -static inline int rpl_ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -#ifdef HAVE_IP_LOCAL_OUT_TAKES_NET - /* net and sk parameters are added at same time. */ - return ip_local_out(net, sk, skb); -#else - return ip_local_out(skb); -#endif -} -#define ip_local_out rpl_ip_local_out - -static inline int rpl_ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) -{ - memset(IP6CB(skb), 0, sizeof (*IP6CB(skb))); -#ifdef HAVE_IP_LOCAL_OUT_TAKES_NET - return ip6_local_out(net, sk, skb); -#else - return ip6_local_out(skb); -#endif -} -#define ip6_local_out rpl_ip6_local_out - -#endif /* USE_UPSTREAM_TUNNEL_GSO */ - -#ifndef USE_UPSTREAM_TUNNEL -/* We need two separate functions to manage different dst in this case. - * First is dst_entry and second is tunnel-dst. - * So define ovs_* separate functions for tun_dst. - */ -static inline void ovs_skb_dst_set(struct sk_buff *skb, void *dst) -{ - OVS_GSO_CB(skb)->tun_dst = (void *)dst; -} - -static inline struct ip_tunnel_info *ovs_skb_tunnel_info(struct sk_buff *skb) -{ - if (likely(OVS_GSO_CB(skb)->tun_dst)) - return &OVS_GSO_CB(skb)->tun_dst->u.tun_info; - else - return NULL; -} - -static inline void ovs_skb_dst_drop(struct sk_buff *skb) -{ - OVS_GSO_CB(skb)->tun_dst = NULL; -} - -static inline void ovs_dst_hold(void *dst) -{ -} - -static inline void ovs_dst_release(struct dst_entry *dst) -{ - struct metadata_dst *tun_dst = (struct metadata_dst *) dst; - - dst_cache_destroy(&tun_dst->u.tun_info.dst_cache); - kfree(dst); -} - -#else -#define ovs_skb_dst_set skb_dst_set -#define ovs_skb_dst_drop skb_dst_drop -#define ovs_dst_hold dst_hold -#define ovs_dst_release dst_release -#endif - -#ifndef USE_UPSTREAM_TUNNEL -#define SKB_INIT_FILL_METADATA_DST(skb) OVS_GSO_CB(skb)->fill_md_dst = NULL; - -#define SKB_RESTORE_FILL_METADATA_DST(skb) do { \ - if (OVS_GSO_CB(skb)->fill_md_dst) { \ - kfree(OVS_GSO_CB(skb)->tun_dst); \ - OVS_GSO_CB(skb)->tun_dst = OVS_GSO_CB(skb)->fill_md_dst; \ - } \ -} while (0) - - -#define SKB_SETUP_FILL_METADATA_DST(skb) ({ \ - struct metadata_dst *new_md_dst; \ - struct metadata_dst *md_dst; \ - int md_size; \ - int ret = 1; \ - \ - SKB_RESTORE_FILL_METADATA_DST(skb); \ - new_md_dst = kmalloc(sizeof(struct metadata_dst) + 256, GFP_ATOMIC); \ - if (new_md_dst) { \ - md_dst = OVS_GSO_CB(skb)->tun_dst; \ - md_size = new_md_dst->u.tun_info.options_len; \ - memcpy(&new_md_dst->u.tun_info, &md_dst->u.tun_info, \ - sizeof(struct ip_tunnel_info) + md_size); \ - \ - OVS_GSO_CB(skb)->fill_md_dst = md_dst; \ - OVS_GSO_CB(skb)->tun_dst = new_md_dst; \ - ret = 1; \ - } else { \ - ret = 0; \ - } \ - ret; \ -}) - -#else -#define SKB_INIT_FILL_METADATA_DST(skb) do {} while(0) -#define SKB_SETUP_FILL_METADATA_DST(skb) (true) -#define SKB_RESTORE_FILL_METADATA_DST(skb) do {} while(0) -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/bug.h b/datapath/linux/compat/include/linux/bug.h deleted file mode 100644 index 6538a22fc..000000000 --- a/datapath/linux/compat/include/linux/bug.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __LINUX_BUG_WRAPPER_H -#define __LINUX_BUG_WRAPPER_H 1 - -#include_next <linux/bug.h> - -#ifdef __CHECKER__ -#ifndef BUILD_BUG_ON_INVALID -#define BUILD_BUG_ON_INVALID(e) (0) -#endif - -#endif /* __CHECKER__ */ - -#endif diff --git a/datapath/linux/compat/include/linux/cache.h b/datapath/linux/compat/include/linux/cache.h deleted file mode 100644 index c8a6710b3..000000000 --- a/datapath/linux/compat/include/linux/cache.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef __LINUX_CACHE_WRAPPER_H -#define __LINUX_CACHE_WRAPPER_H 1 - -#include_next <linux/cache.h> - -/* Upstream commit c74ba8b3480d ("arch: Introduce post-init read-only memory") - * introduced the __ro_after_init attribute, however it wasn't applied to - * generic netlink sockets until commit 34158151d2aa ("netfilter: cttimeout: - * use nf_ct_iterate_cleanup_net to unlink timeout objs"). Using it on - * genetlink before the latter commit leads to crash on module unload. - * For kernels < 4.10, define it as empty. */ -#ifdef HAVE_GENL_FAMILY_LIST -#ifdef __ro_after_init -#undef __ro_after_init -#endif /* #ifdef __ro_after_init */ -#define __ro_after_init -#else -#ifndef __ro_after_init -#define __ro_after_init -#endif /* #ifndef __ro_after_init */ -#endif /* #ifdef HAVE_GENL_FAMILY_LIST */ - -#endif diff --git a/datapath/linux/compat/include/linux/compiler-gcc.h b/datapath/linux/compat/include/linux/compiler-gcc.h deleted file mode 100644 index 39d2e0198..000000000 --- a/datapath/linux/compat/include/linux/compiler-gcc.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __LINUX_COMPILER_H -#if 0 -/* Disable this check - it no longer makes sense with so many backports - * due to spectre mitigation - */ -#ifndef HAVE_LINUX_COMPILER_TYPES_H -#error "Please don't include <linux/compiler-gcc.h> directly, include <linux/compiler.h> instead." -#endif -#endif -#endif - -#include_next <linux/compiler-gcc.h> - -#ifndef __packed -#define __packed __attribute__((packed)) -#endif - -#ifndef __always_unused -#define __always_unused __attribute__((unused)) -#endif diff --git a/datapath/linux/compat/include/linux/compiler.h b/datapath/linux/compat/include/linux/compiler.h deleted file mode 100644 index 59b506fd4..000000000 --- a/datapath/linux/compat/include/linux/compiler.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef __LINUX_COMPILER_WRAPPER_H -#define __LINUX_COMPILER_WRAPPER_H 1 - -#include_next <linux/compiler.h> - -#ifndef __percpu -#define __percpu -#endif - -#ifndef __rcu -#define __rcu -#endif - -#ifndef READ_ONCE -#define READ_ONCE(x) (x) -#endif - -#ifndef WRITE_ONCE -#define WRITE_ONCE(x, val) \ -do { \ - *(volatile typeof(x) *)&(x) = (val); \ -} while (0) -#endif - - -#endif diff --git a/datapath/linux/compat/include/linux/cpumask.h b/datapath/linux/compat/include/linux/cpumask.h deleted file mode 100644 index 48c73aa8f..000000000 --- a/datapath/linux/compat/include/linux/cpumask.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef __LINUX_CPUMASK_WRAPPER_H -#define __LINUX_CPUMASK_WRAPPER_H - -#include_next <linux/cpumask.h> - -/* for_each_cpu was renamed for_each_possible_cpu in 2.6.18. */ -#ifndef for_each_possible_cpu -#define for_each_possible_cpu for_each_cpu -#endif - -#endif /* linux/cpumask.h wrapper */ diff --git a/datapath/linux/compat/include/linux/err.h b/datapath/linux/compat/include/linux/err.h deleted file mode 100644 index 321386c21..000000000 --- a/datapath/linux/compat/include/linux/err.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef __LINUX_ERR_WRAPPER_H -#define __LINUX_ERR_WRAPPER_H 1 - -#include_next <linux/err.h> - -#ifndef HAVE_ERR_CAST -/** - * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type - * @ptr: The pointer to cast. - * - * Explicitly cast an error-valued pointer to another pointer type in such a - * way as to make it clear that's what's going on. - */ -static inline void *ERR_CAST(const void *ptr) -{ - /* cast away the const */ - return (void *) ptr; -} -#endif /* HAVE_ERR_CAST */ - -#ifndef HAVE_IS_ERR_OR_NULL -static inline bool __must_check IS_ERR_OR_NULL(__force const void *ptr) -{ - return !ptr || IS_ERR_VALUE((unsigned long)ptr); -} -#endif - -#ifndef HAVE_PTR_ERR_OR_ZERO -static inline int __must_check PTR_ERR_OR_ZERO(__force const void *ptr) -{ - if (IS_ERR(ptr)) - return PTR_ERR(ptr); - else - return 0; -} -#endif -#endif diff --git a/datapath/linux/compat/include/linux/etherdevice.h b/datapath/linux/compat/include/linux/etherdevice.h deleted file mode 100644 index 4b2707455..000000000 --- a/datapath/linux/compat/include/linux/etherdevice.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef __LINUX_ETHERDEVICE_WRAPPER_H -#define __LINUX_ETHERDEVICE_WRAPPER_H 1 - -#include <linux/version.h> -#include_next <linux/etherdevice.h> - -#ifndef HAVE_ETHER_ADDR_COPY -static inline void ether_addr_copy(u8 *dst, const u8 *src) -{ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - *(u32 *)dst = *(const u32 *)src; - *(u16 *)(dst + 4) = *(const u16 *)(src + 4); -#else - u16 *a = (u16 *)dst; - const u16 *b = (const u16 *)src; - - a[0] = b[0]; - a[1] = b[1]; - a[2] = b[2]; -#endif -} -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0) -#define eth_proto_is_802_3 rpl_eth_proto_is_802_3 -static inline bool eth_proto_is_802_3(__be16 proto) -{ -#ifndef __BIG_ENDIAN - /* if CPU is little endian mask off bits representing LSB */ - proto &= htons(0xFF00); -#endif - /* cast both to u16 and compare since LSB can be ignored */ - return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); -} -#endif - -#define ether_addr_equal rpl_ether_addr_equal -static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) -{ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) - u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | - ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); - - return fold == 0; -#else - const u16 *a = (const u16 *)addr1; - const u16 *b = (const u16 *)addr2; - - return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; -#endif -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) -#define eth_gro_receive rpl_eth_gro_receive -struct sk_buff **rpl_eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb); - -#define eth_gro_complete rpl_eth_gro_complete -int rpl_eth_gro_complete(struct sk_buff *skb, int nhoff); -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/genetlink.h b/datapath/linux/compat/include/linux/genetlink.h deleted file mode 100644 index 3b85f3865..000000000 --- a/datapath/linux/compat/include/linux/genetlink.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _UAPI__LINUX_GENERIC_NETLINK_WRAPPER_H -#define _UAPI__LINUX_GENERIC_NETLINK_WRAPPER_H - -#include_next <linux/genetlink.h> - -#ifndef GENL_UNS_ADMIN_PERM -#define GENL_UNS_ADMIN_PERM GENL_ADMIN_PERM -#endif - -#ifdef GENL_ID_GENERATE -#if GENL_ID_GENERATE != 0 -#error "GENL_ID_GENERATE is assumed to be zero" -#endif -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/if.h b/datapath/linux/compat/include/linux/if.h deleted file mode 100644 index 3beb61df1..000000000 --- a/datapath/linux/compat/include/linux/if.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __LINUX_IF_WRAPPER_H -#define __LINUX_IF_WRAPPER_H 1 - -#include_next <linux/if.h> - -#endif diff --git a/datapath/linux/compat/include/linux/if_ether.h b/datapath/linux/compat/include/linux/if_ether.h deleted file mode 100644 index 8dff938b7..000000000 --- a/datapath/linux/compat/include/linux/if_ether.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __LINUX_IF_ETHER_WRAPPER_H -#define __LINUX_IF_ETHER_WRAPPER_H 1 - -#include_next <linux/if_ether.h> - -#ifndef ETH_MIN_MTU -#define ETH_MIN_MTU 68 /* Min IPv4 MTU per RFC791 */ -#endif - -#ifndef ETH_MAX_MTU -#define ETH_MAX_MTU 0xFFFFU /* 65535, same as IP_MAX_MTU */ -#endif - -#ifndef ETH_P_802_3_MIN -#define ETH_P_802_3_MIN 0x0600 -#endif - -#ifndef ETH_P_8021AD -#define ETH_P_8021AD 0x88A8 /* 802.1ad Service VLAN */ -#endif - -#ifndef ETH_P_NSH -#define ETH_P_NSH 0x894F /* Network Service Header */ -#endif - -#ifndef ETH_P_ERSPAN -#define ETH_P_ERSPAN 0x88BE /* ERSPAN TYPE II */ -#endif - -#ifndef ETH_P_ERSPAN2 -#define ETH_P_ERSPAN2 0x22EB /* ERSPAN version 2 (type III) */ -#endif - -#define inner_eth_hdr rpl_inner_eth_hdr -static inline struct ethhdr *inner_eth_hdr(const struct sk_buff *skb) -{ - return (struct ethhdr *)skb_inner_mac_header(skb); -} -#endif diff --git a/datapath/linux/compat/include/linux/if_link.h b/datapath/linux/compat/include/linux/if_link.h deleted file mode 100644 index bd77e33d3..000000000 --- a/datapath/linux/compat/include/linux/if_link.h +++ /dev/null @@ -1,171 +0,0 @@ -#ifndef _LINUX_IF_LINK_WRAPPER_H -#define _LINUX_IF_LINK_WRAPPER_H - -#include_next<linux/if_link.h> - -/* GENEVE section */ -enum { -#define IFLA_GENEVE_UNSPEC rpl_IFLA_GENEVE_UNSPEC - IFLA_GENEVE_UNSPEC, - -#define IFLA_GENEVE_ID rpl_IFLA_GENEVE_ID - IFLA_GENEVE_ID, - -#define IFLA_GENEVE_REMOTE rpl_IFLA_GENEVE_REMOTE - IFLA_GENEVE_REMOTE, - -#define IFLA_GENEVE_TTL rpl_IFLA_GENEVE_TTL - IFLA_GENEVE_TTL, - -#define IFLA_GENEVE_TOS rpl_IFLA_GENEVE_TOS - IFLA_GENEVE_TOS, - -#define IFLA_GENEVE_PORT rpl_IFLA_GENEVE_PORT - IFLA_GENEVE_PORT, /* destination port */ - -#define IFLA_GENEVE_COLLECT_METADATA rpl_IFLA_GENEVE_COLLECT_METADATA - IFLA_GENEVE_COLLECT_METADATA, - -#define IFLA_GENEVE_REMOTE6 rpl_IFLA_GENEVE_REMOTE6 - IFLA_GENEVE_REMOTE6, - -#define IFLA_GENEVE_UDP_CSUM rpl_IFLA_GENEVE_UDP_CSUM - IFLA_GENEVE_UDP_CSUM, - -#define IFLA_GENEVE_UDP_ZERO_CSUM6_TX rpl_IFLA_GENEVE_UDP_ZERO_CSUM6_TX - IFLA_GENEVE_UDP_ZERO_CSUM6_TX, - -#define IFLA_GENEVE_UDP_ZERO_CSUM6_RX rpl_IFLA_GENEVE_UDP_ZERO_CSUM6_RX - IFLA_GENEVE_UDP_ZERO_CSUM6_RX, - -#define IFLA_GENEVE_LABEL rpl_IFLA_GENEVE_LABEL - IFLA_GENEVE_LABEL, - -#define __IFLA_GENEVE_MAX rpl__IFLA_GENEVE_MAX - __IFLA_GENEVE_MAX -}; -#undef IFLA_GENEVE_MAX -#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) - -/* STT section */ -enum { - IFLA_STT_PORT, /* destination port */ - __IFLA_STT_MAX -}; -#define IFLA_STT_MAX (__IFLA_STT_MAX - 1) - -/* LISP section */ -enum { - IFLA_LISP_PORT, /* destination port */ - __IFLA_LISP_MAX -}; -#define IFLA_LISP_MAX (__IFLA_LISP_MAX - 1) - -/* VXLAN section */ -enum { -#define IFLA_VXLAN_UNSPEC rpl_IFLA_VXLAN_UNSPEC - IFLA_VXLAN_UNSPEC, -#define IFLA_VXLAN_ID rpl_IFLA_VXLAN_ID - IFLA_VXLAN_ID, -#define IFLA_VXLAN_GROUP rpl_IFLA_VXLAN_GROUP - IFLA_VXLAN_GROUP, /* group or remote address */ -#define IFLA_VXLAN_LINK rpl_IFLA_VXLAN_LINK - IFLA_VXLAN_LINK, -#define IFLA_VXLAN_LOCAL rpl_IFLA_VXLAN_LOCAL - IFLA_VXLAN_LOCAL, -#define IFLA_VXLAN_TTL rpl_IFLA_VXLAN_TTL - IFLA_VXLAN_TTL, -#define IFLA_VXLAN_TOS rpl_IFLA_VXLAN_TOS - IFLA_VXLAN_TOS, -#define IFLA_VXLAN_LEARNING rpl_IFLA_VXLAN_LEARNING - IFLA_VXLAN_LEARNING, -#define IFLA_VXLAN_AGEING rpl_IFLA_VXLAN_AGEING - IFLA_VXLAN_AGEING, -#define IFLA_VXLAN_LIMIT rpl_IFLA_VXLAN_LIMIT - IFLA_VXLAN_LIMIT, -#define IFLA_VXLAN_PORT_RANGE rpl_IFLA_VXLAN_PORT_RANGE - IFLA_VXLAN_PORT_RANGE, /* source port */ -#define IFLA_VXLAN_PROXY rpl_IFLA_VXLAN_PROXY - IFLA_VXLAN_PROXY, -#define IFLA_VXLAN_RSC rpl_IFLA_VXLAN_RSC - IFLA_VXLAN_RSC, -#define IFLA_VXLAN_L2MISS rpl_IFLA_VXLAN_L2MISS - IFLA_VXLAN_L2MISS, -#define IFLA_VXLAN_L3MISS rpl_IFLA_VXLAN_L3MISS - IFLA_VXLAN_L3MISS, -#define IFLA_VXLAN_PORT rpl_IFLA_VXLAN_PORT - IFLA_VXLAN_PORT, /* destination port */ -#define IFLA_VXLAN_GROUP6 rpl_IFLA_VXLAN_GROUP6 - IFLA_VXLAN_GROUP6, -#define IFLA_VXLAN_LOCAL6 rpl_IFLA_VXLAN_LOCAL6 - IFLA_VXLAN_LOCAL6, -#define IFLA_VXLAN_UDP_CSUM rpl_IFLA_VXLAN_UDP_CSUM - IFLA_VXLAN_UDP_CSUM, -#define IFLA_VXLAN_UDP_ZERO_CSUM6_TX rpl_IFLA_VXLAN_UDP_ZERO_CSUM6_TX - IFLA_VXLAN_UDP_ZERO_CSUM6_TX, -#define IFLA_VXLAN_UDP_ZERO_CSUM6_RX rpl_IFLA_VXLAN_UDP_ZERO_CSUM6_RX - IFLA_VXLAN_UDP_ZERO_CSUM6_RX, -#define IFLA_VXLAN_REMCSUM_TX rpl_IFLA_VXLAN_REMCSUM_TX - IFLA_VXLAN_REMCSUM_TX, -#define IFLA_VXLAN_REMCSUM_RX rpl_IFLA_VXLAN_REMCSUM_RX - IFLA_VXLAN_REMCSUM_RX, -#define IFLA_VXLAN_GBP rpl_IFLA_VXLAN_GBP - IFLA_VXLAN_GBP, -#define IFLA_VXLAN_REMCSUM_NOPARTIAL rpl_IFLA_VXLAN_REMCSUM_NOPARTIAL - IFLA_VXLAN_REMCSUM_NOPARTIAL, -#define IFLA_VXLAN_COLLECT_METADATA rpl_IFLA_VXLAN_COLLECT_METADATA - IFLA_VXLAN_COLLECT_METADATA, -#define IFLA_VXLAN_LABEL rpl_IFLA_VXLAN_LABEL - IFLA_VXLAN_LABEL, -#define IFLA_VXLAN_GPE rpl_IFLA_VXLAN_GPE - IFLA_VXLAN_GPE, - -#define __IFLA_VXLAN_MAX rpl___IFLA_VXLAN_MAX - __IFLA_VXLAN_MAX -}; - -#undef IFLA_VXLAN_MAX -#define IFLA_VXLAN_MAX (rpl___IFLA_VXLAN_MAX - 1) - -#define ifla_vxlan_port_range rpl_ifla_vxlan_port_range -struct ifla_vxlan_port_range { - __be16 low; - __be16 high; -}; - -#ifndef HAVE_RTNL_LINK_STATS64 -/* The main device statistics structure */ -struct rtnl_link_stats64 { - __u64 rx_packets; /* total packets received */ - __u64 tx_packets; /* total packets transmitted */ - __u64 rx_bytes; /* total bytes received */ - __u64 tx_bytes; /* total bytes transmitted */ - __u64 rx_errors; /* bad packets received */ - __u64 tx_errors; /* packet transmit problems */ - __u64 rx_dropped; /* no space in linux buffers */ - __u64 tx_dropped; /* no space available in linux */ - __u64 multicast; /* multicast packets received */ - __u64 collisions; - - /* detailed rx_errors: */ - __u64 rx_length_errors; - __u64 rx_over_errors; /* receiver ring buff overflow */ - __u64 rx_crc_errors; /* recved pkt with crc error */ - __u64 rx_frame_errors; /* recv'd frame alignment error */ - __u64 rx_fifo_errors; /* recv'r fifo overrun */ - __u64 rx_missed_errors; /* receiver missed packet */ - - /* detailed tx_errors */ - __u64 tx_aborted_errors; - __u64 tx_carrier_errors; - __u64 tx_fifo_errors; - __u64 tx_heartbeat_errors; - __u64 tx_window_errors; - - /* for cslip etc */ - __u64 rx_compressed; - __u64 tx_compressed; -}; -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/if_vlan.h b/datapath/linux/compat/include/linux/if_vlan.h deleted file mode 100644 index 3ed7522c7..000000000 --- a/datapath/linux/compat/include/linux/if_vlan.h +++ /dev/null @@ -1,306 +0,0 @@ -#ifndef __LINUX_IF_VLAN_WRAPPER_H -#define __LINUX_IF_VLAN_WRAPPER_H 1 - -#include <linux/skbuff.h> -#include <linux/version.h> -#include_next <linux/if_vlan.h> - -#ifndef HAVE_VLAN_INSERT_TAG_SET_PROTO -/* - * The behavior of __vlan_put_tag()/vlan_insert_tag_set_proto() has changed - * over time: - * - * - In 2.6.26 and earlier, it adjusted both MAC and network header - * pointers. (The latter didn't make any sense.) - * - * - In 2.6.27 and 2.6.28, it did not adjust any header pointers at all. - * - * - In 2.6.29 and later, it adjusts the MAC header pointer only. - * - * - In 3.19 and later, it was renamed to vlan_insert_tag_set_proto() - * - * This is the version from 2.6.33. We unconditionally substitute this version - * to avoid the need to guess whether the version in the kernel tree is - * acceptable. - */ -#define vlan_insert_tag_set_proto(skb, proto, vlan_tci) \ - rpl_vlan_insert_tag_set_proto(skb, proto, vlan_tci) -static inline struct sk_buff *rpl_vlan_insert_tag_set_proto(struct sk_buff *skb, - __be16 vlan_proto, - u16 vlan_tci) -{ - struct vlan_ethhdr *veth; - - if (skb_cow_head(skb, VLAN_HLEN) < 0) { - kfree_skb(skb); - return NULL; - } - veth = (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN); - - /* Move the mac addresses to the beginning of the new header. */ - memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); - skb->mac_header -= VLAN_HLEN; - - /* first, the ethernet type */ - veth->h_vlan_proto = vlan_proto; - - /* now, the TCI */ - veth->h_vlan_TCI = htons(vlan_tci); - - skb->protocol = vlan_proto; - - return skb; -} -#endif - -#ifndef HAVE_VLAN_HWACCEL_CLEAR_TAG -/** - * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info - * @skb: skbuff to clear - * - * Clears the VLAN information from @skb - */ -#define __vlan_hwaccel_clear_tag rpl_vlan_hwaccel_clear_tag -static inline void rpl_vlan_hwaccel_clear_tag(struct sk_buff *skb) -{ -#ifdef HAVE_SKBUFF_VLAN_PRESENT - skb->vlan_present = 0; -#else - skb->vlan_tci = 0; - skb->vlan_proto = 0; -#endif -} -#endif - -#ifndef HAVE_VLAN_HWACCEL_PUSH_INSIDE - -/* - * __vlan_hwaccel_push_inside - pushes vlan tag to the payload - * @skb: skbuff to tag - * - * Pushes the VLAN tag from @skb->vlan_tci inside to the payload. - * - * Following the skb_unshare() example, in case of error, the calling function - * doesn't have to worry about freeing the original skb. - */ -static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb) -{ - skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, - vlan_tx_tag_get(skb)); - if (likely(skb)) - skb->vlan_tci = 0; - return skb; -} -/* - * vlan_hwaccel_push_inside - pushes vlan tag to the payload - * @skb: skbuff to tag - * - * Checks is tag is present in @skb->vlan_tci and if it is, it pushes the - * VLAN tag from @skb->vlan_tci inside to the payload. - * - * Following the skb_unshare() example, in case of error, the calling function - * doesn't have to worry about freeing the original skb. - */ -static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb) -{ - if (vlan_tx_tag_present(skb)) - skb = __vlan_hwaccel_push_inside(skb); - return skb; -} -#endif - -#ifndef HAVE_ETH_TYPE_VLAN -/** - * eth_type_vlan - check for valid vlan ether type. - * @ethertype: ether type to check - * - * Returns true if the ether type is a vlan ether type. - */ -static inline bool eth_type_vlan(__be16 ethertype) -{ - switch (ethertype) { - case htons(ETH_P_8021Q): - case htons(ETH_P_8021AD): - return true; - default: - return false; - } -} -#endif - -/* All of these were introduced in a single commit preceding 2.6.33, so - * presumably all of them or none of them are present. */ -#ifndef VLAN_PRIO_MASK -#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ -#define VLAN_PRIO_SHIFT 13 -#define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */ -#define VLAN_TAG_PRESENT VLAN_CFI_MASK -#endif - -#ifndef HAVE_VLAN_SET_ENCAP_PROTO -static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) -{ - __be16 proto; - unsigned char *rawp; - - /* - * Was a VLAN packet, grab the encapsulated protocol, which the layer - * three protocols care about. - */ - - proto = vhdr->h_vlan_encapsulated_proto; - if (ntohs(proto) >= 1536) { - skb->protocol = proto; - return; - } - - rawp = skb->data; - if (*(unsigned short *) rawp == 0xFFFF) - /* - * This is a magic hack to spot IPX packets. Older Novell - * breaks the protocol design and runs IPX over 802.3 without - * an 802.2 LLC layer. We look for FFFF which isn't a used - * 802.2 SSAP/DSAP. This won't work for fault tolerant netware - * but does for the rest. - */ - skb->protocol = htons(ETH_P_802_3); - else - /* - * Real 802.2 LLC - */ - skb->protocol = htons(ETH_P_802_2); -} -#endif - -#ifndef HAVE___VLAN_INSERT_TAG -/* Kernels which don't have __vlan_insert_tag() also don't have skb->vlan_proto - * so ignore the proto paramter. - */ -#define __vlan_insert_tag(skb, proto, tci) rpl_vlan_insert_tag(skb, tci) -static inline int rpl_vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci) -{ - struct vlan_ethhdr *veth; - - if (skb_cow_head(skb, VLAN_HLEN) < 0) - return -ENOMEM; - - veth = (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN); - - /* Move the mac addresses to the beginning of the new header. */ - memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN); - skb->mac_header -= VLAN_HLEN; - - /* first, the ethernet type */ - veth->h_vlan_proto = htons(ETH_P_8021Q); - - /* now, the TCI */ - veth->h_vlan_TCI = htons(vlan_tci); - - return 0; -} -#endif - -#ifndef skb_vlan_tag_present -#define skb_vlan_tag_present(skb) vlan_tx_tag_present(skb) -#define skb_vlan_tag_get(skb) vlan_tx_tag_get(skb) -#endif - -#ifndef HAVE_VLAN_GET_PROTOCOL - -static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type, - int *depth) -{ - unsigned int vlan_depth = skb->mac_len; - - /* if type is 802.1Q/AD then the header should already be - * present at mac_len - VLAN_HLEN (if mac_len > 0), or at - * ETH_HLEN otherwise - */ - if (eth_type_vlan(type)) { - if (vlan_depth) { - if (WARN_ON(vlan_depth < VLAN_HLEN)) - return 0; - vlan_depth -= VLAN_HLEN; - } else { - vlan_depth = ETH_HLEN; - } - do { - struct vlan_hdr *vh; - - if (unlikely(!pskb_may_pull(skb, - vlan_depth + VLAN_HLEN))) - return 0; - - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; - } while (eth_type_vlan(type)); - } - - if (depth) - *depth = vlan_depth; - - return type; -} - -/** - * vlan_get_protocol - get protocol EtherType. - * @skb: skbuff to query - * - * Returns the EtherType of the packet, regardless of whether it is - * vlan encapsulated (normal or hardware accelerated) or not. - */ -static inline __be16 vlan_get_protocol(struct sk_buff *skb) -{ - return __vlan_get_protocol(skb, skb->protocol, NULL); -} - -#endif - -#ifndef HAVE_SKB_VLAN_TAGGED -/** - * skb_vlan_tagged - check if skb is vlan tagged. - * @skb: skbuff to query - * - * Returns true if the skb is tagged, regardless of whether it is hardware - * accelerated or not. - */ -static inline bool skb_vlan_tagged(const struct sk_buff *skb) -{ - if (!skb_vlan_tag_present(skb) && - likely(!eth_type_vlan(skb->protocol))) - return false; - - return true; -} - -/** - * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. - * @skb: skbuff to query - * - * Returns true if the skb is tagged with multiple vlan headers, regardless - * of whether it is hardware accelerated or not. - */ -static inline bool skb_vlan_tagged_multi(const struct sk_buff *skb) -{ - __be16 protocol = skb->protocol; - - if (!skb_vlan_tag_present(skb)) { - struct vlan_ethhdr *veh; - - if (likely(!eth_type_vlan(protocol))) - return false; - - veh = (struct vlan_ethhdr *)skb->data; - protocol = veh->h_vlan_encapsulated_proto; - } - - if (!eth_type_vlan(protocol)) - return false; - - return true; -} - -#endif /* HAVE_SKB_VLAN_TAGGED */ - -#endif /* linux/if_vlan.h wrapper */ diff --git a/datapath/linux/compat/include/linux/in.h b/datapath/linux/compat/include/linux/in.h deleted file mode 100644 index 78f8d7731..000000000 --- a/datapath/linux/compat/include/linux/in.h +++ /dev/null @@ -1,56 +0,0 @@ -#ifndef __LINUX_IN_WRAPPER_H -#define __LINUX_IN_WRAPPER_H 1 - -#include_next <linux/in.h> - -#include <linux/module.h> -#ifndef HAVE_PROTO_PORTS_OFFSET -static inline int proto_ports_offset(int proto) -{ - switch (proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: /* SPI */ - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - return 0; - case IPPROTO_AH: /* SPI */ - return 4; - default: - return -EINVAL; - } -} -#endif - -#ifndef HAVE_IPV4_IS_MULTICAST - -static inline bool ipv4_is_loopback(__be32 addr) -{ - return (addr & htonl(0xff000000)) == htonl(0x7f000000); -} - -static inline bool ipv4_is_multicast(__be32 addr) -{ - return (addr & htonl(0xf0000000)) == htonl(0xe0000000); -} - -static inline bool ipv4_is_local_multicast(__be32 addr) -{ - return (addr & htonl(0xffffff00)) == htonl(0xe0000000); -} - -static inline bool ipv4_is_lbcast(__be32 addr) -{ - /* limited broadcast */ - return addr == htonl(INADDR_BROADCAST); -} - -static inline bool ipv4_is_zeronet(__be32 addr) -{ - return (addr & htonl(0xff000000)) == htonl(0x00000000); -} - -#endif /* !HAVE_IPV4_IS_MULTICAST */ - -#endif diff --git a/datapath/linux/compat/include/linux/jiffies.h b/datapath/linux/compat/include/linux/jiffies.h deleted file mode 100644 index 642eacec7..000000000 --- a/datapath/linux/compat/include/linux/jiffies.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef __LINUX_JIFFIES_WRAPPER_H -#define __LINUX_JIFFIES_WRAPPER_H 1 - -#include_next <linux/jiffies.h> - -#include <linux/version.h> - -/* Same as above, but does so with platform independent 64bit types. - * These must be used when utilizing jiffies_64 (i.e. return value of - * get_jiffies_64() */ - -#ifndef time_after64 -#define time_after64(a, b) \ - (typecheck(__u64, a) && \ - typecheck(__u64, b) && \ - ((__s64)(b) - (__s64)(a) < 0)) -#endif - -#ifndef time_before64 -#define time_before64(a, b) time_after64(b, a) -#endif - -#ifndef time_after_eq64 -#define time_after_eq64(a, b) \ - (typecheck(__u64, a) && \ - typecheck(__u64, b) && \ - ((__s64)(a) - (__s64)(b) >= 0)) -#endif - -#ifndef time_before_eq64 -#define time_before_eq64(a, b) time_after_eq64(b, a) -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/kconfig.h b/datapath/linux/compat/include/linux/kconfig.h deleted file mode 100644 index d3fa57a6b..000000000 --- a/datapath/linux/compat/include/linux/kconfig.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef __LINUX_KCONFIG_WRAPPER_H -#define __LINUX_KCONFIG_WRAPPER_H - -#include <linux/version.h> - -#ifndef IS_ENABLED - -/* - * Helper macros to use CONFIG_ options in C/CPP expressions. Note that - * these only work with boolean and tristate options. - */ - -/* - * Getting something that works in C and CPP for an arg that may or may - * not be defined is tricky. Here, if we have "#define CONFIG_BOOGER 1" - * we match on the placeholder define, insert the "0," for arg1 and generate - * the triplet (0, 1, 0). Then the last step cherry picks the 2nd arg (a one). - * When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when - * the last step cherry picks the 2nd arg, we get a zero. - */ -#define __ARG_PLACEHOLDER_1 0, -#define config_enabled(cfg) _config_enabled(cfg) -#define _config_enabled(value) __config_enabled(__ARG_PLACEHOLDER_##value) -#define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0) -#define ___config_enabled(__ignored, val, ...) val - -/* - * IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm', - * 0 otherwise. - * - */ -#define IS_ENABLED(option) \ - (config_enabled(option) || config_enabled(option##_MODULE)) - -/* - * IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0 - * otherwise. For boolean options, this is equivalent to - * IS_ENABLED(CONFIG_FOO). - */ -#define IS_BUILTIN(option) config_enabled(option) - -/* - * IS_MODULE(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'm', 0 - * otherwise. - */ -#define IS_MODULE(option) config_enabled(option##_MODULE) - -#endif /* IS_ENABLED */ -#endif /* __LINUX_KCONFIG_WRAPER_H */ diff --git a/datapath/linux/compat/include/linux/kernel.h b/datapath/linux/compat/include/linux/kernel.h deleted file mode 100644 index 106b5940a..000000000 --- a/datapath/linux/compat/include/linux/kernel.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __KERNEL_H_WRAPPER -#define __KERNEL_H_WRAPPER 1 - -#include_next <linux/kernel.h> -#ifndef HAVE_LOG2_H -#include <linux/log2.h> -#endif - -#include <linux/version.h> - -#ifndef USHRT_MAX -#define USHRT_MAX ((u16)(~0U)) -#define SHRT_MAX ((s16)(USHRT_MAX>>1)) -#define SHRT_MIN ((s16)(-SHRT_MAX - 1)) -#endif - -#ifndef DIV_ROUND_UP -#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) -#endif - -#ifndef rounddown -#define rounddown(x, y) ( \ -{ \ - typeof(x) __x = (x); \ - __x - (__x % (y)); \ -} \ -) -#endif - -/* U32_MAX was introduced in include/linux/kernel.h after version 3.14. */ -#ifndef U32_MAX -#define U32_MAX ((u32)~0U) -#endif - -#ifndef sizeof_field -#define sizeof_field(t, f) (sizeof(((t*)0)->f)) -#endif - -#endif /* linux/kernel.h */ diff --git a/datapath/linux/compat/include/linux/list.h b/datapath/linux/compat/include/linux/list.h deleted file mode 100644 index 4234c17ce..000000000 --- a/datapath/linux/compat/include/linux/list.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __LINUX_LIST_WRAPPER_H -#define __LINUX_LIST_WRAPPER_H 1 - -#include_next <linux/list.h> - -#ifndef hlist_entry_safe -#define hlist_entry_safe(ptr, type, member) \ - ({ typeof(ptr) ____ptr = (ptr); \ - ____ptr ? hlist_entry(____ptr, type, member) : NULL; \ - }) - -#undef hlist_for_each_entry -#define hlist_for_each_entry(pos, head, member) \ - for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\ - pos; \ - pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member)) - -#undef hlist_for_each_entry_safe -#define hlist_for_each_entry_safe(pos, n, head, member) \ - for (pos = hlist_entry_safe((head)->first, typeof(*pos), member);\ - pos && ({ n = pos->member.next; 1; }); \ - pos = hlist_entry_safe(n, typeof(*pos), member)) - -#endif - -#ifndef list_first_entry_or_null -#define list_first_entry_or_null(ptr, type, member) \ - (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/mm.h b/datapath/linux/compat/include/linux/mm.h deleted file mode 100644 index 681f3db89..000000000 --- a/datapath/linux/compat/include/linux/mm.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef OVS_MM_H -#define OVS_MM_H - -#include <linux/overflow.h> - -#ifndef HAVE_KVMALLOC_ARRAY -#ifndef HAVE_KVMALLOC_NODE -extern void *vmalloc_node(unsigned long size, int node); -#define kvmalloc_node(a, b, c) vmalloc_node(a, c) -#else -extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -#endif /* HAVE_KVMALLOC_NODE */ -static inline void *kvmalloc(size_t size, gfp_t flags) -{ - return kvmalloc_node(size, flags, NUMA_NO_NODE); -} -static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) -{ - return kvmalloc_node(size, flags | __GFP_ZERO, node); -} -static inline void *kvzalloc(size_t size, gfp_t flags) -{ - return kvmalloc(size, flags | __GFP_ZERO); -} - -static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) -{ - size_t bytes; - - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; - - return kvmalloc(bytes, flags); -} - -static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) -{ - return kvmalloc_array(n, size, flags | __GFP_ZERO); -} - -#endif -#include_next <linux/mm.h> -#endif /* OVS_MM_H */ - diff --git a/datapath/linux/compat/include/linux/mpls.h b/datapath/linux/compat/include/linux/mpls.h deleted file mode 100644 index ab99ebc30..000000000 --- a/datapath/linux/compat/include/linux/mpls.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef _UAPI_MPLS_WRAPPER_H -#define _UAPI_MPLS_WRAPPER_H - - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) -#include_next <linux/mpls.h> -#else - -#include <linux/types.h> -#include <asm/byteorder.h> - -/* Reference: RFC 5462, RFC 3032 - * - * 0 1 2 3 - * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Label | TC |S| TTL | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * Label: Label Value, 20 bits - * TC: Traffic Class field, 3 bits - * S: Bottom of Stack, 1 bit - * TTL: Time to Live, 8 bits - */ - -struct mpls_label { - __be32 entry; -}; - -#define MPLS_LS_LABEL_MASK 0xFFFFF000 -#define MPLS_LS_LABEL_SHIFT 12 -#define MPLS_LS_TC_MASK 0x00000E00 -#define MPLS_LS_TC_SHIFT 9 -#define MPLS_LS_S_MASK 0x00000100 -#define MPLS_LS_S_SHIFT 8 -#define MPLS_LS_TTL_MASK 0x000000FF -#define MPLS_LS_TTL_SHIFT 0 -#endif - -#endif /* _UAPI_MPLS_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/net.h b/datapath/linux/compat/include/linux/net.h deleted file mode 100644 index 2a6903d0a..000000000 --- a/datapath/linux/compat/include/linux/net.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef __LINUX_NET_WRAPPER_H -#define __LINUX_NET_WRAPPER_H 1 - -#include_next <linux/net.h> -#include <linux/types.h> - -#ifndef net_ratelimited_function -#define net_ratelimited_function(function, ...) \ -do { \ - if (net_ratelimit()) \ - function(__VA_ARGS__); \ -} while (0) - -#define net_emerg_ratelimited(fmt, ...) \ - net_ratelimited_function(pr_emerg, fmt, ##__VA_ARGS__) -#define net_alert_ratelimited(fmt, ...) \ - net_ratelimited_function(pr_alert, fmt, ##__VA_ARGS__) -#define net_crit_ratelimited(fmt, ...) \ - net_ratelimited_function(pr_crit, fmt, ##__VA_ARGS__) -#define net_err_ratelimited(fmt, ...) \ - net_ratelimited_function(pr_err, fmt, ##__VA_ARGS__) -#define net_notice_ratelimited(fmt, ...) \ - net_ratelimited_function(pr_notice, fmt, ##__VA_ARGS__) -#define net_warn_ratelimited(fmt, ...) \ - net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) -#define net_info_ratelimited(fmt, ...) \ - net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#define net_dbg_ratelimited(fmt, ...) \ - net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) -#endif - -#ifndef net_get_random_once -#define __net_get_random_once rpl___net_get_random_once -bool rpl___net_get_random_once(void *buf, int nbytes, bool *done, - atomic_t *done_key); - -#define ___NET_RANDOM_STATIC_KEY_INIT ATOMIC_INIT(0) - - -#define net_get_random_once(buf, nbytes) \ -({ \ - bool ___ret = false; \ - static bool ___done = false; \ - static atomic_t ___done_key = \ - ___NET_RANDOM_STATIC_KEY_INIT; \ - if (!atomic_read(&___done_key)) \ - ___ret = __net_get_random_once(buf, \ - nbytes, \ - &___done, \ - &___done_key); \ - ___ret; \ -}) -#endif - -#ifndef HAVE_SOCK_CREATE_KERN_NET -int ovs_sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res); -void ovs_sock_release(struct socket *sock); -#define sock_create_kern ovs_sock_create_kern -#define sock_release ovs_sock_release -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/netdev_features.h b/datapath/linux/compat/include/linux/netdev_features.h deleted file mode 100644 index 411f2949b..000000000 --- a/datapath/linux/compat/include/linux/netdev_features.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef __LINUX_NETDEV_FEATURES_WRAPPER_H -#define __LINUX_NETDEV_FEATURES_WRAPPER_H - -#include_next <linux/netdev_features.h> - -#ifndef NETIF_F_GSO_GRE -#define NETIF_F_GSO_GRE 0 -#endif - -#ifndef NETIF_F_GSO_GRE_CSUM -#define NETIF_F_GSO_GRE_CSUM 0 -#else -#define HAVE_NETIF_F_GSO_GRE_CSUM -#endif - -#ifndef NETIF_F_GSO_IPIP -#define NETIF_F_GSO_IPIP 0 -#endif - -#ifndef NETIF_F_GSO_SIT -#define NETIF_F_GSO_SIT 0 -#endif - -#ifndef NETIF_F_CSUM_MASK -#define NETIF_F_CSUM_MASK 0 -#endif - -#ifndef NETIF_F_GSO_UDP_TUNNEL -#define NETIF_F_GSO_UDP_TUNNEL 0 -#else -#define HAVE_NETIF_F_GSO_UDP_TUNNEL 0 -#endif - -#ifndef NETIF_F_GSO_UDP_TUNNEL_CSUM -#define NETIF_F_GSO_UDP_TUNNEL_CSUM 0 -#define SKB_GSO_UDP_TUNNEL_CSUM 0 -#endif - -#ifndef NETIF_F_GSO_MPLS -#define NETIF_F_GSO_MPLS 0 -#endif - -#ifndef NETIF_F_HW_VLAN_STAG_TX -#define NETIF_F_HW_VLAN_STAG_TX 0 -#endif - -#ifndef NETIF_F_GSO_TUNNEL_REMCSUM -#define NETIF_F_GSO_TUNNEL_REMCSUM 0 -#define SKB_GSO_TUNNEL_REMCSUM 0 -#else -/* support for REM_CSUM is added in 3.19 but API are not defined - * till 4.0, so turn on REMSUM support on kernel 4.0 onwards. - */ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) -#define HAVE_NETIF_F_GSO_TUNNEL_REMCSUM -#endif -#endif - -#ifndef NETIF_F_RXCSUM -#define NETIF_F_RXCSUM 0 -#endif - -#ifndef NETIF_F_GSO_ENCAP_ALL -#define NETIF_F_GSO_ENCAP_ALL (NETIF_F_GSO_GRE | \ - NETIF_F_GSO_GRE_CSUM | \ - NETIF_F_GSO_IPIP | \ - NETIF_F_GSO_SIT | \ - NETIF_F_GSO_UDP_TUNNEL | \ - NETIF_F_GSO_UDP_TUNNEL_CSUM | \ - NETIF_F_GSO_MPLS) -#endif - -#ifndef HAVE_NETIF_F_GSO_GRE_CSUM -#define SKB_GSO_GRE_CSUM 0 -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h deleted file mode 100644 index 126ff23cf..000000000 --- a/datapath/linux/compat/include/linux/netdevice.h +++ /dev/null @@ -1,336 +0,0 @@ -#ifndef __LINUX_NETDEVICE_WRAPPER_H -#define __LINUX_NETDEVICE_WRAPPER_H 1 - -#include_next <linux/netdevice.h> -#include <linux/if_bridge.h> - -struct net; - -#include <linux/version.h> - -#ifndef IFF_TX_SKB_SHARING -#define IFF_TX_SKB_SHARING 0 -#endif - -#ifndef IFF_OVS_DATAPATH -#define IFF_OVS_DATAPATH 0 -#else -#define HAVE_OVS_DATAPATH -#endif - -#ifndef IFF_LIVE_ADDR_CHANGE -#define IFF_LIVE_ADDR_CHANGE 0 -#endif - -#ifndef IFF_OPENVSWITCH -#define IFF_OPENVSWITCH 0 -#endif - -#ifndef to_net_dev -#define to_net_dev(class) container_of(class, struct net_device, NETDEV_DEV_MEMBER) -#endif - -#ifndef HAVE_NET_NAME_UNKNOWN -#undef alloc_netdev -#define NET_NAME_UNKNOWN 0 -#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \ - alloc_netdev_mq(sizeof_priv, name, setup, 1) -#endif - -#ifndef HAVE_DEV_DISABLE_LRO -extern void dev_disable_lro(struct net_device *dev); -#endif - -#ifndef HAVE_DEV_GET_BY_INDEX_RCU -static inline struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex) -{ - struct net_device *dev; - - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); - read_unlock(&dev_base_lock); - - return dev; -} -#endif - -#ifndef NETIF_F_FSO -#define NETIF_F_FSO 0 -#endif - -#ifndef HAVE_NETDEV_FEATURES_T -typedef u32 netdev_features_t; -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) -#define OVS_USE_COMPAT_GSO_SEGMENTATION -#endif - -#ifdef OVS_USE_COMPAT_GSO_SEGMENTATION -/* define compat version to handle MPLS segmentation offload. */ -#define __skb_gso_segment rpl__skb_gso_segment -struct sk_buff *rpl__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, - bool tx_path); - -#define skb_gso_segment rpl_skb_gso_segment -static inline -struct sk_buff *rpl_skb_gso_segment(struct sk_buff *skb, netdev_features_t features) -{ - return rpl__skb_gso_segment(skb, features, true); -} -#endif - -#ifdef HAVE_NETIF_NEEDS_GSO_NETDEV -#define netif_needs_gso rpl_netif_needs_gso -static inline bool netif_needs_gso(struct sk_buff *skb, - netdev_features_t features) -{ - return skb_is_gso(skb) && (!skb_gso_ok(skb, features) || - unlikely((skb->ip_summed != CHECKSUM_PARTIAL) && - (skb->ip_summed != CHECKSUM_UNNECESSARY))); -} -#endif - -#ifndef HAVE_NETDEV_MASTER_UPPER_DEV_LINK_PRIV -#ifndef HAVE_NETDEV_MASTER_UPPER_DEV_LINK_RH -static inline int rpl_netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, - void *upper_priv, - void *upper_info, void *extack) -{ - return netdev_master_upper_dev_link(dev, upper_dev); -} -#define netdev_master_upper_dev_link rpl_netdev_master_upper_dev_link -#else /* #ifndef HAVE_NETDEV_MASTER_UPPER_DEV_LINK_RH */ -static inline int rpl_netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, - void *upper_priv, - void *upper_info, void *extack) -{ - return netdev_master_upper_dev_link(dev, upper_dev, - upper_priv, upper_info); -} -#undef netdev_master_upper_dev_link -#define netdev_master_upper_dev_link rpl_netdev_master_upper_dev_link -#endif /* #else HAVE_NETDEV_MASTER_UPPER_DEV_LINK_RH */ -#else /* #ifndef HAVE_NETDEV_MASTER_UPPER_DEV_LINK_PRIV */ -#ifndef HAVE_UPPER_DEV_LINK_EXTACK -static inline int rpl_netdev_master_upper_dev_link(struct net_device *dev, - struct net_device *upper_dev, - void *upper_priv, - void *upper_info, void *extack) -{ - return netdev_master_upper_dev_link(dev, upper_dev, upper_priv, - upper_info); -} -#define netdev_master_upper_dev_link rpl_netdev_master_upper_dev_link -#endif /* #ifndef HAVE_UPPER_DEV_LINK_EXTACK */ -#endif /* #else HAVE_NETDEV_MASTER_UPPER_DEV_LINK_PRIV */ - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0) -#define dev_queue_xmit rpl_dev_queue_xmit -int rpl_dev_queue_xmit(struct sk_buff *skb); -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0) -static inline struct net_device *rpl_netdev_notifier_info_to_dev(void *info) -{ - return info; -} -#define netdev_notifier_info_to_dev rpl_netdev_notifier_info_to_dev -#endif - -#ifndef HAVE_PCPU_SW_NETSTATS -#define pcpu_sw_netstats pcpu_tstats -#endif - -#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0) -/* Use compat version for all redhas releases */ -#undef netdev_alloc_pcpu_stats -#endif - -#ifndef netdev_alloc_pcpu_stats -#define netdev_alloc_pcpu_stats(type) \ -({ \ - typeof(type) __percpu *pcpu_stats = alloc_percpu(type); \ - if (pcpu_stats) { \ - int ____i; \ - for_each_possible_cpu(____i) { \ - typeof(type) *stat; \ - stat = per_cpu_ptr(pcpu_stats, ____i); \ - u64_stats_init(&stat->syncp); \ - } \ - } \ - pcpu_stats; \ -}) -#endif - -#ifndef HAVE_DEV_RECURSION_LEVEL -static inline bool dev_recursion_level(void) { return false; } -#endif - -#ifndef NET_NAME_USER -#define NET_NAME_USER 3 -#endif - -#ifndef HAVE_GRO_REMCSUM -struct gro_remcsum { -}; - -#define skb_gro_remcsum_init(grc) -#define skb_gro_remcsum_cleanup(a1, a2) -#else -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0) - -#define skb_gro_remcsum_process rpl_skb_gro_remcsum_process -static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, - unsigned int off, size_t hdrlen, - int start, int offset, - struct gro_remcsum *grc, - bool nopartial) -{ - __wsum delta; - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); - - BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); - - if (!nopartial) { - NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; - return ptr; - } - - ptr = skb_gro_header_fast(skb, off); - if (skb_gro_header_hard(skb, off + plen)) { - ptr = skb_gro_header_slow(skb, off + plen, off); - if (!ptr) - return NULL; - } - - delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, - start, offset); - - /* Adjust skb->csum since we changed the packet */ - NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); - - grc->offset = off + hdrlen + offset; - grc->delta = delta; - - return ptr; -} -#endif -#endif - -#ifndef HAVE_RTNL_LINK_STATS64 -#define dev_get_stats rpl_dev_get_stats -struct rtnl_link_stats64 *rpl_dev_get_stats(struct net_device *dev, - struct rtnl_link_stats64 *storage); -#endif - -#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0) -/* Only required on RHEL 6. */ -#define dev_get_stats dev_get_stats64 -#endif - -#ifndef netdev_dbg -#define netdev_dbg(__dev, format, args...) \ -do { \ - printk(KERN_DEBUG "%s ", __dev->name); \ - printk(KERN_DEBUG format, ##args); \ -} while (0) -#endif - -#ifndef netdev_info -#define netdev_info(__dev, format, args...) \ -do { \ - printk(KERN_INFO "%s ", __dev->name); \ - printk(KERN_INFO format, ##args); \ -} while (0) - -#endif - -#ifndef USE_UPSTREAM_TUNNEL -#define dev_fill_metadata_dst ovs_dev_fill_metadata_dst -int ovs_dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); -#endif - -#ifndef NETDEV_OFFLOAD_PUSH_VXLAN -#define NETDEV_OFFLOAD_PUSH_VXLAN 0x001C -#endif - -#ifndef NETDEV_OFFLOAD_PUSH_GENEVE -#define NETDEV_OFFLOAD_PUSH_GENEVE 0x001D -#endif - -#ifndef HAVE_IFF_PHONY_HEADROOM - -#define IFF_PHONY_HEADROOM 0 -static inline unsigned netdev_get_fwd_headroom(struct net_device *dev) -{ - return 0; -} - -static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr) -{ -} - -/* set the device rx headroom to the dev's default */ -static inline void netdev_reset_rx_headroom(struct net_device *dev) -{ -} - -#endif - -#ifdef IFF_NO_QUEUE -#define HAVE_IFF_NO_QUEUE -#else -#define IFF_NO_QUEUE 0 -#endif - -#ifndef HAVE_SKB_CSUM_HWOFFLOAD_HELP -static inline int skb_csum_hwoffload_help(struct sk_buff *skb, - const netdev_features_t features) -{ - /* It's less accurate to approximate to this for older kernels, but - * it was sufficient for a long time. If you care about ensuring that - * upstream commit 7529390d08f0 has the same effect on older kernels, - * consider backporting the following commits: - * b72b5bf6a8fc ("net: introduce skb_crc32c_csum_help") - * 43c26a1a4593 ("net: more accurate checksumming in validate_xmit_skb()") - */ - return skb_checksum_help(skb); -} -#endif - -#ifndef HAVE_SKB_GSO_ERROR_UNWIND -static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, - int pulled_hlen, u16 mac_offset, - int mac_len) -{ - skb->protocol = protocol; - skb->encapsulation = 1; - skb_push(skb, pulled_hlen); - skb_reset_transport_header(skb); - skb->mac_header = mac_offset; - skb->network_header = skb->mac_header + mac_len; - skb->mac_len = mac_len; -} -#endif - -#ifndef HAVE_NETIF_KEEP_DST -static inline void netif_keep_dst(struct net_device *dev) -{ -} -#endif - -#ifndef HAVE_DEV_CHANGE_FLAGS_TAKES_EXTACK -static inline int rpl_dev_change_flags(struct net_device *dev, - unsigned int flags, - struct netlink_ext_ack *extack) -{ - return dev_change_flags(dev, flags); -} -#define dev_change_flags rpl_dev_change_flags -#endif - -#endif /* __LINUX_NETDEVICE_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/netfilter.h b/datapath/linux/compat/include/linux/netfilter.h deleted file mode 100644 index a6ed6172d..000000000 --- a/datapath/linux/compat/include/linux/netfilter.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __NETFILTER_WRAPPER_H -#define __NETFILTER_WRAPPER_H - -#include_next <linux/netfilter.h> - -#if !defined(HAVE_NF_HOOK_STATE) || !defined(HAVE_NF_HOOK_STATE_NET) -struct rpl_nf_hook_state { - unsigned int hook; - u_int8_t pf; - struct net_device *in; - struct net_device *out; - struct sock *sk; - struct net *net; - int (*okfn)(struct net *, struct sock *, struct sk_buff *); -}; -#define nf_hook_state rpl_nf_hook_state -#endif - -#endif /* __NETFILTER_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/netfilter_ipv6.h b/datapath/linux/compat/include/linux/netfilter_ipv6.h deleted file mode 100644 index 8d896fbc5..000000000 --- a/datapath/linux/compat/include/linux/netfilter_ipv6.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef __NETFILTER_IPV6_WRAPPER_H -#define __NETFILTER_IPV6_WRAPPER_H 1 - -#include_next <linux/netfilter_ipv6.h> - -#include <linux/version.h> -#include <net/ip.h> /* For OVS_VPORT_OUTPUT_PARAMS */ -#include <net/ip6_route.h> - -#ifndef HAVE_NF_IPV6_OPS_FRAGMENT -/* Try to minimise changes required to the actions.c code for calling IPv6 - * fragmentation. We can keep the fragment() API mostly the same, except that - * the callback parameter needs to be in the form that older kernels accept. - * We don't backport the other ipv6_ops as they're currently unused by OVS. */ -struct ovs_nf_ipv6_ops { - int (*fragment)(struct sock *sk, struct sk_buff *skb, - int (*output)(OVS_VPORT_OUTPUT_PARAMS)); -}; -#define nf_ipv6_ops ovs_nf_ipv6_ops - -static struct ovs_nf_ipv6_ops ovs_ipv6_ops = { - .fragment = ip6_fragment, -}; - -static inline struct ovs_nf_ipv6_ops *ovs_nf_get_ipv6_ops(void) -{ - return &ovs_ipv6_ops; -} -#define nf_get_ipv6_ops ovs_nf_get_ipv6_ops - -#endif /* HAVE_NF_IPV6_OPS_FRAGMENT */ -#endif /* __NETFILTER_IPV6_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/netlink.h b/datapath/linux/compat/include/linux/netlink.h deleted file mode 100644 index a64de4ff8..000000000 --- a/datapath/linux/compat/include/linux/netlink.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __LINUX_NETLINK_WRAPPER_H -#define __LINUX_NETLINK_WRAPPER_H 1 - -#include <linux/skbuff.h> -#include_next <linux/netlink.h> - -#ifndef NLA_TYPE_MASK -#define NLA_F_NESTED (1 << 15) -#define NLA_F_NET_BYTEORDER (1 << 14) -#define NLA_TYPE_MASK (~(NLA_F_NESTED | NLA_F_NET_BYTEORDER)) -#endif - -#include <net/netlink.h> - -#ifndef NLMSG_DEFAULT_SIZE -#define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN) -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/overflow.h b/datapath/linux/compat/include/linux/overflow.h deleted file mode 100644 index 13ae6cf6a..000000000 --- a/datapath/linux/compat/include/linux/overflow.h +++ /dev/null @@ -1,313 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR MIT */ -#if defined(HAVE_OVERFLOW_H) && defined(HAVE_STRUCT_SIZE) -#include_next <linux/overflow.h> -#else -#ifndef __LINUX_OVERFLOW_H -#define __LINUX_OVERFLOW_H - -#include <linux/compiler.h> - -/* - * In the fallback code below, we need to compute the minimum and - * maximum values representable in a given type. These macros may also - * be useful elsewhere, so we provide them outside the - * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. - * - * It would seem more obvious to do something like - * - * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) - * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) - * - * Unfortunately, the middle expressions, strictly speaking, have - * undefined behaviour, and at least some versions of gcc warn about - * the type_max expression (but not if -fsanitize=undefined is in - * effect; in that case, the warning is deferred to runtime...). - * - * The slightly excessive casting in type_min is to make sure the - * macros also produce sensible values for the exotic type _Bool. [The - * overflow checkers only almost work for _Bool, but that's - * a-feature-not-a-bug, since people shouldn't be doing arithmetic on - * _Bools. Besides, the gcc builtins don't allow _Bool* as third - * argument.] - * - * Idea stolen from - * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - - * credit to Christian Biere. - */ -#define is_signed_type(type) (((type)(-1)) < (type)1) -#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) -#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) -#define type_min(T) ((T)((T)-type_max(T)-(T)1)) - - -#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW -/* - * For simplicity and code hygiene, the fallback code below insists on - * a, b and *d having the same type (similar to the min() and max() - * macros), whereas gcc's type-generic overflow checkers accept - * different types. Hence we don't just make check_add_overflow an - * alias for __builtin_add_overflow, but add type checks similar to - * below. - */ -#define check_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_add_overflow(__a, __b, __d); \ -}) - -#define check_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_sub_overflow(__a, __b, __d); \ -}) - -#define check_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - __builtin_mul_overflow(__a, __b, __d); \ -}) - -#else - - -/* Checking for unsigned overflow is relatively easy without causing UB. */ -#define __unsigned_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a + __b; \ - *__d < __a; \ -}) -#define __unsigned_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a - __b; \ - __a < __b; \ -}) -/* - * If one of a or b is a compile-time constant, this avoids a division. - */ -#define __unsigned_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = __a * __b; \ - __builtin_constant_p(__b) ? \ - __b > 0 && __a > type_max(typeof(__a)) / __b : \ - __a > 0 && __b > type_max(typeof(__b)) / __a; \ -}) - -/* - * For signed types, detecting overflow is much harder, especially if - * we want to avoid UB. But the interface of these macros is such that - * we must provide a result in *d, and in fact we must produce the - * result promised by gcc's builtins, which is simply the possibly - * wrapped-around value. Fortunately, we can just formally do the - * operations in the widest relevant unsigned type (u64) and then - * truncate the result - gcc is smart enough to generate the same code - * with and without the (u64) casts. - */ - -/* - * Adding two signed integers can overflow only if they have the same - * sign, and overflow has happened iff the result has the opposite - * sign. - */ -#define __signed_add_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a + (u64)__b; \ - (((~(__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) - -/* - * Subtraction is similar, except that overflow can now happen only - * when the signs are opposite. In this case, overflow has happened if - * the result has the opposite sign of a. - */ -#define __signed_sub_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a - (u64)__b; \ - ((((__a ^ __b)) & (*__d ^ __a)) \ - & type_min(typeof(__a))) != 0; \ -}) - -/* - * Signed multiplication is rather hard. gcc always follows C99, so - * division is truncated towards 0. This means that we can write the - * overflow check like this: - * - * (a > 0 && (b > MAX/a || b < MIN/a)) || - * (a < -1 && (b > MIN/a || b < MAX/a) || - * (a == -1 && b == MIN) - * - * The redundant casts of -1 are to silence an annoying -Wtype-limits - * (included in -Wextra) warning: When the type is u8 or u16, the - * __b_c_e in check_mul_overflow obviously selects - * __unsigned_mul_overflow, but unfortunately gcc still parses this - * code and warns about the limited range of __b. - */ - -#define __signed_mul_overflow(a, b, d) ({ \ - typeof(a) __a = (a); \ - typeof(b) __b = (b); \ - typeof(d) __d = (d); \ - typeof(a) __tmax = type_max(typeof(a)); \ - typeof(a) __tmin = type_min(typeof(a)); \ - (void) (&__a == &__b); \ - (void) (&__a == __d); \ - *__d = (u64)__a * (u64)__b; \ - (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ - (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ - (__b == (typeof(__b))-1 && __a == __tmin); \ -}) - - -#define check_add_overflow(a, b, d) \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_add_overflow(a, b, d), \ - __unsigned_add_overflow(a, b, d)) - -#define check_sub_overflow(a, b, d) \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_sub_overflow(a, b, d), \ - __unsigned_sub_overflow(a, b, d)) - -#define check_mul_overflow(a, b, d) \ - __builtin_choose_expr(is_signed_type(typeof(a)), \ - __signed_mul_overflow(a, b, d), \ - __unsigned_mul_overflow(a, b, d)) - - -#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ - -/** check_shl_overflow() - Calculate a left-shifted value and check overflow - * - * @a: Value to be shifted - * @s: How many bits left to shift - * @d: Pointer to where to store the result - * - * Computes *@d = (@a << @s) - * - * Returns true if '*d' cannot hold the result or when 'a << s' doesn't - * make sense. Example conditions: - * - 'a << s' causes bits to be lost when stored in *d. - * - 's' is garbage (e.g. negative) or so large that the result of - * 'a << s' is guaranteed to be 0. - * - 'a' is negative. - * - 'a << s' sets the sign bit, if any, in '*d'. - * - * '*d' will hold the results of the attempted shift, but is not - * considered "safe for use" if false is returned. - */ -#define check_shl_overflow(a, s, d) ({ \ - typeof(a) _a = a; \ - typeof(s) _s = s; \ - typeof(d) _d = d; \ - u64 _a_full = _a; \ - unsigned int _to_shift = \ - _s >= 0 && _s < 8 * sizeof(*d) ? _s : 0; \ - *_d = (_a_full << _to_shift); \ - (_to_shift != _s || *_d < 0 || _a < 0 || \ - (*_d >> _to_shift) != _a); \ -}) - -/** - * array_size() - Calculate size of 2-dimensional array. - * - * @a: dimension one - * @b: dimension two - * - * Calculates size of 2-dimensional array: @a * @b. - * - * Returns: number of bytes needed to represent the array or SIZE_MAX on - * overflow. - */ -static inline __must_check size_t array_size(size_t a, size_t b) -{ - size_t bytes; - - if (check_mul_overflow(a, b, &bytes)) - return SIZE_MAX; - - return bytes; -} - -/** - * array3_size() - Calculate size of 3-dimensional array. - * - * @a: dimension one - * @b: dimension two - * @c: dimension three - * - * Calculates size of 3-dimensional array: @a * @b * @c. - * - * Returns: number of bytes needed to represent the array or SIZE_MAX on - * overflow. - */ -static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) -{ - size_t bytes; - - if (check_mul_overflow(a, b, &bytes)) - return SIZE_MAX; - if (check_mul_overflow(bytes, c, &bytes)) - return SIZE_MAX; - - return bytes; -} - -static inline __must_check size_t __ab_c_size(size_t n, size_t size, size_t c) -{ - size_t bytes; - - if (check_mul_overflow(n, size, &bytes)) - return SIZE_MAX; - if (check_add_overflow(bytes, c, &bytes)) - return SIZE_MAX; - - return bytes; -} - -/** - * struct_size() - Calculate size of structure with trailing array. - * @p: Pointer to the structure. - * @member: Name of the array member. - * @n: Number of elements in the array. - * - * Calculates size of memory needed for structure @p followed by an - * array of @n @member elements. - * - * Return: number of bytes needed or SIZE_MAX on overflow. - */ -#define struct_size(p, member, n) \ - __ab_c_size(n, \ - sizeof(*(p)->member) + __must_be_array((p)->member),\ - sizeof(*(p))) - -#endif /* __LINUX_OVERFLOW_H */ -#endif /* defined(HAVE_OVERFLOW_H) && defined(HAVE_STRUCT_SIZE) */ diff --git a/datapath/linux/compat/include/linux/percpu.h b/datapath/linux/compat/include/linux/percpu.h deleted file mode 100644 index a039142e2..000000000 --- a/datapath/linux/compat/include/linux/percpu.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __LINUX_PERCPU_WRAPPER_H -#define __LINUX_PERCPU_WRAPPER_H 1 - -#include_next <linux/percpu.h> - -#if !defined this_cpu_ptr -#define this_cpu_ptr(ptr) per_cpu_ptr(ptr, smp_processor_id()) -#endif - -#if !defined this_cpu_read -#define this_cpu_read(ptr) percpu_read(ptr) -#endif - -#if !defined this_cpu_inc -#define this_cpu_inc(ptr) percpu_add(ptr, 1) -#endif - -#if !defined this_cpu_dec -#define this_cpu_dec(ptr) percpu_sub(ptr, 1) -#endif - -#ifndef alloc_percpu_gfp -#define NEED_ALLOC_PERCPU_GFP - -void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); - -#define alloc_percpu_gfp(type, gfp) \ - (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ - __alignof__(type), gfp) -#endif - - -#endif diff --git a/datapath/linux/compat/include/linux/random.h b/datapath/linux/compat/include/linux/random.h deleted file mode 100644 index 5c088a2d8..000000000 --- a/datapath/linux/compat/include/linux/random.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef __LINUX_RANDOM_WRAPPER_H -#define __LINUX_RANDOM_WRAPPER_H 1 - -#include_next <linux/random.h> - -#ifndef HAVE_PRANDOM_U32 -#define prandom_u32() random32() -#endif - -#ifndef HAVE_PRANDOM_U32_MAX -static inline u32 prandom_u32_max(u32 ep_ro) -{ - return (u32)(((u64) prandom_u32() * ep_ro) >> 32); -} -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/rbtree.h b/datapath/linux/compat/include/linux/rbtree.h deleted file mode 100644 index dbf20ff0e..000000000 --- a/datapath/linux/compat/include/linux/rbtree.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __LINUX_RBTREE_WRAPPER_H -#define __LINUX_RBTREE_WRAPPER_H 1 - -#include_next <linux/rbtree.h> - -#ifndef HAVE_RBTREE_RB_LINK_NODE_RCU -#include <linux/rcupdate.h> - -static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, - struct rb_node **rb_link) -{ - node->__rb_parent_color = (unsigned long)parent; - node->rb_left = node->rb_right = NULL; - - rcu_assign_pointer(*rb_link, node); -} -#endif - -#endif /* __LINUX_RBTREE_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/rculist.h b/datapath/linux/compat/include/linux/rculist.h deleted file mode 100644 index 40fd5e171..000000000 --- a/datapath/linux/compat/include/linux/rculist.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __LINUX_RCULIST_WRAPPER_H -#define __LINUX_RCULIST_WRAPPER_H - -#include_next <linux/rculist.h> - -#ifndef hlist_first_rcu -#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) -#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) -#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) -#endif - -/* - * Check during list traversal that we are within an RCU reader - */ - -#define check_arg_count_one(dummy) - -#ifdef CONFIG_PROVE_RCU_LIST -#define __list_check_rcu(dummy, cond, extra...) \ - ({ \ - check_arg_count_one(extra); \ - RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(), \ - "RCU-list traversed in non-reader section!"); \ - }) -#else -#define __list_check_rcu(dummy, cond, extra...) \ - ({ check_arg_count_one(extra); }) -#endif - -#undef hlist_for_each_entry_rcu -#define hlist_for_each_entry_rcu(pos, head, member, cond...) \ - for (__list_check_rcu(dummy, ## cond, 0), \ - pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ - typeof(*(pos)), member); \ - pos; \ - pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ - &(pos)->member)), typeof(*(pos)), member)) - -#endif diff --git a/datapath/linux/compat/include/linux/rcupdate.h b/datapath/linux/compat/include/linux/rcupdate.h deleted file mode 100644 index 85e3c3b76..000000000 --- a/datapath/linux/compat/include/linux/rcupdate.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef __RCUPDATE_WRAPPER_H -#define __RCUPDATE_WRAPPER_H 1 - -#include_next <linux/rcupdate.h> - -#ifndef rcu_dereference_check -#define rcu_dereference_check(p, c) rcu_dereference(p) -#endif - -#ifndef rcu_dereference_protected -#define rcu_dereference_protected(p, c) (p) -#endif - -#ifndef rcu_dereference_raw -#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) -#endif - -#ifndef rcu_access_pointer -#define rcu_access_pointer(p) rcu_dereference(p) -#endif - -#ifndef HAVE_RCU_READ_LOCK_HELD -static inline int rcu_read_lock_held(void) -{ - return 1; -} -#endif - -#ifndef RCU_INITIALIZER -#define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) -#endif - -#ifndef RCU_INIT_POINTER -#define RCU_INIT_POINTER(p, v) \ - do { \ - p = RCU_INITIALIZER(v); \ - } while (0) - -#endif - -#endif /* linux/rcupdate.h wrapper */ diff --git a/datapath/linux/compat/include/linux/reciprocal_div.h b/datapath/linux/compat/include/linux/reciprocal_div.h deleted file mode 100644 index f50d8e4ee..000000000 --- a/datapath/linux/compat/include/linux/reciprocal_div.h +++ /dev/null @@ -1,37 +0,0 @@ -#ifndef _LINUX_RECIPROCAL_DIV_WRAPPER_H -#define _LINUX_RECIPROCAL_DIV_WRAPPER_H 1 - -#include <linux/types.h> - -/* - * This algorithm is based on the paper "Division by Invariant - * Integers Using Multiplication" by Torbjörn Granlund and Peter - * L. Montgomery. - * - * The assembler implementation from Agner Fog, which this code is - * based on, can be found here: - * http://www.agner.org/optimize/asmlib.zip - * - * This optimization for A/B is helpful if the divisor B is mostly - * runtime invariant. The reciprocal of B is calculated in the - * slow-path with reciprocal_value(). The fast-path can then just use - * a much faster multiplication operation with a variable dividend A - * to calculate the division A/B. - */ - -#define reciprocal_value rpl_reciprocal_value -struct reciprocal_value { - u32 m; - u8 sh1, sh2; -}; - -struct reciprocal_value rpl_reciprocal_value(u32 d); - -#define reciprocal_divide rpl_reciprocal_divide -static inline u32 rpl_reciprocal_divide(u32 a, struct reciprocal_value R) -{ - u32 t = (u32)(((u64)a * R.m) >> 32); - return (t + ((a - t) >> R.sh1)) >> R.sh2; -} - -#endif /* _LINUX_RECIPROCAL_DIV_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/rtnetlink.h b/datapath/linux/compat/include/linux/rtnetlink.h deleted file mode 100644 index cd1e1a0c0..000000000 --- a/datapath/linux/compat/include/linux/rtnetlink.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef __RTNETLINK_WRAPPER_H -#define __RTNETLINK_WRAPPER_H 1 - -#include_next <linux/rtnetlink.h> - -#ifndef HAVE_LOCKDEP_RTNL_IS_HELD -#ifdef CONFIG_PROVE_LOCKING -static inline int lockdep_rtnl_is_held(void) -{ - return 1; -} -#endif -#endif - -#ifndef rcu_dereference_rtnl -/** - * rcu_dereference_rtnl - rcu_dereference with debug checking - * @p: The pointer to read, prior to dereferencing - * - * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() - * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() - */ -#define rcu_dereference_rtnl(p) \ - rcu_dereference_check(p, rcu_read_lock_held() || \ - lockdep_rtnl_is_held()) -#endif - -#ifndef rtnl_dereference -/** - * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL - * @p: The pointer to read, prior to dereferencing - * - * Return the value of the specified RCU-protected pointer, but omit - * both the smp_read_barrier_depends() and the ACCESS_ONCE(), because - * caller holds RTNL. - */ -#define rtnl_dereference(p) \ - rcu_dereference_protected(p, lockdep_rtnl_is_held()) -#endif - -#endif /* linux/rtnetlink.h wrapper */ diff --git a/datapath/linux/compat/include/linux/skbuff.h b/datapath/linux/compat/include/linux/skbuff.h deleted file mode 100644 index 396a5e406..000000000 --- a/datapath/linux/compat/include/linux/skbuff.h +++ /dev/null @@ -1,491 +0,0 @@ -#ifndef __LINUX_SKBUFF_WRAPPER_H -#define __LINUX_SKBUFF_WRAPPER_H 1 - -#include <linux/version.h> -#include <linux/types.h> - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) -/* This should be before skbuff.h to make sure that we rewrite - * the calls there. */ -struct sk_buff; - -int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, - gfp_t gfp_mask); -#define pskb_expand_head rpl_pskb_expand_head -#endif - -#include_next <linux/skbuff.h> -#include <linux/jhash.h> - -#ifndef HAVE_IGNORE_DF_RENAME -#define ignore_df local_df -#endif - - -#ifndef HAVE_NULL_COMPUTE_PSEUDO -static inline __wsum null_compute_pseudo(struct sk_buff *skb, int proto) -{ - return 0; -} -#endif - -#ifndef HAVE_SKB_CHECKSUM_CONVERT -static inline bool __skb_checksum_convert_check(struct sk_buff *skb) -{ -#ifdef HAVE_SKBUFF_CSUM_VALID - return (skb->ip_summed == CHECKSUM_NONE && skb->csum_valid); -#else - return skb->ip_summed == CHECKSUM_NONE; -#endif -} - -static inline void __skb_checksum_convert(struct sk_buff *skb, - __sum16 check, __wsum pseudo) -{ - skb->csum = ~pseudo; - skb->ip_summed = CHECKSUM_COMPLETE; -} - -#define skb_checksum_try_convert(skb, proto, check, compute_pseudo) \ -do { \ - if (__skb_checksum_convert_check(skb)) \ - __skb_checksum_convert(skb, check, \ - compute_pseudo(skb, proto)); \ -} while (0) - -#endif - -#ifndef SKB_CHECKSUM_SIMPLE_VALIDATE - -#ifndef __skb_checksum_validate -#define __skb_checksum_validate(skb, proto, complete, \ - zero_okay, check, compute_pseudo) \ -({ \ - __sum16 __ret = 0; \ - __ret; \ -}) -#endif - -#define skb_checksum_simple_validate(skb) \ - __skb_checksum_validate(skb, 0, true, false, 0, null_compute_pseudo) -#endif - -#ifndef HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET -static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb, - const int offset, void *to, - const unsigned int len) -{ - memcpy(to, skb->data + offset, len); -} - -static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb, - const int offset, - const void *from, - const unsigned int len) -{ - memcpy(skb->data + offset, from, len); -} - -#endif /* !HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET */ - -#ifndef HAVE_SKB_INNER_TRANSPORT_OFFSET -static inline int skb_inner_transport_offset(const struct sk_buff *skb) -{ - return skb_inner_transport_header(skb) - skb->data; -} -#endif - -#ifndef HAVE_SKB_RESET_TAIL_POINTER -static inline void skb_reset_tail_pointer(struct sk_buff *skb) -{ - skb->tail = skb->data; -} -#endif -/* - * The networking layer reserves some headroom in skb data (via - * dev_alloc_skb). This is used to avoid having to reallocate skb data when - * the header has to grow. In the default case, if the header has to grow - * 16 bytes or less we avoid the reallocation. - * - * Unfortunately this headroom changes the DMA alignment of the resulting - * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive - * on some architectures. An architecture can override this value, - * perhaps setting it to a cacheline in size (since that will maintain - * cacheline alignment of the DMA). It must be a power of 2. - * - * Various parts of the networking layer expect at least 16 bytes of - * headroom, you should not reduce this. - */ -#ifndef NET_SKB_PAD -#define NET_SKB_PAD 16 -#endif - -#ifndef HAVE_SKB_COW_HEAD -static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom, - int cloned) -{ - int delta = 0; - - if (headroom < NET_SKB_PAD) - headroom = NET_SKB_PAD; - if (headroom > skb_headroom(skb)) - delta = headroom - skb_headroom(skb); - - if (delta || cloned) - return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0, - GFP_ATOMIC); - return 0; -} - -static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom) -{ - return __skb_cow(skb, headroom, skb_header_cloned(skb)); -} -#endif /* !HAVE_SKB_COW_HEAD */ - -#ifndef HAVE_SKB_DST_ACCESSOR_FUNCS -static inline struct dst_entry *skb_dst(const struct sk_buff *skb) -{ - return (struct dst_entry *)skb->dst; -} - -static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst) -{ - skb->dst = dst; -} - -static inline struct rtable *skb_rtable(const struct sk_buff *skb) -{ - return (struct rtable *)skb->dst; -} -#endif - -#ifndef CHECKSUM_PARTIAL -#define CHECKSUM_PARTIAL CHECKSUM_HW -#endif -#ifndef CHECKSUM_COMPLETE -#define CHECKSUM_COMPLETE CHECKSUM_HW -#endif - -#ifndef HAVE_SKB_WARN_LRO -#ifndef NETIF_F_LRO -static inline bool skb_warn_if_lro(const struct sk_buff *skb) -{ - return false; -} -#else -extern void __skb_warn_lro_forwarding(const struct sk_buff *skb); - -static inline bool skb_warn_if_lro(const struct sk_buff *skb) -{ - /* LRO sets gso_size but not gso_type, whereas if GSO is really - * wanted then gso_type will be set. */ - struct skb_shared_info *shinfo = skb_shinfo(skb); - if (shinfo->gso_size != 0 && unlikely(shinfo->gso_type == 0)) { - __skb_warn_lro_forwarding(skb); - return true; - } - return false; -} -#endif /* NETIF_F_LRO */ -#endif /* HAVE_SKB_WARN_LRO */ - -#ifndef HAVE_CONSUME_SKB -#define consume_skb kfree_skb -#endif - -#ifndef HAVE_SKB_FRAG_PAGE -#include <linux/mm.h> - -static inline struct page *skb_frag_page(const skb_frag_t *frag) -{ - return frag->page; -} - -static inline void __skb_frag_set_page(skb_frag_t *frag, struct page *page) -{ - frag->page = page; -} -static inline void skb_frag_size_set(skb_frag_t *frag, unsigned int size) -{ - frag->size = size; -} -static inline void __skb_frag_ref(skb_frag_t *frag) -{ - get_page(skb_frag_page(frag)); -} -static inline void __skb_frag_unref(skb_frag_t *frag) -{ - put_page(skb_frag_page(frag)); -} - -static inline void skb_frag_ref(struct sk_buff *skb, int f) -{ - __skb_frag_ref(&skb_shinfo(skb)->frags[f]); -} - -static inline void skb_frag_unref(struct sk_buff *skb, int f) -{ - __skb_frag_unref(&skb_shinfo(skb)->frags[f]); -} - -#endif - -#ifndef HAVE_SKB_RESET_MAC_LEN -static inline void skb_reset_mac_len(struct sk_buff *skb) -{ - skb->mac_len = skb->network_header - skb->mac_header; -} -#endif - -#ifndef HAVE_SKB_UNCLONE -static inline int skb_unclone(struct sk_buff *skb, gfp_t pri) -{ - might_sleep_if(pri & __GFP_WAIT); - - if (skb_cloned(skb)) - return pskb_expand_head(skb, 0, 0, pri); - - return 0; -} -#endif - -#ifndef HAVE_SKB_ORPHAN_FRAGS -static inline int skb_orphan_frags(struct sk_buff *skb, gfp_t gfp_mask) -{ - return 0; -} -#endif - -#ifndef HAVE_SKB_GET_HASH -#define skb_get_hash skb_get_rxhash -#endif /* HAVE_SKB_GET_HASH */ - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) -#define skb_zerocopy_headlen rpl_skb_zerocopy_headlen -unsigned int rpl_skb_zerocopy_headlen(const struct sk_buff *from); -#endif - -#ifndef HAVE_SKB_ZEROCOPY -#define skb_zerocopy rpl_skb_zerocopy -int rpl_skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, - int hlen); -#endif - -#ifndef HAVE_SKB_CLEAR_HASH -static inline void skb_clear_hash(struct sk_buff *skb) -{ -#ifdef HAVE_RXHASH - skb->rxhash = 0; -#endif - skb->l4_hash = 0; -} -#endif - -#ifndef HAVE_SKB_HAS_FRAG_LIST -#define skb_has_frag_list skb_has_frags -#endif - -#ifndef HAVE___SKB_FILL_PAGE_DESC -static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, - struct page *page, int off, int size) -{ - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - __skb_frag_set_page(frag, page); - frag->page_offset = off; - skb_frag_size_set(frag, size); -} -#endif - -#ifndef HAVE_SKB_ENSURE_WRITABLE -#define skb_ensure_writable rpl_skb_ensure_writable -int rpl_skb_ensure_writable(struct sk_buff *skb, int write_len); -#endif - -#ifndef HAVE___SKB_VLAN_POP -#define __skb_vlan_pop rpl___skb_vlan_pop -int rpl___skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci); -#endif - -#ifndef HAVE_SKB_VLAN_POP -#define skb_vlan_pop rpl_skb_vlan_pop -int rpl_skb_vlan_pop(struct sk_buff *skb); -#endif - -#ifndef HAVE_SKB_VLAN_PUSH -#define skb_vlan_push rpl_skb_vlan_push -int rpl_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci); -#endif - -#ifndef HAVE_KFREE_SKB_LIST -void rpl_kfree_skb_list(struct sk_buff *segs); -#define kfree_skb_list rpl_kfree_skb_list -#endif - -#ifndef HAVE_SKB_CHECKSUM_START_OFFSET -static inline int skb_checksum_start_offset(const struct sk_buff *skb) -{ - return skb->csum_start - skb_headroom(skb); -} -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0) -#define skb_postpull_rcsum rpl_skb_postpull_rcsum -static inline void skb_postpull_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0)); - else if (skb->ip_summed == CHECKSUM_PARTIAL && - skb_checksum_start_offset(skb) < 0) - skb->ip_summed = CHECKSUM_NONE; -} - -#define skb_pull_rcsum rpl_skb_pull_rcsum -static inline unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len) -{ - unsigned char *data = skb->data; - - BUG_ON(len > skb->len); - __skb_pull(skb, len); - skb_postpull_rcsum(skb, data, len); - return skb->data; -} - -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) -#define skb_scrub_packet rpl_skb_scrub_packet -void rpl_skb_scrub_packet(struct sk_buff *skb, bool xnet); -#endif - -#define skb_pop_mac_header rpl_skb_pop_mac_header -static inline void skb_pop_mac_header(struct sk_buff *skb) -{ - skb->mac_header = skb->network_header; -} - -#ifndef HAVE_SKB_CLEAR_HASH_IF_NOT_L4 -static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) -{ - if (!skb->l4_hash) - skb_clear_hash(skb); -} -#endif - -#ifndef HAVE_SKB_POSTPUSH_RCSUM -static inline void skb_postpush_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) -{ - /* For performing the reverse operation to skb_postpull_rcsum(), - * we can instead of ... - * - * skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); - * - * ... just use this equivalent version here to save a few - * instructions. Feeding csum of 0 in csum_partial() and later - * on adding skb->csum is equivalent to feed skb->csum in the - * first place. - */ - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_partial(start, len, skb->csum); -} -#endif - -#define skb_checksum_start rpl_skb_checksum_start -static inline unsigned char *skb_checksum_start(const struct sk_buff *skb) -{ - return skb->head + skb->csum_start; -} - -#ifndef HAVE_LCO_CSUM -static inline __wsum lco_csum(struct sk_buff *skb) -{ - unsigned char *csum_start = skb_checksum_start(skb); - unsigned char *l4_hdr = skb_transport_header(skb); - __wsum partial; - - /* Start with complement of inner checksum adjustment */ - partial = ~csum_unfold(*(__force __sum16 *)(csum_start + - skb->csum_offset)); - - /* Add in checksum of our headers (incl. outer checksum - * adjustment filled in by caller) and return result. - */ - return csum_partial(l4_hdr, csum_start - l4_hdr, partial); -} -#endif - -#ifndef HAVE_SKB_NFCT -static inline struct nf_conntrack *skb_nfct(const struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - return skb->nfct; -#else - return NULL; -#endif -} -#endif - -#ifndef HAVE_SKB_PUT_ZERO -static inline void *skb_put_zero(struct sk_buff *skb, unsigned int len) -{ - void *tmp = skb_put(skb, len); - - memset(tmp, 0, len); - - return tmp; -} -#endif - -#ifndef HAVE_SKB_GSO_IPXIP6 -#define SKB_GSO_IPXIP6 (1 << 10) -#endif - -#ifndef HAVE_SKB_SET_INNER_IPPROTO -static inline void skb_set_inner_ipproto(struct sk_buff *skb, - __u8 ipproto) -{ -} -#endif - -#ifndef HAVE_NF_RESET_CT -#define nf_reset_ct nf_reset -#endif - -#ifndef HAVE___SKB_SET_HASH -static inline void -__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4) -{ -#ifdef HAVE_RXHASH - skb->rxhash = hash; -#else - skb->hash = hash; -#endif - skb->l4_hash = is_l4; -#ifdef HAVE_SW_HASH - skb->sw_hash = is_sw; -#endif -} -#endif - -#ifndef HAVE_SKB_GET_HASH_RAW -static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) -{ -#ifdef HAVE_RXHASH - return skb->rxhash; -#else - return skb->hash; -#endif -} -#endif - -#ifndef skb_list_walk_safe -/* Iterate through singly-linked GSO fragments of an skb. */ -#define skb_list_walk_safe(first, skb, next_skb) \ - for ((skb) = (first), (next_skb) = (skb) ? (skb)->next : NULL; (skb); \ - (skb) = (next_skb), (next_skb) = (skb) ? (skb)->next : NULL) -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/static_key.h b/datapath/linux/compat/include/linux/static_key.h deleted file mode 100644 index 432feccb9..000000000 --- a/datapath/linux/compat/include/linux/static_key.h +++ /dev/null @@ -1,86 +0,0 @@ -#ifndef _STATIC_KEY_WRAPPER_H -#define _STATIC_KEY_WRAPPER_H - -#include <linux/atomic.h> -#include_next <linux/static_key.h> -#ifndef HAVE_UPSTREAM_STATIC_KEY -/* - * This backport is based on upstream net-next commit 11276d5306b8 - * ("locking/static_keys: Add a new static_key interface"). - * - * For kernel that does not support the new static key interface, - * we do not backport the jump label support but the fall back version - * of static key that is simply a conditional branch. - */ - -struct static_key_true { - struct static_key key; -}; - -struct static_key_false { - struct static_key key; -}; - -#define rpl_STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) } -#define rpl_STATIC_KEY_INIT_FALSE { .enabled = ATOMIC_INIT(0) } - -#define rpl_STATIC_KEY_TRUE_INIT \ - (struct static_key_true) { .key = rpl_STATIC_KEY_INIT_TRUE, } -#define rpl_STATIC_KEY_FALSE_INIT \ - (struct static_key_false){ .key = rpl_STATIC_KEY_INIT_FALSE, } - -#define rpl_DEFINE_STATIC_KEY_TRUE(name) \ - struct static_key_true name = rpl_STATIC_KEY_TRUE_INIT - -#define rpl_DEFINE_STATIC_KEY_FALSE(name) \ - struct static_key_false name = rpl_STATIC_KEY_FALSE_INIT - -static inline int rpl_static_key_count(struct static_key *key) -{ - return atomic_read(&key->enabled); -} - -static inline void rpl_static_key_enable(struct static_key *key) -{ - int count = rpl_static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (!count) - static_key_slow_inc(key); -} - -static inline void rpl_static_key_disable(struct static_key *key) -{ - int count = rpl_static_key_count(key); - - WARN_ON_ONCE(count < 0 || count > 1); - - if (count) - static_key_slow_dec(key); -} - -#ifdef HAVE_DEFINE_STATIC_KEY -#undef DEFINE_STATIC_KEY_TRUE -#undef DEFINE_STATIC_KEY_FALSE -#endif - -#define DEFINE_STATIC_KEY_TRUE rpl_DEFINE_STATIC_KEY_TRUE -#define DEFINE_STATIC_KEY_FALSE rpl_DEFINE_STATIC_KEY_FALSE - -#define static_branch_likely(x) likely(static_key_enabled(&(x)->key)) -#define static_branch_unlikely(x) unlikely(static_key_enabled(&(x)->key)) - -#define static_branch_enable(x) rpl_static_key_enable(&(x)->key) -#define static_branch_disable(x) rpl_static_key_disable(&(x)->key) - -#ifndef HAVE_DECLARE_STATIC_KEY -#define DECLARE_STATIC_KEY_TRUE(name) \ - extern struct static_key_true name -#define DECLARE_STATIC_KEY_FALSE(name) \ - extern struct static_key_false name -#endif - -#endif /* HAVE_UPSTREAM_STATIC_KEY */ - -#endif /* _STATIC_KEY_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/stddef.h b/datapath/linux/compat/include/linux/stddef.h deleted file mode 100644 index 5b44c0dee..000000000 --- a/datapath/linux/compat/include/linux/stddef.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef __LINUX_STDDEF_WRAPPER_H -#define __LINUX_STDDEF_WRAPPER_H 1 - -#include_next <linux/stddef.h> - -#ifdef __KERNEL__ - -#ifndef offsetofend -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) -#endif - -#endif /* __KERNEL__ */ - -#endif diff --git a/datapath/linux/compat/include/linux/timekeeping.h b/datapath/linux/compat/include/linux/timekeeping.h deleted file mode 100644 index 3a3b18331..000000000 --- a/datapath/linux/compat/include/linux/timekeeping.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _LINUX_TIMEKEEPING_WRAPPER_H -#define _LINUX_TIMEKEEPING_WRAPPER_H - -#ifndef HAVE_KTIME_GET_TS64 -#define ktime_get_ts64 ktime_get_ts -#define timespec64 timespec -#else -#include_next <linux/timekeeping.h> -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/types.h b/datapath/linux/compat/include/linux/types.h deleted file mode 100644 index a58623e70..000000000 --- a/datapath/linux/compat/include/linux/types.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef __LINUX_TYPES_WRAPPER_H -#define __LINUX_TYPES_WRAPPER_H 1 - -#include_next <linux/types.h> - -#ifndef HAVE_CSUM_TYPES -typedef __u16 __bitwise __sum16; -typedef __u32 __bitwise __wsum; -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/u64_stats_sync.h b/datapath/linux/compat/include/linux/u64_stats_sync.h deleted file mode 100644 index 9342f73d0..000000000 --- a/datapath/linux/compat/include/linux/u64_stats_sync.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef _LINUX_U64_STATS_SYNC_WRAPPER_H -#define _LINUX_U64_STATS_SYNC_WRAPPER_H - -#include <linux/version.h> - -#if defined(HAVE_U64_STATS_FETCH_BEGIN_IRQ) && \ - LINUX_VERSION_CODE >= KERNEL_VERSION(3,13,0) -#include_next <linux/u64_stats_sync.h> -#else - -/* - * To properly implement 64bits network statistics on 32bit and 64bit hosts, - * we provide a synchronization point, that is a noop on 64bit or UP kernels. - * - * Key points : - * 1) Use a seqcount on SMP 32bits, with low overhead. - * 2) Whole thing is a noop on 64bit arches or UP kernels. - * 3) Write side must ensure mutual exclusion or one seqcount update could - * be lost, thus blocking readers forever. - * If this synchronization point is not a mutex, but a spinlock or - * spinlock_bh() or disable_bh() : - * 3.1) Write side should not sleep. - * 3.2) Write side should not allow preemption. - * 3.3) If applicable, interrupts should be disabled. - * - * 4) If reader fetches several counters, there is no guarantee the whole values - * are consistent (remember point 1) : this is a noop on 64bit arches anyway) - * - * 5) readers are allowed to sleep or be preempted/interrupted : They perform - * pure reads. But if they have to fetch many values, it's better to not allow - * preemptions/interruptions to avoid many retries. - * - * 6) If counter might be written by an interrupt, readers should block interrupts. - * (On UP, there is no seqcount_t protection, a reader allowing interrupts could - * read partial values) - * - * 7) For irq or softirq uses, readers can use u64_stats_fetch_begin_irq() and - * u64_stats_fetch_retry_irq() helpers - * - * Usage : - * - * Stats producer (writer) should use following template granted it already got - * an exclusive access to counters (a lock is already taken, or per cpu - * data is used [in a non preemptable context]) - * - * spin_lock_bh(...) or other synchronization to get exclusive access - * ... - * u64_stats_update_begin(&stats->syncp); - * stats->bytes64 += len; // non atomic operation - * stats->packets64++; // non atomic operation - * u64_stats_update_end(&stats->syncp); - * - * While a consumer (reader) should use following template to get consistent - * snapshot for each variable (but no guarantee on several ones) - * - * u64 tbytes, tpackets; - * unsigned int start; - * - * do { - * start = u64_stats_fetch_begin(&stats->syncp); - * tbytes = stats->bytes64; // non atomic operation - * tpackets = stats->packets64; // non atomic operation - * } while (u64_stats_fetch_retry(&stats->syncp, start)); - * - * - * Example of use in drivers/net/loopback.c, using per_cpu containers, - * in BH disabled context. - */ -#include <linux/seqlock.h> - -struct u64_stats_sync { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - seqcount_t seq; -#endif -}; - -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) -# define u64_stats_init(syncp) seqcount_init(syncp.seq) -#else -# define u64_stats_init(syncp) do { } while (0) -#endif - -static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - write_seqcount_begin(&syncp->seq); -#endif -} - -static inline void u64_stats_update_end(struct u64_stats_sync *syncp) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - write_seqcount_end(&syncp->seq); -#endif -} - -static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_begin(&syncp->seq); -#else -#if BITS_PER_LONG==32 - preempt_disable(); -#endif - return 0; -#endif -} - -static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, - unsigned int start) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_retry(&syncp->seq, start); -#else -#if BITS_PER_LONG==32 - preempt_enable(); -#endif - return false; -#endif -} - -/* - * In case irq handlers can update u64 counters, readers can use following helpers - * - SMP 32bit arches use seqcount protection, irq safe. - * - UP 32bit must disable irqs. - * - 64bit have no problem atomically reading u64 values, irq safe. - */ -static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_begin(&syncp->seq); -#else -#if BITS_PER_LONG==32 - local_irq_disable(); -#endif - return 0; -#endif -} - -static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, - unsigned int start) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_retry(&syncp->seq, start); -#else -#if BITS_PER_LONG==32 - local_irq_enable(); -#endif - return false; -#endif -} - -#endif /* !HAVE_U64_STATS_FETCH_BEGIN_IRQ || kernel < 3.13 */ - -#endif /* _LINUX_U64_STATS_SYNC_WRAPPER_H */ diff --git a/datapath/linux/compat/include/linux/udp.h b/datapath/linux/compat/include/linux/udp.h deleted file mode 100644 index 22e57d4c0..000000000 --- a/datapath/linux/compat/include/linux/udp.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __LINUX_UDP_WRAPPER_H -#define __LINUX_UDP_WRAPPER_H 1 - -#include_next <linux/udp.h> -#include <linux/ipv6.h> - -#ifndef HAVE_NO_CHECK6_TX -static inline void udp_set_no_check6_tx(struct sock *sk, bool val) -{ -#ifdef HAVE_SK_NO_CHECK_TX - sk->sk_no_check_tx = val; -#endif -} - -static inline void udp_set_no_check6_rx(struct sock *sk, bool val) -{ -#ifdef HAVE_SK_NO_CHECK_TX - sk->sk_no_check_rx = val; -#else - /* since netwroking stack is not checking for zero UDP checksum - * check it in OVS module. */ - #define OVS_CHECK_UDP_TUNNEL_ZERO_CSUM -#endif -} -#endif - -#ifdef OVS_CHECK_UDP_TUNNEL_ZERO_CSUM -#define udp6_csum_zero_error rpl_udp6_csum_zero_error - -void rpl_udp6_csum_zero_error(struct sk_buff *skb); -#endif - -#endif diff --git a/datapath/linux/compat/include/linux/workqueue.h b/datapath/linux/compat/include/linux/workqueue.h deleted file mode 100644 index ed573c226..000000000 --- a/datapath/linux/compat/include/linux/workqueue.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __LINUX_WORKQUEUE_WRAPPER_H -#define __LINUX_WORKQUEUE_WRAPPER_H 1 - -#include_next <linux/workqueue.h> - -#endif diff --git a/datapath/linux/compat/include/net/checksum.h b/datapath/linux/compat/include/net/checksum.h deleted file mode 100644 index d1f1125d1..000000000 --- a/datapath/linux/compat/include/net/checksum.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef __NET_CHECKSUM_WRAPPER_H -#define __NET_CHECKSUM_WRAPPER_H 1 - -#include_next <net/checksum.h> - -#ifndef HAVE_CSUM_UNFOLD -static inline __wsum csum_unfold(__sum16 n) -{ - return (__force __wsum)n; -} -#endif /* !HAVE_CSUM_UNFOLD */ - -/* Workaround for debugging included in certain versions of XenServer. It only - * applies to 32-bit x86. - */ -#if defined(HAVE_CSUM_COPY_DBG) && defined(CONFIG_X86_32) -#define csum_and_copy_to_user(src, dst, len, sum, err_ptr) \ - csum_and_copy_to_user(src, dst, len, sum, NULL, err_ptr) -#endif - -#ifndef HAVE_CSUM_REPLACE4 -static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) -{ - __be32 diff[] = { ~from, to }; - - *sum = csum_fold(csum_partial((char *)diff, sizeof(diff), ~csum_unfold(*sum))); -} - -static inline void csum_replace2(__sum16 *sum, __be16 from, __be16 to) -{ - csum_replace4(sum, (__force __be32)from, (__force __be32)to); -} -#endif - -#ifndef CSUM_MANGLED_0 -#define CSUM_MANGLED_0 ((__force __sum16)0xffff) -#endif - -#endif /* checksum.h */ diff --git a/datapath/linux/compat/include/net/dst.h b/datapath/linux/compat/include/net/dst.h deleted file mode 100644 index af78a6ca6..000000000 --- a/datapath/linux/compat/include/net/dst.h +++ /dev/null @@ -1,77 +0,0 @@ -#ifndef __NET_DST_WRAPPER_H -#define __NET_DST_WRAPPER_H 1 - -#include <linux/version.h> -#include_next <net/dst.h> - -#ifndef HAVE_SKB_DST_ACCESSOR_FUNCS - -static inline void skb_dst_drop(struct sk_buff *skb) -{ - if (skb->dst) - dst_release(skb_dst(skb)); - skb->dst = NULL; -} - -#endif - -#ifndef DST_OBSOLETE_NONE -#define DST_OBSOLETE_NONE 0 -#endif - -#ifndef DST_NOCOUNT -#define DST_NOCOUNT 0 -#endif - -#if !defined(HAVE___SKB_DST_COPY) -static inline void __skb_dst_copy(struct sk_buff *nskb, unsigned long refdst) -{ - nskb->_skb_refdst = refdst; - if (!(nskb->_skb_refdst & SKB_DST_NOREF)) - dst_clone(skb_dst(nskb)); -} -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0) -static const u32 rpl_dst_default_metrics[RTAX_MAX + 1] = { - /* This initializer is needed to force linker to place this variable - * into const section. Otherwise it might end into bss section. - * We really want to avoid false sharing on this variable, and catch - * any writes on it. - */ - [RTAX_MAX] = 0xdeadbeef, -}; -#define dst_default_metrics rpl_dst_default_metrics - -static inline void rpl_dst_init(struct dst_entry *dst, struct dst_ops *ops, - struct net_device *dev, int initial_ref, - int initial_obsolete, unsigned short flags) -{ - /* XXX: It's easier to handle compatibility by zeroing, as we can - * refer to fewer fields. Do that here. - */ - memset(dst, 0, sizeof *dst); - - dst->dev = dev; - if (dev) - dev_hold(dev); - dst->ops = ops; - dst_init_metrics(dst, dst_default_metrics, true); - dst->path = dst; - dst->input = dst_discard; -#ifndef HAVE_DST_DISCARD_SK - dst->output = dst_discard; -#else - dst->output = dst_discard_sk; -#endif - dst->obsolete = initial_obsolete; - atomic_set(&dst->__refcnt, initial_ref); - dst->lastuse = jiffies; - dst->flags = flags; - if (!(flags & DST_NOCOUNT)) - dst_entries_add(ops, 1); -} -#define dst_init rpl_dst_init -#endif - -#endif diff --git a/datapath/linux/compat/include/net/dst_cache.h b/datapath/linux/compat/include/net/dst_cache.h deleted file mode 100644 index 6084d4eea..000000000 --- a/datapath/linux/compat/include/net/dst_cache.h +++ /dev/null @@ -1,114 +0,0 @@ -#ifndef _NET_DST_CACHE_WRAPPER_H -#define _NET_DST_CACHE_WRAPPER_H - -#ifdef USE_BUILTIN_DST_CACHE -#include_next <net/dst_cache.h> -#else - -#include <linux/jiffies.h> -#include <net/dst.h> -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ip6_fib.h> -#endif - -#ifdef USE_UPSTREAM_TUNNEL -#include_next <net/dst_cache.h> - -#else -struct dst_cache { - struct dst_cache_pcpu __percpu *cache; - unsigned long reset_ts; -}; - -/** - * dst_cache_get - perform cache lookup - * @dst_cache: the cache - * - * The caller should use dst_cache_get_ip4() if it need to retrieve the - * source address to be used when xmitting to the cached dst. - * local BH must be disabled. - */ -#define rpl_dst_cache_get dst_cache_get -struct dst_entry *rpl_dst_cache_get(struct dst_cache *dst_cache); - -/** - * dst_cache_get_ip4 - perform cache lookup and fetch ipv4 source address - * @dst_cache: the cache - * @saddr: return value for the retrieved source address - * - * local BH must be disabled. - */ -#define rpl_dst_cache_get_ip4 dst_cache_get_ip4 -struct rtable *rpl_dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr); - -/** - * dst_cache_set_ip4 - store the ipv4 dst into the cache - * @dst_cache: the cache - * @dst: the entry to be cached - * @saddr: the source address to be stored inside the cache - * - * local BH must be disabled. - */ -#define rpl_dst_cache_set_ip4 dst_cache_set_ip4 -void rpl_dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst, - __be32 saddr); - -#if IS_ENABLED(CONFIG_IPV6) - -/** - * dst_cache_set_ip6 - store the ipv6 dst into the cache - * @dst_cache: the cache - * @dst: the entry to be cached - * @saddr: the source address to be stored inside the cache - * - * local BH must be disabled. - */ -#define rpl_dst_cache_set_ip6 dst_cache_set_ip6 -void rpl_dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst, - const struct in6_addr *addr); - -/** - * dst_cache_get_ip6 - perform cache lookup and fetch ipv6 source address - * @dst_cache: the cache - * @saddr: return value for the retrieved source address - * - * local BH must be disabled. - */ -#define rpl_dst_cache_get_ip6 dst_cache_get_ip6 -struct dst_entry *rpl_dst_cache_get_ip6(struct dst_cache *dst_cache, - struct in6_addr *saddr); -#endif - -/** - * dst_cache_reset - invalidate the cache contents - * @dst_cache: the cache - * - * This do not free the cached dst to avoid races and contentions. - * the dst will be freed on later cache lookup. - */ -static inline void dst_cache_reset(struct dst_cache *dst_cache) -{ - dst_cache->reset_ts = jiffies; -} - -/** - * dst_cache_init - initialize the cache, allocating the required storage - * @dst_cache: the cache - * @gfp: allocation flags - */ -#define rpl_dst_cache_init dst_cache_init -int rpl_dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp); - -/** - * dst_cache_destroy - empty the cache and free the allocated storage - * @dst_cache: the cache - * - * No synchronization is enforced: it must be called only when the cache - * is unsed. - */ -#define rpl_dst_cache_destroy dst_cache_destroy -void rpl_dst_cache_destroy(struct dst_cache *dst_cache); - -#endif /* USE_UPSTREAM_TUNNEL */ -#endif /* USE_BUILTIN_DST_CACHE */ -#endif diff --git a/datapath/linux/compat/include/net/dst_metadata.h b/datapath/linux/compat/include/net/dst_metadata.h deleted file mode 100644 index 4ffafccce..000000000 --- a/datapath/linux/compat/include/net/dst_metadata.h +++ /dev/null @@ -1,269 +0,0 @@ -#ifndef __NET_DST_METADATA_WRAPPER_H -#define __NET_DST_METADATA_WRAPPER_H 1 - -#ifdef USE_UPSTREAM_TUNNEL -#include_next <net/dst_metadata.h> -#else -#include <linux/skbuff.h> - -#include <net/dsfield.h> -#include <net/dst.h> -#include <net/ipv6.h> -#include <net/ip_tunnels.h> - -enum metadata_type { - METADATA_IP_TUNNEL, - METADATA_HW_PORT_MUX, -}; - -struct hw_port_info { - struct net_device *lower_dev; - u32 port_id; -}; - -struct metadata_dst { - struct dst_entry dst; - enum metadata_type type; - union { - struct ip_tunnel_info tun_info; - struct hw_port_info port_info; - } u; -}; - -#ifndef DST_METADATA -#define DST_METADATA 0x0080 -#endif - -extern struct dst_ops md_dst_ops; - -static void rpl__metadata_dst_init(struct metadata_dst *md_dst, - enum metadata_type type, u8 optslen) - -{ - struct dst_entry *dst; - - dst = &md_dst->dst; - dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, - DST_METADATA | DST_NOCOUNT); - -#if 0 - /* unused in OVS */ - dst->input = dst_md_discard; - dst->output = dst_md_discard_out; -#endif - memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); - md_dst->type = type; -} - -static struct -metadata_dst *__rpl_metadata_dst_alloc(u8 optslen, - enum metadata_type type, - gfp_t flags) -{ - struct metadata_dst *md_dst; - - md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); - if (!md_dst) - return NULL; - - rpl__metadata_dst_init(md_dst, type, optslen); - - return md_dst; -} -static inline struct metadata_dst *rpl_tun_rx_dst(int md_size) -{ - struct metadata_dst *tun_dst; - - tun_dst = __rpl_metadata_dst_alloc(md_size, METADATA_IP_TUNNEL, - GFP_ATOMIC); - if (!tun_dst) - return NULL; - - tun_dst->u.tun_info.options_len = 0; - tun_dst->u.tun_info.mode = 0; - return tun_dst; -} -static inline struct metadata_dst *rpl__ip_tun_set_dst(__be32 saddr, - __be32 daddr, - __u8 tos, __u8 ttl, - __be16 tp_dst, - __be16 flags, - __be64 tunnel_id, - int md_size) -{ - struct metadata_dst *tun_dst; - - tun_dst = rpl_tun_rx_dst(md_size); - if (!tun_dst) - return NULL; - - ip_tunnel_key_init(&tun_dst->u.tun_info.key, - saddr, daddr, tos, ttl, - 0, 0, tp_dst, tunnel_id, flags); - return tun_dst; -} - -static inline struct metadata_dst *rpl_ip_tun_rx_dst(struct sk_buff *skb, - __be16 flags, - __be64 tunnel_id, - int md_size) -{ - const struct iphdr *iph = ip_hdr(skb); - - return rpl__ip_tun_set_dst(iph->saddr, iph->daddr, iph->tos, iph->ttl, - 0, flags, tunnel_id, md_size); -} - -static inline -struct metadata_dst *rpl__ipv6_tun_set_dst(const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 tos, __u8 ttl, - __be16 tp_dst, - __be32 label, - __be16 flags, - __be64 tunnel_id, - int md_size) -{ - struct metadata_dst *tun_dst; - struct ip_tunnel_info *info; - - tun_dst = rpl_tun_rx_dst(md_size); - if (!tun_dst) - return NULL; - - info = &tun_dst->u.tun_info; - info->mode = IP_TUNNEL_INFO_IPV6; - info->key.tun_flags = flags; - info->key.tun_id = tunnel_id; - info->key.tp_src = 0; - info->key.tp_dst = tp_dst; - - info->key.u.ipv6.src = *saddr; - info->key.u.ipv6.dst = *daddr; - - info->key.tos = tos; - info->key.ttl = ttl; - info->key.label = label; - - return tun_dst; -} - -static inline struct metadata_dst *rpl_ipv6_tun_rx_dst(struct sk_buff *skb, - __be16 flags, - __be64 tunnel_id, - int md_size) -{ - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - - return rpl__ipv6_tun_set_dst(&ip6h->saddr, &ip6h->daddr, - ipv6_get_dsfield(ip6h), ip6h->hop_limit, - 0, ip6_flowlabel(ip6h), flags, tunnel_id, - md_size); -} - -static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) -{ - struct dst_entry *dst; - - dst = &md_dst->dst; - -#if 0 - dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, - DST_METADATA | DST_NOCACHE | DST_NOCOUNT); - - dst->input = dst_md_discard; - dst->output = dst_md_discard_out; -#endif - - memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); -} - -static inline struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) -{ - struct metadata_dst *md_dst; - - md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); - if (!md_dst) - return NULL; - - __metadata_dst_init(md_dst, optslen); - return md_dst; -} - -#define skb_tunnel_info ovs_skb_tunnel_info - -static inline void ovs_tun_rx_dst(struct metadata_dst *md_dst, int optslen) -{ - /* No need to allocate for OVS backport case. */ -#if 0 - struct metadata_dst *tun_dst; - struct ip_tunnel_info *info; - - tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC); - if (!tun_dst) - return NULL; -#endif - __metadata_dst_init(md_dst, optslen); -} - -static inline void ovs_ip_tun_rx_dst(struct metadata_dst *md_dst, - struct sk_buff *skb, __be16 flags, - __be64 tunnel_id, int md_size) -{ - const struct iphdr *iph = ip_hdr(skb); - - ovs_tun_rx_dst(md_dst, md_size); - ip_tunnel_key_init(&md_dst->u.tun_info.key, - iph->saddr, iph->daddr, iph->tos, iph->ttl, 0, - 0, 0, tunnel_id, flags); -} - -static inline void ovs_ipv6_tun_rx_dst(struct metadata_dst *md_dst, - struct sk_buff *skb, - __be16 flags, - __be64 tunnel_id, - int md_size) -{ - struct ip_tunnel_info *info = &md_dst->u.tun_info; - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - - ovs_tun_rx_dst(md_dst, md_size); - info->mode = IP_TUNNEL_INFO_IPV6; - info->key.tun_flags = flags; - info->key.tun_id = tunnel_id; - info->key.tp_src = 0; - info->key.tp_dst = 0; - - info->key.u.ipv6.src = ip6h->saddr; - info->key.u.ipv6.dst = ip6h->daddr; - - info->key.tos = ipv6_get_dsfield(ip6h); - info->key.ttl = ip6h->hop_limit; - info->key.label = ip6_flowlabel(ip6h); -} - -#endif /* USE_UPSTREAM_TUNNEL */ - -void ovs_ip_tunnel_rcv(struct net_device *dev, struct sk_buff *skb, - struct metadata_dst *tun_dst); - -static inline struct metadata_dst * -rpl_metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags) -{ -#if defined(HAVE_METADATA_DST_ALLOC_WITH_METADATA_TYPE) && defined(USE_UPSTREAM_TUNNEL) - return metadata_dst_alloc(optslen, type, flags); -#else - return metadata_dst_alloc(optslen, flags); -#endif -} -#define metadata_dst_alloc rpl_metadata_dst_alloc - -static inline bool rpl_skb_valid_dst(const struct sk_buff *skb) -{ - struct dst_entry *dst = skb_dst(skb); - - return dst && !(dst->flags & DST_METADATA); -} -#define skb_valid_dst rpl_skb_valid_dst - -#endif /* __NET_DST_METADATA_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/erspan.h b/datapath/linux/compat/include/net/erspan.h deleted file mode 100644 index 4a6a8f240..000000000 --- a/datapath/linux/compat/include/net/erspan.h +++ /dev/null @@ -1,342 +0,0 @@ -#ifndef USE_UPSTREAM_TUNNEL -#ifndef __LINUX_ERSPAN_H -#define __LINUX_ERSPAN_H - -/* - * GRE header for ERSPAN encapsulation (8 octets [34:41]) -- 8 bytes - * 0 1 2 3 - * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |0|0|0|1|0|00000|000000000|00000| Protocol Type for ERSPAN | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Sequence Number (increments per packet per session) | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * Note that in the above GRE header [RFC1701] out of the C, R, K, S, - * s, Recur, Flags, Version fields only S (bit 03) is set to 1. The - * other fields are set to zero, so only a sequence number follows. - * - * ERSPAN Version 1 (Type II) header (8 octets [42:49]) - * 0 1 2 3 - * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Ver | VLAN | COS | En|T| Session ID | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Reserved | Index | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * - * ERSPAN Version 2 (Type III) header (12 octets [42:49]) - * 0 1 2 3 - * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Ver | VLAN | COS |BSO|T| Session ID | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Timestamp | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | SGT |P| FT | Hw ID |D|Gra|O| - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * Platform Specific SubHeader (8 octets, optional) - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Platf ID | Platform Specific Info | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Platform Specific Info | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB - */ - -/* #include <uapi/linux/erspan.h> */ -/* Just insert uapi/linux/erspan.h here since - * we don't pull in uapi to compat - */ -/* ERSPAN version 2 metadata header */ -struct erspan_md2 { - __be32 timestamp; - __be16 sgt; /* security group tag */ -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 hwid_upper:2, - ft:5, - p:1; - __u8 o:1, - gra:2, - dir:1, - hwid:4; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 p:1, - ft:5, - hwid_upper:2; - __u8 hwid:4, - dir:1, - gra:2, - o:1; -#else -#error "Please fix <asm/byteorder.h>" -#endif -}; - -struct erspan_metadata { - int version; - union { - __be32 index; /* Version 1 (type II)*/ - struct erspan_md2 md2; /* Version 2 (type III) */ - } u; -}; - -#define ERSPAN_VERSION 0x1 /* ERSPAN type II */ -#define VER_MASK 0xf000 -#define VLAN_MASK 0x0fff -#define COS_MASK 0xe000 -#define EN_MASK 0x1800 -#define T_MASK 0x0400 -#define ID_MASK 0x03ff -#define INDEX_MASK 0xfffff - -#define ERSPAN_VERSION2 0x2 /* ERSPAN type III*/ -#define BSO_MASK EN_MASK -#define SGT_MASK 0xffff0000 -#define P_MASK 0x8000 -#define FT_MASK 0x7c00 -#define HWID_MASK 0x03f0 -#define DIR_MASK 0x0008 -#define GRA_MASK 0x0006 -#define O_MASK 0x0001 - -#define HWID_OFFSET 4 -#define DIR_OFFSET 3 - -enum erspan_encap_type { - ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ - ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ - ERSPAN_ENCAP_8021Q = 0x2, /* originally 802.1Q encapsulated */ - ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag perserved in frame */ -}; - -#define ERSPAN_V1_MDSIZE 4 -#define ERSPAN_V2_MDSIZE 8 - -struct erspan_base_hdr { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u8 vlan_upper:4, - ver:4; - __u8 vlan:8; - __u8 session_id_upper:2, - t:1, - en:2, - cos:3; - __u8 session_id:8; -#elif defined(__BIG_ENDIAN_BITFIELD) - __u8 ver: 4, - vlan_upper:4; - __u8 vlan:8; - __u8 cos:3, - en:2, - t:1, - session_id_upper:2; - __u8 session_id:8; -#else -#error "Please fix <asm/byteorder.h>" -#endif -}; - -static inline void set_session_id(struct erspan_base_hdr *ershdr, u16 id) -{ - ershdr->session_id = id & 0xff; - ershdr->session_id_upper = (id >> 8) & 0x3; -} - -static inline u16 get_session_id(const struct erspan_base_hdr *ershdr) -{ - return (ershdr->session_id_upper << 8) + ershdr->session_id; -} - -static inline void set_vlan(struct erspan_base_hdr *ershdr, u16 vlan) -{ - ershdr->vlan = vlan & 0xff; - ershdr->vlan_upper = (vlan >> 8) & 0xf; -} - -static inline u16 get_vlan(const struct erspan_base_hdr *ershdr) -{ - return (ershdr->vlan_upper << 8) + ershdr->vlan; -} - -static inline void set_hwid(struct erspan_md2 *md2, u8 hwid) -{ - md2->hwid = hwid & 0xf; - md2->hwid_upper = (hwid >> 4) & 0x3; -} - -static inline u8 get_hwid(const struct erspan_md2 *md2) -{ - return (md2->hwid_upper << 4) + md2->hwid; -} - -static inline int erspan_hdr_len(int version) -{ - return sizeof(struct erspan_base_hdr) + - (version == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); -} - -static inline u8 tos_to_cos(u8 tos) -{ - u8 dscp, cos; - - dscp = tos >> 2; - cos = dscp >> 3; - return cos; -} - -static inline void erspan_build_header(struct sk_buff *skb, - u32 id, u32 index, - bool truncate, bool is_ipv4) -{ - struct ethhdr *eth = (struct ethhdr *)skb->data; - enum erspan_encap_type enc_type; - struct erspan_base_hdr *ershdr; - struct qtag_prefix { - __be16 eth_type; - __be16 tci; - } *qp; - u16 vlan_tci = 0; - u8 tos; - __be32 *idx; - - tos = is_ipv4 ? ip_hdr(skb)->tos : - (ipv6_hdr(skb)->priority << 4) + - (ipv6_hdr(skb)->flow_lbl[0] >> 4); - - enc_type = ERSPAN_ENCAP_NOVLAN; - - /* If mirrored packet has vlan tag, extract tci and - * perserve vlan header in the mirrored frame. - */ - if (eth->h_proto == htons(ETH_P_8021Q)) { - qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); - vlan_tci = ntohs(qp->tci); - enc_type = ERSPAN_ENCAP_INFRAME; - } - - skb_push(skb, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); - ershdr = (struct erspan_base_hdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); - - /* Build base header */ - ershdr->ver = ERSPAN_VERSION; - ershdr->cos = tos_to_cos(tos); - ershdr->en = enc_type; - ershdr->t = truncate; - set_vlan(ershdr, vlan_tci); - set_session_id(ershdr, id); - - /* Build metadata */ - idx = (__be32 *)(ershdr + 1); - *idx = htonl(index & INDEX_MASK); -} - -/* ERSPAN GRA: timestamp granularity - * 00b --> granularity = 100 microseconds - * 01b --> granularity = 100 nanoseconds - * 10b --> granularity = IEEE 1588 - * Here we only support 100 microseconds. - */ -static inline __be32 erspan_get_timestamp(void) -{ - u64 h_usecs; - ktime_t kt; - - kt = ktime_get_real(); - h_usecs = ktime_divns(kt, 100 * NSEC_PER_USEC); - - /* ERSPAN base header only has 32-bit, - * so it wraps around 4 days. - */ - return htonl((u32)h_usecs); -} - -/* ERSPAN BSO (Bad/Short/Oversized), see RFC1757 - * 00b --> Good frame with no error, or unknown integrity - * 01b --> Payload is a Short Frame - * 10b --> Payload is an Oversized Frame - * 11b --> Payload is a Bad Frame with CRC or Alignment Error - */ -enum erspan_bso { - BSO_NOERROR = 0x0, - BSO_SHORT = 0x1, - BSO_OVERSIZED = 0x2, - BSO_BAD = 0x3, -}; - -static inline u8 erspan_detect_bso(struct sk_buff *skb) -{ - /* BSO_BAD is not handled because the frame CRC - * or alignment error information is in FCS. - */ - if (skb->len < ETH_ZLEN) - return BSO_SHORT; - - if (skb->len > ETH_FRAME_LEN) - return BSO_OVERSIZED; - - return BSO_NOERROR; -} - -static inline void erspan_build_header_v2(struct sk_buff *skb, - u32 id, u8 direction, u16 hwid, - bool truncate, bool is_ipv4) -{ - struct ethhdr *eth = (struct ethhdr *)skb->data; - struct erspan_base_hdr *ershdr; - struct erspan_md2 *md2; - struct qtag_prefix { - __be16 eth_type; - __be16 tci; - } *qp; - u16 vlan_tci = 0; - u8 gra = 0; /* 100 usec */ - u8 bso = truncate; /* Bad/Short/Oversized */ - u8 sgt = 0; - u8 tos; - - tos = is_ipv4 ? ip_hdr(skb)->tos : - (ipv6_hdr(skb)->priority << 4) + - (ipv6_hdr(skb)->flow_lbl[0] >> 4); - - /* Unlike v1, v2 does not have En field, - * so only extract vlan tci field. - */ - if (eth->h_proto == htons(ETH_P_8021Q)) { - qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); - vlan_tci = ntohs(qp->tci); - } - - bso = erspan_detect_bso(skb); - skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); - ershdr = (struct erspan_base_hdr *)skb->data; - memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); - - /* Build base header */ - ershdr->ver = ERSPAN_VERSION2; - ershdr->cos = tos_to_cos(tos); - ershdr->en = bso; - ershdr->t = truncate; - set_vlan(ershdr, vlan_tci); - set_session_id(ershdr, id); - - /* Build metadata */ - md2 = (struct erspan_md2 *)(ershdr + 1); - md2->timestamp = erspan_get_timestamp(); - md2->sgt = htons(sgt); - md2->p = 1; - md2->ft = 0; - md2->dir = direction; - md2->gra = gra; - md2->o = 0; - set_hwid(md2, hwid); -} - -#endif -#else -#include_next <net/erspan.h> -#endif diff --git a/datapath/linux/compat/include/net/genetlink.h b/datapath/linux/compat/include/net/genetlink.h deleted file mode 100644 index 602ce38d3..000000000 --- a/datapath/linux/compat/include/net/genetlink.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef __NET_GENERIC_NETLINK_WRAPPER_H -#define __NET_GENERIC_NETLINK_WRAPPER_H 1 - -#include <linux/version.h> -#include <linux/netlink.h> -#include <net/net_namespace.h> -#include_next <net/genetlink.h> - -#ifndef HAVE_GENL_NOTIFY_TAKES_FAMILY -struct rpl_genl_family { - struct genl_family compat_family; - unsigned int id; - unsigned int hdrsize; - char name[GENL_NAMSIZ]; - unsigned int version; - unsigned int maxattr; - bool netnsok; - bool parallel_ops; - int (*pre_doit)(const struct genl_ops *ops, - struct sk_buff *skb, - struct genl_info *info); - void (*post_doit)(const struct genl_ops *ops, - struct sk_buff *skb, - struct genl_info *info); - struct nlattr ** attrbuf; /* private */ - const struct genl_ops * ops; /* private */ - const struct genl_multicast_group *mcgrps; /* private */ - unsigned int n_ops; /* private */ - unsigned int n_mcgrps; /* private */ - unsigned int mcgrp_offset; /* private */ - struct list_head family_list; /* private */ - struct module *module; -}; - -#define genl_family rpl_genl_family -static inline void *rpl_genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, - struct genl_family *family, int flags, u8 cmd) -{ - return genlmsg_put(skb, portid, seq, &family->compat_family, flags, cmd); -} - -#define genlmsg_put rpl_genlmsg_put - -static inline int rpl_genl_unregister_family(struct genl_family *family) -{ - return genl_unregister_family(&family->compat_family); -} -#define genl_unregister_family rpl_genl_unregister_family - -#define genl_set_err rpl_genl_set_err -static inline int genl_set_err(struct genl_family *family, struct net *net, - u32 portid, u32 group, int code) -{ -#ifdef HAVE_VOID_NETLINK_SET_ERR - netlink_set_err(net->genl_sock, portid, group, code); - return 0; -#else - return netlink_set_err(net->genl_sock, portid, group, code); -#endif -} - -#define genlmsg_multicast_netns rpl_genlmsg_multicast_netns -static inline int genlmsg_multicast_netns(struct genl_family *family, - struct net *net, struct sk_buff *skb, - u32 portid, unsigned int group, gfp_t flags) -{ - return nlmsg_multicast(net->genl_sock, skb, portid, group, flags); -} - - -#define __genl_register_family rpl___genl_register_family -int rpl___genl_register_family(struct genl_family *family); - -#define genl_register_family rpl_genl_register_family -static inline int rpl_genl_register_family(struct genl_family *family) -{ - family->module = THIS_MODULE; - return rpl___genl_register_family(family); -} -#endif - -#ifdef HAVE_GENL_NOTIFY_TAKES_NET -#define genl_notify rpl_genl_notify -void rpl_genl_notify(struct genl_family *family, struct sk_buff *skb, - struct genl_info *info , u32 group, gfp_t flags); -#endif - -#ifndef HAVE_GENL_HAS_LISTENERS -static inline int genl_has_listeners(struct genl_family *family, - struct net *net, unsigned int group) -{ -#ifdef HAVE_MCGRP_OFFSET - if (WARN_ON_ONCE(group >= family->n_mcgrps)) - return -EINVAL; - group = family->mcgrp_offset + group; -#endif - return netlink_has_listeners(net->genl_sock, group); -} -#else - -#ifndef HAVE_GENL_HAS_LISTENERS_TAKES_NET -static inline int rpl_genl_has_listeners(struct genl_family *family, - struct net *net, unsigned int group) -{ -#ifdef HAVE_GENL_NOTIFY_TAKES_FAMILY - return genl_has_listeners(family, net->genl_sock, group); -#else - return genl_has_listeners(&family->compat_family, net->genl_sock, group); -#endif -} - -#define genl_has_listeners rpl_genl_has_listeners -#endif - -#endif /* HAVE_GENL_HAS_LISTENERS */ - -#ifndef HAVE_NETLINK_EXT_ACK -struct netlink_ext_ack; - -static inline int rpl_genlmsg_parse(const struct nlmsghdr *nlh, - const struct genl_family *family, - struct nlattr *tb[], int maxtype, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) -{ -#ifdef HAVE_GENLMSG_PARSE - return genlmsg_parse(nlh, family, tb, maxtype, policy); -#else - return nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, - policy); -#endif -} -#define genlmsg_parse rpl_genlmsg_parse -#endif - -#endif /* genetlink.h */ diff --git a/datapath/linux/compat/include/net/geneve.h b/datapath/linux/compat/include/net/geneve.h deleted file mode 100644 index d9c9f0bf7..000000000 --- a/datapath/linux/compat/include/net/geneve.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef __NET_GENEVE_WRAPPER_H -#define __NET_GENEVE_WRAPPER_H 1 - -#ifdef CONFIG_INET -#include <net/udp_tunnel.h> -#endif - - -#ifdef USE_UPSTREAM_TUNNEL -#include_next <net/geneve.h> - -static inline int rpl_geneve_init_module(void) -{ - return 0; -} -static inline void rpl_geneve_cleanup_module(void) -{} - -#define geneve_xmit dev_queue_xmit - -#ifdef CONFIG_INET -#ifndef HAVE_NAME_ASSIGN_TYPE -static inline struct net_device *rpl_geneve_dev_create_fb( - struct net *net, const char *name, u8 name_assign_type, u16 dst_port) { - return geneve_dev_create_fb(net, name, dst_port); -} -#define geneve_dev_create_fb rpl_geneve_dev_create_fb -#endif -#endif - -#else -/* Geneve Header: - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |Ver| Opt Len |O|C| Rsvd. | Protocol Type | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Virtual Network Identifier (VNI) | Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Variable Length Options | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * Option Header: - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Option Class | Type |R|R|R| Length | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Variable Option Data | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - */ - -struct geneve_opt { - __be16 opt_class; - u8 type; -#ifdef __LITTLE_ENDIAN_BITFIELD - u8 length:5; - u8 r3:1; - u8 r2:1; - u8 r1:1; -#else - u8 r1:1; - u8 r2:1; - u8 r3:1; - u8 length:5; -#endif - u8 opt_data[]; -}; - -#define GENEVE_CRIT_OPT_TYPE (1 << 7) - -struct genevehdr { -#ifdef __LITTLE_ENDIAN_BITFIELD - u8 opt_len:6; - u8 ver:2; - u8 rsvd1:6; - u8 critical:1; - u8 oam:1; -#else - u8 ver:2; - u8 opt_len:6; - u8 oam:1; - u8 critical:1; - u8 rsvd1:6; -#endif - __be16 proto_type; - u8 vni[3]; - u8 rsvd2; - struct geneve_opt options[]; -}; - -#ifdef CONFIG_INET -#define geneve_dev_create_fb rpl_geneve_dev_create_fb -struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name, - u8 name_assign_type, u16 dst_port); -#endif /*ifdef CONFIG_INET */ - -int rpl_geneve_init_module(void); -void rpl_geneve_cleanup_module(void); - -#define geneve_xmit rpl_geneve_xmit -netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb); - -#endif -#define geneve_init_module rpl_geneve_init_module -#define geneve_cleanup_module rpl_geneve_cleanup_module - -#define geneve_fill_metadata_dst ovs_geneve_fill_metadata_dst -int ovs_geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); - -#endif /*ifdef__NET_GENEVE_H */ diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h deleted file mode 100644 index 57293b6c2..000000000 --- a/datapath/linux/compat/include/net/gre.h +++ /dev/null @@ -1,191 +0,0 @@ -#ifndef __LINUX_GRE_WRAPPER_H -#define __LINUX_GRE_WRAPPER_H - -#include <linux/version.h> -#include <linux/skbuff.h> -#include <net/ip_tunnels.h> - -#ifdef USE_UPSTREAM_TUNNEL -#include_next <net/gre.h> - -static inline int rpl_ipgre_init(void) -{ - return 0; -} -static inline void rpl_ipgre_fini(void) -{} - -static inline int rpl_ip6gre_init(void) -{ - return 0; -} - -static inline void rpl_ip6gre_fini(void) -{} - -static inline int rpl_ip6_tunnel_init(void) -{ - return 0; -} - -static inline void rpl_ip6_tunnel_cleanup(void) -{ -} - -static inline int rpl_gre_init(void) -{ - return 0; -} - -static inline void rpl_gre_exit(void) -{ -} - -#define gre_fb_xmit dev_queue_xmit - -#ifdef CONFIG_INET -#ifndef HAVE_NAME_ASSIGN_TYPE -static inline struct net_device *rpl_gretap_fb_dev_create( - struct net *net, const char *name, u8 name_assign_type) { - return gretap_fb_dev_create(net, name); -} -#define gretap_fb_dev_create rpl_gretap_fb_dev_create -#endif -#endif - -#else -#include_next <net/gre.h> - -#ifndef HAVE_GRE_CALC_HLEN -static inline int gre_calc_hlen(__be16 o_flags) -{ - int addend = 4; - - if (o_flags & TUNNEL_CSUM) - addend += 4; - if (o_flags & TUNNEL_KEY) - addend += 4; - if (o_flags & TUNNEL_SEQ) - addend += 4; - return addend; -} - -#define ip_gre_calc_hlen gre_calc_hlen -#else -#ifdef HAVE_IP_GRE_CALC_HLEN -#define gre_calc_hlen ip_gre_calc_hlen -#endif -#endif - -#define tnl_flags_to_gre_flags rpl_tnl_flags_to_gre_flags -static inline __be16 rpl_tnl_flags_to_gre_flags(__be16 tflags) -{ - __be16 flags = 0; - - if (tflags & TUNNEL_CSUM) - flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) - flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) - flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) - flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) - flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) - flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) - flags |= GRE_VERSION; - - return flags; -} - -#define gre_flags_to_tnl_flags rpl_gre_flags_to_tnl_flags -static inline __be16 rpl_gre_flags_to_tnl_flags(__be16 flags) -{ - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; -} -#define gre_tnl_flags_to_gre_flags rpl_gre_tnl_flags_to_gre_flags -static inline __be16 rpl_gre_tnl_flags_to_gre_flags(__be16 tflags) -{ - __be16 flags = 0; - - if (tflags & TUNNEL_CSUM) - flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) - flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) - flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) - flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) - flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) - flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) - flags |= GRE_VERSION; - - return flags; -} - -#define gre_build_header rpl_gre_build_header -void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len); - -int rpl_ipgre_init(void); -void rpl_ipgre_fini(void); -int rpl_ip6gre_init(void); -void rpl_ip6gre_fini(void); -int rpl_ip6_tunnel_init(void); -void rpl_ip6_tunnel_cleanup(void); -int rpl_gre_init(void); -void rpl_gre_exit(void); - -#define gretap_fb_dev_create rpl_gretap_fb_dev_create -struct net_device *rpl_gretap_fb_dev_create(struct net *net, const char *name, - u8 name_assign_type); - -#define gre_parse_header rpl_gre_parse_header -int rpl_gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, - bool *csum_err, __be16 proto, int nhs); - -#define gre_fb_xmit rpl_gre_fb_xmit -netdev_tx_t rpl_gre_fb_xmit(struct sk_buff *skb); - -#define gre_add_protocol rpl_gre_add_protocol -int rpl_gre_add_protocol(const struct gre_protocol *proto, u8 version); -#define gre_del_protocol rpl_gre_del_protocol -int rpl_gre_del_protocol(const struct gre_protocol *proto, u8 version); -#endif /* USE_UPSTREAM_TUNNEL */ - -#define ipgre_init rpl_ipgre_init -#define ipgre_fini rpl_ipgre_fini -#define ip6gre_init rpl_ip6gre_init -#define ip6gre_fini rpl_ip6gre_fini -#define ip6_tunnel_init rpl_ip6_tunnel_init -#define ip6_tunnel_cleanup rpl_ip6_tunnel_cleanup -#define gre_init rpl_gre_init -#define gre_exit rpl_gre_exit - -#define gre_fill_metadata_dst ovs_gre_fill_metadata_dst -int ovs_gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); - - -#endif diff --git a/datapath/linux/compat/include/net/inet_ecn.h b/datapath/linux/compat/include/net/inet_ecn.h deleted file mode 100644 index f0591b322..000000000 --- a/datapath/linux/compat/include/net/inet_ecn.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef _INET_ECN_WRAPPER_H_ -#define _INET_ECN_WRAPPER_H_ - -#include_next <net/inet_ecn.h> - -#define INET_ECN_decapsulate rpl_INET_ECN_decapsulate -static inline int INET_ECN_decapsulate(struct sk_buff *skb, - __u8 outer, __u8 inner) -{ - if (INET_ECN_is_not_ect(inner)) { - switch (outer & INET_ECN_MASK) { - case INET_ECN_NOT_ECT: - return 0; - case INET_ECN_ECT_0: - case INET_ECN_ECT_1: - return 1; - case INET_ECN_CE: - return 2; - } - } - - if (INET_ECN_is_ce(outer)) - INET_ECN_set_ce(skb); - - return 0; -} - -#define IP_ECN_decapsulate rpl_IP_ECN_decapsulate -static inline int IP_ECN_decapsulate(const struct iphdr *oiph, - struct sk_buff *skb) -{ - __u8 inner; - - if (skb->protocol == htons(ETH_P_IP)) - inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) - inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else - return 0; - - return INET_ECN_decapsulate(skb, oiph->tos, inner); -} - -#define IP6_ECN_decapsulate rpl_IP6_ECN_decapsulate -static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, - struct sk_buff *skb) -{ - __u8 inner; - - if (skb->protocol == htons(ETH_P_IP)) - inner = ip_hdr(skb)->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) - inner = ipv6_get_dsfield(ipv6_hdr(skb)); - else - return 0; - - return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); -} -#endif diff --git a/datapath/linux/compat/include/net/inet_frag.h b/datapath/linux/compat/include/net/inet_frag.h deleted file mode 100644 index 00784da2b..000000000 --- a/datapath/linux/compat/include/net/inet_frag.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef __NET_INET_FRAG_WRAPPER_H -#define __NET_INET_FRAG_WRAPPER_H 1 - -#include <linux/version.h> -#include_next <net/inet_frag.h> - -#ifdef HAVE_INET_FRAGS_LAST_IN -#define q_flags(q) (q->last_in) -#define qp_flags(qp) (qp->q.last_in) -#else -#define q_flags(q) (q->flags) -#define qp_flags(qp) (qp->q.flags) -#endif - -#ifndef HAVE_CORRECT_MRU_HANDLING -#ifndef HAVE_INET_FRAG_EVICTING -static inline bool inet_frag_evicting(struct inet_frag_queue *q) -{ -#ifdef HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR - return !hlist_unhashed(&q->list_evictor); -#else - return (q_flags(q) & INET_FRAG_FIRST_IN) && q->fragments != NULL; -#endif /* HAVE_INET_FRAG_QUEUE_WITH_LIST_EVICTOR */ -} -#endif /* HAVE_INET_FRAG_EVICTING */ -#endif /* HAVE_CORRECT_MRU_HANDLING */ - -/* Upstream commit 3fd588eb90bf ("inet: frag: remove lru list") dropped this - * function, but we call it from our compat code. Provide a noop version. */ -#ifndef HAVE_INET_FRAG_LRU_MOVE -#define inet_frag_lru_move(q) -#endif - -#ifdef HAVE_INET_FRAG_FQDIR -#define netns_frags fqdir -#endif - -#ifndef HAVE_SUB_FRAG_MEM_LIMIT_ARG_STRUCT_NETNS_FRAGS -#ifdef HAVE_FRAG_PERCPU_COUNTER_BATCH -static inline void rpl_sub_frag_mem_limit(struct netns_frags *nf, int i) -{ - __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch); -} -#define sub_frag_mem_limit rpl_sub_frag_mem_limit - -static inline void rpl_add_frag_mem_limit(struct netns_frags *nf, int i) -{ - __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch); -} -#define add_frag_mem_limit rpl_add_frag_mem_limit -#else /* !frag_percpu_counter_batch */ -static inline void rpl_sub_frag_mem_limit(struct netns_frags *nf, int i) -{ -#ifdef HAVE_INET_FRAG_FQDIR - atomic_long_sub(i, &nf->mem); -#else - atomic_sub(i, &nf->mem); -#endif -} -#define sub_frag_mem_limit rpl_sub_frag_mem_limit - -static inline void rpl_add_frag_mem_limit(struct netns_frags *nf, int i) -{ -#ifdef HAVE_INET_FRAG_FQDIR - atomic_long_add(i, &nf->mem); -#else - atomic_add(i, &nf->mem); -#endif -} -#define add_frag_mem_limit rpl_add_frag_mem_limit -#endif /* frag_percpu_counter_batch */ -#endif - -#ifdef HAVE_VOID_INET_FRAGS_INIT -static inline int rpl_inet_frags_init(struct inet_frags *frags) -{ - inet_frags_init(frags); - return 0; -} -#define inet_frags_init rpl_inet_frags_init -#endif - -#endif /* inet_frag.h */ diff --git a/datapath/linux/compat/include/net/inetpeer.h b/datapath/linux/compat/include/net/inetpeer.h deleted file mode 100644 index c5f5eb12b..000000000 --- a/datapath/linux/compat/include/net/inetpeer.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _NET_INETPEER_WRAPPER_H -#define _NET_INETPEER_WRAPPER_H - -#include_next <net/inetpeer.h> - -#ifndef HAVE_INETPEER_VIF_SUPPORT -static inline struct inet_peer *rpl_inet_getpeer_v4(struct inet_peer_base *base, - __be32 v4daddr, int vif, - int create) -{ - return inet_getpeer_v4(base, v4daddr, create); -} -#define inet_getpeer_v4 rpl_inet_getpeer_v4 -#endif /* HAVE_INETPEER_VIF_SUPPORT */ - -#endif /* _NET_INETPEER_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/ip.h b/datapath/linux/compat/include/net/ip.h deleted file mode 100644 index ad5ac33ee..000000000 --- a/datapath/linux/compat/include/net/ip.h +++ /dev/null @@ -1,143 +0,0 @@ -#ifndef __NET_IP_WRAPPER_H -#define __NET_IP_WRAPPER_H 1 - -#include_next <net/ip.h> - -#include <net/route.h> -#include <linux/version.h> - -#ifndef HAVE_INET_GET_LOCAL_PORT_RANGE_USING_NET -static inline void rpl_inet_get_local_port_range(struct net *net, int *low, - int *high) -{ - inet_get_local_port_range(low, high); -} -#define inet_get_local_port_range rpl_inet_get_local_port_range - -#endif - -#ifndef IPSKB_FRAG_PMTU -#define IPSKB_FRAG_PMTU BIT(6) -#endif - -/* IPv4 datagram length is stored into 16bit field (tot_len) */ -#ifndef IP_MAX_MTU -#define IP_MAX_MTU 0xFFFFU -#endif - -#ifndef HAVE_IP_SKB_DST_MTU -static inline bool rpl_ip_sk_use_pmtu(const struct sock *sk) -{ - return inet_sk(sk)->pmtudisc < IP_PMTUDISC_PROBE; -} -#define ip_sk_use_pmtu rpl_ip_sk_use_pmtu - -static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, - bool forwarding) -{ -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) - struct net *net = dev_net(dst->dev); - - if (net->ipv4.sysctl_ip_fwd_use_pmtu || - dst_metric_locked(dst, RTAX_MTU) || - !forwarding) - return dst_mtu(dst); -#endif - - return min(dst->dev->mtu, IP_MAX_MTU); -} - -static inline unsigned int rpl_ip_skb_dst_mtu(const struct sk_buff *skb) -{ - if (!skb->sk || ip_sk_use_pmtu(skb->sk)) { - bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; - return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); - } else { - return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU); - } -} -#define ip_skb_dst_mtu rpl_ip_skb_dst_mtu -#endif /* HAVE_IP_SKB_DST_MTU */ - -#ifdef HAVE_IP_FRAGMENT_TAKES_SOCK -#ifdef HAVE_IP_LOCAL_OUT_TAKES_NET -#define OVS_VPORT_OUTPUT_PARAMS struct net *net, struct sock *sock, struct sk_buff *skb -#else -#define OVS_VPORT_OUTPUT_PARAMS struct sock *sock, struct sk_buff *skb -#endif -#else -#define OVS_VPORT_OUTPUT_PARAMS struct sk_buff *skb -#endif - -/* Prior to upstream commit d6b915e29f4a ("ip_fragment: don't forward - * defragmented DF packet"), IPCB(skb)->frag_max_size was not always populated - * correctly, which would lead to reassembled packets not being refragmented. - * So, we backport all of ip_defrag() in these cases. - */ -#ifndef HAVE_CORRECT_MRU_HANDLING - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0) -static inline bool ip_defrag_user_in_between(u32 user, - enum ip_defrag_users lower_bond, - enum ip_defrag_users upper_bond) -{ - return user >= lower_bond && user <= upper_bond; -} -#endif /* < v4.2 */ - -int rpl_ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - int (*output)(OVS_VPORT_OUTPUT_PARAMS)); -#define ip_do_fragment rpl_ip_do_fragment - -/* If backporting IP defrag, then init/exit functions need to be called from - * compat_{in,ex}it() to prepare the backported fragmentation cache. In this - * case we declare the functions which are defined in - * datapath/linux/compat/ip_fragment.c. */ -int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user); -#define ip_defrag rpl_ip_defrag -int __init rpl_ipfrag_init(void); -void rpl_ipfrag_fini(void); -void ovs_netns_frags_init(struct net *net); -void ovs_netns_frags_exit(struct net *net); - -#else /* HAVE_CORRECT_MRU_HANDLING */ - -#ifndef HAVE_IP_DO_FRAGMENT_TAKES_NET -static inline int rpl_ip_do_fragment(struct net *net, struct sock *sk, - struct sk_buff *skb, - int (*output)(OVS_VPORT_OUTPUT_PARAMS)) -{ - return ip_do_fragment(sk, skb, output); -} -#define ip_do_fragment rpl_ip_do_fragment -#endif /* IP_DO_FRAGMENT_TAKES_NET */ - -/* We have no good way to detect the presence of upstream commit 8282f27449bf - * ("inet: frag: Always orphan skbs inside ip_defrag()"), but it should be - * always included in kernels 4.5+. */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0) -static inline int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user) -{ - skb_orphan(skb); -#ifndef HAVE_IP_DEFRAG_TAKES_NET - return ip_defrag(skb, user); -#else - return ip_defrag(net, skb, user); -#endif -} -#define ip_defrag rpl_ip_defrag -#endif - -/* If we can use upstream defrag then we can rely on the upstream - * defrag module to init/exit correctly. In this case the calls in - * compat_{in,ex}it() can be no-ops. */ -static inline int rpl_ipfrag_init(void) { return 0; } -static inline void rpl_ipfrag_fini(void) { } -static inline void ovs_netns_frags_init(struct net *net) { } -static inline void ovs_netns_frags_exit(struct net *net) { } -#endif /* HAVE_CORRECT_MRU_HANDLING */ - -#define ipfrag_init rpl_ipfrag_init -#define ipfrag_fini rpl_ipfrag_fini - -#endif diff --git a/datapath/linux/compat/include/net/ip6_fib.h b/datapath/linux/compat/include/net/ip6_fib.h deleted file mode 100644 index 0cc435813..000000000 --- a/datapath/linux/compat/include/net/ip6_fib.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Linux INET6 implementation - * - * Authors: - * Pedro Roque <roque@di.fc.ul.pt> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _IP6_FIB_WRAPPER_H -#define _IP6_FIB_WRAPPER_H - -#include_next <net/ip6_fib.h> - -#ifndef HAVE_RT6_GET_COOKIE - -#ifndef RTF_PCPU -#define RTF_PCPU 0x40000000 -#endif - -#ifndef RTF_LOCAL -#define RTF_LOCAL 0x80000000 -#endif - -#define rt6_get_cookie rpl_rt6_get_cookie -static inline u32 rt6_get_cookie(const struct rt6_info *rt) -{ - if (rt->rt6i_flags & RTF_PCPU || -#ifdef HAVE_DST_NOCACHE - (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from)) -#else - (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from)) -#endif - rt = (struct rt6_info *)(rt->dst.from); - - return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; -} -#endif - -#endif diff --git a/datapath/linux/compat/include/net/ip6_route.h b/datapath/linux/compat/include/net/ip6_route.h deleted file mode 100644 index 7c78fd5c6..000000000 --- a/datapath/linux/compat/include/net/ip6_route.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef __NET_IP6_ROUTE_WRAPPER -#define __NET_IP6_ROUTE_WRAPPER - -#include <net/route.h> -#include <net/ip.h> /* For OVS_VPORT_OUTPUT_PARAMS */ -#include <net/ipv6.h> - -#include_next<net/ip6_route.h> - -#ifndef HAVE_NF_IPV6_OPS_FRAGMENT -int rpl_ip6_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(OVS_VPORT_OUTPUT_PARAMS)); -#define ip6_fragment rpl_ip6_fragment -#endif /* HAVE_NF_IPV6_OPS_FRAGMENT */ - -#endif /* _NET_IP6_ROUTE_WRAPPER */ diff --git a/datapath/linux/compat/include/net/ip6_tunnel.h b/datapath/linux/compat/include/net/ip6_tunnel.h deleted file mode 100644 index e0a33a646..000000000 --- a/datapath/linux/compat/include/net/ip6_tunnel.h +++ /dev/null @@ -1,208 +0,0 @@ -#ifndef NET_IP6_TUNNEL_WRAPPER_H -#define NET_IP6_TUNNEL_WRAPPER_H 1 - -#ifdef HAVE_IP6_TNL_PARM_ERSPAN_VER -#include_next <net/ip6_tunnel.h> -#else - -#include <linux/ipv6.h> -#include <linux/netdevice.h> -#include <linux/if_tunnel.h> -#include <linux/ip6_tunnel.h> -#include <net/ip_tunnels.h> -#include <net/dst_cache.h> -#include <net/dst_metadata.h> -#include "gso.h" - -#define IP6TUNNEL_ERR_TIMEO (30*HZ) - -/* capable of sending packets */ -#define IP6_TNL_F_CAP_XMIT 0x10000 -/* capable of receiving packets */ -#define IP6_TNL_F_CAP_RCV 0x20000 -/* determine capability on a per-packet basis */ -#define IP6_TNL_F_CAP_PER_PACKET 0x40000 - -#ifndef IP6_TNL_F_ALLOW_LOCAL_REMOTE -#define IP6_TNL_F_ALLOW_LOCAL_REMOTE 0 -#endif - -struct rpl__ip6_tnl_parm { - char name[IFNAMSIZ]; /* name of tunnel device */ - int link; /* ifindex of underlying L2 interface */ - __u8 proto; /* tunnel protocol */ - __u8 encap_limit; /* encapsulation limit for tunnel */ - __u8 hop_limit; /* hop limit for tunnel */ - bool collect_md; - __be32 flowinfo; /* traffic class and flowlabel for tunnel */ - __u32 flags; /* tunnel flags */ - struct in6_addr laddr; /* local tunnel end-point address */ - struct in6_addr raddr; /* remote tunnel end-point address */ - - __be16 i_flags; - __be16 o_flags; - __be32 i_key; - __be32 o_key; - - __u32 fwmark; - __u32 index; /* ERSPAN type II index */ - __u8 erspan_ver; /* ERSPAN version */ - __u8 dir; /* direction */ - __u16 hwid; /* hwid */ -}; - -#define __ip6_tnl_parm rpl__ip6_tnl_parm - -/* IPv6 tunnel */ -struct rpl_ip6_tnl { - struct rpl_ip6_tnl __rcu *next; /* next tunnel in list */ - struct net_device *dev; /* virtual device associated with tunnel */ - struct net *net; /* netns for packet i/o */ - struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ - struct flowi fl; /* flowi template for xmit */ - struct dst_cache dst_cache; /* cached dst */ - struct gro_cells gro_cells; - - int err_count; - unsigned long err_time; - - /* These fields used only by GRE */ - __u32 i_seqno; /* The last seen seqno */ - __u32 o_seqno; /* The last output seqno */ - int hlen; /* tun_hlen + encap_hlen */ - int tun_hlen; /* Precalculated header length */ - int encap_hlen; /* Encap header length (FOU,GUE) */ - struct ip_tunnel_encap encap; - int mlink; -}; - -#define ip6_tnl rpl_ip6_tnl - -struct rpl_ip6_tnl_encap_ops { - size_t (*encap_hlen)(struct ip_tunnel_encap *e); - int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, - u8 *protocol, struct flowi6 *fl6); -}; - -#define ip6_tnl_encap_ops rpl_ip6_tnl_encap_ops - -#ifdef CONFIG_INET - -#ifndef MAX_IPTUN_ENCAP_OPS -#define MAX_IPTUN_ENCAP_OPS 8 -#endif - -extern const struct ip6_tnl_encap_ops __rcu * - rpl_ip6tun_encaps[MAX_IPTUN_ENCAP_OPS]; - -int rpl_ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, - unsigned int num); -#define ip6_tnl_encap_add_ops rpl_ip6_tnl_encap_add_ops -int rpl_ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, - unsigned int num); -#define ip6_tnl_encap_del_ops rpl_ip6_tnl_encap_del_ops -int rpl_ip6_tnl_encap_setup(struct ip6_tnl *t, - struct ip_tunnel_encap *ipencap); -#define ip6_tnl_encap_setup rpl_ip6_tnl_encap_setup - -#ifndef HAVE_TUNNEL_ENCAP_TYPES -enum tunnel_encap_types { - TUNNEL_ENCAP_NONE, - TUNNEL_ENCAP_FOU, - TUNNEL_ENCAP_GUE, -}; - -#endif -static inline int ip6_encap_hlen(struct ip_tunnel_encap *e) -{ - const struct ip6_tnl_encap_ops *ops; - int hlen = -EINVAL; - - if (e->type == TUNNEL_ENCAP_NONE) - return 0; - - if (e->type >= MAX_IPTUN_ENCAP_OPS) - return -EINVAL; - - rcu_read_lock(); - ops = rcu_dereference(rpl_ip6tun_encaps[e->type]); - if (likely(ops && ops->encap_hlen)) - hlen = ops->encap_hlen(e); - rcu_read_unlock(); - - return hlen; -} - -static inline int ip6_tnl_encap(struct sk_buff *skb, struct ip6_tnl *t, - u8 *protocol, struct flowi6 *fl6) -{ - const struct ip6_tnl_encap_ops *ops; - int ret = -EINVAL; - - if (t->encap.type == TUNNEL_ENCAP_NONE) - return 0; - - if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) - return -EINVAL; - - rcu_read_lock(); - ops = rcu_dereference(rpl_ip6tun_encaps[t->encap.type]); - if (likely(ops && ops->build_header)) - ret = ops->build_header(skb, &t->encap, protocol, fl6); - rcu_read_unlock(); - - return ret; -} - -/* Tunnel encapsulation limit destination sub-option */ - -struct ipv6_tlv_tnl_enc_lim { - __u8 type; /* type-code for option */ - __u8 length; /* option length */ - __u8 encap_limit; /* tunnel encapsulation limit */ -} __packed; - -int rpl_ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, - const struct in6_addr *raddr); -#define ip6_tnl_rcv_ctl rpl_ip6_tnl_rcv_ctl -int rpl_ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, - const struct tnl_ptk_info *tpi, - struct metadata_dst *tun_dst, - bool log_ecn_error); -#define ip6_tnl_rcv rpl_ip6_tnl_rcv -int rpl_ip6_tnl_xmit_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, - const struct in6_addr *raddr); -#define ip6_tnl_xmit_ctl rpl_ip6_tnl_xmit_ctl -int rpl_ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, - struct flowi6 *fl6, int encap_limit, __u32 *pmtu, - __u8 proto); -#define ip6_tnl_xmit rpl_ip6_tnl_xmit -__u16 rpl_ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw); -#define ip6_tnl_parse_tlv_enc_lim rpl_ip6_tnl_parse_tlv_enc_lim -__u32 rpl_ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, - const struct in6_addr *raddr); -#define ip6_tnl_get_cap rpl_ip6_tnl_get_cap -struct net *rpl_ip6_tnl_get_link_net(const struct net_device *dev); -#define ip6_tnl_get_link_net rpl_ip6_tnl_get_link_net -int rpl_ip6_tnl_get_iflink(const struct net_device *dev); -#define ip6_tnl_get_iflink rpl_ip6_tnl_get_iflink -int rpl_ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); -#define ip6_tnl_change_mtu rpl_ip6_tnl_change_mtu - -static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, - struct net_device *dev) -{ - int pkt_len, err; - - memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - pkt_len = skb->len - skb_inner_network_offset(skb); - err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); - if (unlikely(net_xmit_eval(err))) - pkt_len = -1; - iptunnel_xmit_stats(dev, pkt_len); -} -#endif - -#endif /* HAVE_IP6_TNL_PARM_ERSPAN_VER */ - -#endif diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h deleted file mode 100644 index 617a753c7..000000000 --- a/datapath/linux/compat/include/net/ip_tunnels.h +++ /dev/null @@ -1,513 +0,0 @@ -#ifndef __NET_IP_TUNNELS_WRAPPER_H -#define __NET_IP_TUNNELS_WRAPPER_H 1 - -#include <linux/version.h> - -#ifdef USE_UPSTREAM_TUNNEL -/* Block all ip_tunnel functions. - * Only function that do not depend on ip_tunnel structure can - * be used. Those needs to be explicitly defined in this header file. */ -#include_next <net/ip_tunnels.h> - -#ifndef TUNNEL_ERSPAN_OPT -#define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) -#endif -#define ovs_ip_tunnel_encap ip_tunnel_encap - -#ifndef HAVE_IP_TUNNEL_INFO_OPTS_SET_FLAGS -static inline void rpl_ip_tunnel_info_opts_set(struct ip_tunnel_info *info, - const void *from, int len, - __be16 flags) -{ - memcpy(ip_tunnel_info_opts(info), from, len); - info->options_len = len; - info->key.tun_flags |= flags; -} - -#define ip_tunnel_info_opts_set rpl_ip_tunnel_info_opts_set -#endif - -#else /* USE_UPSTREAM_TUNNEL */ - -#include <linux/if_tunnel.h> -#include <linux/types.h> -#include <net/dsfield.h> -#include <net/dst_cache.h> -#include <net/flow.h> -#include <net/inet_ecn.h> -#include <net/ip.h> -#include <net/rtnetlink.h> -#include <net/gro_cells.h> - -#ifndef MAX_IPTUN_ENCAP_OPS -#define MAX_IPTUN_ENCAP_OPS 8 -#endif - -#ifndef HAVE_TUNNEL_ENCAP_TYPES -enum tunnel_encap_types { - TUNNEL_ENCAP_NONE, - TUNNEL_ENCAP_FOU, - TUNNEL_ENCAP_GUE, -}; - -#define HAVE_TUNNEL_ENCAP_TYPES 1 -#endif - -#define __iptunnel_pull_header rpl___iptunnel_pull_header -int rpl___iptunnel_pull_header(struct sk_buff *skb, int hdr_len, - __be16 inner_proto, bool raw_proto, bool xnet); - -#define iptunnel_pull_header rpl_iptunnel_pull_header -static inline int rpl_iptunnel_pull_header(struct sk_buff *skb, int hdr_len, - __be16 inner_proto, bool xnet) -{ - return rpl___iptunnel_pull_header(skb, hdr_len, inner_proto, false, xnet); -} - -int ovs_iptunnel_handle_offloads(struct sk_buff *skb, - int gso_type_mask, - void (*fix_segment)(struct sk_buff *)); - -/* This is required to compile upstream gre.h. gre_handle_offloads() - * is defined in gre.h and needs iptunnel_handle_offloads(). This provides - * default signature for this function. - * rpl prefix is to make OVS build happy. - */ -#define iptunnel_handle_offloads rpl_iptunnel_handle_offloads -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) -struct sk_buff *rpl_iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, - int gso_type_mask); -#else -int rpl_iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, - int gso_type_mask); -#endif - -#define iptunnel_xmit rpl_iptunnel_xmit -void rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, - __be16 df, bool xnet); -#define ip_tunnel_xmit rpl_ip_tunnel_xmit -void rpl_ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, - const struct iphdr *tnl_params, const u8 protocol); - - -#ifndef TUNNEL_CSUM -#define TUNNEL_CSUM __cpu_to_be16(0x01) -#define TUNNEL_ROUTING __cpu_to_be16(0x02) -#define TUNNEL_KEY __cpu_to_be16(0x04) -#define TUNNEL_SEQ __cpu_to_be16(0x08) -#define TUNNEL_STRICT __cpu_to_be16(0x10) -#define TUNNEL_REC __cpu_to_be16(0x20) -#define TUNNEL_VERSION __cpu_to_be16(0x40) -#define TUNNEL_NO_KEY __cpu_to_be16(0x80) -#define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100) -#define TUNNEL_OAM __cpu_to_be16(0x0200) -#define TUNNEL_CRIT_OPT __cpu_to_be16(0x0400) -#define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800) -#define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000) -#define TUNNEL_NOCACHE __cpu_to_be16(0x2000) -#define TUNNEL_ERSPAN_OPT __cpu_to_be16(0x4000) - -#undef TUNNEL_OPTIONS_PRESENT -#define TUNNEL_OPTIONS_PRESENT \ - (TUNNEL_GENEVE_OPT | TUNNEL_VXLAN_OPT | TUNNEL_ERSPAN_OPT) - -struct tnl_ptk_info { - __be16 flags; - __be16 proto; - __be32 key; - __be32 seq; - int hdr_len; -}; - -#define PACKET_RCVD 0 -#define PACKET_REJECT 1 -#define PACKET_NEXT 2 -#endif - -#define IP_TNL_HASH_BITS 7 -#define IP_TNL_HASH_SIZE (1 << IP_TNL_HASH_BITS) - -/* Keep error state on tunnel for 30 sec */ -#define IPTUNNEL_ERR_TIMEO (30*HZ) - -/* Used to memset ip_tunnel padding. */ -#define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) - -/* Used to memset ipv4 address padding. */ -#define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) -#define IP_TUNNEL_KEY_IPV4_PAD_LEN \ - (sizeof_field(struct ip_tunnel_key, u) - \ - sizeof_field(struct ip_tunnel_key, u.ipv4)) - -struct ip_tunnel_key { - __be64 tun_id; - union { - struct { - __be32 src; - __be32 dst; - } ipv4; - struct { - struct in6_addr src; - struct in6_addr dst; - } ipv6; - } u; - __be16 tun_flags; - u8 tos; /* TOS for IPv4, TC for IPv6 */ - u8 ttl; /* TTL for IPv4, HL for IPv6 */ - __be32 label; /* Flow Label for IPv6 */ - __be16 tp_src; - __be16 tp_dst; -}; - -/* Flags for ip_tunnel_info mode. */ -#define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ -#define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ - -struct ip_tunnel_info { - struct ip_tunnel_key key; - struct dst_cache dst_cache; - u8 options_len; - u8 mode; -}; - -/* 6rd prefix/relay information */ -#ifdef CONFIG_IPV6_SIT_6RD -struct ip_tunnel_6rd_parm { - struct in6_addr prefix; - __be32 relay_prefix; - u16 prefixlen; - u16 relay_prefixlen; -}; -#endif - -struct ip_tunnel_encap { - u16 type; - u16 flags; - __be16 sport; - __be16 dport; -}; - -struct ip_tunnel_prl_entry { - struct ip_tunnel_prl_entry __rcu *next; - __be32 addr; - u16 flags; - struct rcu_head rcu_head; -}; - -static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info) -{ - return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; -} - -static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info) -{ - return info + 1; -} - -static inline void ip_tunnel_info_opts_get(void *to, - const struct ip_tunnel_info *info) -{ - memcpy(to, info + 1, info->options_len); -} - -static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, - const void *from, int len, - __be16 flags) -{ - memcpy(ip_tunnel_info_opts(info), from, len); - info->options_len = len; - info->key.tun_flags |= flags; -} - -static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, __be32 label, - __be16 tp_src, __be16 tp_dst, - __be64 tun_id, __be16 tun_flags) -{ - key->tun_id = tun_id; - key->u.ipv4.src = saddr; - key->u.ipv4.dst = daddr; - memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, - 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); - key->tos = tos; - key->ttl = ttl; - key->label = label; - key->tun_flags = tun_flags; - - /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of - * the upper tunnel are used. - * E.g: GRE over IPSEC, the tp_src and tp_port are zero. - */ - key->tp_src = tp_src; - key->tp_dst = tp_dst; - - /* Clear struct padding. */ - if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) - memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, - 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); -} - -#define ip_tunnel_collect_metadata() true - -#undef TUNNEL_NOCACHE -#define TUNNEL_NOCACHE 0 - -static inline bool -ip_tunnel_dst_cache_usable(const struct sk_buff *skb, - const struct ip_tunnel_info *info) -{ - if (skb->mark) - return false; - if (!info) - return true; - if (info->key.tun_flags & TUNNEL_NOCACHE) - return false; - - return true; -} - -#define ip_tunnel_dst rpl_ip_tunnel_dst -struct rpl_ip_tunnel_dst { - struct dst_entry __rcu *dst; - __be32 saddr; -}; - -#define ip_tunnel rpl_ip_tunnel -struct rpl_ip_tunnel { - struct ip_tunnel __rcu *next; - struct hlist_node hash_node; - struct net_device *dev; - struct net *net; /* netns for packet i/o */ - - unsigned long err_time; /* Time when the last ICMP error - * arrived */ - int err_count; /* Number of arrived ICMP errors */ - - /* These four fields used only by GRE */ - u32 i_seqno; /* The last seen seqno */ - u32 o_seqno; /* The last output seqno */ - int tun_hlen; /* Precalculated header length */ - - /* These four fields used only by ERSPAN */ - u32 index; /* ERSPAN type II index */ - u8 erspan_ver; /* ERSPAN version */ - u8 dir; /* ERSPAN direction */ - u16 hwid; /* ERSPAN hardware ID */ - - struct dst_cache dst_cache; - - struct ip_tunnel_parm parms; - - int mlink; - int encap_hlen; /* Encap header length (FOU,GUE) */ - int hlen; /* tun_hlen + encap_hlen */ - struct ip_tunnel_encap encap; - - /* for SIT */ -#ifdef CONFIG_IPV6_SIT_6RD - struct ip_tunnel_6rd_parm ip6rd; -#endif - struct ip_tunnel_prl_entry __rcu *prl; /* potential router list */ - unsigned int prl_count; /* # of entries in PRL */ - unsigned int ip_tnl_net_id; - struct gro_cells gro_cells; - __u32 fwmark; - bool collect_md; - bool ignore_df; -}; - -#define ip_tunnel_net rpl_ip_tunnel_net -struct rpl_ip_tunnel_net { - struct net_device *fb_tunnel_dev; - struct hlist_head tunnels[IP_TNL_HASH_SIZE]; - struct ip_tunnel __rcu *collect_md_tun; -}; - - -struct ip_tunnel_encap_ops { - size_t (*encap_hlen)(struct ip_tunnel_encap *e); - int (*build_header)(struct sk_buff *skb, struct ip_tunnel_encap *e, - const u8 *protocol, struct flowi4 *fl4); -}; - -extern const struct ip_tunnel_encap_ops __rcu * - rpl_iptun_encaps[MAX_IPTUN_ENCAP_OPS]; - -#define ip_encap_hlen rpl_ip_encap_hlen -static inline int rpl_ip_encap_hlen(struct ip_tunnel_encap *e) -{ - const struct ip_tunnel_encap_ops *ops; - int hlen = -EINVAL; - - if (e->type == TUNNEL_ENCAP_NONE) - return 0; - - if (e->type >= MAX_IPTUN_ENCAP_OPS) - return -EINVAL; - - rcu_read_lock(); - ops = rcu_dereference(rpl_iptun_encaps[e->type]); - if (likely(ops && ops->encap_hlen)) - hlen = ops->encap_hlen(e); - rcu_read_unlock(); - - return hlen; -} - -static inline int ovs_ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t, - const u8 *protocol, struct flowi4 *fl4) -{ - const struct ip_tunnel_encap_ops *ops; - int ret = -EINVAL; - - if (t->encap.type == TUNNEL_ENCAP_NONE) - return 0; - - if (t->encap.type >= MAX_IPTUN_ENCAP_OPS) - return -EINVAL; - - rcu_read_lock(); - ops = rcu_dereference(rpl_iptun_encaps[t->encap.type]); - if (likely(ops && ops->build_header)) - ret = ops->build_header(skb, &t->encap, protocol, fl4); - rcu_read_unlock(); - - return ret; -} - -#define ip_tunnel_get_stats64 rpl_ip_tunnel_get_stats64 -#if !defined(HAVE_VOID_NDO_GET_STATS64) && !defined(HAVE_RHEL7_MAX_MTU) -struct rtnl_link_stats64 *rpl_ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot); -#else -void rpl_ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot); -#endif -#define ip_tunnel_get_dsfield rpl_ip_tunnel_get_dsfield -static inline u8 rpl_ip_tunnel_get_dsfield(const struct iphdr *iph, - const struct sk_buff *skb) -{ - if (skb->protocol == htons(ETH_P_IP)) - return iph->tos; - else if (skb->protocol == htons(ETH_P_IPV6)) - return ipv6_get_dsfield((const struct ipv6hdr *)iph); - else - return 0; -} - -#define ip_tunnel_ecn_encap rpl_ip_tunnel_ecn_encap -static inline u8 rpl_ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph, - const struct sk_buff *skb) -{ - u8 inner = ip_tunnel_get_dsfield(iph, skb); - - return INET_ECN_encapsulate(tos, inner); -} - -static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len) -{ - if (pkt_len > 0) { - struct pcpu_sw_netstats *tstats = get_cpu_ptr(dev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += pkt_len; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); - put_cpu_ptr(tstats); - } else { - struct net_device_stats *err_stats = &dev->stats; - - if (pkt_len < 0) { - err_stats->tx_errors++; - err_stats->tx_aborted_errors++; - } else { - err_stats->tx_dropped++; - } - } -} - -static inline __be64 key32_to_tunnel_id(__be32 key) -{ -#ifdef __BIG_ENDIAN - return (__force __be64)key; -#else - return (__force __be64)((__force u64)key << 32); -#endif -} - -/* Returns the least-significant 32 bits of a __be64. */ -static inline __be32 tunnel_id_to_key32(__be64 tun_id) -{ -#ifdef __BIG_ENDIAN - return (__force __be32)tun_id; -#else - return (__force __be32)((__force u64)tun_id >> 32); -#endif -} - -#define ip_tunnel_init rpl_ip_tunnel_init -int rpl_ip_tunnel_init(struct net_device *dev); - -#define ip_tunnel_uninit rpl_ip_tunnel_uninit -void rpl_ip_tunnel_uninit(struct net_device *dev); - -#define ip_tunnel_change_mtu rpl_ip_tunnel_change_mtu -int rpl_ip_tunnel_change_mtu(struct net_device *dev, int new_mtu); - -#define ip_tunnel_newlink rpl_ip_tunnel_newlink -int rpl_ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p); - -#define ip_tunnel_dellink rpl_ip_tunnel_dellink -void rpl_ip_tunnel_dellink(struct net_device *dev, struct list_head *head); - -#define ip_tunnel_init_net rpl_ip_tunnel_init_net -int rpl_ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, - struct rtnl_link_ops *ops, char *devname); - -#define ip_tunnel_delete_net rpl_ip_tunnel_delete_net -void rpl_ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops); - -#define ip_tunnel_setup rpl_ip_tunnel_setup -void rpl_ip_tunnel_setup(struct net_device *dev, int net_id); - -#define ip_tunnel_get_iflink rpl_ip_tunnel_get_iflink -int rpl_ip_tunnel_get_iflink(const struct net_device *dev); - -#define ip_tunnel_get_link_net rpl_ip_tunnel_get_link_net -struct net *rpl_ip_tunnel_get_link_net(const struct net_device *dev); - -#define __ip_tunnel_change_mtu rpl___ip_tunnel_change_mtu -int rpl___ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict); - -#define ip_tunnel_lookup rpl_ip_tunnel_lookup -struct ip_tunnel *rpl_ip_tunnel_lookup(struct ip_tunnel_net *itn, - int link, __be16 flags, - __be32 remote, __be32 local, - __be32 key); - -static inline int iptunnel_pull_offloads(struct sk_buff *skb) -{ - if (skb_is_gso(skb)) { - int err; - - err = skb_unclone(skb, GFP_ATOMIC); - if (unlikely(err)) - return err; - skb_shinfo(skb)->gso_type &= ~(NETIF_F_GSO_ENCAP_ALL >> - NETIF_F_GSO_SHIFT); - } - - skb->encapsulation = 0; - return 0; -} -#endif /* USE_UPSTREAM_TUNNEL */ - -#define skb_is_encapsulated ovs_skb_is_encapsulated -bool ovs_skb_is_encapsulated(struct sk_buff *skb); - -#endif /* __NET_IP_TUNNELS_H */ diff --git a/datapath/linux/compat/include/net/ipv6.h b/datapath/linux/compat/include/net/ipv6.h deleted file mode 100644 index 6379457e8..000000000 --- a/datapath/linux/compat/include/net/ipv6.h +++ /dev/null @@ -1,88 +0,0 @@ -#ifndef __NET_IPV6_WRAPPER_H -#define __NET_IPV6_WRAPPER_H 1 - -#include <linux/version.h> - -#include_next <net/ipv6.h> - -#ifndef NEXTHDR_SCTP -#define NEXTHDR_SCTP 132 /* Stream Control Transport Protocol */ -#endif - -#ifndef HAVE_IP6_FH_F_SKIP_RH - -enum { - IP6_FH_F_FRAG = (1 << 0), - IP6_FH_F_AUTH = (1 << 1), - IP6_FH_F_SKIP_RH = (1 << 2), -}; - -/* This function is upstream, but not the version which skips routing - * headers with 0 segments_left. We fixed it when we introduced - * IP6_FH_F_SKIP_RH. - */ -#define ipv6_find_hdr rpl_ipv6_find_hdr -extern int rpl_ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, - int target, unsigned short *fragoff, int *fragflg); -#endif - -#ifndef HAVE___IPV6_ADDR_JHASH -static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 unused) -{ - return ipv6_addr_jhash(a); -} -#endif - -#define ip6_flowlabel rpl_ip6_flowlabel -static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr) -{ - return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK; -} - -#ifndef HAVE_IP6_MAKE_FLOWLABEL_FL6 -#define ip6_make_flowlabel rpl_ip6_make_flowlabel -static inline __be32 rpl_ip6_make_flowlabel(struct net *net, - struct sk_buff *skb, - __be32 flowlabel, bool autolabel, - struct flowi6 *fl6) -{ -#ifndef HAVE_NETNS_SYSCTL_IPV6_AUTO_FLOWLABELS - if (!flowlabel && autolabel) { -#else - if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) { -#endif - u32 hash; - - hash = skb_get_hash(skb); - - /* Since this is being sent on the wire obfuscate hash a bit - * to minimize possbility that any useful information to an - * attacker is leaked. Only lower 20 bits are relevant. - */ - hash ^= hash >> 12; - - flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; - } - - return flowlabel; -} -#endif - -#ifndef IPV6_TCLASS_SHIFT -#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) -#define IPV6_TCLASS_SHIFT 20 -#endif - -#define ip6_tclass rpl_ip6_tclass -static inline u8 ip6_tclass(__be32 flowinfo) -{ - return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; -} - -#define ip6_make_flowinfo rpl_ip6_make_flowinfo -static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) -{ - return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; -} - -#endif diff --git a/datapath/linux/compat/include/net/ipv6_frag.h b/datapath/linux/compat/include/net/ipv6_frag.h deleted file mode 100644 index 5d1cc901b..000000000 --- a/datapath/linux/compat/include/net/ipv6_frag.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef __NET_IPV6_FRAG_WRAPPER_H -#define __NET_IPV6_FRAG_WRAPPER_H - -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) && defined(HAVE_IPV6_FRAG_H) -#include_next <net/ipv6_frag.h> -#endif - -#endif /* __NET_IPV6_FRAG_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/lisp.h b/datapath/linux/compat/include/net/lisp.h deleted file mode 100644 index 6b43c77e2..000000000 --- a/datapath/linux/compat/include/net/lisp.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef __NET_LISP_WRAPPER_H -#define __NET_LISP_WRAPPER_H 1 - -#ifdef CONFIG_INET -#include <net/udp_tunnel.h> -#endif - - -#ifdef CONFIG_INET -#define lisp_dev_create_fb rpl_lisp_dev_create_fb -struct net_device *rpl_lisp_dev_create_fb(struct net *net, const char *name, - u8 name_assign_type, u16 dst_port); -#endif /*ifdef CONFIG_INET */ - -#define lisp_init_module rpl_lisp_init_module -int rpl_lisp_init_module(void); - -#define lisp_cleanup_module rpl_lisp_cleanup_module -void rpl_lisp_cleanup_module(void); - -#define lisp_xmit rpl_lisp_xmit -netdev_tx_t rpl_lisp_xmit(struct sk_buff *skb); - -#define lisp_fill_metadata_dst ovs_lisp_fill_metadata_dst -int ovs_lisp_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); - -#endif /*ifdef__NET_LISP_H */ diff --git a/datapath/linux/compat/include/net/mpls.h b/datapath/linux/compat/include/net/mpls.h deleted file mode 100644 index 9359a2369..000000000 --- a/datapath/linux/compat/include/net/mpls.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright (c) 2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#ifndef _NET_MPLS_WRAPPER_H -#define _NET_MPLS_WRAPPER_H 1 - -#include <linux/if_ether.h> -#include <linux/netdevice.h> - -#define MPLS_HLEN 4 - -struct mpls_shim_hdr { - __be32 label_stack_entry; -}; - -static inline bool eth_p_mpls(__be16 eth_type) -{ - return eth_type == htons(ETH_P_MPLS_UC) || - eth_type == htons(ETH_P_MPLS_MC); -} - -/* Starting from kernel 4.9, commit 48d2ab609b6b ("net: mpls: Fixups for GSO") - * and commit 85de4a2101ac ("openvswitch: use mpls_hdr") introduced - * behavioural changes to mpls_gso kernel module. It now assumes that - * skb_network_header() points to the mpls header and - * skb_inner_network_header() points to the L3 header. However, the old - * mpls_gso kernel module assumes that the skb_network_header() points - * to the L3 header. We shall backport the following function to ensure - * MPLS GSO works properly for kernels older than the one which contains - * these commits. - */ -#ifdef MPLS_HEADER_IS_L3 -static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb) -{ - return (struct mpls_shim_hdr *)skb_network_header(skb); -} -#else -#define mpls_hdr rpl_mpls_hdr -/* - * For non-MPLS skbs this will correspond to the network header. - * For MPLS skbs it will be before the network_header as the MPLS - * label stack lies between the end of the mac header and the network - * header. That is, for MPLS skbs the end of the mac header - * is the top of the MPLS label stack. - */ -static inline struct mpls_shim_hdr *rpl_mpls_hdr(const struct sk_buff *skb) -{ - return (struct mpls_shim_hdr *) (skb_mac_header(skb) + skb->mac_len); -} -#endif - -#endif /* _NET_MPLS_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/net_namespace.h b/datapath/linux/compat/include/net/net_namespace.h deleted file mode 100644 index 427072249..000000000 --- a/datapath/linux/compat/include/net/net_namespace.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __NET_NET_NAMESPACE_WRAPPER_H -#define __NET_NET_NAMESPACE_WRAPPER_H 1 - -#include_next <net/net_namespace.h> - -#ifndef HAVE_POSSIBLE_NET_T -typedef struct { -#ifdef CONFIG_NET_NS - struct net *net; -#endif -} possible_net_t; - -static inline void rpl_write_pnet(possible_net_t *pnet, struct net *net) -{ -#ifdef CONFIG_NET_NS - pnet->net = net; -#endif -} - -static inline struct net *rpl_read_pnet(const possible_net_t *pnet) -{ -#ifdef CONFIG_NET_NS - return pnet->net; -#else - return &init_net; -#endif -} -#else /* Linux >= 4.1 */ -#define rpl_read_pnet read_pnet -#define rpl_write_pnet write_pnet -#endif /* Linux >= 4.1 */ - -#endif /* net/net_namespace.h wrapper */ diff --git a/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h deleted file mode 100644 index c4c0f79ab..000000000 --- a/datapath/linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _NF_DEFRAG_IPV6_WRAPPER_H -#define _NF_DEFRAG_IPV6_WRAPPER_H - -#include <linux/kconfig.h> -#include_next <net/netfilter/ipv6/nf_defrag_ipv6.h> - -/* Upstream commit 029f7f3b8701 ("netfilter: ipv6: nf_defrag: avoid/free clone - * operations") changed the semantics of nf_ct_frag6_gather(), so we need - * to backport for all prior kernels, i.e. kernel < 4.5.0. - * - * Upstream commit 48cac18ecf1d ("ipv6: orphan skbs in reassembly unit") fixes - * a bug that requires all kernels prior to this fix, i.e. kernel < 4.11.0 - * to be backported. - */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0) -#define OVS_NF_DEFRAG6_BACKPORT 1 -int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user); -#define nf_ct_frag6_gather rpl_nf_ct_frag6_gather - -/* If backporting IPv6 defrag, then init/exit functions need to be called from - * compat_{in,ex}it() to prepare the backported fragmentation cache. In this - * case we declare the functions which are defined in - * datapath/linux/compat/nf_conntrack_reasm.c. - * - * Otherwise, if we can use upstream defrag then we can rely on the upstream - * nf_defrag_ipv6 module to init/exit correctly. In this case the calls in - * compat_{in,ex}it() can be no-ops. - */ -int __init rpl_nf_ct_frag6_init(void); -void rpl_nf_ct_frag6_cleanup(void); -void ovs_netns_frags6_init(struct net *net); -void ovs_netns_frags6_exit(struct net *net); -#else /* !OVS_NF_DEFRAG6_BACKPORT */ -static inline int __init rpl_nf_ct_frag6_init(void) { return 0; } -static inline void rpl_nf_ct_frag6_cleanup(void) { } -static inline void ovs_netns_frags6_init(struct net *net) { } -static inline void ovs_netns_frags6_exit(struct net *net) { } -#endif /* OVS_NF_DEFRAG6_BACKPORT */ -#define nf_ct_frag6_init rpl_nf_ct_frag6_init -#define nf_ct_frag6_cleanup rpl_nf_ct_frag6_cleanup - -#endif /* __NF_DEFRAG_IPV6_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack.h deleted file mode 100644 index 50db914a3..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _NF_CONNTRACK_WRAPPER_H -#define _NF_CONNTRACK_WRAPPER_H - -#include_next <net/netfilter/nf_conntrack.h> - -#ifndef HAVE_NF_CT_GET_TUPLEPR_TAKES_STRUCT_NET -static inline bool rpl_nf_ct_get_tuplepr(const struct sk_buff *skb, - unsigned int nhoff, - u_int16_t l3num, struct net *net, - struct nf_conntrack_tuple *tuple) -{ - return nf_ct_get_tuplepr(skb, nhoff, l3num, tuple); -} -#define nf_ct_get_tuplepr rpl_nf_ct_get_tuplepr -#endif - -#ifndef HAVE_NF_CT_SET -static inline void -nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) -{ - skb->nfct = &ct->ct_general; - skb->nfctinfo = info; -} -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) -int rpl_nf_ct_netns_get(struct net *net, u8 nfproto); -void rpl_nf_ct_netns_put(struct net *net, u8 nfproto); -#define nf_ct_netns_get rpl_nf_ct_netns_get -#define nf_ct_netns_put rpl_nf_ct_netns_put -#endif - -#endif /* _NF_CONNTRACK_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h deleted file mode 100644 index bc18c56b8..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h +++ /dev/null @@ -1,137 +0,0 @@ -#ifndef _NF_CONNTRACK_CORE_WRAPPER_H -#define _NF_CONNTRACK_CORE_WRAPPER_H - -#include_next <net/netfilter/nf_conntrack_core.h> - -#ifndef HAVE_NF_CT_TMPL_ALLOC_TAKES_STRUCT_ZONE - -#include <net/netfilter/nf_conntrack_zones.h> - -/* Released via destroy_conntrack() */ -static inline struct nf_conn * -rpl_nf_ct_tmpl_alloc(struct net *net, const struct nf_conntrack_zone *zone, - gfp_t flags) -{ - struct nf_conn *tmpl; - - tmpl = kzalloc(sizeof(*tmpl), flags); - if (tmpl == NULL) - return NULL; - - tmpl->status = IPS_TEMPLATE; - write_pnet(&tmpl->ct_net, net); - - if (nf_ct_zone_add(tmpl, flags, zone) < 0) - goto out_free; - - atomic_set(&tmpl->ct_general.use, 0); - - return tmpl; -out_free: - kfree(tmpl); - return NULL; -} -#define nf_ct_tmpl_alloc rpl_nf_ct_tmpl_alloc - -static inline void rpl_nf_ct_tmpl_free(struct nf_conn *tmpl) -{ - nf_ct_ext_destroy(tmpl); - nf_ct_ext_free(tmpl); - kfree(tmpl); -} -#define nf_ct_tmpl_free rpl_nf_ct_tmpl_free - -static inline struct nf_conntrack_tuple_hash * -rpl_nf_conntrack_find_get(struct net *net, - const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) -{ - return nf_conntrack_find_get(net, zone->id, tuple); -} -#define nf_conntrack_find_get rpl_nf_conntrack_find_get -#endif /* HAVE_NF_CT_TMPL_ALLOC_TAKES_STRUCT_ZONE */ - -#ifndef HAVE_NF_CT_GET_TUPLEPR_TAKES_STRUCT_NET -static inline bool rpl_nf_ct_get_tuple(const struct sk_buff *skb, - unsigned int nhoff, - unsigned int dataoff, u_int16_t l3num, - u_int8_t protonum, - struct net *net, - struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_l3proto *l3proto, - const struct nf_conntrack_l4proto *l4proto) -{ - return nf_ct_get_tuple(skb, nhoff, dataoff, l3num, protonum, tuple, - l3proto, l4proto); -} -#define nf_ct_get_tuple rpl_nf_ct_get_tuple -#endif /* HAVE_NF_CT_GET_TUPLEPR_TAKES_STRUCT_NET */ - -#ifdef HAVE_NF_CONN_TIMER - -#ifndef HAVE_NF_CT_DELETE -#include <net/netfilter/nf_conntrack_timestamp.h> -#endif - -static inline bool rpl_nf_ct_delete(struct nf_conn *ct, u32 portid, int report) -{ - if (del_timer(&ct->timeout)) -#ifdef HAVE_NF_CT_DELETE - return nf_ct_delete(ct, portid, report); -#else - { - struct nf_conn_tstamp *tstamp; - - tstamp = nf_conn_tstamp_find(ct); - if (tstamp && tstamp->stop == 0) - tstamp->stop = ktime_to_ns(ktime_get_real()); - - if (!test_bit(IPS_DYING_BIT, &ct->status) && - unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { - /* destroy event was not delivered */ - nf_ct_delete_from_lists(ct); - nf_ct_dying_timeout(ct); - return false; - } - set_bit(IPS_DYING_BIT, &ct->status); - nf_ct_delete_from_lists(ct); - nf_ct_put(ct); - return true; - } -#endif - return false; -} -#define nf_ct_delete rpl_nf_ct_delete -#endif /* HAVE_NF_CONN_TIMER */ - -#ifndef HAVE_NF_CONNTRACK_IN_TAKES_NF_HOOK_STATE -static inline unsigned int -rpl_nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state) -{ - int err; - - /* Repeat if requested, see nf_iterate(). */ - do { - err = nf_conntrack_in(state->net, state->pf, state->hook, skb); - } while (err == NF_REPEAT); - - return err; -} -#define nf_conntrack_in rpl_nf_conntrack_in -#endif /* HAVE_NF_CONNTRACK_IN_TAKES_NF_HOOK_STATE */ - -#ifdef HAVE_NF_CT_INVERT_TUPLEPR -static inline bool rpl_nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, - const struct nf_conntrack_tuple *orig) -{ - return nf_ct_invert_tuplepr(inverse, orig); -} -#else -static inline bool rpl_nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, - const struct nf_conntrack_tuple *orig) -{ - return nf_ct_invert_tuple(inverse, orig); -} -#endif /* HAVE_NF_CT_INVERT_TUPLEPR */ - -#endif /* _NF_CONNTRACK_CORE_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_count.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_count.h deleted file mode 100644 index 2143136aa..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_count.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef _NF_CONNTRACK_COUNT_WRAPPER_H -#define _NF_CONNTRACK_COUNT_WRAPPER_H - -#include <linux/list.h> -#include <net/netfilter/nf_conntrack_tuple.h> -#include <net/netfilter/nf_conntrack_zones.h> - -#ifdef HAVE_UPSTREAM_NF_CONNCOUNT -#include_next <net/netfilter/nf_conntrack_count.h> - -static inline int rpl_nf_conncount_modinit(void) -{ - return 0; -} - -static inline void rpl_nf_conncount_modexit(void) -{ -} - -#else -#define CONFIG_NETFILTER_CONNCOUNT 1 -struct nf_conncount_data; - -struct nf_conncount_list { - spinlock_t list_lock; - struct list_head head; /* connections with the same filtering key */ - unsigned int count; /* length of list */ -}; - -struct nf_conncount_data -*rpl_nf_conncount_init(struct net *net, unsigned int family, - unsigned int keylen); - -void rpl_nf_conncount_destroy(struct net *net, unsigned int family, - struct nf_conncount_data *data); - -unsigned int rpl_nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone); - -#define nf_conncount_init rpl_nf_conncount_init -#define nf_conncount_destroy rpl_nf_conncount_destroy -#define nf_conncount_count rpl_nf_conncount_count - -int rpl_nf_conncount_modinit(void); -void rpl_nf_conncount_modexit(void); -#endif /* HAVE_UPSTREAM_NF_CONNCOUNT */ - -#define nf_conncount_mod_init rpl_nf_conncount_modinit -#define nf_conncount_modexit rpl_nf_conncount_modexit - -#endif /* _NF_CONNTRACK_COUNT_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_expect.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_expect.h deleted file mode 100644 index a13f0ce60..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_expect.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _NF_CONNTRACK_EXPECT_WRAPPER_H -#define _NF_CONNTRACK_EXPECT_WRAPPER_H - -#include_next <net/netfilter/nf_conntrack_expect.h> - -#ifndef HAVE_NF_CT_ZONE_INIT - -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_zones.h> - -static inline struct nf_conntrack_expect * -rpl___nf_ct_expect_find(struct net *net, - const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) -{ - return __nf_ct_expect_find(net, zone->id, tuple); -} -#define __nf_ct_expect_find rpl___nf_ct_expect_find - -#endif /* HAVE_NF_CT_ZONE_INIT */ -#endif /* _NF_CONNTRACK_EXPECT_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_helper.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_helper.h deleted file mode 100644 index 78f97375b..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_helper.h +++ /dev/null @@ -1,39 +0,0 @@ -#ifndef _NF_CONNTRACK_HELPER_WRAPPER_H -#define _NF_CONNTRACK_HELPER_WRAPPER_H - -#include_next <net/netfilter/nf_conntrack_helper.h> - -#ifndef HAVE_NF_CONNTRACK_HELPER_PUT -static inline void nf_conntrack_helper_put(struct nf_conntrack_helper *helper) { - module_put(helper->me); -} -#endif - -#ifndef HAVE_NF_CT_HELPER_EXT_ADD_TAKES_HELPER -static inline struct nf_conn_help * -rpl_nf_ct_helper_ext_add(struct nf_conn *ct, - struct nf_conntrack_helper *helper, gfp_t gfp) -{ - return nf_ct_helper_ext_add(ct, gfp); -} -#define nf_ct_helper_ext_add rpl_nf_ct_helper_ext_add -#endif /* HAVE_NF_CT_HELPER_EXT_ADD_TAKES_HELPER */ - -#ifndef HAVE_NF_NAT_HELPER_TRY_MODULE_GET -static inline int rpl_nf_nat_helper_try_module_get(const char *name, u16 l3num, - u8 protonum) -{ - request_module("ip_nat_%s", name); - return 0; -} -#define nf_nat_helper_try_module_get rpl_nf_nat_helper_try_module_get -#endif /* HAVE_NF_NAT_HELPER_TRY_MODULE_GET */ - -#ifndef HAVE_NF_NAT_HELPER_PUT -void rpl_nf_nat_helper_put(struct nf_conntrack_helper *helper) -{ -} -#define nf_nat_helper_put rpl_nf_nat_helper_put -#endif /* HAVE_NF_NAT_HELPER_PUT */ - -#endif /* _NF_CONNTRACK_HELPER_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_labels.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_labels.h deleted file mode 100644 index 14cb35716..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_labels.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef _NF_CONNTRACK_LABELS_WRAPPER_H -#define _NF_CONNTRACK_LABELS_WRAPPER_H - -#include <linux/kconfig.h> -#include <linux/version.h> -#include_next <net/netfilter/nf_conntrack_labels.h> - -#ifndef NF_CT_LABELS_MAX_SIZE -#define NF_CT_LABELS_MAX_SIZE ((XT_CONNLABEL_MAXBIT + 1) / BITS_PER_BYTE) -#endif - -#ifndef HAVE_NF_CONNLABELS_GET_TAKES_BIT -#if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) - -/* XXX: This doesn't lock others out from doing the same configuration - * simultaneously. */ -static inline int rpl_nf_connlabels_get(struct net *net, unsigned int bits) -{ -#ifndef HAVE_NF_CONNLABELS_GET - size_t words; - - words = BIT_WORD(bits) + 1; - if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long)) - return -ERANGE; - - net->ct.labels_used++; - if (words > net->ct.label_words) - net->ct.label_words = words; - - return 0; -#else - return nf_connlabels_get(net, bits + 1); -#endif /* HAVE_NF_CONNLABELS_GET */ -} -#define nf_connlabels_get rpl_nf_connlabels_get - -static inline void rpl_nf_connlabels_put(struct net *net) -{ -#ifndef HAVE_NF_CONNLABELS_GET - net->ct.labels_used--; - if (net->ct.labels_used == 0) - net->ct.label_words = 0; -#else - nf_connlabels_put(net); -#endif /* HAVE_NF_CONNLABELS_GET */ -} -#define nf_connlabels_put rpl_nf_connlabels_put - -#else /* CONFIG_NF_CONNTRACK_LABELS */ -#define nf_connlabels_get rpl_nf_connlabels_get -static inline int nf_connlabels_get(struct net *net, unsigned int bits) -{ - return -ERANGE; -} - -#define nf_connlabels_put rpl_nf_connlabels_put -static inline void nf_connlabels_put(struct net *net) { } -#endif /* CONFIG_NF_CONNTRACK_LABELS */ -#endif /* HAVE_NF_CONNLABELS_GET_TAKES_BIT */ - -/* Upstream commit 5a8145f7b222 ("netfilter: labels: don't emit ct event if - * labels were not changed"), released in Linux 4.7, introduced a functional - * change to trigger conntrack event for a label change only when the labels - * actually changed. There is no way we can detect this from the headers, so - * provide replacements that work the same for OVS (where labels size is 128 - * bits == 16 bytes == 4 4-byte words). */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) -static int replace_u32(u32 *address, u32 mask, u32 new) -{ - u32 old, tmp; - - do { - old = *address; - tmp = (old & mask) ^ new; - if (old == tmp) - return 0; - } while (cmpxchg(address, old, tmp) != old); - - return 1; -} - -static int rpl_nf_connlabels_replace(struct nf_conn *ct, - const u32 *data, - const u32 *mask, unsigned int words32) -{ - struct nf_conn_labels *labels; - unsigned int i; - int changed = 0; - u32 *dst; - - labels = nf_ct_labels_find(ct); - if (!labels) - return -ENOSPC; - - dst = (u32 *) labels->bits; - for (i = 0; i < words32; i++) - changed |= replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); - - if (changed) - nf_conntrack_event_cache(IPCT_LABEL, ct); - - return 0; -} -#define nf_connlabels_replace rpl_nf_connlabels_replace -#endif - -#endif /* _NF_CONNTRACK_LABELS_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_seqadj.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_seqadj.h deleted file mode 100644 index b11d1a578..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_seqadj.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _NF_CONNTRACK_SEQADJ_WRAPPER_H -#define _NF_CONNTRACK_SEQADJ_WRAPPER_H - -#ifdef HAVE_NF_CT_SEQ_ADJUST -#include_next <net/netfilter/nf_conntrack_seqadj.h> -#else - -#include <net/netfilter/nf_nat_helper.h> - -/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ -static inline int -nf_ct_seq_adjust(struct sk_buff *skb, - struct nf_conn *ct, enum ip_conntrack_info ctinfo, - unsigned int protoff) -{ - typeof(nf_nat_seq_adjust_hook) seq_adjust; - - seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook); - if (!seq_adjust || - !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) { - NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop); - return 0; - } - - return 1; -} - -#endif /* HAVE_NF_CT_SEQ_ADJUST */ - -#endif /* _NF_CONNTRACK_SEQADJ_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_timeout.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_timeout.h deleted file mode 100644 index 134e72b83..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_timeout.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _NF_CONNTRACK_TIMEOUT_WRAPPER_H -#define _NF_CONNTRACK_TIMEOUT_WRAPPER_H - -#include_next <net/netfilter/nf_conntrack_timeout.h> - -#ifndef HAVE_NF_CT_SET_TIMEOUT - -#ifndef HAVE_NF_CT_TIMEOUT -#define nf_ct_timeout ctnl_timeout -#endif - -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT -int rpl_nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, - const char *timeout_name); -void rpl_nf_ct_destroy_timeout(struct nf_conn *ct); -#else -static inline int rpl_nf_ct_set_timeout(struct net *net, struct nf_conn *ct, - u8 l3num, u8 l4num, - const char *timeout_name) -{ - return -EOPNOTSUPP; -} - -static inline void rpl_nf_ct_destroy_timeout(struct nf_conn *ct) -{ - return; -} -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - -#define nf_ct_set_timeout rpl_nf_ct_set_timeout -#define nf_ct_destroy_timeout rpl_nf_ct_destroy_timeout - -#endif /* HAVE_NF_CT_SET_TIMEOUT */ -#endif /* _NF_CONNTRACK_TIMEOUT_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_zones.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_zones.h deleted file mode 100644 index d46c098c7..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_zones.h +++ /dev/null @@ -1,101 +0,0 @@ -#ifndef _NF_CONNTRACK_ZONES_WRAPPER_H -#define _NF_CONNTRACK_ZONES_WRAPPER_H - -#include <linux/version.h> - -#include_next <net/netfilter/nf_conntrack_zones.h> - -#ifndef HAVE_NF_CT_ZONE_INIT - -#include <linux/kconfig.h> -#include <linux/types.h> -#include <linux/netfilter/nf_conntrack_tuple_common.h> - -#define NF_CT_DEFAULT_ZONE_ID 0 - -#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) -#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) - -#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) - -#define NF_CT_FLAG_MARK 1 - -struct rpl_nf_conntrack_zone { - u16 id; - u8 flags; - u8 dir; -}; -#define nf_conntrack_zone rpl_nf_conntrack_zone - -extern const struct nf_conntrack_zone nf_ct_zone_dflt; - -#if IS_ENABLED(CONFIG_NF_CONNTRACK) -#include <net/netfilter/nf_conntrack_extend.h> - -static inline const struct nf_conntrack_zone * -rpl_nf_ct_zone(const struct nf_conn *ct) -{ - const struct nf_conntrack_zone *nf_ct_zone = NULL; - -#ifdef CONFIG_NF_CONNTRACK_ZONES - nf_ct_zone = nf_ct_ext_find(ct, NF_CT_EXT_ZONE); -#endif - return nf_ct_zone ? nf_ct_zone : &nf_ct_zone_dflt; -} -#define nf_ct_zone rpl_nf_ct_zone - -static inline const struct nf_conntrack_zone * -nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags) -{ - zone->id = id; - zone->flags = flags; - zone->dir = dir; - - return zone; -} - -static inline int nf_ct_zone_add(struct nf_conn *ct, gfp_t flags, - const struct nf_conntrack_zone *info) -{ -#ifdef CONFIG_NF_CONNTRACK_ZONES - struct nf_conntrack_zone *nf_ct_zone; - - nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, flags); - if (!nf_ct_zone) - return -ENOMEM; - - nf_ct_zone_init(nf_ct_zone, info->id, info->dir, - info->flags); -#endif - return 0; -} - -static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, - enum ip_conntrack_dir dir) -{ - return zone->dir & (1 << dir); -} - -static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone, - enum ip_conntrack_dir dir) -{ - return nf_ct_zone_matches_dir(zone, dir) ? - zone->id : NF_CT_DEFAULT_ZONE_ID; -} - -static inline bool nf_ct_zone_equal(const struct nf_conn *a, - const struct nf_conntrack_zone *b, - enum ip_conntrack_dir dir) -{ - return nf_ct_zone_id(nf_ct_zone(a), dir) == - nf_ct_zone_id(b, dir); -} - -static inline bool nf_ct_zone_equal_any(const struct nf_conn *a, - const struct nf_conntrack_zone *b) -{ - return nf_ct_zone(a)->id == b->id; -} -#endif /* IS_ENABLED(CONFIG_NF_CONNTRACK) */ -#endif /* HAVE_NF_CT_ZONE_INIT */ -#endif /* _NF_CONNTRACK_ZONES_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netfilter/nf_nat.h b/datapath/linux/compat/include/net/netfilter/nf_nat.h deleted file mode 100644 index 773e569cb..000000000 --- a/datapath/linux/compat/include/net/netfilter/nf_nat.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef _NF_NAT_WRAPPER_H -#define _NF_NAT_WRAPPER_H - -#include_next <net/netfilter/nf_nat.h> - -#ifndef HAVE_NF_CT_NAT_EXT_ADD - -static inline struct nf_conn_nat * -nf_ct_nat_ext_add(struct nf_conn *ct) -{ - struct nf_conn_nat *nat = nfct_nat(ct); - if (nat) - return nat; - - if (!nf_ct_is_confirmed(ct)) - nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); - - return nat; -} -#endif /* HAVE_NF_CT_NAT_EXT_ADD */ - -#ifndef HAVE_NF_NAT_ALLOC_NULL_BINDING -static inline unsigned int -nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) -{ - /* Force range to this IP; let proto decide mapping for - * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - * Use reply in case it's already been mangled (eg local packet). - */ - union nf_inet_addr ip = - (HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ? - ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : - ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); - struct nf_nat_range range = { - .flags = NF_NAT_RANGE_MAP_IPS, - .min_addr = ip, - .max_addr = ip, - }; - return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); -} - -#endif /* HAVE_NF_NAT_ALLOC_NULL_BINDING */ - -#endif /* _NF_NAT_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/netlink.h b/datapath/linux/compat/include/net/netlink.h deleted file mode 100644 index 84e073974..000000000 --- a/datapath/linux/compat/include/net/netlink.h +++ /dev/null @@ -1,185 +0,0 @@ -#ifndef __NET_NETLINK_WRAPPER_H -#define __NET_NETLINK_WRAPPER_H 1 - -#include <linux/version.h> -#include_next <net/netlink.h> -#include_next <linux/in6.h> - -#ifndef HAVE_NLA_GET_BE16 -/** - * nla_get_be16 - return payload of __be16 attribute - * @nla: __be16 netlink attribute - */ -static inline __be16 nla_get_be16(const struct nlattr *nla) -{ - return *(__be16 *) nla_data(nla); -} -#endif /* !HAVE_NLA_GET_BE16 */ - -#ifndef HAVE_NLA_PUT_BE16 -static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) -{ - return nla_put(skb, attrtype, sizeof(__be16), &value); -} -#endif - -#ifndef HAVE_NLA_PUT_BE32 -static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) -{ - return nla_put(skb, attrtype, sizeof(__be32), &value); -} -#endif - -#ifndef HAVE_NLA_PUT_BE64 -static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value) -{ - return nla_put(skb, attrtype, sizeof(__be64), &value); -} -#endif - -#ifndef nla_for_each_nested -#define nla_for_each_nested(pos, nla, rem) \ - nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) -#endif - -#ifndef HAVE_NLA_FIND_NESTED -static inline struct nlattr *nla_find_nested(struct nlattr *nla, int attrtype) -{ - return nla_find(nla_data(nla), nla_len(nla), attrtype); -} -#endif - -#ifndef HAVE_NLA_IS_LAST -static inline bool nla_is_last(const struct nlattr *nla, int rem) -{ - return nla->nla_len == rem; -} -#endif - -#ifndef HAVE_NLA_PUT_IN_ADDR -static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, - __be32 addr) -{ - return nla_put_be32(skb, attrtype, addr); -} - -static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype, - const struct in6_addr *addr) -{ - return nla_put(skb, attrtype, sizeof(*addr), addr); -} - -static inline __be32 nla_get_in_addr(const struct nlattr *nla) -{ - return *(__be32 *) nla_data(nla); -} - -static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla) -{ - struct in6_addr tmp; - - nla_memcpy(&tmp, nla, sizeof(tmp)); - return tmp; -} -#endif - -#ifndef HAVE_NLA_PUT_64BIT -static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) -{ -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - /* The nlattr header is 4 bytes in size, that's why we test - * if the skb->data _is_ aligned. A NOP attribute, plus - * nlattr header for next attribute, will make nla_data() - * 8-byte aligned. - */ - if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) - return true; -#endif - return false; -} - -static inline int nla_align_64bit(struct sk_buff *skb, int padattr) -{ - if (nla_need_padding_for_64bit(skb) && - !nla_reserve(skb, padattr, 0)) - return -EMSGSIZE; - - return 0; -} - -static inline int nla_total_size_64bit(int payload) -{ - return NLA_ALIGN(nla_attr_size(payload)) -#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS - + NLA_ALIGN(nla_attr_size(0)) -#endif - ; -} - -#define nla_put_64bit rpl_nla_put_64bit -int rpl_nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, - const void *data, int padattr); - -#define __nla_put_64bit rpl___nla_put_64bit -void rpl___nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, - const void *data, int padattr); - -#define __nla_reserve_64bit rpl___nla_reserve_64bit -struct nlattr *rpl___nla_reserve_64bit(struct sk_buff *skb, int attrtype, - int attrlen, int padattr); - -static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, - u64 value, int padattr) -{ - return nla_put_64bit(skb, attrtype, sizeof(u64), &value, padattr); -} - -#define nla_put_be64 rpl_nla_put_be64 -static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, - int padattr) -{ - return nla_put_64bit(skb, attrtype, sizeof(__be64), &value, padattr); -} - -#endif - -#ifndef HAVE_NLA_PARSE_DEPRECATED_STRICT -#define nla_parse_nested_deprecated nla_parse_nested -#define nla_parse_deprecated_strict nla_parse -#define genlmsg_parse_deprecated genlmsg_parse - -#ifndef HAVE_NETLINK_EXT_ACK -struct netlink_ext_ack; - -static inline int rpl_nla_parse_nested(struct nlattr *tb[], int maxtype, - const struct nlattr *nla, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) -{ - return nla_parse_nested(tb, maxtype, nla, policy); -} -#undef nla_parse_nested_deprecated -#define nla_parse_nested_deprecated rpl_nla_parse_nested - -static inline int rpl_nla_parse(struct nlattr **tb, int maxtype, - const struct nlattr *head, int len, - const struct nla_policy *policy, - struct netlink_ext_ack *extack) -{ - return nla_parse(tb, maxtype, head, len, policy); -} -#undef nla_parse_deprecated_strict -#define nla_parse_deprecated_strict rpl_nla_parse -#endif -#endif /* HAVE_NLA_PARSE_DEPRECATED_STRICT */ - -#ifndef HAVE_NLA_NEST_START_NOFLAG -static inline struct nlattr *rpl_nla_nest_start_noflag(struct sk_buff *skb, - int attrtype) -{ - return nla_nest_start(skb, attrtype); -} -#define nla_nest_start_noflag rpl_nla_nest_start_noflag -#endif - -#endif /* net/netlink.h */ diff --git a/datapath/linux/compat/include/net/nsh.h b/datapath/linux/compat/include/net/nsh.h deleted file mode 100644 index 76894910c..000000000 --- a/datapath/linux/compat/include/net/nsh.h +++ /dev/null @@ -1,313 +0,0 @@ -#ifndef __NET_NSH_H -#define __NET_NSH_H 1 - -#include <linux/skbuff.h> - -/* - * Network Service Header: - * 0 1 2 3 - * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |Ver|O|U| TTL | Length |U|U|U|U|MD Type| Next Protocol | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Service Path Identifier (SPI) | Service Index | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | | - * ~ Mandatory/Optional Context Headers ~ - * | | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * Version: The version field is used to ensure backward compatibility - * going forward with future NSH specification updates. It MUST be set - * to 0x0 by the sender, in this first revision of NSH. Given the - * widespread implementation of existing hardware that uses the first - * nibble after an MPLS label stack for ECMP decision processing, this - * document reserves version 01b and this value MUST NOT be used in - * future versions of the protocol. Please see [RFC7325] for further - * discussion of MPLS-related forwarding requirements. - * - * O bit: Setting this bit indicates an Operations, Administration, and - * Maintenance (OAM) packet. The actual format and processing of SFC - * OAM packets is outside the scope of this specification (see for - * example [I-D.ietf-sfc-oam-framework] for one approach). - * - * The O bit MUST be set for OAM packets and MUST NOT be set for non-OAM - * packets. The O bit MUST NOT be modified along the SFP. - * - * SF/SFF/SFC Proxy/Classifier implementations that do not support SFC - * OAM procedures SHOULD discard packets with O bit set, but MAY support - * a configurable parameter to enable forwarding received SFC OAM - * packets unmodified to the next element in the chain. Forwarding OAM - * packets unmodified by SFC elements that do not support SFC OAM - * procedures may be acceptable for a subset of OAM functions, but can - * result in unexpected outcomes for others, thus it is recommended to - * analyze the impact of forwarding an OAM packet for all OAM functions - * prior to enabling this behavior. The configurable parameter MUST be - * disabled by default. - * - * TTL: Indicates the maximum SFF hops for an SFP. This field is used - * for service plane loop detection. The initial TTL value SHOULD be - * configurable via the control plane; the configured initial value can - * be specific to one or more SFPs. If no initial value is explicitly - * provided, the default initial TTL value of 63 MUST be used. Each SFF - * involved in forwarding an NSH packet MUST decrement the TTL value by - * 1 prior to NSH forwarding lookup. Decrementing by 1 from an incoming - * value of 0 shall result in a TTL value of 63. The packet MUST NOT be - * forwarded if TTL is, after decrement, 0. - * - * All other flag fields, marked U, are unassigned and available for - * future use, see Section 11.2.1. Unassigned bits MUST be set to zero - * upon origination, and MUST be ignored and preserved unmodified by - * other NSH supporting elements. Elements which do not understand the - * meaning of any of these bits MUST NOT modify their actions based on - * those unknown bits. - * - * Length: The total length, in 4-byte words, of NSH including the Base - * Header, the Service Path Header, the Fixed Length Context Header or - * Variable Length Context Header(s). The length MUST be 0x6 for MD - * Type equal to 0x1, and MUST be 0x2 or greater for MD Type equal to - * 0x2. The length of the NSH header MUST be an integer multiple of 4 - * bytes, thus variable length metadata is always padded out to a - * multiple of 4 bytes. - * - * MD Type: Indicates the format of NSH beyond the mandatory Base Header - * and the Service Path Header. MD Type defines the format of the - * metadata being carried. - * - * 0x0 - This is a reserved value. Implementations SHOULD silently - * discard packets with MD Type 0x0. - * - * 0x1 - This indicates that the format of the header includes a fixed - * length Context Header (see Figure 4 below). - * - * 0x2 - This does not mandate any headers beyond the Base Header and - * Service Path Header, but may contain optional variable length Context - * Header(s). The semantics of the variable length Context Header(s) - * are not defined in this document. The format of the optional - * variable length Context Headers is provided in Section 2.5.1. - * - * 0xF - This value is reserved for experimentation and testing, as per - * [RFC3692]. Implementations not explicitly configured to be part of - * an experiment SHOULD silently discard packets with MD Type 0xF. - * - * Next Protocol: indicates the protocol type of the encapsulated data. - * NSH does not alter the inner payload, and the semantics on the inner - * protocol remain unchanged due to NSH service function chaining. - * Please see the IANA Considerations section below, Section 11.2.5. - * - * This document defines the following Next Protocol values: - * - * 0x1: IPv4 - * 0x2: IPv6 - * 0x3: Ethernet - * 0x4: NSH - * 0x5: MPLS - * 0xFE: Experiment 1 - * 0xFF: Experiment 2 - * - * Packets with Next Protocol values not supported SHOULD be silently - * dropped by default, although an implementation MAY provide a - * configuration parameter to forward them. Additionally, an - * implementation not explicitly configured for a specific experiment - * [RFC3692] SHOULD silently drop packets with Next Protocol values 0xFE - * and 0xFF. - * - * Service Path Identifier (SPI): Identifies a service path. - * Participating nodes MUST use this identifier for Service Function - * Path selection. The initial classifier MUST set the appropriate SPI - * for a given classification result. - * - * Service Index (SI): Provides location within the SFP. The initial - * classifier for a given SFP SHOULD set the SI to 255, however the - * control plane MAY configure the initial value of SI as appropriate - * (i.e., taking into account the length of the service function path). - * The Service Index MUST be decremented by a value of 1 by Service - * Functions or by SFC Proxy nodes after performing required services - * and the new decremented SI value MUST be used in the egress packet's - * NSH. The initial Classifier MUST send the packet to the first SFF in - * the identified SFP for forwarding along an SFP. If re-classification - * occurs, and that re-classification results in a new SPI, the - * (re)classifier is, in effect, the initial classifier for the - * resultant SPI. - * - * The SI is used in conjunction the with Service Path Identifier for - * Service Function Path Selection and for determining the next SFF/SF - * in the path. The SI is also valuable when troubleshooting or - * reporting service paths. Additionally, while the TTL field is the - * main mechanism for service plane loop detection, the SI can also be - * used for detecting service plane loops. - * - * When the Base Header specifies MD Type = 0x1, a Fixed Length Context - * Header (16-bytes) MUST be present immediately following the Service - * Path Header. The value of a Fixed Length Context - * Header that carries no metadata MUST be set to zero. - * - * When the base header specifies MD Type = 0x2, zero or more Variable - * Length Context Headers MAY be added, immediately following the - * Service Path Header (see Figure 5). Therefore, Length = 0x2, - * indicates that only the Base Header followed by the Service Path - * Header are present. The optional Variable Length Context Headers - * MUST be of an integer number of 4-bytes. The base header Length - * field MUST be used to determine the offset to locate the original - * packet or frame for SFC nodes that require access to that - * information. - * - * The format of the optional variable length Context Headers - * - * 0 1 2 3 - * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Metadata Class | Type |U| Length | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Variable Metadata | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * Metadata Class (MD Class): Defines the scope of the 'Type' field to - * provide a hierarchical namespace. The IANA Considerations - * Section 11.2.4 defines how the MD Class values can be allocated to - * standards bodies, vendors, and others. - * - * Type: Indicates the explicit type of metadata being carried. The - * definition of the Type is the responsibility of the MD Class owner. - * - * Unassigned bit: One unassigned bit is available for future use. This - * bit MUST NOT be set, and MUST be ignored on receipt. - * - * Length: Indicates the length of the variable metadata, in bytes. In - * case the metadata length is not an integer number of 4-byte words, - * the sender MUST add pad bytes immediately following the last metadata - * byte to extend the metadata to an integer number of 4-byte words. - * The receiver MUST round up the length field to the nearest 4-byte - * word boundary, to locate and process the next field in the packet. - * The receiver MUST access only those bytes in the metadata indicated - * by the length field (i.e., actual number of bytes) and MUST ignore - * the remaining bytes up to the nearest 4-byte word boundary. The - * Length may be 0 or greater. - * - * A value of 0 denotes a Context Header without a Variable Metadata - * field. - * - * [0] https://datatracker.ietf.org/doc/draft-ietf-sfc-nsh/ - */ - -/** - * struct nsh_md1_ctx - Keeps track of NSH context data - * @nshc<1-4>: NSH Contexts. - */ -struct nsh_md1_ctx { - __be32 context[4]; -}; - -struct nsh_md2_tlv { - __be16 md_class; - u8 type; - u8 length; - u8 md_value[]; -}; - -struct nshhdr { - __be16 ver_flags_ttl_len; - u8 mdtype; - u8 np; - __be32 path_hdr; - union { - struct nsh_md1_ctx md1; - struct nsh_md2_tlv md2; - }; -}; - -/* Masking NSH header fields. */ -#define NSH_VER_MASK 0xc000 -#define NSH_VER_SHIFT 14 -#define NSH_FLAGS_MASK 0x3000 -#define NSH_FLAGS_SHIFT 12 -#define NSH_TTL_MASK 0x0fc0 -#define NSH_TTL_SHIFT 6 -#define NSH_LEN_MASK 0x003f -#define NSH_LEN_SHIFT 0 - -#define NSH_MDTYPE_MASK 0x0f -#define NSH_MDTYPE_SHIFT 0 - -#define NSH_SPI_MASK 0xffffff00 -#define NSH_SPI_SHIFT 8 -#define NSH_SI_MASK 0x000000ff -#define NSH_SI_SHIFT 0 - -/* MD Type Registry. */ -#define NSH_M_TYPE1 0x01 -#define NSH_M_TYPE2 0x02 -#define NSH_M_EXP1 0xFE -#define NSH_M_EXP2 0xFF - -/* NSH Base Header Length */ -#define NSH_BASE_HDR_LEN 8 - -/* NSH MD Type 1 header Length. */ -#define NSH_M_TYPE1_LEN 24 - -/* NSH header maximum Length. */ -#define NSH_HDR_MAX_LEN 252 - -/* NSH context headers maximum Length. */ -#define NSH_CTX_HDRS_MAX_LEN 244 - -static inline struct nshhdr *nsh_hdr(struct sk_buff *skb) -{ - return (struct nshhdr *)skb_network_header(skb); -} - -static inline u16 nsh_hdr_len(const struct nshhdr *nsh) -{ - return ((ntohs(nsh->ver_flags_ttl_len) & NSH_LEN_MASK) - >> NSH_LEN_SHIFT) << 2; -} - -static inline u8 nsh_get_ver(const struct nshhdr *nsh) -{ - return (ntohs(nsh->ver_flags_ttl_len) & NSH_VER_MASK) - >> NSH_VER_SHIFT; -} - -static inline u8 nsh_get_flags(const struct nshhdr *nsh) -{ - return (ntohs(nsh->ver_flags_ttl_len) & NSH_FLAGS_MASK) - >> NSH_FLAGS_SHIFT; -} - -static inline u8 nsh_get_ttl(const struct nshhdr *nsh) -{ - return (ntohs(nsh->ver_flags_ttl_len) & NSH_TTL_MASK) - >> NSH_TTL_SHIFT; -} - -static inline void __nsh_set_xflag(struct nshhdr *nsh, u16 xflag, u16 xmask) -{ - nsh->ver_flags_ttl_len - = (nsh->ver_flags_ttl_len & ~htons(xmask)) | htons(xflag); -} - -static inline void nsh_set_flags_and_ttl(struct nshhdr *nsh, u8 flags, u8 ttl) -{ - __nsh_set_xflag(nsh, ((flags << NSH_FLAGS_SHIFT) & NSH_FLAGS_MASK) | - ((ttl << NSH_TTL_SHIFT) & NSH_TTL_MASK), - NSH_FLAGS_MASK | NSH_TTL_MASK); -} - -static inline void nsh_set_flags_ttl_len(struct nshhdr *nsh, u8 flags, - u8 ttl, u8 len) -{ - len = len >> 2; - __nsh_set_xflag(nsh, ((flags << NSH_FLAGS_SHIFT) & NSH_FLAGS_MASK) | - ((ttl << NSH_TTL_SHIFT) & NSH_TTL_MASK) | - ((len << NSH_LEN_SHIFT) & NSH_LEN_MASK), - NSH_FLAGS_MASK | NSH_TTL_MASK | NSH_LEN_MASK); -} - -int ovs_nsh_init(void); -void ovs_nsh_cleanup(void); - -int ovs_nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh); -int ovs_nsh_pop(struct sk_buff *skb); - -#endif /* __NET_NSH_H */ diff --git a/datapath/linux/compat/include/net/protocol.h b/datapath/linux/compat/include/net/protocol.h deleted file mode 100644 index 0247a26c7..000000000 --- a/datapath/linux/compat/include/net/protocol.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _NET_PROTOCOL_WRAPPER_H -#define _NET_PROTOCOL_WRAPPER_H - -#include_next <net/protocol.h> - -#ifdef HAVE_UDP_OFFLOAD - -#ifndef HAVE_UDP_ADD_OFFLOAD_TAKES_NET -#define udp_add_offload(net, prot) udp_add_offload(prot) -#endif - -#else - -#define udp_add_offload(net, prot) 0 -#define udp_del_offload(prot) do {} while(0) - -#endif /* HAVE_UDP_OFFLOAD */ - -#endif /* _NET_PROTOCOL_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/route.h b/datapath/linux/compat/include/net/route.h deleted file mode 100644 index 9e4a1f18a..000000000 --- a/datapath/linux/compat/include/net/route.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __NET_ROUTE_H_WRAPPER -#define __NET_ROUTE_H_WRAPPER - -#include_next <net/route.h> - -#endif diff --git a/datapath/linux/compat/include/net/rtnetlink.h b/datapath/linux/compat/include/net/rtnetlink.h deleted file mode 100644 index e026cab95..000000000 --- a/datapath/linux/compat/include/net/rtnetlink.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef __NET_RTNETLINK_WRAPPER_H -#define __NET_RTNETLINK_WRAPPER_H -#include_next <net/rtnetlink.h> - -#define rtnl_delete_link rpl_rtnl_delete_link -int rpl_rtnl_delete_link(struct net_device *dev); - -#ifndef HAVE_NAME_ASSIGN_TYPE -#ifdef HAVE_RTNL_CREATE_LINK_SRC_NET -static inline struct net_device *rpl_rtnl_create_link(struct net *net, const char *ifname, - unsigned char name_assign_type, - const struct rtnl_link_ops *ops, - struct nlattr *tb[]) -{ - return rtnl_create_link(net, net, (char *)ifname, ops, tb); -} - -#else -static inline struct net_device *rpl_rtnl_create_link(struct net *net, const char *ifname, - unsigned char name_assign_type, - const struct rtnl_link_ops *ops, - struct nlattr *tb[]) -{ - return rtnl_create_link(net, (char *)ifname, ops, tb); -} -#endif -#else -/* This function is only defined to avoid warning related to ifname. Some backported - * function did not changed the name to const type. */ -static inline struct net_device *rpl_rtnl_create_link(struct net *net, const char *ifname, - unsigned char name_assign_type, - const struct rtnl_link_ops *ops, - struct nlattr *tb[]) -{ -#ifdef HAVE_RTNL_CREATE_LINK_TAKES_EXTACK - return rtnl_create_link(net, (char *) ifname, name_assign_type, ops, tb, NULL); -#else - return rtnl_create_link(net, (char *) ifname, name_assign_type, ops, tb); -#endif -} -#endif - -#define rtnl_create_link rpl_rtnl_create_link -#endif diff --git a/datapath/linux/compat/include/net/sctp/checksum.h b/datapath/linux/compat/include/net/sctp/checksum.h deleted file mode 100644 index 7832abce0..000000000 --- a/datapath/linux/compat/include/net/sctp/checksum.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef __SCTP_CHECKSUM_WRAPPER_H -#define __SCTP_CHECKSUM_WRAPPER_H 1 - -#include_next <net/sctp/checksum.h> - -#ifndef HAVE_SCTP_COMPUTE_CKSUM -static inline __le32 sctp_compute_cksum(const struct sk_buff *skb, - unsigned int offset) -{ - const struct sk_buff *iter; - - __u32 crc32 = sctp_start_cksum(skb->data + offset, - skb_headlen(skb) - offset); - skb_walk_frags(skb, iter) - crc32 = sctp_update_cksum((__u8 *) iter->data, - skb_headlen(iter), crc32); - - /* Open-code sctp_end_cksum() to avoid a sparse warning due to a bug in - * sparse annotations in Linux fixed in 3.10 in commit eee1d5a14 (sctp: - * Correct type and usage of sctp_end_cksum()). */ - return cpu_to_le32(~crc32); -} -#endif - -#endif diff --git a/datapath/linux/compat/include/net/sock.h b/datapath/linux/compat/include/net/sock.h deleted file mode 100644 index 2900704ec..000000000 --- a/datapath/linux/compat/include/net/sock.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __NET_SOCK_WRAPPER_H -#define __NET_SOCK_WRAPPER_H 1 - -#include_next <net/sock.h> - -#ifndef __sk_user_data -#define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) - -#define rcu_dereference_sk_user_data(sk) rcu_dereference(__sk_user_data((sk))) -#define rcu_assign_sk_user_data(sk, ptr) rcu_assign_pointer(__sk_user_data((sk)), ptr) -#endif - -#endif diff --git a/datapath/linux/compat/include/net/stt.h b/datapath/linux/compat/include/net/stt.h deleted file mode 100644 index d2e63d163..000000000 --- a/datapath/linux/compat/include/net/stt.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef __NET_STT_H -#define __NET_STT_H 1 - -#include <linux/kconfig.h> -#include <linux/errno.h> -#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) && IS_ENABLED(CONFIG_NETFILTER) -#include <net/ip_tunnels.h> -#define OVS_STT - -struct stthdr { - __u8 version; - __u8 flags; - __u8 l4_offset; - __u8 reserved; - __be16 mss; - __be16 vlan_tci; - __be64 key; -}; - -/* Padding after the end of the tunnel headers to provide alignment - * for inner packet IP header after 14 byte Ethernet header. - */ -#define STT_ETH_PAD 2 - -#define STT_BASE_HLEN (sizeof(struct stthdr) + STT_ETH_PAD) -#define STT_HEADER_LEN (sizeof(struct tcphdr) + STT_BASE_HLEN) - -static inline struct stthdr *stt_hdr(const struct sk_buff *skb) -{ - return (struct stthdr *)(skb_transport_header(skb) + - sizeof(struct tcphdr)); -} - -struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name, - u8 name_assign_type, u16 dst_port); - -netdev_tx_t ovs_stt_xmit(struct sk_buff *skb); - -int ovs_stt_init_module(void); - -void ovs_stt_cleanup_module(void); -#else -static inline int ovs_stt_init_module(void) -{ - return 0; -} - -static inline void ovs_stt_cleanup_module(void) -{} - -static inline struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name, - u8 name_assign_type, u16 dst_port) -{ - return ERR_PTR(-EOPNOTSUPP); -} -static inline netdev_tx_t ovs_stt_xmit(struct sk_buff *skb) -{ - BUG(); - return NETDEV_TX_OK; -} -#endif - -#define stt_dev_create_fb ovs_stt_dev_create_fb -#define stt_init_module ovs_stt_init_module -#define stt_cleanup_module ovs_stt_cleanup_module - -#define stt_fill_metadata_dst ovs_stt_fill_metadata_dst -int ovs_stt_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); - -#endif /*ifdef__NET_STT_H */ diff --git a/datapath/linux/compat/include/net/tun_proto.h b/datapath/linux/compat/include/net/tun_proto.h deleted file mode 100644 index 2ea3deba4..000000000 --- a/datapath/linux/compat/include/net/tun_proto.h +++ /dev/null @@ -1,49 +0,0 @@ -#ifndef __NET_TUN_PROTO_H -#define __NET_TUN_PROTO_H - -#include <linux/kernel.h> - -/* One byte protocol values as defined by VXLAN-GPE and NSH. These will - * hopefully get a shared IANA registry. - */ -#define TUN_P_IPV4 0x01 -#define TUN_P_IPV6 0x02 -#define TUN_P_ETHERNET 0x03 -#define TUN_P_NSH 0x04 -#define TUN_P_MPLS_UC 0x05 - -static inline __be16 tun_p_to_eth_p(u8 proto) -{ - switch (proto) { - case TUN_P_IPV4: - return htons(ETH_P_IP); - case TUN_P_IPV6: - return htons(ETH_P_IPV6); - case TUN_P_ETHERNET: - return htons(ETH_P_TEB); - case TUN_P_NSH: - return htons(ETH_P_NSH); - case TUN_P_MPLS_UC: - return htons(ETH_P_MPLS_UC); - } - return 0; -} - -static inline u8 tun_p_from_eth_p(__be16 proto) -{ - switch (proto) { - case htons(ETH_P_IP): - return TUN_P_IPV4; - case htons(ETH_P_IPV6): - return TUN_P_IPV6; - case htons(ETH_P_TEB): - return TUN_P_ETHERNET; - case htons(ETH_P_NSH): - return TUN_P_NSH; - case htons(ETH_P_MPLS_UC): - return TUN_P_MPLS_UC; - } - return 0; -} - -#endif diff --git a/datapath/linux/compat/include/net/udp.h b/datapath/linux/compat/include/net/udp.h deleted file mode 100644 index 447999218..000000000 --- a/datapath/linux/compat/include/net/udp.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef __NET_UDP_WRAPPER_H -#define __NET_UDP_WRAPPER_H 1 - -#include <net/ip.h> - -#ifdef inet_get_local_port_range -/* Earlier RHEL7 kernels backport udp_flow_src_port() using an older version of - * inet_get_local_port_range(). */ -#undef inet_get_local_port_range -#include_next <net/udp.h> -#define inet_get_local_port_range rpl_inet_get_local_port_range -#else -#include_next <net/udp.h> -#endif - -#ifndef HAVE_UDP_FLOW_SRC_PORT -static inline __be16 rpl_udp_flow_src_port(struct net *net, struct sk_buff *skb, - int min, int max, bool use_eth) -{ - u32 hash; - - if (min >= max) { - /* Use default range */ - inet_get_local_port_range(net, &min, &max); - } - - hash = skb_get_hash(skb); - if (unlikely(!hash) && use_eth) { - /* Can't find a normal hash, caller has indicated an Ethernet - * packet so use that to compute a hash. - */ - hash = jhash(skb->data, 2 * ETH_ALEN, - (__force u32) skb->protocol); - } - - /* Since this is being sent on the wire obfuscate hash a bit - * to minimize possbility that any useful information to an - * attacker is leaked. Only upper 16 bits are relevant in the - * computation for 16 bit port value. - */ - hash ^= hash << 16; - - return htons((((u64) hash * (max - min)) >> 32) + min); -} - -#define udp_flow_src_port rpl_udp_flow_src_port -#endif - -#ifndef HAVE_UDP_V4_CHECK -static inline __sum16 udp_v4_check(int len, __be32 saddr, - __be32 daddr, __wsum base) -{ - return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); -} -#endif - -#ifndef USE_UPSTREAM_TUNNEL -#define udp_set_csum rpl_udp_set_csum -void rpl_udp_set_csum(bool nocheck, struct sk_buff *skb, - __be32 saddr, __be32 daddr, int len); -#endif -#endif diff --git a/datapath/linux/compat/include/net/udp_tunnel.h b/datapath/linux/compat/include/net/udp_tunnel.h deleted file mode 100644 index 6e4063359..000000000 --- a/datapath/linux/compat/include/net/udp_tunnel.h +++ /dev/null @@ -1,208 +0,0 @@ -#ifndef __NET_UDP_TUNNEL_WRAPPER_H -#define __NET_UDP_TUNNEL_WRAPPER_H - -#include <linux/version.h> -#include <linux/kconfig.h> - -#include <net/addrconf.h> -#include <net/dst_metadata.h> -#include <linux/netdev_features.h> - -#ifdef USE_UPSTREAM_TUNNEL -#include_next <net/udp_tunnel.h> - -#else - -#include <net/addrconf.h> -#include <net/ip_tunnels.h> -#include <net/udp.h> - -struct udp_port_cfg { - u8 family; - - /* Used only for kernel-created sockets */ - union { - struct in_addr local_ip; -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr local_ip6; -#endif - }; - - union { - struct in_addr peer_ip; -#if IS_ENABLED(CONFIG_IPV6) - struct in6_addr peer_ip6; -#endif - }; - - __be16 local_udp_port; - __be16 peer_udp_port; - unsigned int use_udp_checksums:1, - use_udp6_tx_checksums:1, - use_udp6_rx_checksums:1, - ipv6_v6only:1; -}; - -#ifdef HAVE_NDO_UDP_TUNNEL_ADD -enum udp_parsable_tunnel_type { - UDP_TUNNEL_TYPE_VXLAN, /* RFC 7348 */ - UDP_TUNNEL_TYPE_GENEVE, /* draft-ietf-nvo3-geneve */ - UDP_TUNNEL_TYPE_VXLAN_GPE, /* draft-ietf-nvo3-vxlan-gpe */ -}; - -struct udp_tunnel_info { - unsigned short type; - sa_family_t sa_family; - __be16 port; -}; -#endif - -#define udp_sock_create4 rpl_udp_sock_create4 -int rpl_udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp); - -#define udp_sock_create6 rpl_udp_sock_create6 -#if IS_ENABLED(CONFIG_IPV6) -int rpl_udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp); -#else -static inline int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp) -{ - return -EPFNOSUPPORT; -} -#endif - -#define udp_sock_create rpl_udp_sock_create -static inline int udp_sock_create(struct net *net, - struct udp_port_cfg *cfg, - struct socket **sockp) -{ - if (cfg->family == AF_INET) - return udp_sock_create4(net, cfg, sockp); - - if (cfg->family == AF_INET6) - return udp_sock_create6(net, cfg, sockp); - - return -EPFNOSUPPORT; -} - -typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb); -typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk); -typedef struct sk_buff **(*udp_tunnel_gro_receive_t)(struct sock *sk, - struct sk_buff **head, - struct sk_buff *skb); -typedef int (*udp_tunnel_gro_complete_t)(struct sock *sk, struct sk_buff *skb, - int nhoff); - -struct udp_tunnel_sock_cfg { - void *sk_user_data; /* user data used by encap_rcv call back */ - /* Used for setting up udp_sock fields, see udp.h for details */ - __u8 encap_type; - udp_tunnel_encap_rcv_t encap_rcv; - udp_tunnel_encap_destroy_t encap_destroy; -#ifdef HAVE_UDP_TUNNEL_SOCK_CFG_GRO_RECEIVE - udp_tunnel_gro_receive_t gro_receive; - udp_tunnel_gro_complete_t gro_complete; -#endif -}; - -/* Setup the given (UDP) sock to receive UDP encapsulated packets */ -#define setup_udp_tunnel_sock rpl_setup_udp_tunnel_sock -void rpl_setup_udp_tunnel_sock(struct net *net, struct socket *sock, - struct udp_tunnel_sock_cfg *sock_cfg); - -/* Transmit the skb using UDP encapsulation. */ -#define udp_tunnel_xmit_skb rpl_udp_tunnel_xmit_skb -void rpl_udp_tunnel_xmit_skb(struct rtable *rt, - struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, - __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); - - -#define udp_tunnel_sock_release rpl_udp_tunnel_sock_release -void rpl_udp_tunnel_sock_release(struct socket *sock); - -#define udp_tunnel_encap_enable rpl_udp_tunnel_encap_enable -static inline void udp_tunnel_encap_enable(struct socket *sock) -{ -#if IS_ENABLED(CONFIG_IPV6) - if (sock->sk->sk_family == PF_INET6) -#ifdef HAVE_IPV6_STUB - ipv6_stub->udpv6_encap_enable(); -#else - udpv6_encap_enable(); -#endif - else -#endif - udp_encap_enable(); -} - -#if IS_ENABLED(CONFIG_IPV6) -#define udp_tunnel6_xmit_skb rpl_udp_tunnel6_xmit_skb -int rpl_udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, struct in6_addr *saddr, - struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, __be16 src_port, - __be16 dst_port, bool nocheck); -#endif - -static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff) -{ - struct udphdr *uh; - - uh = (struct udphdr *)(skb->data + nhoff - sizeof(struct udphdr)); - skb_shinfo(skb)->gso_type |= uh->check ? - SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; -} - -void ovs_udp_gso(struct sk_buff *skb); -void ovs_udp_csum_gso(struct sk_buff *skb); - -static inline int rpl_udp_tunnel_handle_offloads(struct sk_buff *skb, - bool udp_csum) -{ - void (*fix_segment)(struct sk_buff *); - int type = 0; - - type |= udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; -#ifndef USE_UPSTREAM_TUNNEL_GSO - if (!udp_csum) - fix_segment = ovs_udp_gso; - else - fix_segment = ovs_udp_csum_gso; - /* This functuin is not used by vxlan lan tunnel. On older - * udp offload only supports vxlan, therefore fallback to software - * segmentation. - */ - type = 0; -#else - fix_segment = NULL; -#endif - - return ovs_iptunnel_handle_offloads(skb, type, fix_segment); -} - -#define udp_tunnel_handle_offloads rpl_udp_tunnel_handle_offloads -static inline void ovs_udp_tun_rx_dst(struct metadata_dst *md_dst, - struct sk_buff *skb, - unsigned short family, - __be16 flags, __be64 tunnel_id, int md_size) -{ - struct ip_tunnel_info *info = &md_dst->u.tun_info; - - if (family == AF_INET) - ovs_ip_tun_rx_dst(md_dst, skb, flags, tunnel_id, md_size); - else - ovs_ipv6_tun_rx_dst(md_dst, skb, flags, tunnel_id, md_size); - - info->key.tp_src = udp_hdr(skb)->source; - info->key.tp_dst = udp_hdr(skb)->dest; - if (udp_hdr(skb)->check) - info->key.tun_flags |= TUNNEL_CSUM; -} -#endif /* USE_UPSTREAM_TUNNEL */ - -#endif diff --git a/datapath/linux/compat/include/net/vrf.h b/datapath/linux/compat/include/net/vrf.h deleted file mode 100644 index f5b6e8900..000000000 --- a/datapath/linux/compat/include/net/vrf.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * include/net/net_vrf.h - adds vrf dev structure definitions - * Copyright (c) 2015 Cumulus Networks - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef __LINUX_NET_VRF_WRAPPER_H -#define __LINUX_NET_VRF_WRAPPER_H - -#include <linux/version.h> - -#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0) -#include_next <net/vrf.h> -#else - -static inline int vrf_master_ifindex_rcu(const struct net_device *dev) -{ - return 0; -} -#endif - -#endif /* __LINUX_NET_VRF_WRAPPER_H */ diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h deleted file mode 100644 index 18f5474d9..000000000 --- a/datapath/linux/compat/include/net/vxlan.h +++ /dev/null @@ -1,444 +0,0 @@ -#ifndef __NET_VXLAN_WRAPPER_H -#define __NET_VXLAN_WRAPPER_H 1 - -#ifdef CONFIG_INET -#include <net/udp_tunnel.h> -#endif - -#ifdef USE_UPSTREAM_TUNNEL -#include_next <net/vxlan.h> - -static inline int rpl_vxlan_init_module(void) -{ - return 0; -} -static inline void rpl_vxlan_cleanup_module(void) -{} - -#define vxlan_xmit dev_queue_xmit - -#ifdef CONFIG_INET -#ifndef HAVE_NAME_ASSIGN_TYPE -static inline struct net_device *rpl_vxlan_dev_create( - struct net *net, const char *name, u8 name_assign_type, - struct vxlan_config *conf) { - return vxlan_dev_create(net, name, conf); -} -#define vxlan_dev_create rpl_vxlan_dev_create -#endif -#endif - -#else /* USE_UPSTREAM_TUNNEL */ - -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/if_vlan.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/udp.h> -#include <net/dst_cache.h> -#include <net/dst_metadata.h> - -#include "compat.h" -#include "gso.h" - -/* VXLAN protocol (RFC 7348) header: - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |R|R|R|R|I|R|R|R| Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | VXLAN Network Identifier (VNI) | Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * I = VXLAN Network Identifier (VNI) present. - */ -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; -}; - -/* VXLAN header flags. */ -#define VXLAN_HF_VNI cpu_to_be32(BIT(27)) - -#define VXLAN_N_VID (1u << 24) -#define VXLAN_VID_MASK (VXLAN_N_VID - 1) -#define VXLAN_VNI_MASK cpu_to_be32(VXLAN_VID_MASK << 8) -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) - -#define VNI_HASH_BITS 10 -#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) -#define FDB_HASH_BITS 8 -#define FDB_HASH_SIZE (1<<FDB_HASH_BITS) - -/* Remote checksum offload for VXLAN (VXLAN_F_REMCSUM_[RT]X): - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |R|R|R|R|I|R|R|R|R|R|C| Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | VXLAN Network Identifier (VNI) |O| Csum start | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * C = Remote checksum offload bit. When set indicates that the - * remote checksum offload data is present. - * - * O = Offset bit. Indicates the checksum offset relative to - * checksum start. - * - * Csum start = Checksum start divided by two. - * - * http://tools.ietf.org/html/draft-herbert-vxlan-rco - */ - -/* VXLAN-RCO header flags. */ -#define VXLAN_HF_RCO cpu_to_be32(BIT(21)) - -/* Remote checksum offload header option */ -#define VXLAN_RCO_MASK cpu_to_be32(0x7f) /* Last byte of vni field */ -#define VXLAN_RCO_UDP cpu_to_be32(0x80) /* Indicate UDP RCO (TCP when not set *) */ -#define VXLAN_RCO_SHIFT 1 /* Left shift of start */ -#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1) -#define VXLAN_MAX_REMCSUM_START (0x7f << VXLAN_RCO_SHIFT) - -/* - * VXLAN Group Based Policy Extension (VXLAN_F_GBP): - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |G|R|R|R|I|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | VXLAN Network Identifier (VNI) | Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * G = Group Policy ID present. - * - * D = Don't Learn bit. When set, this bit indicates that the egress - * VTEP MUST NOT learn the source address of the encapsulated frame. - * - * A = Indicates that the group policy has already been applied to - * this packet. Policies MUST NOT be applied by devices when the - * A bit is set. - * - * https://tools.ietf.org/html/draft-smith-vxlan-group-policy - */ -struct vxlanhdr_gbp { - u8 vx_flags; -#ifdef __LITTLE_ENDIAN_BITFIELD - u8 reserved_flags1:3, - policy_applied:1, - reserved_flags2:2, - dont_learn:1, - reserved_flags3:1; -#elif defined(__BIG_ENDIAN_BITFIELD) - u8 reserved_flags1:1, - dont_learn:1, - reserved_flags2:2, - policy_applied:1, - reserved_flags3:3; -#else -#error "Please fix <asm/byteorder.h>" -#endif - __be16 policy_id; - __be32 vx_vni; -}; - -/* VXLAN-GBP header flags. */ -#define VXLAN_HF_GBP cpu_to_be32(BIT(31)) - -#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | cpu_to_be32(0xFFFFFF)) - -/* skb->mark mapping - * - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |R|R|R|R|R|R|R|R|R|D|R|R|A|R|R|R| Group Policy ID | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - */ -#define VXLAN_GBP_DONT_LEARN (BIT(6) << 16) -#define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16) -#define VXLAN_GBP_ID_MASK (0xFFFF) - -/* - * VXLAN Generic Protocol Extension (VXLAN_F_GPE): - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |R|R|Ver|I|P|R|O| Reserved |Next Protocol | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | VXLAN Network Identifier (VNI) | Reserved | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - * Ver = Version. Indicates VXLAN GPE protocol version. - * - * P = Next Protocol Bit. The P bit is set to indicate that the - * Next Protocol field is present. - * - * O = OAM Flag Bit. The O bit is set to indicate that the packet - * is an OAM packet. - * - * Next Protocol = This 8 bit field indicates the protocol header - * immediately following the VXLAN GPE header. - * - * https://tools.ietf.org/html/draft-ietf-nvo3-vxlan-gpe-01 - */ - -struct vxlanhdr_gpe { -#if defined(__LITTLE_ENDIAN_BITFIELD) - u8 oam_flag:1, - reserved_flags1:1, - np_applied:1, - instance_applied:1, - version:2, -reserved_flags2:2; -#elif defined(__BIG_ENDIAN_BITFIELD) - u8 reserved_flags2:2, - version:2, - instance_applied:1, - np_applied:1, - reserved_flags1:1, - oam_flag:1; -#endif - u8 reserved_flags3; - u8 reserved_flags4; - u8 next_protocol; - __be32 vx_vni; -}; - -/* VXLAN-GPE header flags. */ -#define VXLAN_HF_VER cpu_to_be32(BIT(29) | BIT(28)) -#define VXLAN_HF_NP cpu_to_be32(BIT(26)) -#define VXLAN_HF_OAM cpu_to_be32(BIT(24)) - -#define VXLAN_GPE_USED_BITS (VXLAN_HF_VER | VXLAN_HF_NP | VXLAN_HF_OAM | \ - cpu_to_be32(0xff)) - -struct vxlan_metadata { - u32 gbp; -}; - -/* per UDP socket information */ -struct vxlan_sock { - struct hlist_node hlist; - struct socket *sock; - struct hlist_head vni_list[VNI_HASH_SIZE]; - atomic_t refcnt; - u32 flags; -#ifdef HAVE_UDP_OFFLOAD - struct udp_offload udp_offloads; -#endif -}; - -union vxlan_addr { - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - struct sockaddr sa; -}; - -struct vxlan_rdst { - union vxlan_addr remote_ip; - __be16 remote_port; - __be32 remote_vni; - u32 remote_ifindex; - struct list_head list; - struct rcu_head rcu; - struct dst_cache dst_cache; -}; - -struct vxlan_config { - union vxlan_addr remote_ip; - union vxlan_addr saddr; - __be32 vni; - int remote_ifindex; - int mtu; - __be16 dst_port; - u16 port_min; - u16 port_max; - u8 tos; - u8 ttl; - __be32 label; - u32 flags; - unsigned long age_interval; - unsigned int addrmax; - bool no_share; -}; - -/* Pseudo network device */ -struct vxlan_dev { - struct hlist_node hlist; /* vni hash table */ - struct list_head next; /* vxlan's per namespace list */ - struct vxlan_sock __rcu *vn4_sock; /* listening socket for IPv4 */ -#if IS_ENABLED(CONFIG_IPV6) - struct vxlan_sock __rcu *vn6_sock; /* listening socket for IPv6 */ -#endif - struct net_device *dev; - struct net *net; /* netns for packet i/o */ - struct vxlan_rdst default_dst; /* default destination */ - u32 flags; /* VXLAN_F_* in vxlan.h */ - - struct timer_list age_timer; - spinlock_t hash_lock; - unsigned int addrcnt; - - struct vxlan_config cfg; - - struct hlist_head fdb_head[FDB_HASH_SIZE]; -}; - -#define VXLAN_F_LEARN 0x01 -#define VXLAN_F_PROXY 0x02 -#define VXLAN_F_RSC 0x04 -#define VXLAN_F_L2MISS 0x08 -#define VXLAN_F_L3MISS 0x10 -#define VXLAN_F_IPV6 0x20 -#define VXLAN_F_UDP_ZERO_CSUM_TX 0x40 -#define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80 -#define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100 -#define VXLAN_F_REMCSUM_TX 0x200 -#define VXLAN_F_REMCSUM_RX 0x400 -#define VXLAN_F_GBP 0x800 -#define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 -#define VXLAN_F_COLLECT_METADATA 0x2000 -#define VXLAN_F_GPE 0x4000 - -/* Flags that are used in the receive path. These flags must match in - * order for a socket to be shareable - */ -#define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \ - VXLAN_F_GPE | \ - VXLAN_F_UDP_ZERO_CSUM6_RX | \ - VXLAN_F_REMCSUM_RX | \ - VXLAN_F_REMCSUM_NOPARTIAL | \ - VXLAN_F_COLLECT_METADATA) - -/* Flags that can be set together with VXLAN_F_GPE. */ -#define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \ - VXLAN_F_IPV6 | \ - VXLAN_F_UDP_ZERO_CSUM_TX | \ - VXLAN_F_UDP_ZERO_CSUM6_TX | \ - VXLAN_F_UDP_ZERO_CSUM6_RX | \ - VXLAN_F_COLLECT_METADATA) - -#define vxlan_dev_create rpl_vxlan_dev_create -struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name, - u8 name_assign_type, struct vxlan_config *conf); - -static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, - netdev_features_t features) -{ - u8 l4_hdr = 0; - - if (!skb->encapsulation) - return features; - - switch (vlan_get_protocol(skb)) { - case htons(ETH_P_IP): - l4_hdr = ip_hdr(skb)->protocol; - break; - case htons(ETH_P_IPV6): - l4_hdr = ipv6_hdr(skb)->nexthdr; - break; - default: - return features;; - } - - if ((l4_hdr == IPPROTO_UDP) && ( -#ifdef HAVE_INNER_PROTOCOL_TYPE - skb->inner_protocol_type != ENCAP_TYPE_ETHER || -#endif -#ifdef HAVE_INNER_PROTOCOL - skb->inner_protocol != htons(ETH_P_TEB) || -#endif - (skb_inner_mac_header(skb) - skb_transport_header(skb) != - sizeof(struct udphdr) + sizeof(struct vxlanhdr)) || - (skb->ip_summed != CHECKSUM_NONE && - !can_checksum_protocol(features, inner_eth_hdr(skb)->h_proto)))) - return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); - - return features; -} - -/* IP header + UDP + VXLAN + Ethernet header */ -#define VXLAN_HEADROOM (20 + 8 + 8 + 14) -/* IPv6 header + UDP + VXLAN + Ethernet header */ -#define VXLAN6_HEADROOM (40 + 8 + 8 + 14) - -static inline struct vxlanhdr *vxlan_hdr(struct sk_buff *skb) -{ - return (struct vxlanhdr *)(udp_hdr(skb) + 1); -} - -static inline __be32 vxlan_vni(__be32 vni_field) -{ -#if defined(__BIG_ENDIAN) - return (__force __be32)((__force u32)vni_field >> 8); -#else - return (__force __be32)((__force u32)(vni_field & VXLAN_VNI_MASK) << 8); -#endif -} - -static inline __be32 vxlan_vni_field(__be32 vni) -{ -#if defined(__BIG_ENDIAN) - return (__force __be32)((__force u32)vni << 8); -#else - return (__force __be32)((__force u32)vni >> 8); -#endif -} - -static inline __be32 vxlan_tun_id_to_vni(__be64 tun_id) -{ -#if defined(__BIG_ENDIAN) - return (__force __be32)tun_id; -#else - return (__force __be32)((__force u64)tun_id >> 32); -#endif -} - -static inline __be64 vxlan_vni_to_tun_id(__be32 vni) -{ -#if defined(__BIG_ENDIAN) - return (__force __be64)vni; -#else - return (__force __be64)((u64)(__force u32)vni << 32); -#endif -} - -static inline size_t vxlan_rco_start(__be32 vni_field) -{ - return be32_to_cpu(vni_field & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; -} - -static inline size_t vxlan_rco_offset(__be32 vni_field) -{ - return (vni_field & VXLAN_RCO_UDP) ? - offsetof(struct udphdr, check) : - offsetof(struct tcphdr, check); -} - -static inline __be32 vxlan_compute_rco(unsigned int start, unsigned int offset) -{ - __be32 vni_field = cpu_to_be32(start >> VXLAN_RCO_SHIFT); - - if (offset == offsetof(struct udphdr, check)) - vni_field |= VXLAN_RCO_UDP; - return vni_field; -} - -static inline void vxlan_get_rx_port(struct net_device *netdev) -{ - ASSERT_RTNL(); - call_netdevice_notifiers(NETDEV_OFFLOAD_PUSH_VXLAN, netdev); -} - -static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs) -{ - return vs->sock->sk->sk_family; -} - -int rpl_vxlan_init_module(void); -void rpl_vxlan_cleanup_module(void); - -#define vxlan_fill_metadata_dst ovs_vxlan_fill_metadata_dst -int ovs_vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb); - -#define vxlan_xmit rpl_vxlan_xmit -netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb); - -#endif /* USE_UPSTREAM_TUNNEL */ - -#define vxlan_init_module rpl_vxlan_init_module -#define vxlan_cleanup_module rpl_vxlan_cleanup_module - -#endif diff --git a/datapath/linux/compat/include/uapi/linux/netfilter.h b/datapath/linux/compat/include/uapi/linux/netfilter.h deleted file mode 100644 index 56895b17b..000000000 --- a/datapath/linux/compat/include/uapi/linux/netfilter.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _NETFILTER_WRAPPER_H -#define _NETFILTER_WRAPPER_H - -#include_next <uapi/linux/netfilter.h> - -/* - * NFPROTO_INET was introduced in net-next commit 1d49144c0aaa - * ("netfilter: nf_tables: add "inet" table for IPv4/IPv6") in v3.14. - * Define this symbol to support back to v3.10 kernel. */ -#ifndef HAVE_NFPROTO_INET -#define NFPROTO_INET 1 -#endif - -#endif /* _NETFILTER_WRAPPER_H */ diff --git a/datapath/linux/compat/inet_fragment.c b/datapath/linux/compat/inet_fragment.c deleted file mode 100644 index 21736e61a..000000000 --- a/datapath/linux/compat/inet_fragment.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * inet fragments management - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Authors: Pavel Emelyanov <xemul@openvz.org> - * Started as consolidation of ipv4/ip_fragment.c, - * ipv6/reassembly. and ipv6 nf conntrack reassembly - */ - -#ifndef HAVE_CORRECT_MRU_HANDLING - -#include <linux/list.h> -#include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/timer.h> -#include <linux/mm.h> -#include <linux/random.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/slab.h> - -#include <net/sock.h> -#include <net/inet_frag.h> -#include <net/inet_ecn.h> - - -#endif /* !HAVE_CORRECT_MRU_HANDLING */ diff --git a/datapath/linux/compat/ip6_gre.c b/datapath/linux/compat/ip6_gre.c deleted file mode 100644 index 3aa9844b3..000000000 --- a/datapath/linux/compat/ip6_gre.c +++ /dev/null @@ -1,2746 +0,0 @@ -/* - * GRE over IPv6 protocol decoder. - * - * Authors: Dmitry Kozlov (xeb@mail.ru) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#ifndef USE_UPSTREAM_TUNNEL -#include <linux/capability.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/in.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/if_arp.h> -#include <linux/init.h> -#include <linux/in6.h> -#include <linux/inetdevice.h> -#include <linux/igmp.h> -#include <linux/netfilter_ipv4.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/hash.h> -#include <linux/if_tunnel.h> -#include <linux/ip6_tunnel.h> - -#include <net/sock.h> -#include <net/ip.h> -#include <net/ip_tunnels.h> -#include <net/icmp.h> -#include <net/protocol.h> -#include <net/addrconf.h> -#include <net/arp.h> -#include <net/checksum.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/xfrm.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/rtnetlink.h> - -#include <net/ipv6.h> -#include <net/ip6_fib.h> -#include <net/ip6_route.h> -#include <net/ip6_tunnel.h> -#include <net/gre.h> -#include <net/erspan.h> -#include <net/dst_metadata.h> - -#include "vport-netdev.h" - -#define IP6_GRE_HASH_SIZE_SHIFT 5 -#define IP6_GRE_HASH_SIZE (1 << IP6_GRE_HASH_SIZE_SHIFT) - -static unsigned int ip6gre_net_id __read_mostly; -static bool ip6_gre_loaded = false; -struct ip6gre_net { - struct ip6_tnl __rcu *tunnels[4][IP6_GRE_HASH_SIZE]; - - struct ip6_tnl __rcu *collect_md_tun; - struct ip6_tnl __rcu *collect_md_tun_erspan; - struct net_device *fb_tunnel_dev; -}; - -static struct rtnl_link_ops ip6gre_link_ops __read_mostly; -static struct rtnl_link_ops ip6gre_tap_ops __read_mostly; -static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly; -static int ip6gre_tunnel_init(struct net_device *dev); -static void ip6gre_tunnel_setup(struct net_device *dev); -static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t); -static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu); -static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu); - -/* Tunnel hash table */ - -/* - 4 hash tables: - - 3: (remote,local) - 2: (remote,*) - 1: (*,local) - 0: (*,*) - - We require exact key match i.e. if a key is present in packet - it will match only tunnel with the same key; if it is not present, - it will match only keyless tunnel. - - All keysless packets, if not matched configured keyless tunnels - will match fallback tunnel. - */ - -#define HASH_KEY(key) (((__force u32)key^((__force u32)key>>4))&(IP6_GRE_HASH_SIZE - 1)) -static u32 HASH_ADDR(const struct in6_addr *addr) -{ - u32 hash = ipv6_addr_hash(addr); - - return hash_32(hash, IP6_GRE_HASH_SIZE_SHIFT); -} - -#define tunnels_r_l tunnels[3] -#define tunnels_r tunnels[2] -#define tunnels_l tunnels[1] -#define tunnels_wc tunnels[0] - -/* Given src, dst and key, find appropriate for input tunnel. */ - -static struct ip6_tnl *ip6gre_tunnel_lookup(struct net_device *dev, - const struct in6_addr *remote, const struct in6_addr *local, - __be32 key, __be16 gre_proto) -{ - struct net *net = dev_net(dev); - int link = dev->ifindex; - unsigned int h0 = HASH_ADDR(remote); - unsigned int h1 = HASH_KEY(key); - struct ip6_tnl *t, *cand = NULL; - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - int dev_type = (gre_proto == htons(ETH_P_TEB) || - gre_proto == htons(ETH_P_ERSPAN) || - gre_proto == htons(ETH_P_ERSPAN2)) ? - ARPHRD_ETHER : ARPHRD_IP6GRE; - int score, cand_score = 4; - - for_each_ip_tunnel_rcu(t, ign->tunnels_r_l[h0 ^ h1]) { - if (!ipv6_addr_equal(local, &t->parms.laddr) || - !ipv6_addr_equal(remote, &t->parms.raddr) || - key != t->parms.i_key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } - - for_each_ip_tunnel_rcu(t, ign->tunnels_r[h0 ^ h1]) { - if (!ipv6_addr_equal(remote, &t->parms.raddr) || - key != t->parms.i_key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } - - for_each_ip_tunnel_rcu(t, ign->tunnels_l[h1]) { - if ((!ipv6_addr_equal(local, &t->parms.laddr) && - (!ipv6_addr_equal(local, &t->parms.raddr) || - !ipv6_addr_is_multicast(local))) || - key != t->parms.i_key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } - - for_each_ip_tunnel_rcu(t, ign->tunnels_wc[h1]) { - if (t->parms.i_key != key || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->dev->type != ARPHRD_IP6GRE && - t->dev->type != dev_type) - continue; - - score = 0; - if (t->parms.link != link) - score |= 1; - if (t->dev->type != dev_type) - score |= 2; - if (score == 0) - return t; - - if (score < cand_score) { - cand = t; - cand_score = score; - } - } - - if (cand) - return cand; - - if (gre_proto == htons(ETH_P_ERSPAN) || - gre_proto == htons(ETH_P_ERSPAN2)) - t = rcu_dereference(ign->collect_md_tun_erspan); - else - t = rcu_dereference(ign->collect_md_tun); - - if (t && t->dev->flags & IFF_UP) - return t; - - dev = ign->fb_tunnel_dev; - if (dev->flags & IFF_UP) - return netdev_priv(dev); - - return NULL; -} - -static struct ip6_tnl __rcu **__ip6gre_bucket(struct ip6gre_net *ign, - const struct __ip6_tnl_parm *p) -{ - const struct in6_addr *remote = &p->raddr; - const struct in6_addr *local = &p->laddr; - unsigned int h = HASH_KEY(p->i_key); - int prio = 0; - - if (!ipv6_addr_any(local)) - prio |= 1; - if (!ipv6_addr_any(remote) && !ipv6_addr_is_multicast(remote)) { - prio |= 2; - h ^= HASH_ADDR(remote); - } - - return &ign->tunnels[prio][h]; -} - -static void ip6gre_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t) -{ - if (t->parms.collect_md) - rcu_assign_pointer(ign->collect_md_tun, t); -} - -static void ip6erspan_tunnel_link_md(struct ip6gre_net *ign, struct ip6_tnl *t) -{ - if (t->parms.collect_md) - rcu_assign_pointer(ign->collect_md_tun_erspan, t); -} - -static void ip6gre_tunnel_unlink_md(struct ip6gre_net *ign, struct ip6_tnl *t) -{ - if (t->parms.collect_md) - rcu_assign_pointer(ign->collect_md_tun, NULL); -} - -static void ip6erspan_tunnel_unlink_md(struct ip6gre_net *ign, - struct ip6_tnl *t) -{ - if (t->parms.collect_md) - rcu_assign_pointer(ign->collect_md_tun_erspan, NULL); -} - -static inline struct ip6_tnl __rcu **ip6gre_bucket(struct ip6gre_net *ign, - const struct ip6_tnl *t) -{ - return __ip6gre_bucket(ign, &t->parms); -} - -static void ip6gre_tunnel_link(struct ip6gre_net *ign, struct ip6_tnl *t) -{ - struct ip6_tnl __rcu **tp = ip6gre_bucket(ign, t); - - rcu_assign_pointer(t->next, rtnl_dereference(*tp)); - rcu_assign_pointer(*tp, t); -} - -static void ip6gre_tunnel_unlink(struct ip6gre_net *ign, struct ip6_tnl *t) -{ - struct ip6_tnl __rcu **tp; - struct ip6_tnl *iter; - - for (tp = ip6gre_bucket(ign, t); - (iter = rtnl_dereference(*tp)) != NULL; - tp = &iter->next) { - if (t == iter) { - rcu_assign_pointer(*tp, t->next); - break; - } - } -} - -static struct ip6_tnl *ip6gre_tunnel_find(struct net *net, - const struct __ip6_tnl_parm *parms, - int type) -{ - const struct in6_addr *remote = &parms->raddr; - const struct in6_addr *local = &parms->laddr; - __be32 key = parms->i_key; - int link = parms->link; - struct ip6_tnl *t; - struct ip6_tnl __rcu **tp; - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - - for (tp = __ip6gre_bucket(ign, parms); - (t = rtnl_dereference(*tp)) != NULL; - tp = &t->next) - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr) && - key == t->parms.i_key && - link == t->parms.link && - type == t->dev->type) - break; - - return t; -} - -static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, - const struct __ip6_tnl_parm *parms, int create) -{ - struct ip6_tnl *t, *nt; - struct net_device *dev; - char name[IFNAMSIZ]; - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - - t = ip6gre_tunnel_find(net, parms, ARPHRD_IP6GRE); - if (t && create) - return NULL; - if (t || !create) - return t; - - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else - strlcpy(name, "ovs-ip6gre%d", IFNAMSIZ); - - dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, - ip6gre_tunnel_setup); - if (!dev) - return NULL; - - dev_net_set(dev, net); - - nt = netdev_priv(dev); - nt->parms = *parms; - dev->rtnl_link_ops = &ip6gre_link_ops; - - nt->dev = dev; - nt->net = dev_net(dev); - - if (register_netdevice(dev) < 0) - goto failed_free; - - ip6gre_tnl_link_config(nt, 1); - - /* Can use a lockless transmit, unless we generate output sequences */ - if (!(nt->parms.o_flags & TUNNEL_SEQ)) - dev->features |= NETIF_F_LLTX; - - dev_hold(dev); - ip6gre_tunnel_link(ign, nt); - return nt; - -failed_free: - free_netdev(dev); - return NULL; -} - -static void ip6erspan_tunnel_uninit(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); - - ip6erspan_tunnel_unlink_md(ign, t); - ip6gre_tunnel_unlink(ign, t); - dst_cache_reset(&t->dst_cache); - dev_put(dev); -} - -static void ip6gre_tunnel_uninit(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id); - - ip6gre_tunnel_unlink_md(ign, t); - ip6gre_tunnel_unlink(ign, t); - dst_cache_reset(&t->dst_cache); - dev_put(dev); -} - - -static void ip6gre_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ -#if 0 - struct net *net = dev_net(skb->dev); - const struct gre_base_hdr *greh; - const struct ipv6hdr *ipv6h; - int grehlen = sizeof(*greh); - struct ip6_tnl *t; - int key_off = 0; - __be16 flags; - __be32 key; - - if (!pskb_may_pull(skb, offset + grehlen)) - return; - greh = (const struct gre_base_hdr *)(skb->data + offset); - flags = greh->flags; - if (flags & (GRE_VERSION | GRE_ROUTING)) - return; - if (flags & GRE_CSUM) - grehlen += 4; - if (flags & GRE_KEY) { - key_off = grehlen + offset; - grehlen += 4; - } - - if (!pskb_may_pull(skb, offset + grehlen)) - return; - ipv6h = (const struct ipv6hdr *)skb->data; - greh = (const struct gre_base_hdr *)(skb->data + offset); - key = key_off ? *(__be32 *)(skb->data + key_off) : 0; - - t = ip6gre_tunnel_lookup(skb->dev, &ipv6h->daddr, &ipv6h->saddr, - key, greh->protocol); - if (!t) - return; - - switch (type) { - struct ipv6_tlv_tnl_enc_lim *tel; - __u32 teli; - case ICMPV6_DEST_UNREACH: - net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n", - t->parms.name); - if (code != ICMPV6_PORT_UNREACH) - break; - return; - case ICMPV6_TIME_EXCEED: - if (code == ICMPV6_EXC_HOPLIMIT) { - net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n", - t->parms.name); - break; - } - return; - case ICMPV6_PARAMPROB: - teli = 0; - if (code == ICMPV6_HDR_FIELD) - teli = ip6_tnl_parse_tlv_enc_lim(skb, skb->data); - - if (teli && teli == be32_to_cpu(info) - 2) { - tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; - if (tel->encap_limit == 0) { - net_dbg_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n", - t->parms.name); - } - } else { - net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n", - t->parms.name); - } - return; - case ICMPV6_PKT_TOOBIG: - ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); - return; - case NDISC_REDIRECT: - ip6_redirect(skb, net, skb->dev->ifindex, 0, - sock_net_uid(net, NULL)); - return; - } - - if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO)) - t->err_count++; - else - t->err_count = 1; - t->err_time = jiffies; -#endif -} - -static int ip6gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) -{ - const struct ipv6hdr *ipv6h; - struct ip6_tnl *tunnel; - - ipv6h = ipv6_hdr(skb); - tunnel = ip6gre_tunnel_lookup(skb->dev, - &ipv6h->saddr, &ipv6h->daddr, tpi->key, - tpi->proto); - if (tunnel) { - struct metadata_dst *tun_dst = NULL; - if (tunnel->parms.collect_md) { - __be64 tun_id; - __be16 flags; - - flags = tpi->flags; - tun_id = key32_to_tunnel_id(tpi->key); - - tun_dst = rpl_ipv6_tun_rx_dst(skb, flags, tun_id, 0); - if (!tun_dst) - return PACKET_REJECT; - - } - - ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, false); - kfree(tun_dst); - return PACKET_RCVD; - } - - return PACKET_RCVD; -} - -static int ip6erspan_rcv(struct sk_buff *skb, - struct tnl_ptk_info *tpi, - int gre_hdr_len) -{ - struct erspan_base_hdr *ershdr; - const struct ipv6hdr *ipv6h; - struct erspan_md2 *md2; - struct ip6_tnl *tunnel; - u8 ver; - - if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) - return PACKET_REJECT; - - ipv6h = ipv6_hdr(skb); - ershdr = (struct erspan_base_hdr *)skb->data; - ver = ershdr->ver; - tpi->key = cpu_to_be32(get_session_id(ershdr)); - - tunnel = ip6gre_tunnel_lookup(skb->dev, - &ipv6h->saddr, &ipv6h->daddr, 0, - tpi->proto); - if (tunnel) { - struct metadata_dst *tun_dst = NULL; - int len = erspan_hdr_len(ver); - - if (unlikely(!pskb_may_pull(skb, len))) - return PACKET_REJECT; - - if (__iptunnel_pull_header(skb, len, - htons(ETH_P_TEB), - false, false) < 0) - return PACKET_REJECT; - - if (tunnel->parms.collect_md) { - struct erspan_metadata *pkt_md, *md; - struct ip_tunnel_info *info; - unsigned char *gh; - __be64 tun_id; - __be16 flags; - - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; - tun_id = key32_to_tunnel_id(tpi->key); - - tun_dst = rpl_ipv6_tun_rx_dst(skb, flags, tun_id, - sizeof(*md)); - if (!tun_dst) - return PACKET_REJECT; - - /* skb can be uncloned in __iptunnel_pull_header, so - * old pkt_md is no longer valid and we need to reset - * it - */ - gh = skb_network_header(skb) + - skb_network_header_len(skb); - pkt_md = (struct erspan_metadata *)(gh + gre_hdr_len + - sizeof(*ershdr)); - info = &tun_dst->u.tun_info; - md = ip_tunnel_info_opts(info); - md->version = ver; - md2 = &md->u.md2; - memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : - ERSPAN_V2_MDSIZE); - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; - info->options_len = sizeof(*md); - } - - ip6_tnl_rcv(tunnel, skb, tpi, tun_dst, false); - kfree(tun_dst); - return PACKET_RCVD; - } - - kfree(skb); - return PACKET_RCVD; -} - -static int gre_rcv(struct sk_buff *skb) -{ - struct tnl_ptk_info tpi; - bool csum_err = false; - int hdr_len; - - hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IPV6), 0); - if (hdr_len < 0) - goto drop; - - if (iptunnel_pull_header(skb, hdr_len, tpi.proto, false)) - goto drop; - - if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || - tpi.proto == htons(ETH_P_ERSPAN2))) { - if (ip6erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) - return 0; - goto out; - } - - if (ip6gre_rcv(skb, &tpi) == PACKET_RCVD) - return 0; - -out: - icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); -drop: - kfree_skb(skb); - return 0; -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) -#include "gso.h" -/* gre_handle_offloads() has different return type on older kernsl. */ -static void gre_nop_fix(struct sk_buff *skb) { } - -static void gre_csum_fix(struct sk_buff *skb) -{ - struct gre_base_hdr *greh; - __be32 *options; - int gre_offset = skb_transport_offset(skb); - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - options = ((__be32 *)greh + 1); - - *options = 0; - *(__sum16 *)options = csum_fold(skb_checksum(skb, gre_offset, - skb->len - gre_offset, 0)); -} - -#define gre_handle_offloads rpl_gre_handle_offloads -static int rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum) -{ - int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE; - gso_fix_segment_t fix_segment; - - if (gre_csum) - fix_segment = gre_csum_fix; - else - fix_segment = gre_nop_fix; - - return ovs_iptunnel_handle_offloads(skb, type, fix_segment); -} -#else -static int gre_handle_offloads(struct sk_buff *skb, bool csum) -{ - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); -} -#endif - -static void prepare_ip6gre_xmit_ipv4(struct sk_buff *skb, - struct net_device *dev, - struct flowi6 *fl6, __u8 *dsfield, - int *encap_limit) -{ - const struct iphdr *iph = ip_hdr(skb); - struct ip6_tnl *t = netdev_priv(dev); - - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - *encap_limit = t->parms.encap_limit; - - memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - *dsfield = ipv4_get_dsfield(iph); - else - *dsfield = ip6_tclass(t->parms.flowinfo); - -#ifndef IP6_TNL_F_USE_ORIG_FWMARK - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6->flowi6_mark = skb->mark; - else - fl6->flowi6_mark = t->parms.fwmark; - - fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); -#endif -} - -static int prepare_ip6gre_xmit_ipv6(struct sk_buff *skb, - struct net_device *dev, - struct flowi6 *fl6, __u8 *dsfield, - int *encap_limit) -{ - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - struct ip6_tnl *t = netdev_priv(dev); - __u16 offset; - - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - - tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; - } - *encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { - *encap_limit = t->parms.encap_limit; - } - - memcpy(fl6, &t->fl.u.ip6, sizeof(*fl6)); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - *dsfield = ipv6_get_dsfield(ipv6h); - else - *dsfield = ip6_tclass(t->parms.flowinfo); - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6->flowlabel |= ip6_flowlabel(ipv6h); - -#ifndef IP6_TNL_F_USE_ORIG_FWMARK - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6->flowi6_mark = skb->mark; - else - fl6->flowi6_mark = t->parms.fwmark; - - fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); -#endif - - return 0; -} - -static netdev_tx_t __gre6_xmit(struct sk_buff *skb, - struct net_device *dev, __u8 dsfield, - struct flowi6 *fl6, int encap_limit, - __u32 *pmtu, __be16 proto) -{ - struct ip6_tnl *tunnel = netdev_priv(dev); - struct tnl_ptk_info tpi; - __be16 protocol; - - if (dev->header_ops && dev->type == ARPHRD_IP6GRE) - fl6->daddr = ((struct ipv6hdr *)skb->data)->daddr; - else - fl6->daddr = tunnel->parms.raddr; - - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; - - if (skb_cow_head(skb, dev->needed_headroom ?: tunnel->hlen)) - return -ENOMEM; - - /* Push GRE header. */ - protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto; - - if (tunnel->parms.collect_md) { - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - __be16 flags; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || - !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET6)) - return -EINVAL; - - key = &tun_info->key; - memset(fl6, 0, sizeof(*fl6)); - fl6->flowi6_proto = IPPROTO_GRE; - fl6->daddr = key->u.ipv6.dst; - fl6->flowlabel = key->label; -// FIX ME! -// fl6->flowi6_uid = sock_net_uid(dev_net(dev), NULL); - - dsfield = key->tos; - flags = key->tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); - tunnel->tun_hlen = gre_calc_hlen(flags); - - tpi.flags = flags; - tpi.proto = protocol; - tpi.key = tunnel_id_to_key32(key->tun_id); - tpi.seq = htonl(tunnel->o_seqno++); - tpi.hdr_len = tunnel->tun_hlen; - - gre_build_header(skb, &tpi, 8); - } else { - tpi.flags = tunnel->parms.o_flags; - tpi.proto = protocol; - tpi.key = tunnel->parms.o_key; - tpi.seq = htonl(tunnel->o_seqno++); - tpi.hdr_len = tunnel->tun_hlen; - - gre_build_header(skb, &tpi, 8); - } - - return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu, - NEXTHDR_GRE); -} - -static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - int encap_limit = -1; - struct flowi6 fl6; - __u8 dsfield = 0; - __u32 mtu; - int err; - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - if (!t->parms.collect_md) - prepare_ip6gre_xmit_ipv4(skb, dev, &fl6, - &dsfield, &encap_limit); - - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); - if (err) - return -1; - - err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - skb->protocol); - if (err != 0) { - /* XXX: send ICMP error even if DF is not set. */ - if (err == -EMSGSIZE) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - return -1; - } - - return 0; -} - -static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - int encap_limit = -1; - struct flowi6 fl6; - __u8 dsfield = 0; - __u32 mtu; - int err; - - if (ipv6_addr_equal(&t->parms.raddr, &ipv6h->saddr)) - return -1; - - if (!t->parms.collect_md && - prepare_ip6gre_xmit_ipv6(skb, dev, &fl6, &dsfield, &encap_limit)) - return -1; - - if (gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM))) - return -1; - - err = __gre6_xmit(skb, dev, dsfield, &fl6, encap_limit, - &mtu, skb->protocol); - if (err != 0) { - if (err == -EMSGSIZE) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - return -1; - } - - return 0; -} - -/** - * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own - * @t: the outgoing tunnel device - * @hdr: IPv6 header from the incoming packet - * - * Description: - * Avoid trivial tunneling loop by checking that tunnel exit-point - * doesn't match source of incoming packet. - * - * Return: - * 1 if conflict, - * 0 else - **/ - -static inline bool ip6gre_tnl_addr_conflict(const struct ip6_tnl *t, - const struct ipv6hdr *hdr) -{ - return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); -} - -static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - int encap_limit = -1; - struct flowi6 fl6; - __u32 mtu; - int err; - - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - if (!t->parms.collect_md) - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - - err = gre_handle_offloads(skb, !!(t->parms.o_flags & TUNNEL_CSUM)); - if (err) - return err; - - err = __gre6_xmit(skb, dev, 0, &fl6, encap_limit, &mtu, skb->protocol); - - return err; -} - -static netdev_tx_t ip6gre_tunnel_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct net_device_stats *stats = &t->dev->stats; - int ret; - - if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) - goto tx_err; - - switch (skb->protocol) { - case htons(ETH_P_IP): - ret = ip6gre_xmit_ipv4(skb, dev); - break; - case htons(ETH_P_IPV6): - ret = ip6gre_xmit_ipv6(skb, dev); - break; - default: - ret = ip6gre_xmit_other(skb, dev); - break; - } - - if (ret < 0) - goto tx_err; - - return NETDEV_TX_OK; - -tx_err: - stats->tx_errors++; - stats->tx_dropped++; - kfree_skb(skb); - return NETDEV_TX_OK; -} - -static netdev_tx_t __ip6gre_tunnel_xmit(struct sk_buff *skb) -{ - return ip6gre_tunnel_xmit(skb, skb->dev); -} - -static bool erspan_skb_would_panic(struct sk_buff *skb, int erspan_md_size) -{ - /* check if there is enough headroom in packet, if not - * drop it. Checking for 8 bytes of gre header space + - * erspan base hdr and erspan type specific header. - */ - if (skb_headroom(skb) < (8 + sizeof(struct erspan_base_hdr) + - erspan_md_size)) - return true; - - return false; -} - -static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct dst_entry *dst = skb_dst(skb); - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - struct net_device_stats *stats; - struct erspan_metadata *md; - struct tnl_ptk_info tpi; - bool truncate = false; - int encap_limit = -1; - __u8 dsfield = false; - struct flowi6 fl6; - int err = -EINVAL; - __be32 tun_id; - __u32 mtu; - int nhoff; - int thoff; - - - /* OVS doesn't support native mode ip6 tunnel traffic so - * take an early exit in that case. */ - if (!t->parms.collect_md) - goto tx_err; - - if (!ip6_tnl_xmit_ctl(t, &t->parms.laddr, &t->parms.raddr)) - goto tx_err; - - if (gre_handle_offloads(skb, false)) - goto tx_err; - - if (skb->len > dev->mtu + dev->hard_header_len) { - pskb_trim(skb, dev->mtu + dev->hard_header_len); - truncate = true; - } - - nhoff = skb_network_header(skb) - skb_mac_header(skb); - if (skb->protocol == htons(ETH_P_IP) && - (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) - truncate = true; - - thoff = skb_transport_header(skb) - skb_mac_header(skb); - if (skb->protocol == htons(ETH_P_IPV6) && - (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) - truncate = true; - - if (skb_cow_head(skb, dev->needed_headroom ? : t->hlen)) - goto tx_err; - - t->parms.o_flags &= ~TUNNEL_KEY; - - tun_info = ovs_skb_tunnel_info(skb); - if (unlikely(!tun_info || - !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET6)) - return -EINVAL; - - key = &tun_info->key; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_GRE; - fl6.daddr = key->u.ipv6.dst; - fl6.flowlabel = key->label; - // fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - - dsfield = key->tos; - md = ip_tunnel_info_opts(tun_info); - if (!md) - goto tx_err; - - if (erspan_skb_would_panic(skb, - md->version == 1 ? - ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE)) - goto tx_err; - - tun_id = tunnel_id_to_key32(key->tun_id); - if (md->version == 1) { - erspan_build_header(skb, - ntohl(tun_id), - ntohl(md->u.index), truncate, - false); - tpi.hdr_len = ERSPAN_V1_MDSIZE; - tpi.proto = htons(ETH_P_ERSPAN); - } else if (md->version == 2) { - erspan_build_header_v2(skb, - ntohl(tun_id), - md->u.md2.dir, - get_hwid(&md->u.md2), - truncate, false); - tpi.hdr_len = ERSPAN_V2_MDSIZE; - tpi.proto = htons(ETH_P_ERSPAN2); - } else { - goto tx_err; - } - - tpi.flags = TUNNEL_SEQ; - tpi.key = 0; - tpi.seq = htonl(t->o_seqno++); - - /* Push GRE header. */ - gre_build_header(skb, &tpi, 8); - - /* TooBig packet may have updated dst->dev's mtu */ - if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) -#ifndef HAVE_DST_OPS_CONFIRM_NEIGH - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu); -#else - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); -#endif - - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - NEXTHDR_GRE); - if (err != 0) - goto tx_err; - - return NETDEV_TX_OK; - -tx_err: - stats = &t->dev->stats; - stats->tx_errors++; - stats->tx_dropped++; - kfree_skb(skb); - return NETDEV_TX_OK; -} - -static netdev_tx_t __ip6erspan_tunnel_xmit(struct sk_buff *skb) -{ - return ip6erspan_tunnel_xmit(skb, skb->dev); -} - -static void ip6gre_tnl_link_config_common(struct ip6_tnl *t) -{ - struct net_device *dev = t->dev; - struct __ip6_tnl_parm *p = &t->parms; - struct flowi6 *fl6 = &t->fl.u.ip6; - - if (dev->type != ARPHRD_ETHER) { - memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); - memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); - } - - /* Set up flowi template */ - fl6->saddr = p->laddr; - fl6->daddr = p->raddr; - fl6->flowi6_oif = p->link; - fl6->flowlabel = 0; - fl6->flowi6_proto = IPPROTO_GRE; - - if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) - fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; - if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) - fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; - - p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); - p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); - - if (p->flags&IP6_TNL_F_CAP_XMIT && - p->flags&IP6_TNL_F_CAP_RCV && dev->type != ARPHRD_ETHER) - dev->flags |= IFF_POINTOPOINT; - else - dev->flags &= ~IFF_POINTOPOINT; -} - -static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu, - int t_hlen) -{ - const struct __ip6_tnl_parm *p = &t->parms; - struct net_device *dev = t->dev; - - if (p->flags & IP6_TNL_F_CAP_XMIT) { - int strict = (ipv6_addr_type(&p->raddr) & - (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); - - struct rt6_info *rt = rt6_lookup(t->net, - &p->raddr, &p->laddr, - p->link, strict); - - if (!rt) - return; - - if (rt->dst.dev) { - dev->hard_header_len = rt->dst.dev->hard_header_len + - t_hlen; - - if (set_mtu) { - dev->mtu = rt->dst.dev->mtu - t_hlen; - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; - if (dev->type == ARPHRD_ETHER) - dev->mtu -= ETH_HLEN; - - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; - } - } - ip6_rt_put(rt); - } -} - -static int ip6gre_calc_hlen(struct ip6_tnl *tunnel) -{ - int t_hlen; - - tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags); - tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; - - t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); - tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; - return t_hlen; -} - -static void ip6gre_tnl_link_config(struct ip6_tnl *t, int set_mtu) -{ - ip6gre_tnl_link_config_common(t); - ip6gre_tnl_link_config_route(t, set_mtu, ip6gre_calc_hlen(t)); -} - -static void ip6gre_tnl_copy_tnl_parm(struct ip6_tnl *t, - const struct __ip6_tnl_parm *p) -{ - t->parms.laddr = p->laddr; - t->parms.raddr = p->raddr; - t->parms.flags = p->flags; - t->parms.hop_limit = p->hop_limit; - t->parms.encap_limit = p->encap_limit; - t->parms.flowinfo = p->flowinfo; - t->parms.link = p->link; - t->parms.proto = p->proto; - t->parms.i_key = p->i_key; - t->parms.o_key = p->o_key; - t->parms.i_flags = p->i_flags; - t->parms.o_flags = p->o_flags; - t->parms.fwmark = p->fwmark; - dst_cache_reset(&t->dst_cache); -} - -static int ip6gre_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, - int set_mtu) -{ - ip6gre_tnl_copy_tnl_parm(t, p); - ip6gre_tnl_link_config(t, set_mtu); - return 0; -} - -static void ip6gre_tnl_parm_from_user(struct __ip6_tnl_parm *p, - const struct ip6_tnl_parm2 *u) -{ - p->laddr = u->laddr; - p->raddr = u->raddr; - p->flags = u->flags; - p->hop_limit = u->hop_limit; - p->encap_limit = u->encap_limit; - p->flowinfo = u->flowinfo; - p->link = u->link; - p->i_key = u->i_key; - p->o_key = u->o_key; - p->i_flags = gre_flags_to_tnl_flags(u->i_flags); - p->o_flags = gre_flags_to_tnl_flags(u->o_flags); - memcpy(p->name, u->name, sizeof(u->name)); -} - -static void ip6gre_tnl_parm_to_user(struct ip6_tnl_parm2 *u, - const struct __ip6_tnl_parm *p) -{ - u->proto = IPPROTO_GRE; - u->laddr = p->laddr; - u->raddr = p->raddr; - u->flags = p->flags; - u->hop_limit = p->hop_limit; - u->encap_limit = p->encap_limit; - u->flowinfo = p->flowinfo; - u->link = p->link; - u->i_key = p->i_key; - u->o_key = p->o_key; - u->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); - u->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); - memcpy(u->name, p->name, sizeof(u->name)); -} - -static int ip6gre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) -{ - int err = 0; - struct ip6_tnl_parm2 p; - struct __ip6_tnl_parm p1; - struct ip6_tnl *t = netdev_priv(dev); - struct net *net = t->net; - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - - memset(&p1, 0, sizeof(p1)); - - switch (cmd) { - case SIOCGETTUNNEL: - if (dev == ign->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - ip6gre_tnl_parm_from_user(&p1, &p); - t = ip6gre_tunnel_locate(net, &p1, 0); - if (!t) - t = netdev_priv(dev); - } - memset(&p, 0, sizeof(p)); - ip6gre_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; - break; - - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - - err = -EINVAL; - if ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)) - goto done; - - if (!(p.i_flags&GRE_KEY)) - p.i_key = 0; - if (!(p.o_flags&GRE_KEY)) - p.o_key = 0; - - ip6gre_tnl_parm_from_user(&p1, &p); - t = ip6gre_tunnel_locate(net, &p1, cmd == SIOCADDTUNNEL); - - if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - t = netdev_priv(dev); - - ip6gre_tunnel_unlink(ign, t); - synchronize_net(); - ip6gre_tnl_change(t, &p1, 1); - ip6gre_tunnel_link(ign, t); - netdev_state_change(dev); - } - } - - if (t) { - err = 0; - - memset(&p, 0, sizeof(p)); - ip6gre_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; - - case SIOCDELTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - if (dev == ign->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - err = -ENOENT; - ip6gre_tnl_parm_from_user(&p1, &p); - t = ip6gre_tunnel_locate(net, &p1, 0); - if (!t) - goto done; - err = -EPERM; - if (t == netdev_priv(ign->fb_tunnel_dev)) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; - - default: - err = -EINVAL; - } - -done: - return err; -} - -static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, const void *daddr, - const void *saddr, unsigned int len) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h; - __be16 *p; - - ipv6h = (struct ipv6hdr *)skb_push(skb, t->hlen + sizeof(*ipv6h)); - ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, - true, &t->fl.u.ip6)); - ipv6h->hop_limit = t->parms.hop_limit; - ipv6h->nexthdr = NEXTHDR_GRE; - ipv6h->saddr = t->parms.laddr; - ipv6h->daddr = t->parms.raddr; - - p = (__be16 *)(ipv6h + 1); - p[0] = t->parms.o_flags; - p[1] = htons(type); - - /* - * Set the source hardware address. - */ - - if (saddr) - memcpy(&ipv6h->saddr, saddr, sizeof(struct in6_addr)); - if (daddr) - memcpy(&ipv6h->daddr, daddr, sizeof(struct in6_addr)); - if (!ipv6_addr_any(&ipv6h->daddr)) - return t->hlen; - - return -t->hlen; -} - -static const struct header_ops ip6gre_header_ops = { - .create = ip6gre_header, -}; - -static const struct net_device_ops ip6gre_netdev_ops = { - .ndo_init = ip6gre_tunnel_init, - .ndo_uninit = ip6gre_tunnel_uninit, - .ndo_start_xmit = ip6gre_tunnel_xmit, - .ndo_do_ioctl = ip6gre_tunnel_ioctl, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = ip6_tnl_change_mtu, -#else - .ndo_change_mtu = ip6_tnl_change_mtu, -#endif - .ndo_get_stats64 = ip_tunnel_get_stats64, -#ifdef HAVE_NDO_GET_IFLINK - .ndo_get_iflink = ip6_tnl_get_iflink, -#endif -}; - -#ifdef HAVE_NEEDS_FREE_NETDEV -static void ip6gre_dev_free(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - - dst_cache_destroy(&t->dst_cache); - free_percpu(dev->tstats); -} - -#endif -static void ip6gre_tunnel_setup(struct net_device *dev) -{ - dev->netdev_ops = &ip6gre_netdev_ops; -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = free_netdev; -#else - dev->needs_free_netdev = true; - dev->priv_destructor = ip6gre_dev_free; -#endif - - dev->type = ARPHRD_IP6GRE; - - dev->flags |= IFF_NOARP; - dev->addr_len = sizeof(struct in6_addr); - netif_keep_dst(dev); - /* This perm addr will be used as interface identifier by IPv6 */ - dev->addr_assign_type = NET_ADDR_RANDOM; - eth_random_addr(dev->perm_addr); -} - -#define GRE6_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_HW_CSUM) - -static void ip6gre_tnl_init_features(struct net_device *dev) -{ - struct ip6_tnl *nt = netdev_priv(dev); - - dev->features |= GRE6_FEATURES; - dev->hw_features |= GRE6_FEATURES; - - if (!(nt->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported, nor - * can we support 2 levels of outer headers requiring - * an update. - */ - if (!(nt->parms.o_flags & TUNNEL_CSUM) || - nt->encap.type == TUNNEL_ENCAP_NONE) { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } - - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } -} - -static int ip6gre_tunnel_init_common(struct net_device *dev) -{ - struct ip6_tnl *tunnel; - int ret; - int t_hlen; - - tunnel = netdev_priv(dev); - - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); - - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (ret) { - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; - } - - t_hlen = ip6gre_calc_hlen(tunnel); - dev->mtu = ETH_DATA_LEN - t_hlen; - if (dev->type == ARPHRD_ETHER) - dev->mtu -= ETH_HLEN; - if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; - - if (tunnel->parms.collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; - netif_keep_dst(dev); - } - ip6gre_tnl_init_features(dev); - - return 0; -} - -static int ip6gre_tunnel_init(struct net_device *dev) -{ - struct ip6_tnl *tunnel; - int ret; - - ret = ip6gre_tunnel_init_common(dev); - if (ret) - return ret; - - tunnel = netdev_priv(dev); - - if (tunnel->parms.collect_md) - return 0; - - memcpy(dev->dev_addr, &tunnel->parms.laddr, sizeof(struct in6_addr)); - memcpy(dev->broadcast, &tunnel->parms.raddr, sizeof(struct in6_addr)); - - if (ipv6_addr_any(&tunnel->parms.raddr)) - dev->header_ops = &ip6gre_header_ops; - - return 0; -} - -static void ip6gre_fb_tunnel_init(struct net_device *dev) -{ - struct ip6_tnl *tunnel = netdev_priv(dev); - - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); - - tunnel->hlen = sizeof(struct ipv6hdr) + 4; - - dev_hold(dev); -} - -static struct inet6_protocol ip6gre_protocol __read_mostly = { - .handler = gre_rcv, - .err_handler = ip6gre_err, - .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, -}; - -static void ip6gre_destroy_tunnels(struct net *net, struct list_head *head) -{ - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - struct net_device *dev, *aux; - int prio; - - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &ip6gre_link_ops || - dev->rtnl_link_ops == &ip6gre_tap_ops || - dev->rtnl_link_ops == &ip6erspan_tap_ops) - unregister_netdevice_queue(dev, head); - - for (prio = 0; prio < 4; prio++) { - int h; - for (h = 0; h < IP6_GRE_HASH_SIZE; h++) { - struct ip6_tnl *t; - - t = rtnl_dereference(ign->tunnels[prio][h]); - - while (t) { - /* If dev is in the same netns, it has already - * been added to the list by the previous loop. - */ - if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, - head); - t = rtnl_dereference(t->next); - } - } - } -} - -static int __net_init ip6gre_init_net(struct net *net) -{ - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - int err; - - ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), - "ovs-ip6gre0", - NET_NAME_UNKNOWN, - ip6gre_tunnel_setup); - if (!ign->fb_tunnel_dev) { - err = -ENOMEM; - goto err_alloc_dev; - } - dev_net_set(ign->fb_tunnel_dev, net); - /* FB netdevice is special: we have one, and only one per netns. - * Allowing to move it to another netns is clearly unsafe. - */ - ign->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; - - - ip6gre_fb_tunnel_init(ign->fb_tunnel_dev); - ign->fb_tunnel_dev->rtnl_link_ops = &ip6gre_link_ops; - - err = register_netdev(ign->fb_tunnel_dev); - if (err) - goto err_reg_dev; - - rcu_assign_pointer(ign->tunnels_wc[0], - netdev_priv(ign->fb_tunnel_dev)); - return 0; - -err_reg_dev: - free_netdev(ign->fb_tunnel_dev); -err_alloc_dev: - return err; -} - -static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) -{ - struct net *net; - LIST_HEAD(list); - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -enum { -#ifndef HAVE_IFLA_GRE_ENCAP_DPORT - IFLA_GRE_ENCAP_TYPE = IFLA_GRE_FLAGS + 1, - IFLA_GRE_ENCAP_FLAGS, - IFLA_GRE_ENCAP_SPORT, - IFLA_GRE_ENCAP_DPORT, -#endif -#ifndef HAVE_IFLA_GRE_COLLECT_METADATA - IFLA_GRE_COLLECT_METADATA = IFLA_GRE_ENCAP_DPORT + 1, -#endif -#ifndef HAVE_IFLA_GRE_IGNORE_DF - IFLA_GRE_IGNORE_DF = IFLA_GRE_COLLECT_METADATA + 1, -#endif -#ifndef HAVE_IFLA_GRE_FWMARK - IFLA_GRE_FWMARK = IFLA_GRE_IGNORE_DF + 1, -#endif -#ifndef HAVE_IFLA_GRE_ERSPAN_INDEX - IFLA_GRE_ERSPAN_INDEX = IFLA_GRE_FWMARK + 1, -#endif -#ifndef HAVE_IFLA_GRE_ERSPAN_HWID - IFLA_GRE_ERSPAN_VER = IFLA_GRE_ERSPAN_INDEX + 1, - IFLA_GRE_ERSPAN_DIR, - IFLA_GRE_ERSPAN_HWID, -#endif -}; - -#define RPL_IFLA_GRE_MAX (IFLA_GRE_ERSPAN_HWID + 1) - -static struct pernet_operations ip6gre_net_ops = { - .init = ip6gre_init_net, - .exit_batch = ip6gre_exit_batch_net, - .id = &ip6gre_net_id, - .size = sizeof(struct ip6gre_net), -}; -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int rpl_ip6gre_tunnel_validate(struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6gre_tunnel_validate(struct nlattr *tb[], - struct nlattr *data[]) -#endif -{ - __be16 flags; - - if (!data) - return 0; - - flags = 0; - if (data[IFLA_GRE_IFLAGS]) - flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); - if (data[IFLA_GRE_OFLAGS]) - flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); - if (flags & (GRE_VERSION|GRE_ROUTING)) - return -EINVAL; - - return 0; -} -#define ip6gre_tunnel_validate rpl_ip6gre_tunnel_validate - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int rpl_ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6gre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - struct in6_addr daddr; - - if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) - return -EINVAL; - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) - return -EADDRNOTAVAIL; - } - - if (!data) - goto out; - - if (data[IFLA_GRE_REMOTE]) { - daddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]); - if (ipv6_addr_any(&daddr)) - return -EINVAL; - } - -out: -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK - return ip6gre_tunnel_validate(tb, data, extack); -#else - return ip6gre_tunnel_validate(tb, data); -#endif -} -#define ip6gre_tap_validate rpl_ip6gre_tap_validate - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int rpl_ip6erspan_tap_validate(struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6erspan_tap_validate(struct nlattr *tb[], - struct nlattr *data[]) -#endif -{ - __be16 flags = 0; - int ret, ver = 0; - - if (!data) - return 0; - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK - ret = ip6gre_tap_validate(tb, data, extack); -#else - ret = ip6gre_tap_validate(tb, data); -#endif - if (ret) - return ret; - - /* ERSPAN should only have GRE sequence and key flag */ - if (data[IFLA_GRE_OFLAGS]) - flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); - if (data[IFLA_GRE_IFLAGS]) - flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); - if (!data[IFLA_GRE_COLLECT_METADATA] && - flags != (GRE_SEQ | GRE_KEY)) - return -EINVAL; - - /* ERSPAN Session ID only has 10-bit. Since we reuse - * 32-bit key field as ID, check it's range. - */ - if (data[IFLA_GRE_IKEY] && - (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) - return -EINVAL; - - if (data[IFLA_GRE_OKEY] && - (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) - return -EINVAL; - - if (data[IFLA_GRE_ERSPAN_VER]) { - ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (ver != 1 && ver != 2) - return -EINVAL; - } - - if (ver == 1) { - if (data[IFLA_GRE_ERSPAN_INDEX]) { - u32 index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); - - if (index & ~INDEX_MASK) - return -EINVAL; - } - } else if (ver == 2) { - if (data[IFLA_GRE_ERSPAN_DIR]) { - u16 dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); - - if (dir & ~(DIR_MASK >> DIR_OFFSET)) - return -EINVAL; - } - - if (data[IFLA_GRE_ERSPAN_HWID]) { - u16 hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); - - if (hwid & ~(HWID_MASK >> HWID_OFFSET)) - return -EINVAL; - } - } - - return 0; -} -#define ip6erspan_tap_validate rpl_ip6erspan_tap_validate - -static void ip6gre_netlink_parms(struct nlattr *data[], - struct __ip6_tnl_parm *parms) -{ -#if 0 - /* Do not use in case of OVS - our vport needs to set a parm - * directly and this erases it - */ - memset(parms, 0, sizeof(*parms)); - -#endif - if (!data) - return; - - if (data[IFLA_GRE_LINK]) - parms->link = nla_get_u32(data[IFLA_GRE_LINK]); - - if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_IFLAGS])); - - if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags( - nla_get_be16(data[IFLA_GRE_OFLAGS])); - - if (data[IFLA_GRE_IKEY]) - parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); - - if (data[IFLA_GRE_OKEY]) - parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); - - if (data[IFLA_GRE_LOCAL]) - parms->laddr = nla_get_in6_addr(data[IFLA_GRE_LOCAL]); - - if (data[IFLA_GRE_REMOTE]) - parms->raddr = nla_get_in6_addr(data[IFLA_GRE_REMOTE]); - - if (data[IFLA_GRE_TTL]) - parms->hop_limit = nla_get_u8(data[IFLA_GRE_TTL]); - - if (data[IFLA_GRE_ENCAP_LIMIT]) - parms->encap_limit = nla_get_u8(data[IFLA_GRE_ENCAP_LIMIT]); - - if (data[IFLA_GRE_FLOWINFO]) - parms->flowinfo = nla_get_be32(data[IFLA_GRE_FLOWINFO]); - - if (data[IFLA_GRE_FLAGS]) - parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); - - if (data[IFLA_GRE_FWMARK]) - parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); - - if (data[IFLA_GRE_COLLECT_METADATA]) - parms->collect_md = true; - - parms->erspan_ver = 1; - if (data[IFLA_GRE_ERSPAN_VER]) - parms->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - - if (parms->erspan_ver == 1) { - if (data[IFLA_GRE_ERSPAN_INDEX]) - parms->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); - } else if (parms->erspan_ver == 2) { - if (data[IFLA_GRE_ERSPAN_DIR]) - parms->dir = nla_get_u8(data[IFLA_GRE_ERSPAN_DIR]); - if (data[IFLA_GRE_ERSPAN_HWID]) - parms->hwid = nla_get_u16(data[IFLA_GRE_ERSPAN_HWID]); - } -} - -static int ip6gre_tap_init(struct net_device *dev) -{ - int ret; - - ret = ip6gre_tunnel_init_common(dev); - if (ret) - return ret; - - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - - return 0; -} - -static const struct net_device_ops ip6gre_tap_netdev_ops = { - .ndo_init = ip6gre_tap_init, - .ndo_uninit = ip6gre_tunnel_uninit, - .ndo_start_xmit = ip6gre_tunnel_xmit, - .ndo_set_mac_address = eth_mac_addr, - .ndo_validate_addr = eth_validate_addr, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = ip6_tnl_change_mtu, -#else - .ndo_change_mtu = ip6_tnl_change_mtu, -#endif - .ndo_get_stats64 = ip_tunnel_get_stats64, -#ifdef HAVE_NDO_GET_IFLINK - .ndo_get_iflink = ip6_tnl_get_iflink, -#endif -}; - -static int ip6erspan_calc_hlen(struct ip6_tnl *tunnel) -{ - int t_hlen; - - tunnel->tun_hlen = 8; - tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - erspan_hdr_len(tunnel->parms.erspan_ver); - - t_hlen = tunnel->hlen + sizeof(struct ipv6hdr); - tunnel->dev->hard_header_len = LL_MAX_HEADER + t_hlen; - return t_hlen; -} - -static int ip6erspan_tap_init(struct net_device *dev) -{ - struct ip6_tnl *tunnel; - int t_hlen; - int ret; - - tunnel = netdev_priv(dev); - - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); - - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (ret) { - free_percpu(dev->tstats); - dev->tstats = NULL; - return ret; - } - - t_hlen = ip6erspan_calc_hlen(tunnel); - dev->mtu = ETH_DATA_LEN - t_hlen; - if (dev->type == ARPHRD_ETHER) - dev->mtu -= ETH_HLEN; - if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; - - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - tunnel = netdev_priv(dev); - ip6erspan_tnl_link_config(tunnel, 1); - - return 0; -} - -static const struct net_device_ops ip6erspan_netdev_ops = { - .ndo_init = ip6erspan_tap_init, - .ndo_uninit = ip6erspan_tunnel_uninit, - .ndo_start_xmit = ip6erspan_tunnel_xmit, - .ndo_set_mac_address = eth_mac_addr, - .ndo_validate_addr = eth_validate_addr, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = ip6_tnl_change_mtu, -#else - .ndo_change_mtu = ip6_tnl_change_mtu, -#endif - .ndo_get_stats64 = ip_tunnel_get_stats64, -#ifdef HAVE_NDO_GET_IFLINK - .ndo_get_iflink = ip6_tnl_get_iflink, -#endif -}; - -static void ip6gre_tap_setup(struct net_device *dev) -{ - - ether_setup(dev); -#ifdef HAVE_NET_DEVICE_MAX_MTU - dev->max_mtu = 0; -#endif - dev->netdev_ops = &ip6gre_tap_netdev_ops; -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = free_netdev; -#else - dev->needs_free_netdev = true; - dev->priv_destructor = ip6gre_dev_free; -#endif - - dev->features |= NETIF_F_NETNS_LOCAL; - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - netif_keep_dst(dev); -} - -static bool ip6gre_netlink_encap_parms(struct nlattr *data[], - struct ip_tunnel_encap *ipencap) -{ - bool ret = false; - - memset(ipencap, 0, sizeof(*ipencap)); - - if (!data) - return ret; - - if (data[IFLA_GRE_ENCAP_TYPE]) { - ret = true; - ipencap->type = nla_get_u16(data[IFLA_GRE_ENCAP_TYPE]); - } - - if (data[IFLA_GRE_ENCAP_FLAGS]) { - ret = true; - ipencap->flags = nla_get_u16(data[IFLA_GRE_ENCAP_FLAGS]); - } - - if (data[IFLA_GRE_ENCAP_SPORT]) { - ret = true; - ipencap->sport = nla_get_be16(data[IFLA_GRE_ENCAP_SPORT]); - } - - if (data[IFLA_GRE_ENCAP_DPORT]) { - ret = true; - ipencap->dport = nla_get_be16(data[IFLA_GRE_ENCAP_DPORT]); - } - - return ret; -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int rpl_ip6gre_newlink_common(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6gre_newlink_common(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - struct ip6_tnl *nt; - struct ip_tunnel_encap ipencap; - int err; - - nt = netdev_priv(dev); - - if (ip6gre_netlink_encap_parms(data, &ipencap)) { - int err = ip6_tnl_encap_setup(nt, &ipencap); - - if (err < 0) - return err; - } - - if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) - eth_hw_addr_random(dev); - - nt->dev = dev; - nt->net = dev_net(dev); - - err = register_netdevice(dev); - if (err) - goto out; - - if (tb[IFLA_MTU]) - ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); - - dev_hold(dev); - -out: - return err; -} -#define ip6gre_newlink_common rpl_ip6gre_newlink_common - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int rpl_ip6gre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6gre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ip6gre_net *ign; - int err; - - ip6gre_netlink_parms(data, &nt->parms); - ign = net_generic(net, ip6gre_net_id); - - if (nt->parms.collect_md) { - if (rtnl_dereference(ign->collect_md_tun)) - return -EEXIST; - } else { - if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; - } - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); -#else - err = ip6gre_newlink_common(src_net, dev, tb, data); -#endif - if (!err) { - ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]); - ip6gre_tunnel_link_md(ign, nt); - ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt); - } - return err; -} - -#define ip6gre_newlink rpl_ip6gre_newlink - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static struct ip6_tnl * -rpl_ip6gre_changelink_common(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], struct __ip6_tnl_parm *p_p, - struct netlink_ext_ack *extack) -#else -static struct ip6_tnl * -rpl_ip6gre_changelink_common(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], struct __ip6_tnl_parm *p_p) -#endif -{ - struct ip6_tnl *t, *nt = netdev_priv(dev); - struct net *net = nt->net; - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - struct ip_tunnel_encap ipencap; - - if (dev == ign->fb_tunnel_dev) - return ERR_PTR(-EINVAL); - - if (ip6gre_netlink_encap_parms(data, &ipencap)) { - int err = ip6_tnl_encap_setup(nt, &ipencap); - - if (err < 0) - return ERR_PTR(err); - } - - ip6gre_netlink_parms(data, p_p); - - t = ip6gre_tunnel_locate(net, p_p, 0); - - if (t) { - if (t->dev != dev) - return ERR_PTR(-EEXIST); - } else { - t = nt; - } - - return t; -} -#define ip6gre_changelink_common rpl_ip6gre_changelink_common - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int rpl_ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6gre_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) -#endif -{ - struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); - struct __ip6_tnl_parm p; - struct ip6_tnl *t; - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS - t = ip6gre_changelink_common(dev, tb, data, &p, extack); -#else - t = ip6gre_changelink_common(dev, tb, data, &p); -#endif - if (IS_ERR(t)) - return PTR_ERR(t); - - ip6gre_tunnel_unlink_md(ign, t); - ip6gre_tunnel_unlink(ign, t); - ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]); - ip6gre_tunnel_link_md(ign, t); - ip6gre_tunnel_link(ign, t); - return 0; -} -#define ip6gre_changelink rpl_ip6gre_changelink - -static void ip6gre_dellink(struct net_device *dev, struct list_head *head) -{ - struct net *net = dev_net(dev); - struct ip6gre_net *ign = net_generic(net, ip6gre_net_id); - - if (dev != ign->fb_tunnel_dev) - unregister_netdevice_queue(dev, head); -} - -static size_t ip6gre_get_size(const struct net_device *dev) -{ - return - /* IFLA_GRE_LINK */ - nla_total_size(4) + - /* IFLA_GRE_IFLAGS */ - nla_total_size(2) + - /* IFLA_GRE_OFLAGS */ - nla_total_size(2) + - /* IFLA_GRE_IKEY */ - nla_total_size(4) + - /* IFLA_GRE_OKEY */ - nla_total_size(4) + - /* IFLA_GRE_LOCAL */ - nla_total_size(sizeof(struct in6_addr)) + - /* IFLA_GRE_REMOTE */ - nla_total_size(sizeof(struct in6_addr)) + - /* IFLA_GRE_TTL */ - nla_total_size(1) + - /* IFLA_GRE_ENCAP_LIMIT */ - nla_total_size(1) + - /* IFLA_GRE_FLOWINFO */ - nla_total_size(4) + - /* IFLA_GRE_FLAGS */ - nla_total_size(4) + - /* IFLA_GRE_ENCAP_TYPE */ - nla_total_size(2) + - /* IFLA_GRE_ENCAP_FLAGS */ - nla_total_size(2) + - /* IFLA_GRE_ENCAP_SPORT */ - nla_total_size(2) + - /* IFLA_GRE_ENCAP_DPORT */ - nla_total_size(2) + - /* IFLA_GRE_COLLECT_METADATA */ - nla_total_size(0) + - /* IFLA_GRE_FWMARK */ - nla_total_size(4) + - /* IFLA_GRE_ERSPAN_INDEX */ - nla_total_size(4) + - 0; -} - -static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct __ip6_tnl_parm *p = &t->parms; - - if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, - gre_tnl_flags_to_gre_flags(p->i_flags)) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, - gre_tnl_flags_to_gre_flags(p->o_flags)) || - nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || - nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || - nla_put_in6_addr(skb, IFLA_GRE_LOCAL, &p->laddr) || - nla_put_in6_addr(skb, IFLA_GRE_REMOTE, &p->raddr) || - nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) || - nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || - nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || - nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || - nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark) || - nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) - goto nla_put_failure; - - if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, - t->encap.type) || - nla_put_be16(skb, IFLA_GRE_ENCAP_SPORT, - t->encap.sport) || - nla_put_be16(skb, IFLA_GRE_ENCAP_DPORT, - t->encap.dport) || - nla_put_u16(skb, IFLA_GRE_ENCAP_FLAGS, - t->encap.flags)) - goto nla_put_failure; - - if (p->collect_md) { - if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) - goto nla_put_failure; - } - - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, p->erspan_ver)) - goto nla_put_failure; - - if (p->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, p->index)) - goto nla_put_failure; - } else if (p->erspan_ver == 2) { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, p->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, p->hwid)) - goto nla_put_failure; - } - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static const struct nla_policy ip6gre_policy[RPL_IFLA_GRE_MAX + 1] = { - [IFLA_GRE_LINK] = { .type = NLA_U32 }, - [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, - [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, - [IFLA_GRE_IKEY] = { .type = NLA_U32 }, - [IFLA_GRE_OKEY] = { .type = NLA_U32 }, - [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct ipv6hdr, saddr) }, - [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct ipv6hdr, daddr) }, - [IFLA_GRE_TTL] = { .type = NLA_U8 }, - [IFLA_GRE_ENCAP_LIMIT] = { .type = NLA_U8 }, - [IFLA_GRE_FLOWINFO] = { .type = NLA_U32 }, - [IFLA_GRE_FLAGS] = { .type = NLA_U32 }, - [IFLA_GRE_ENCAP_TYPE] = { .type = NLA_U16 }, - [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, - [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, - [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, - [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, - [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, - [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, - [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, - [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, - [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, -}; - -static void ip6erspan_tap_setup(struct net_device *dev) -{ - ether_setup(dev); - - dev->netdev_ops = &ip6erspan_netdev_ops; -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = free_netdev; -#else - dev->needs_free_netdev = true; - dev->priv_destructor = ip6gre_dev_free; -#endif - - dev->features |= NETIF_F_NETNS_LOCAL; - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - netif_keep_dst(dev); -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int rpl_ip6erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6erspan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - struct ip6_tnl *nt = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ip6gre_net *ign; - int err; - - ip6gre_netlink_parms(data, &nt->parms); - ign = net_generic(net, ip6gre_net_id); - - if (nt->parms.collect_md) { - if (rtnl_dereference(ign->collect_md_tun_erspan)) - return -EEXIST; - } else { - if (ip6gre_tunnel_find(net, &nt->parms, dev->type)) - return -EEXIST; - } - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS - err = ip6gre_newlink_common(src_net, dev, tb, data, extack); -#else - err = ip6gre_newlink_common(src_net, dev, tb, data); -#endif - if (!err) { - ip6erspan_tnl_link_config(nt, !tb[IFLA_MTU]); - ip6erspan_tunnel_link_md(ign, nt); - ip6gre_tunnel_link(net_generic(net, ip6gre_net_id), nt); - } - return err; -} -#define ip6erspan_newlink rpl_ip6erspan_newlink - -static void ip6erspan_tnl_link_config(struct ip6_tnl *t, int set_mtu) -{ - ip6gre_tnl_link_config_common(t); - ip6gre_tnl_link_config_route(t, set_mtu, ip6erspan_calc_hlen(t)); -} - -static int ip6erspan_tnl_change(struct ip6_tnl *t, - const struct __ip6_tnl_parm *p, int set_mtu) -{ - ip6gre_tnl_copy_tnl_parm(t, p); - ip6erspan_tnl_link_config(t, set_mtu); - return 0; -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int rpl_ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) -#endif -{ - struct ip6gre_net *ign = net_generic(dev_net(dev), ip6gre_net_id); - struct __ip6_tnl_parm p; - struct ip6_tnl *t; -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS - t = ip6gre_changelink_common(dev, tb, data, &p, extack); -#else - t = ip6gre_changelink_common(dev, tb, data, &p); -#endif - if (IS_ERR(t)) - return PTR_ERR(t); - - ip6gre_tunnel_unlink_md(ign, t); - ip6gre_tunnel_unlink(ign, t); - ip6erspan_tnl_change(t, &p, !tb[IFLA_MTU]); - ip6erspan_tunnel_link_md(ign, t); - ip6gre_tunnel_link(ign, t); - return 0; -} -#define ip6erspan_changelink rpl_ip6erspan_changelink - -static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { - .kind = "ip6gre", - .maxtype = RPL_IFLA_GRE_MAX, - .policy = ip6gre_policy, - .priv_size = sizeof(struct ip6_tnl), - .setup = ip6gre_tunnel_setup, - .validate = ip6gre_tunnel_validate, - .newlink = ip6gre_newlink, - .changelink = ip6gre_changelink, - .dellink = ip6gre_dellink, - .get_size = ip6gre_get_size, - .fill_info = ip6gre_fill_info, -#ifdef HAVE_GET_LINK_NET - .get_link_net = ip6_tnl_get_link_net, -#endif -}; - -static struct rtnl_link_ops ip6gre_tap_ops __read_mostly = { - .kind = "ip6gretap", - .maxtype = RPL_IFLA_GRE_MAX, - .policy = ip6gre_policy, - .priv_size = sizeof(struct ip6_tnl), - .setup = ip6gre_tap_setup, - .validate = ip6gre_tap_validate, - .newlink = ip6gre_newlink, - .changelink = ip6gre_changelink, - .dellink = ip6gre_dellink, - .get_size = ip6gre_get_size, - .fill_info = ip6gre_fill_info, -#ifdef HAVE_GET_LINK_NET - .get_link_net = ip6_tnl_get_link_net, -#endif -}; - -static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = { - .kind = "ip6erspan", - .maxtype = RPL_IFLA_GRE_MAX, - .policy = ip6gre_policy, - .priv_size = sizeof(struct ip6_tnl), - .setup = ip6erspan_tap_setup, - .validate = ip6erspan_tap_validate, - .newlink = ip6erspan_newlink, - .changelink = ip6erspan_changelink, - .dellink = ip6gre_dellink, - .get_size = ip6gre_get_size, - .fill_info = ip6gre_fill_info, -#ifdef HAVE_GET_LINK_NET - .get_link_net = ip6_tnl_get_link_net, -#endif -}; - -struct net_device *ip6erspan_fb_dev_create(struct net *net, const char *name, - u8 name_assign_type) -{ - struct nlattr *tb[IFLA_MAX + 1]; - struct net_device *dev; - LIST_HEAD(list_kill); - struct ip6_tnl *t; - int err; - - memset(&tb, 0, sizeof(tb)); - - dev = rtnl_create_link(net, (char *)name, name_assign_type, - &ip6erspan_tap_ops, tb); - if (IS_ERR(dev)) - return dev; - - t = netdev_priv(dev); - t->parms.collect_md = true; - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS - err = ip6erspan_newlink(net, dev, tb, NULL, NULL); -#else - err = ip6erspan_newlink(net, dev, tb, NULL); -#endif - if (err < 0) { - free_netdev(dev); - return ERR_PTR(err); - } - - /* openvswitch users expect packet sizes to be unrestricted, - * so set the largest MTU we can. - */ - err = ip6_tnl_change_mtu(dev, 64000); - if (err) - goto out; - - return dev; -out: - ip6gre_dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return ERR_PTR(err); -} - -static struct vport_ops ovs_erspan6_vport_ops; - -static struct vport *erspan6_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct net_device *dev; - struct vport *vport; - int err; - - vport = ovs_vport_alloc(0, &ovs_erspan6_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - rtnl_lock(); - dev = ip6erspan_fb_dev_create(net, parms->name, NET_NAME_USER); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_PTR(err); - } - - rtnl_unlock(); - return vport; -} - -static struct vport *erspan6_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = erspan6_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -#ifndef OVS_VPORT_TYPE_IP6ERSPAN -/* Just until integration */ -#define OVS_VPORT_TYPE_IP6ERSPAN 108 -#endif -static struct vport_ops ovs_erspan6_vport_ops = { - .type = OVS_VPORT_TYPE_IP6ERSPAN, - .create = erspan6_create, - .send = __ip6erspan_tunnel_xmit, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = gre_fill_metadata_dst, -#endif - .destroy = ovs_netdev_tunnel_destroy, -}; - -struct net_device *ip6gre_fb_dev_create(struct net *net, const char *name, - u8 name_assign_type) -{ - struct nlattr *tb[IFLA_MAX + 1]; - struct net_device *dev; - LIST_HEAD(list_kill); - struct ip6_tnl *t; - int err; - - memset(&tb, 0, sizeof(tb)); - - dev = rtnl_create_link(net, (char *)name, name_assign_type, - &ip6gre_tap_ops, tb); - if (IS_ERR(dev)) - return dev; - - t = netdev_priv(dev); - t->parms.collect_md = true; - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS - err = ip6gre_newlink(net, dev, tb, NULL, NULL); -#else - err = ip6gre_newlink(net, dev, tb, NULL); -#endif - if (err < 0) { - free_netdev(dev); - return ERR_PTR(err); - } - - /* openvswitch users expect packet sizes to be unrestricted, - * so set the largest MTU we can. - */ - err = ip6_tnl_change_mtu(dev, 64000); - if (err) - goto out; - - return dev; -out: - ip6gre_dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return ERR_PTR(err); -} - -static struct vport_ops ovs_ip6gre_vport_ops; - -static struct vport *ip6gre_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct net_device *dev; - struct vport *vport; - int err; - - vport = ovs_vport_alloc(0, &ovs_ip6gre_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - rtnl_lock(); - dev = ip6gre_fb_dev_create(net, parms->name, NET_NAME_USER); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_PTR(err); - } - - rtnl_unlock(); - return vport; -} - -static struct vport *ip6gre_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = ip6gre_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static struct vport_ops ovs_ip6gre_vport_ops = { - .type = OVS_VPORT_TYPE_IP6GRE, - .create = ip6gre_create, - .send = __ip6gre_tunnel_xmit, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = gre_fill_metadata_dst, -#endif - .destroy = ovs_netdev_tunnel_destroy, -}; - - -/* - * And now the modules code and kernel interface. - */ - -int rpl_ip6gre_init(void) -{ - int err; - - err = register_pernet_device(&ip6gre_net_ops); - if (err < 0) { - if (err == -EEXIST) - goto ip6_gre_loaded; - else - goto out; - } - - err = inet6_add_protocol(&ip6gre_protocol, IPPROTO_GRE); - if (err < 0) { - pr_info("%s: can't add protocol\n", __func__); - unregister_pernet_device(&ip6gre_net_ops); - /* - * inet6_add_protocol will return a -1 if it fails - * to grab the pointer but the vport initialization - * expects a return value of -EEXIST. Set err to - * -EEXIST here to ensure proper handling. - */ - err = -EEXIST; - goto ip6_gre_loaded; - } - - pr_info("GRE over IPv6 tunneling driver\n"); - ovs_vport_ops_register(&ovs_ip6gre_vport_ops); - ovs_vport_ops_register(&ovs_erspan6_vport_ops); - return err; - -ip6_gre_loaded: - /* Since IPv6 GRE only allows single receiver to be registerd, - * we skip here so only transmit works, see: - * - * commit f9242b6b28d61295f2bf7e8adfb1060b382e5381 - * Author: David S. Miller <davem@davemloft.net> - * Date: Tue Jun 19 18:56:21 2012 -0700 - * - * inet: Sanitize inet{,6} protocol demux. - * - * OVS GRE receive part is disabled. - */ - pr_info("GRE TX only over IPv6 tunneling driver\n"); - ip6_gre_loaded = true; - ovs_vport_ops_register(&ovs_ip6gre_vport_ops); - ovs_vport_ops_register(&ovs_erspan6_vport_ops); -out: - return err; -} - -void rpl_ip6gre_fini(void) -{ - ovs_vport_ops_unregister(&ovs_erspan6_vport_ops); - ovs_vport_ops_unregister(&ovs_ip6gre_vport_ops); - if (!ip6_gre_loaded) { - inet6_del_protocol(&ip6gre_protocol, IPPROTO_GRE); - unregister_pernet_device(&ip6gre_net_ops); - } -} -#endif /* USE_UPSTREAM_TUNNEL */ diff --git a/datapath/linux/compat/ip6_output.c b/datapath/linux/compat/ip6_output.c deleted file mode 100644 index 688884275..000000000 --- a/datapath/linux/compat/ip6_output.c +++ /dev/null @@ -1,470 +0,0 @@ -/* - * Backported from upstream commit 9ef2e965e554 - * ("ipv6: drop frames with attached skb->sk in forwarding") - * - * IPv6 output functions - * Linux INET6 implementation - * - * Authors: - * Pedro Roque <roque@di.fc.ul.pt> - * - * Based on linux/net/ipv4/ip_output.c - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * A.N.Kuznetsov : airthmetics in fragmentation. - * extension headers are implemented. - * route changes now work. - * ip6_forward does not confuse sniffers. - * etc. - * - * H. von Brand : Added missing #include <linux/string.h> - * Imran Patel : frag id should be in NBO - * Kazunori MIYAZAWA @USAGI - * : add ip6_append_data and related functions - * for datagram xmit - */ - -#include <linux/version.h> - -#ifndef HAVE_NF_IPV6_OPS_FRAGMENT - -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <linux/in6.h> -#include <linux/tcp.h> -#include <linux/random.h> -#include <linux/route.h> -#include <linux/module.h> -#include <linux/slab.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv6.h> - -#include <net/sock.h> -#include <net/snmp.h> - -#include <net/ipv6.h> -#include <net/ndisc.h> -#include <net/protocol.h> -#include <net/ip6_route.h> -#include <net/addrconf.h> -#include <net/rawv6.h> -#include <net/icmp.h> -#include <net/xfrm.h> -#include <net/checksum.h> -#include <linux/mroute6.h> - -#define IP_IDENTS_SZ 2048u - -static atomic_t *ip_idents __read_mostly; -static u32 *ip_tstamps __read_mostly; - -int __init ip6_output_init(void); -void ip6_output_exit(void); - -/* In order to protect privacy, we add a perturbation to identifiers - * if one generator is seldom used. This makes hard for an attacker - * to infer how many packets were sent between two points in time. - */ -static u32 rpl_ip_idents_reserve(u32 hash, int segs) -{ - u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; - atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = ACCESS_ONCE(*p_tstamp); - u32 now = (u32)jiffies; - u32 delta = 0; - - if (old != now && cmpxchg(p_tstamp, old, now) == old) - delta = prandom_u32_max(now - old); - - return atomic_add_return(segs + delta, p_id) - segs; -} - -static u32 rpl___ipv6_select_ident(struct net *net, u32 hashrnd, - const struct in6_addr *dst, - const struct in6_addr *src) -{ - u32 hash, id; - - hash = __ipv6_addr_jhash(dst, hashrnd); - hash = __ipv6_addr_jhash(src, hash); - hash ^= net_hash_mix(net); - - /* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve, - * set the hight order instead thus minimizing possible future - * collisions. - */ - id = rpl_ip_idents_reserve(hash, 1); - if (unlikely(!id)) - id = 1 << 31; - - return id; -} - -static __be32 rpl_ipv6_select_ident(struct net *net, - const struct in6_addr *daddr, - const struct in6_addr *saddr) -{ - static u32 ip6_idents_hashrnd __read_mostly; - u32 id; - - net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd)); - - id = rpl___ipv6_select_ident(net, ip6_idents_hashrnd, daddr, saddr); - return htonl(id); -} - -static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) -{ - to->pkt_type = from->pkt_type; - to->priority = from->priority; - to->protocol = from->protocol; - skb_dst_drop(to); - skb_dst_set(to, dst_clone(skb_dst(from))); - to->dev = from->dev; - to->mark = from->mark; - -#ifdef CONFIG_NET_SCHED - to->tc_index = from->tc_index; -#endif - nf_copy(to, from); - skb_copy_secmark(to, from); -} - -#ifdef HAVE_IP_FRAGMENT_TAKES_SOCK -#define OUTPUT(skb) output(skb->sk, skb) -#else -#define OUTPUT(skb) output(skb) -#endif - -int ip6_fragment(struct sock *sk, struct sk_buff *skb, - int (*output)(OVS_VPORT_OUTPUT_PARAMS)) -{ - struct sk_buff *frag; - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? - inet6_sk(skb->sk) : NULL; - struct ipv6hdr *tmp_hdr; - struct frag_hdr *fh; - unsigned int mtu, hlen, left, len; - int hroom, troom; - __be32 frag_id; - int ptr, offset = 0, err = 0; - u8 *prevhdr, nexthdr = 0; - struct net *net = dev_net(skb_dst(skb)->dev); - - hlen = ip6_find_1stfragopt(skb, &prevhdr); - nexthdr = *prevhdr; - - mtu = ip6_skb_dst_mtu(skb); - - /* We must not fragment if the socket is set to force MTU discovery - * or if the skb it not generated by a local socket. - */ - if (unlikely(!skb->ignore_df && skb->len > mtu)) - goto fail_toobig; - - if (IP6CB(skb)->frag_max_size) { - if (IP6CB(skb)->frag_max_size > mtu) - goto fail_toobig; - - /* don't send fragments larger than what we received */ - mtu = IP6CB(skb)->frag_max_size; - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - } - - if (np && np->frag_size < mtu) { - if (np->frag_size) - mtu = np->frag_size; - } - mtu -= hlen + sizeof(struct frag_hdr); - - frag_id = rpl_ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, - &ipv6_hdr(skb)->saddr); - - hroom = LL_RESERVED_SPACE(rt->dst.dev); - if (skb_has_frag_list(skb)) { - int first_len = skb_pagelen(skb); - struct sk_buff *frag2; - - if (first_len - hlen > mtu || - ((first_len - hlen) & 7) || - skb_cloned(skb) || - skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) - goto slow_path; - - skb_walk_frags(skb, frag) { - /* Correct geometry. */ - if (frag->len > mtu || - ((frag->len & 7) && frag->next) || - skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) - goto slow_path_clean; - - /* Partially cloned skb? */ - if (skb_shared(frag)) - goto slow_path_clean; - - BUG_ON(frag->sk); - if (skb->sk) { - frag->sk = skb->sk; - frag->destructor = sock_wfree; - } - skb->truesize -= frag->truesize; - } - - err = 0; - offset = 0; - /* BUILD HEADER */ - - *prevhdr = NEXTHDR_FRAGMENT; - tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); - if (!tmp_hdr) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); - err = -ENOMEM; - goto fail; - } - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - - __skb_pull(skb, hlen); - fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); - __skb_push(skb, hlen); - skb_reset_network_header(skb); - memcpy(skb_network_header(skb), tmp_hdr, hlen); - - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(IP6_MF); - fh->identification = frag_id; - - first_len = skb_pagelen(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - ipv6_hdr(skb)->payload_len = htons(first_len - - sizeof(struct ipv6hdr)); - - dst_hold(&rt->dst); - - for (;;) { - /* Prepare header of the next frame, - * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr)); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), tmp_hdr, - hlen); - offset += skb->len - hlen - sizeof(struct frag_hdr); - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->frag_off = htons(offset); - if (frag->next) - fh->frag_off |= htons(IP6_MF); - fh->identification = frag_id; - ipv6_hdr(frag)->payload_len = - htons(frag->len - - sizeof(struct ipv6hdr)); - ip6_copy_metadata(frag, skb); - } - - err = OUTPUT(skb); - if (!err) - IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), - IPSTATS_MIB_FRAGCREATES); - - if (err || !frag) - break; - - skb = frag; - frag = skb->next; - skb->next = NULL; - } - - kfree(tmp_hdr); - - if (err == 0) { - IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), - IPSTATS_MIB_FRAGOKS); - ip6_rt_put(rt); - return 0; - } - - kfree_skb_list(frag); - - IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), - IPSTATS_MIB_FRAGFAILS); - ip6_rt_put(rt); - return err; - -slow_path_clean: - skb_walk_frags(skb, frag2) { - if (frag2 == frag) - break; - frag2->sk = NULL; - frag2->destructor = NULL; - skb->truesize += frag2->truesize; - } - } - -slow_path: - if ((skb->ip_summed == CHECKSUM_PARTIAL) && - skb_checksum_help(skb)) - goto fail; - - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - - /* - * Fragment the datagram. - */ - - *prevhdr = NEXTHDR_FRAGMENT; - troom = rt->dst.dev->needed_tailroom; - - /* - * Keep copying data until we run out. - */ - while (left > 0) { - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - - /* Allocate buffer */ - frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + - hroom + troom, GFP_ATOMIC); - if (!frag) { - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); - err = -ENOMEM; - goto fail; - } - - /* - * Set up data on packet - */ - - ip6_copy_metadata(frag, skb); - skb_reserve(frag, hroom); - skb_put(frag, len + hlen + sizeof(struct frag_hdr)); - skb_reset_network_header(frag); - fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); - frag->transport_header = (frag->network_header + hlen + - sizeof(struct frag_hdr)); - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - if (skb->sk) - skb_set_owner_w(frag, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); - - /* - * Build fragment header. - */ - fh->nexthdr = nexthdr; - fh->reserved = 0; - fh->identification = frag_id; - - /* - * Copy a block of the IP datagram. - */ - BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), - len)); - left -= len; - - fh->frag_off = htons(offset); - if (left > 0) - fh->frag_off |= htons(IP6_MF); - ipv6_hdr(frag)->payload_len = htons(frag->len - - sizeof(struct ipv6hdr)); - - ptr += len; - offset += len; - - /* - * Put this fragment into the sending queue. - */ - err = OUTPUT(frag); - if (err) - goto fail; - - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGCREATES); - } - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGOKS); - consume_skb(skb); - return err; - -fail_toobig: - if (skb->sk && dst_allfrag(skb_dst(skb))) - sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); - - skb->dev = skb_dst(skb)->dev; - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - err = -EMSGSIZE; - -fail: - IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_FRAGFAILS); - kfree_skb(skb); - return err; -} -#undef OUTPUT - -int __init ip6_output_init(void) -{ - ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); - if (!ip_idents) { - pr_warn("IP: failed to allocate ip_idents\n"); - goto error; - } - - prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); - - ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); - if (!ip_tstamps) { - pr_warn("IP: failed to allocate ip_tstamps\n"); - goto error_ip_idents_free; - } - - return 0; - -error_ip_idents_free: - kfree(ip_idents); -error: - return -ENOMEM; -} - -void ip6_output_exit(void) -{ - kfree(ip_tstamps); - kfree(ip_idents); -} - -#endif /* !HAVE_NF_IPV6_OPS_FRAGMENT */ diff --git a/datapath/linux/compat/ip6_tunnel.c b/datapath/linux/compat/ip6_tunnel.c deleted file mode 100644 index 984a51bfb..000000000 --- a/datapath/linux/compat/ip6_tunnel.c +++ /dev/null @@ -1,2213 +0,0 @@ -/* - * IPv6 tunneling device - * Linux INET6 implementation - * - * Authors: - * Ville Nuorvala <vnuorval@tcs.hut.fi> - * Yasuyuki Kozakai <kozakai@linux-ipv6.org> - * - * Based on: - * linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c - * - * RFC 2473 - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#ifndef USE_UPSTREAM_TUNNEL -#include <linux/module.h> -#include <linux/capability.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/sockios.h> -#include <linux/icmp.h> -#include <linux/if.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/net.h> -#include <linux/in6.h> -#include <linux/netdevice.h> -#include <linux/if_arp.h> -#include <linux/icmpv6.h> -#include <linux/init.h> -#include <linux/route.h> -#include <linux/rtnetlink.h> -#include <linux/netfilter_ipv6.h> -#include <linux/slab.h> -#include <linux/hash.h> -#include <linux/etherdevice.h> - -#include <linux/uaccess.h> -#include <linux/atomic.h> - -#include <net/icmp.h> -#include <net/ip.h> -#include <net/ip_tunnels.h> -#include <net/ipv6.h> -#include <net/ip6_route.h> -#include <net/addrconf.h> -#include <net/ip6_tunnel.h> -#include <net/xfrm.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include "gso.h" -#include <net/dst_metadata.h> - -#include "vport-netdev.h" - -#define IP6_TUNNEL_HASH_SIZE_SHIFT 5 -#define IP6_TUNNEL_HASH_SIZE (1 << IP6_TUNNEL_HASH_SIZE_SHIFT) - -enum { -#ifndef HAVE_IFLA_IPTUN_ENCAP_TYPE - IFLA_IPTUN_ENCAP_TYPE = IFLA_IPTUN_6RD_RELAY_PREFIXLEN + 1, - IFLA_IPTUN_ENCAP_FLAGS, - IFLA_IPTUN_ENCAP_SPORT, - IFLA_IPTUN_ENCAP_DPORT, -#endif -#ifndef HAVE_IFLA_IPTUN_COLLECT_METADATA - IFLA_IPTUN_COLLECT_METADATA = IFLA_IPTUN_ENCAP_DPORT + 1, -#endif -#ifndef HAVE_IFLA_IPTUN_FWMARK - IFLA_IPTUN_FWMARK = IFLA_IPTUN_COLLECT_METADATA + 1, -#endif - RPL__IFLA_IPTUN_MAX = IFLA_IPTUN_FWMARK + 1, -}; - -#define RPL_IFLA_IPTUN_MAX RPL__IFLA_IPTUN_MAX - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) -/* Undef the one from ip_tunnels.h - we need a different one here */ -/* At least I think... */ -#undef iptunnel_handle_offloads -/* gre_handle_offloads() has different return type on older kernsl. */ -static void gre_nop_fix(struct sk_buff *skb) { } - -static void gre_csum_fix(struct sk_buff *skb) -{ - struct gre_base_hdr *greh; - __be32 *options; - int gre_offset = skb_transport_offset(skb); - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - options = ((__be32 *)greh + 1); - - *options = 0; - *(__sum16 *)options = csum_fold(skb_checksum(skb, gre_offset, - skb->len - gre_offset, 0)); -} - -#define iptunnel_handle_offloads rpl__iptunnel_handle_offloads -static int rpl__iptunnel_handle_offloads(struct sk_buff *skb, bool gre_csum, - int __always_unused ignored) -{ - int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE; - gso_fix_segment_t fix_segment; - - if (gre_csum) - fix_segment = gre_csum_fix; - else - fix_segment = gre_nop_fix; - - return ovs_iptunnel_handle_offloads(skb, type, fix_segment); -} - -#endif -static bool log_ecn_error = true; - -static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) -{ - u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); - - return hash_32(hash, IP6_TUNNEL_HASH_SIZE_SHIFT); -} - -static int ip6_tnl_dev_init(struct net_device *dev); -static void ip6_tnl_dev_setup(struct net_device *dev); -static struct rtnl_link_ops ip6_link_ops __read_mostly; - -static unsigned int ip6_tnl_net_id __read_mostly; -struct ip6_tnl_net { - /* the IPv6 tunnel fallback device */ - struct net_device *fb_tnl_dev; - /* lists for storing tunnels in use */ - struct ip6_tnl __rcu *tnls_r_l[IP6_TUNNEL_HASH_SIZE]; - struct ip6_tnl __rcu *tnls_wc[1]; - struct ip6_tnl __rcu **tnls[2]; - struct ip6_tnl __rcu *collect_md_tun; -}; - -static struct net_device_stats *ip6_get_stats(struct net_device *dev) -{ - struct pcpu_sw_netstats tmp, sum = { 0 }; - int i; - - for_each_possible_cpu(i) { - unsigned int start; - const struct pcpu_sw_netstats *tstats = - per_cpu_ptr(dev->tstats, i); - - do { - start = u64_stats_fetch_begin_irq(&tstats->syncp); - tmp.rx_packets = tstats->rx_packets; - tmp.rx_bytes = tstats->rx_bytes; - tmp.tx_packets = tstats->tx_packets; - tmp.tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); - - sum.rx_packets += tmp.rx_packets; - sum.rx_bytes += tmp.rx_bytes; - sum.tx_packets += tmp.tx_packets; - sum.tx_bytes += tmp.tx_bytes; - } - dev->stats.rx_packets = sum.rx_packets; - dev->stats.rx_bytes = sum.rx_bytes; - dev->stats.tx_packets = sum.tx_packets; - dev->stats.tx_bytes = sum.tx_bytes; - return &dev->stats; -} - -/** - * ip6_tnl_lookup - fetch tunnel matching the end-point addresses - * @remote: the address of the tunnel exit-point - * @local: the address of the tunnel entry-point - * - * Return: - * tunnel matching given end-points if found, - * else fallback tunnel if its device is up, - * else %NULL - **/ - -#define for_each_ip6_tunnel_rcu(start) \ - for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) - -static struct ip6_tnl * -ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) -{ - unsigned int hash = HASH(remote, local); - struct ip6_tnl *t; - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - struct in6_addr any; - - for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr) && - (t->dev->flags & IFF_UP)) - return t; - } - - memset(&any, 0, sizeof(any)); - hash = HASH(&any, local); - for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_any(&t->parms.raddr) && - (t->dev->flags & IFF_UP)) - return t; - } - - hash = HASH(remote, &any); - for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[hash]) { - if (ipv6_addr_equal(remote, &t->parms.raddr) && - ipv6_addr_any(&t->parms.laddr) && - (t->dev->flags & IFF_UP)) - return t; - } - - t = rcu_dereference(ip6n->collect_md_tun); - if (t && t->dev->flags & IFF_UP) - return t; - - t = rcu_dereference(ip6n->tnls_wc[0]); - if (t && (t->dev->flags & IFF_UP)) - return t; - - return NULL; -} - -/** - * ip6_tnl_bucket - get head of list matching given tunnel parameters - * @p: parameters containing tunnel end-points - * - * Description: - * ip6_tnl_bucket() returns the head of the list matching the - * &struct in6_addr entries laddr and raddr in @p. - * - * Return: head of IPv6 tunnel list - **/ - -static struct ip6_tnl __rcu ** -ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct __ip6_tnl_parm *p) -{ - const struct in6_addr *remote = &p->raddr; - const struct in6_addr *local = &p->laddr; - unsigned int h = 0; - int prio = 0; - - if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { - prio = 1; - h = HASH(remote, local); - } - return &ip6n->tnls[prio][h]; -} - -/** - * ip6_tnl_link - add tunnel to hash table - * @t: tunnel to be added - **/ - -static void -ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) -{ - struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); - - if (t->parms.collect_md) - rcu_assign_pointer(ip6n->collect_md_tun, t); - rcu_assign_pointer(t->next , rtnl_dereference(*tp)); - rcu_assign_pointer(*tp, t); -} - -/** - * ip6_tnl_unlink - remove tunnel from hash table - * @t: tunnel to be removed - **/ - -static void -ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) -{ - struct ip6_tnl __rcu **tp; - struct ip6_tnl *iter; - - if (t->parms.collect_md) - rcu_assign_pointer(ip6n->collect_md_tun, NULL); - - for (tp = ip6_tnl_bucket(ip6n, &t->parms); - (iter = rtnl_dereference(*tp)) != NULL; - tp = &iter->next) { - if (t == iter) { - rcu_assign_pointer(*tp, t->next); - break; - } - } -} - -#ifdef HAVE_NEEDS_FREE_NETDEV -static void ip6_dev_free(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - - gro_cells_destroy(&t->gro_cells); - dst_cache_destroy(&t->dst_cache); - free_percpu(dev->tstats); -} - -#endif -static int ip6_tnl_create2(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - int err; - - t = netdev_priv(dev); - - dev->rtnl_link_ops = &ip6_link_ops; - err = register_netdevice(dev); - if (err < 0) - goto out; - - strcpy(t->parms.name, dev->name); - - dev_hold(dev); - ip6_tnl_link(ip6n, t); - return 0; - -out: - return err; -} - -/** - * ip6_tnl_create - create a new tunnel - * @p: tunnel parameters - * @pt: pointer to new tunnel - * - * Description: - * Create tunnel matching given parameters. - * - * Return: - * created tunnel or error pointer - **/ - -static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) -{ - struct net_device *dev; - struct ip6_tnl *t; - char name[IFNAMSIZ]; - int err = -ENOMEM; - - if (p->name[0]) - strlcpy(name, p->name, IFNAMSIZ); - else - strlcpy(name, "ovs-ip6tnl%d", IFNAMSIZ); - - dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, - ip6_tnl_dev_setup); - if (!dev) - goto failed; - - dev_net_set(dev, net); - - t = netdev_priv(dev); - t->parms = *p; - t->net = dev_net(dev); - err = ip6_tnl_create2(dev); - if (err < 0) - goto failed_free; - - return t; - -failed_free: - free_netdev(dev); -failed: - return ERR_PTR(err); -} - -/** - * ip6_tnl_locate - find or create tunnel matching given parameters - * @p: tunnel parameters - * @create: != 0 if allowed to create new tunnel if no match found - * - * Description: - * ip6_tnl_locate() first tries to locate an existing tunnel - * based on @parms. If this is unsuccessful, but @create is set a new - * tunnel device is created and registered for use. - * - * Return: - * matching tunnel or error pointer - **/ - -static struct ip6_tnl *ip6_tnl_locate(struct net *net, - struct __ip6_tnl_parm *p, int create) -{ - const struct in6_addr *remote = &p->raddr; - const struct in6_addr *local = &p->laddr; - struct ip6_tnl __rcu **tp; - struct ip6_tnl *t; - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - - for (tp = ip6_tnl_bucket(ip6n, p); - (t = rtnl_dereference(*tp)) != NULL; - tp = &t->next) { - if (ipv6_addr_equal(local, &t->parms.laddr) && - ipv6_addr_equal(remote, &t->parms.raddr)) { - if (create) - return ERR_PTR(-EEXIST); - - return t; - } - } - if (!create) - return ERR_PTR(-ENODEV); - return ip6_tnl_create(net, p); -} - -/** - * ip6_tnl_dev_uninit - tunnel device uninitializer - * @dev: the device to be destroyed - * - * Description: - * ip6_tnl_dev_uninit() removes tunnel from its list - **/ - -static void -ip6_tnl_dev_uninit(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct net *net = t->net; - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - - if (dev == ip6n->fb_tnl_dev) - RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); - else - ip6_tnl_unlink(ip6n, t); - dst_cache_reset(&t->dst_cache); - dev_put(dev); -} - -/** - * parse_tvl_tnl_enc_lim - handle encapsulation limit option - * @skb: received socket buffer - * - * Return: - * 0 if none was found, - * else index to encapsulation limit - **/ - -__u16 rpl_ip6_tnl_parse_tlv_enc_lim(struct sk_buff *skb, __u8 *raw) -{ - const struct ipv6hdr *ipv6h = (const struct ipv6hdr *)raw; - unsigned int nhoff = raw - skb->data; - unsigned int off = nhoff + sizeof(*ipv6h); - u8 next, nexthdr = ipv6h->nexthdr; - - while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { - struct ipv6_opt_hdr *hdr; - u16 optlen; - - if (!pskb_may_pull(skb, off + sizeof(*hdr))) - break; - - hdr = (struct ipv6_opt_hdr *)(skb->data + off); - if (nexthdr == NEXTHDR_FRAGMENT) { - struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; - if (frag_hdr->frag_off) - break; - optlen = 8; - } else if (nexthdr == NEXTHDR_AUTH) { - optlen = (hdr->hdrlen + 2) << 2; - } else { - optlen = ipv6_optlen(hdr); - } - /* cache hdr->nexthdr, since pskb_may_pull() might - * invalidate hdr - */ - next = hdr->nexthdr; - if (nexthdr == NEXTHDR_DEST) { - u16 i = 2; - - /* Remember : hdr is no longer valid at this point. */ - if (!pskb_may_pull(skb, off + optlen)) - break; - - while (1) { - struct ipv6_tlv_tnl_enc_lim *tel; - - /* No more room for encapsulation limit */ - if (i + sizeof(*tel) > optlen) - break; - - tel = (struct ipv6_tlv_tnl_enc_lim *)(skb->data + off + i); - /* return index of option if found and valid */ - if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && - tel->length == 1) - return i + off - nhoff; - /* else jump to next option */ - if (tel->type) - i += tel->length + 2; - else - i++; - } - } - nexthdr = next; - off += optlen; - } - return 0; -} - -static int -ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - return PACKET_REJECT; -} - -static int -ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, - u8 type, u8 code, int offset, __be32 info) -{ - return PACKET_REJECT; -} - -static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, - const struct ipv6hdr *ipv6h, - struct sk_buff *skb) -{ - __u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK; - - if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) - ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield); - - return IP6_ECN_decapsulate(ipv6h, skb); -} - -static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, - const struct ipv6hdr *ipv6h, - struct sk_buff *skb) -{ - if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) - ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb)); - - return IP6_ECN_decapsulate(ipv6h, skb); -} - -__u32 rpl_ip6_tnl_get_cap(struct ip6_tnl *t, - const struct in6_addr *laddr, - const struct in6_addr *raddr) -{ - struct __ip6_tnl_parm *p = &t->parms; - int ltype = ipv6_addr_type(laddr); - int rtype = ipv6_addr_type(raddr); - __u32 flags = 0; - - if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) { - flags = IP6_TNL_F_CAP_PER_PACKET; - } else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && - rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) && - !((ltype|rtype) & IPV6_ADDR_LOOPBACK) && - (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) { - if (ltype&IPV6_ADDR_UNICAST) - flags |= IP6_TNL_F_CAP_XMIT; - if (rtype&IPV6_ADDR_UNICAST) - flags |= IP6_TNL_F_CAP_RCV; - } - return flags; -} - -/* called with rcu_read_lock() */ -int rpl_ip6_tnl_rcv_ctl(struct ip6_tnl *t, - const struct in6_addr *laddr, - const struct in6_addr *raddr) -{ - struct __ip6_tnl_parm *p = &t->parms; - int ret = 0; - struct net *net = t->net; - - if ((p->flags & IP6_TNL_F_CAP_RCV) || - ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && - (rpl_ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) { - struct net_device *ldev = NULL; - - if (p->link) - ldev = dev_get_by_index_rcu(net, p->link); - - if ((ipv6_addr_is_multicast(laddr) || - likely(ipv6_chk_addr(net, laddr, ldev, 0))) && - ((p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) || - likely(!ipv6_chk_addr(net, raddr, NULL, 0)))) - ret = 1; - } - return ret; -} - -static int __ip6_tnl_rcv(struct ip6_tnl *tunnel, struct sk_buff *skb, - const struct tnl_ptk_info *tpi, - struct metadata_dst *tun_dst, - int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, - const struct ipv6hdr *ipv6h, - struct sk_buff *skb), - bool log_ecn_err) -{ - struct pcpu_sw_netstats *tstats; - - if ((!(tpi->flags & TUNNEL_CSUM) && - (tunnel->parms.i_flags & TUNNEL_CSUM)) || - ((tpi->flags & TUNNEL_CSUM) && - !(tunnel->parms.i_flags & TUNNEL_CSUM))) { - tunnel->dev->stats.rx_crc_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - - if (tunnel->parms.i_flags & TUNNEL_SEQ) { - if (!(tpi->flags & TUNNEL_SEQ) || - (tunnel->i_seqno && - (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) { - tunnel->dev->stats.rx_fifo_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - tunnel->i_seqno = ntohl(tpi->seq) + 1; - } - -#if 0 - /* Warning: All skb pointers will be invalidated! */ - if (tunnel->dev->type == ARPHRD_ETHER) { - if (!pskb_may_pull(skb, ETH_HLEN)) { - tunnel->dev->stats.rx_length_errors++; - tunnel->dev->stats.rx_errors++; - goto drop; - } - - ipv6h = ipv6_hdr(skb); - skb->protocol = eth_type_trans(skb, tunnel->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - } else { - skb->dev = tunnel->dev; - } - - skb_reset_network_header(skb); - memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - - __skb_tunnel_rx(skb, tunnel->dev, tunnel->net); - - err = dscp_ecn_decapsulate(tunnel, ipv6h, skb); - if (unlikely(err)) { - if (log_ecn_err) - net_info_ratelimited("non-ECT from %pI6 with DS=%#x\n", - &ipv6h->saddr, - ipv6_get_dsfield(ipv6h)); - if (err > 1) { - ++tunnel->dev->stats.rx_frame_errors; - ++tunnel->dev->stats.rx_errors; - goto drop; - } - } - -#endif - tstats = this_cpu_ptr(tunnel->dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - skb_reset_mac_header(skb); - skb_scrub_packet(skb, false); - skb->protocol = eth_type_trans(skb, tunnel->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst); - netdev_port_receive(skb, &tun_dst->u.tun_info); - return 0; - -drop: - /* In OVS case caller will free tun_dst and skb */ -#if 0 - if (tun_dst) - dst_release((struct dst_entry *)tun_dst); - kfree_skb(skb); -#endif - return 0; -} - -int rpl_ip6_tnl_rcv(struct ip6_tnl *t, struct sk_buff *skb, - const struct tnl_ptk_info *tpi, - struct metadata_dst *tun_dst, - bool log_ecn_err) -{ - return __ip6_tnl_rcv(t, skb, tpi, tun_dst, ip6ip6_dscp_ecn_decapsulate, - log_ecn_err); -} - -static const struct tnl_ptk_info tpi_v6 = { - /* no tunnel info required for ipxip6. */ - .proto = htons(ETH_P_IPV6), -}; - -static const struct tnl_ptk_info tpi_v4 = { - /* no tunnel info required for ipxip6. */ - .proto = htons(ETH_P_IP), -}; - -static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, - const struct tnl_ptk_info *tpi, - int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, - const struct ipv6hdr *ipv6h, - struct sk_buff *skb)) -{ - struct ip6_tnl *t; - const struct ipv6hdr *ipv6h = ipv6_hdr(skb); - struct metadata_dst *tun_dst = NULL; - int ret = -1; - - rcu_read_lock(); - t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); - - if (t) { - u8 tproto = READ_ONCE(t->parms.proto); - - if (tproto != ipproto && tproto != 0) - goto drop; - if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) - goto drop; - if (!rpl_ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) - goto drop; - if (iptunnel_pull_header(skb, 0, tpi->proto, false)) - goto drop; - if (t->parms.collect_md) { - ovs_ipv6_tun_rx_dst(tun_dst, skb, 0, 0, 0); - if (!tun_dst) - goto drop; - } - ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate, - log_ecn_error); - } - - rcu_read_unlock(); - - return ret; - -drop: - rcu_read_unlock(); - kfree_skb(skb); - return 0; -} - -static int ip4ip6_rcv(struct sk_buff *skb) -{ - return ipxip6_rcv(skb, IPPROTO_IPIP, &tpi_v4, - ip4ip6_dscp_ecn_decapsulate); -} - -static int ip6ip6_rcv(struct sk_buff *skb) -{ - return ipxip6_rcv(skb, IPPROTO_IPV6, &tpi_v6, - ip6ip6_dscp_ecn_decapsulate); -} - -struct ipv6_tel_txoption { - struct ipv6_txoptions ops; - __u8 dst_opt[8]; -}; - -static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) -{ - memset(opt, 0, sizeof(struct ipv6_tel_txoption)); - - opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT; - opt->dst_opt[3] = 1; - opt->dst_opt[4] = encap_limit; - opt->dst_opt[5] = IPV6_TLV_PADN; - opt->dst_opt[6] = 1; - - opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; - opt->ops.opt_nflen = 8; -} - -/** - * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own - * @t: the outgoing tunnel device - * @hdr: IPv6 header from the incoming packet - * - * Description: - * Avoid trivial tunneling loop by checking that tunnel exit-point - * doesn't match source of incoming packet. - * - * Return: - * 1 if conflict, - * 0 else - **/ - -static inline bool -ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) -{ - return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); -} - -int rpl_ip6_tnl_xmit_ctl(struct ip6_tnl *t, - const struct in6_addr *laddr, - const struct in6_addr *raddr) -{ - struct __ip6_tnl_parm *p = &t->parms; - int ret = 0; - struct net *net = t->net; - - if (t->parms.collect_md) - return 1; - - if ((p->flags & IP6_TNL_F_CAP_XMIT) || - ((p->flags & IP6_TNL_F_CAP_PER_PACKET) && - (rpl_ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_XMIT))) { - struct net_device *ldev = NULL; - - rcu_read_lock(); - if (p->link) - ldev = dev_get_by_index_rcu(net, p->link); - - if (unlikely(!ipv6_chk_addr(net, laddr, ldev, 0))) - pr_warn("%s xmit: Local address not yet configured!\n", - p->name); - else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && - !ipv6_addr_is_multicast(raddr) && - unlikely(ipv6_chk_addr(net, raddr, NULL, 0))) - pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", - p->name); - else - ret = 1; - rcu_read_unlock(); - } - return ret; -} - -static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, - u8 type, struct ipv6_opt_hdr *opt) -{ - struct ipv6_opt_hdr *h = - (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt)); - - memcpy(h, opt, ipv6_optlen(opt)); - h->nexthdr = *proto; - *proto = type; -} - -void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, - u8 *proto) -{ - if (opt->dst1opt) - ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); -} - -/** - * ip6_tnl_xmit - encapsulate packet and send - * @skb: the outgoing socket buffer - * @dev: the outgoing tunnel device - * @dsfield: dscp code for outer header - * @fl6: flow of tunneled packet - * @encap_limit: encapsulation limit - * @pmtu: Path MTU is stored if packet is too big - * @proto: next header value - * - * Description: - * Build new header and do some sanity checks on the packet before sending - * it. - * - * Return: - * 0 on success - * -1 fail - * %-EMSGSIZE message too big. return mtu in this case. - **/ - -int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, - struct flowi6 *fl6, int encap_limit, __u32 *pmtu, - __u8 proto) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct net *net = t->net; - struct net_device_stats *stats = &t->dev->stats; - struct ipv6hdr *ipv6h; - struct ipv6_tel_txoption opt; - struct dst_entry *dst = NULL, *ndst = NULL; - struct net_device *tdev; - int mtu; - unsigned int eth_hlen = t->dev->type == ARPHRD_ETHER ? ETH_HLEN : 0; - unsigned int psh_hlen = sizeof(struct ipv6hdr) + t->encap_hlen; - unsigned int max_headroom = psh_hlen; - bool use_cache = false; - u8 hop_limit; - int err = -1; - - if (t->parms.collect_md) { - hop_limit = skb_tunnel_info(skb)->key.ttl; - goto route_lookup; - } else { - hop_limit = t->parms.hop_limit; - } - - /* NBMA tunnel */ - if (ipv6_addr_any(&t->parms.raddr)) { - if (skb->protocol == htons(ETH_P_IPV6)) { - struct in6_addr *addr6; - struct neighbour *neigh; - int addr_type; - - if (!skb_dst(skb)) - goto tx_err_link_failure; - - neigh = dst_neigh_lookup(skb_dst(skb), - &ipv6_hdr(skb)->daddr); - if (!neigh) - goto tx_err_link_failure; - - addr6 = (struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if (addr_type == IPV6_ADDR_ANY) - addr6 = &ipv6_hdr(skb)->daddr; - - memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); - neigh_release(neigh); - } - } else if (t->parms.proto != 0 && !(t->parms.flags & - (IP6_TNL_F_USE_ORIG_TCLASS | - IP6_TNL_F_USE_ORIG_FWMARK))) { - /* enable the cache only if neither the outer protocol nor the - * routing decision depends on the current inner header value - */ - use_cache = true; - } - - if (use_cache) - dst = dst_cache_get(&t->dst_cache); - - if (!rpl_ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr)) - goto tx_err_link_failure; - - if (!dst) { -route_lookup: - /* add dsfield to flowlabel for route lookup */ - fl6->flowlabel = ip6_make_flowinfo(dsfield, fl6->flowlabel); - - dst = ip6_route_output(net, NULL, fl6); - - if (dst->error) - goto tx_err_link_failure; - dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0); - if (IS_ERR(dst)) { - err = PTR_ERR(dst); - dst = NULL; - goto tx_err_link_failure; - } - if (t->parms.collect_md && - ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, - &fl6->daddr, 0, &fl6->saddr)) - goto tx_err_link_failure; - ndst = dst; - } - - tdev = dst->dev; - - if (tdev == dev) { - stats->collisions++; - net_warn_ratelimited("%s: Local routing loop detected!\n", - t->parms.name); - goto tx_err_dst_release; - } - mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; - if (encap_limit >= 0) { - max_headroom += 8; - mtu -= 8; - } - if (skb->protocol == htons(ETH_P_IPV6)) { - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - } else if (mtu < 576) { - mtu = 576; - } - -// FIX ME -// skb_dst_update_pmtu(skb, mtu); - if (skb->len - t->tun_hlen - eth_hlen > mtu && !skb_is_gso(skb)) { - *pmtu = mtu; - err = -EMSGSIZE; - goto tx_err_dst_release; - } - - if (t->err_count > 0) { - if (time_before(jiffies, - t->err_time + IP6TUNNEL_ERR_TIMEO)) { - t->err_count--; - - dst_link_failure(skb); - } else { - t->err_count = 0; - } - } - - skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom += LL_RESERVED_SPACE(tdev); - - if (skb_headroom(skb) < max_headroom || skb_shared(skb) || - (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { - struct sk_buff *new_skb; - - new_skb = skb_realloc_headroom(skb, max_headroom); - if (!new_skb) - goto tx_err_dst_release; - - if (skb->sk) - skb_set_owner_w(new_skb, skb->sk); - consume_skb(skb); - skb = new_skb; - } - - if (t->parms.collect_md) { - if (t->encap.type != TUNNEL_ENCAP_NONE) - goto tx_err_dst_release; - } else { - if (use_cache && ndst) - dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr); - } - skb_dst_set(skb, dst); - - if (encap_limit >= 0) { - init_tel_txopt(&opt, encap_limit); - ipv6_push_frag_opts(skb, &opt.ops, &proto); - } - hop_limit = hop_limit ? : ip6_dst_hoplimit(dst); - - /* Calculate max headroom for all the headers and adjust - * needed_headroom if necessary. - */ - max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) - + dst->header_len + t->hlen; - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; - - err = ip6_tnl_encap(skb, t, &proto, fl6); - if (err) - return err; - - skb_push(skb, sizeof(struct ipv6hdr)); - skb_reset_network_header(skb); - ipv6h = ipv6_hdr(skb); - ip6_flow_hdr(ipv6h, dsfield, - ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); - ipv6h->hop_limit = hop_limit; - ipv6h->nexthdr = proto; - ipv6h->saddr = fl6->saddr; - ipv6h->daddr = fl6->daddr; - ip6tunnel_xmit(NULL, skb, dev); - return 0; -tx_err_link_failure: - stats->tx_carrier_errors++; - dst_link_failure(skb); -tx_err_dst_release: - dst_release(dst); - return err; -} -EXPORT_SYMBOL(ip6_tnl_xmit); - -static inline int -ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - const struct iphdr *iph = ip_hdr(skb); - int encap_limit = -1; - struct flowi6 fl6; - __u8 dsfield; - __u32 mtu; - u8 tproto; - int err; - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - tproto = READ_ONCE(t->parms.proto); - if (tproto != IPPROTO_IPIP && tproto != 0) - return -1; - - if (t->parms.collect_md) { - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET6)) - return -1; - key = &tun_info->key; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; - fl6.daddr = key->u.ipv6.dst; - fl6.flowlabel = key->label; - dsfield = key->tos; - } else { - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - encap_limit = t->parms.encap_limit; - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - } - -// FIX ME -// fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - - if (iptunnel_handle_offloads(skb, true, SKB_GSO_IPXIP6)) - return -1; - - dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph)); - - skb_set_inner_ipproto(skb, IPPROTO_IPIP); - - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - IPPROTO_IPIP); - if (err != 0) { - /* XXX: send ICMP error even if DF is not set. */ - if (err == -EMSGSIZE) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - return -1; - } - - return 0; -} - -static inline int -ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h = ipv6_hdr(skb); - int encap_limit = -1; - __u16 offset; - struct flowi6 fl6; - __u8 dsfield; - __u32 mtu; - u8 tproto; - int err; - - tproto = READ_ONCE(t->parms.proto); - if ((tproto != IPPROTO_IPV6 && tproto != 0) || - ip6_tnl_addr_conflict(t, ipv6h)) - return -1; - - if (t->parms.collect_md) { - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET6)) - return -1; - key = &tun_info->key; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; - fl6.daddr = key->u.ipv6.dst; - fl6.flowlabel = key->label; - dsfield = key->tos; - } else { - offset = rpl_ip6_tnl_parse_tlv_enc_lim(skb, - skb_network_header(skb)); - /* - * ip6_tnl_parse_tlv_enc_lim() might - * have reallocated skb->head - */ - ipv6h = ipv6_hdr(skb); - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - - tel = (void *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; - } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { - encap_limit = t->parms.encap_limit; - } - - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - } - -// FIX ME -// fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - - if (iptunnel_handle_offloads(skb, true, SKB_GSO_IPXIP6)) - return -1; - - dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h)); - - skb_set_inner_ipproto(skb, IPPROTO_IPV6); - - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - IPPROTO_IPV6); - if (err != 0) { - if (err == -EMSGSIZE) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - return -1; - } - - return 0; -} - -static netdev_tx_t -ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct net_device_stats *stats = &t->dev->stats; - int ret; - - switch (skb->protocol) { - case htons(ETH_P_IP): - ret = ip4ip6_tnl_xmit(skb, dev); - break; - case htons(ETH_P_IPV6): - ret = ip6ip6_tnl_xmit(skb, dev); - break; - default: - goto tx_err; - } - - if (ret < 0) - goto tx_err; - - return NETDEV_TX_OK; - -tx_err: - stats->tx_errors++; - stats->tx_dropped++; - kfree_skb(skb); - return NETDEV_TX_OK; -} - -static void ip6_tnl_link_config(struct ip6_tnl *t) -{ - struct net_device *dev = t->dev; - struct __ip6_tnl_parm *p = &t->parms; - struct flowi6 *fl6 = &t->fl.u.ip6; - int t_hlen; - - memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); - memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); - - /* Set up flowi template */ - fl6->saddr = p->laddr; - fl6->daddr = p->raddr; - fl6->flowi6_oif = p->link; - fl6->flowlabel = 0; - - if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) - fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; - if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) - fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; - - p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET); - p->flags |= rpl_ip6_tnl_get_cap(t, &p->laddr, &p->raddr); - - if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) - dev->flags |= IFF_POINTOPOINT; - else - dev->flags &= ~IFF_POINTOPOINT; - - t->tun_hlen = 0; - t->hlen = t->encap_hlen + t->tun_hlen; - t_hlen = t->hlen + sizeof(struct ipv6hdr); - - if (p->flags & IP6_TNL_F_CAP_XMIT) { - int strict = (ipv6_addr_type(&p->raddr) & - (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)); - - struct rt6_info *rt = rt6_lookup(t->net, - &p->raddr, &p->laddr, - p->link, strict); - - if (!rt) - return; - - if (rt->dst.dev) { - dev->hard_header_len = rt->dst.dev->hard_header_len + - t_hlen; - - dev->mtu = rt->dst.dev->mtu - t_hlen; - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; - - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; - } - ip6_rt_put(rt); - } -} - -/** - * ip6_tnl_change - update the tunnel parameters - * @t: tunnel to be changed - * @p: tunnel configuration parameters - * - * Description: - * ip6_tnl_change() updates the tunnel parameters - **/ - -static int -ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) -{ - t->parms.laddr = p->laddr; - t->parms.raddr = p->raddr; - t->parms.flags = p->flags; - t->parms.hop_limit = p->hop_limit; - t->parms.encap_limit = p->encap_limit; - t->parms.flowinfo = p->flowinfo; - t->parms.link = p->link; - t->parms.proto = p->proto; - t->parms.fwmark = p->fwmark; - dst_cache_reset(&t->dst_cache); - ip6_tnl_link_config(t); - return 0; -} - -static int ip6_tnl_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) -{ - struct net *net = t->net; - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - int err; - - ip6_tnl_unlink(ip6n, t); - synchronize_net(); - err = ip6_tnl_change(t, p); - ip6_tnl_link(ip6n, t); - netdev_state_change(t->dev); - return err; -} - -static int ip6_tnl0_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p) -{ - /* for default tnl0 device allow to change only the proto */ - t->parms.proto = p->proto; - netdev_state_change(t->dev); - return 0; -} - -static void -ip6_tnl_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm *u) -{ - p->laddr = u->laddr; - p->raddr = u->raddr; - p->flags = u->flags; - p->hop_limit = u->hop_limit; - p->encap_limit = u->encap_limit; - p->flowinfo = u->flowinfo; - p->link = u->link; - p->proto = u->proto; - memcpy(p->name, u->name, sizeof(u->name)); -} - -static void -ip6_tnl_parm_to_user(struct ip6_tnl_parm *u, const struct __ip6_tnl_parm *p) -{ - u->laddr = p->laddr; - u->raddr = p->raddr; - u->flags = p->flags; - u->hop_limit = p->hop_limit; - u->encap_limit = p->encap_limit; - u->flowinfo = p->flowinfo; - u->link = p->link; - u->proto = p->proto; - memcpy(u->name, p->name, sizeof(u->name)); -} - -/** - * ip6_tnl_ioctl - configure ipv6 tunnels from userspace - * @dev: virtual device associated with tunnel - * @ifr: parameters passed from userspace - * @cmd: command to be performed - * - * Description: - * ip6_tnl_ioctl() is used for managing IPv6 tunnels - * from userspace. - * - * The possible commands are the following: - * %SIOCGETTUNNEL: get tunnel parameters for device - * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters - * %SIOCCHGTUNNEL: change tunnel parameters to those given - * %SIOCDELTUNNEL: delete tunnel - * - * The fallback device "ovs-ip6tnl0", created during module - * initialization, can be used for creating other tunnel devices. - * - * Return: - * 0 on success, - * %-EFAULT if unable to copy data to or from userspace, - * %-EPERM if current process hasn't %CAP_NET_ADMIN set - * %-EINVAL if passed tunnel parameters are invalid, - * %-EEXIST if changing a tunnel's parameters would cause a conflict - * %-ENODEV if attempting to change or delete a nonexisting device - **/ - -static int -ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) -{ - int err = 0; - struct ip6_tnl_parm p; - struct __ip6_tnl_parm p1; - struct ip6_tnl *t = netdev_priv(dev); - struct net *net = t->net; - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - - memset(&p1, 0, sizeof(p1)); - - switch (cmd) { - case SIOCGETTUNNEL: - if (dev == ip6n->fb_tnl_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - ip6_tnl_parm_from_user(&p1, &p); - t = ip6_tnl_locate(net, &p1, 0); - if (IS_ERR(t)) - t = netdev_priv(dev); - } else { - memset(&p, 0, sizeof(p)); - } - ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) { - err = -EFAULT; - } - break; - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - break; - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - break; - err = -EINVAL; - if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP && - p.proto != 0) - break; - ip6_tnl_parm_from_user(&p1, &p); - t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); - if (cmd == SIOCCHGTUNNEL) { - if (!IS_ERR(t)) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else - t = netdev_priv(dev); - if (dev == ip6n->fb_tnl_dev) - err = ip6_tnl0_update(t, &p1); - else - err = ip6_tnl_update(t, &p1); - } - if (!IS_ERR(t)) { - err = 0; - ip6_tnl_parm_to_user(&p, &t->parms); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - err = -EFAULT; - - } else { - err = PTR_ERR(t); - } - break; - case SIOCDELTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - break; - - if (dev == ip6n->fb_tnl_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - break; - err = -ENOENT; - ip6_tnl_parm_from_user(&p1, &p); - t = ip6_tnl_locate(net, &p1, 0); - if (IS_ERR(t)) - break; - err = -EPERM; - if (t->dev == ip6n->fb_tnl_dev) - break; - dev = t->dev; - } - err = 0; - unregister_netdevice(dev); - break; - default: - err = -EINVAL; - } - return err; -} - -/** - * ip6_tnl_change_mtu - change mtu manually for tunnel device - * @dev: virtual device associated with tunnel - * @new_mtu: the new mtu - * - * Return: - * 0 on success, - * %-EINVAL if mtu too small - **/ - -int rpl_ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) -{ - struct ip6_tnl *tnl = netdev_priv(dev); - - if (tnl->parms.proto == IPPROTO_IPV6) { - if (new_mtu < IPV6_MIN_MTU) - return -EINVAL; - } else { - if (new_mtu < ETH_MIN_MTU) - return -EINVAL; - } - if (new_mtu > 0xFFF8 - dev->hard_header_len) - return -EINVAL; - dev->mtu = new_mtu; - return 0; -} - -#ifdef HAVE_NDO_GET_IFLINK -int rpl_ip6_tnl_get_iflink(const struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - - return t->parms.link; -} - -#endif -const struct ip6_tnl_encap_ops __rcu * - rpl_ip6tun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; - -int rpl_ip6_tnl_encap_add_ops(const struct ip6_tnl_encap_ops *ops, - unsigned int num) -{ - if (num >= MAX_IPTUN_ENCAP_OPS) - return -ERANGE; - - return !cmpxchg((const struct ip6_tnl_encap_ops **) - &rpl_ip6tun_encaps[num], - NULL, ops) ? 0 : -1; -} - -int rpl_ip6_tnl_encap_del_ops(const struct ip6_tnl_encap_ops *ops, - unsigned int num) -{ - int ret; - - if (num >= MAX_IPTUN_ENCAP_OPS) - return -ERANGE; - - ret = (cmpxchg((const struct ip6_tnl_encap_ops **) - &rpl_ip6tun_encaps[num], - ops, NULL) == ops) ? 0 : -1; - - synchronize_net(); - - return ret; -} - -int rpl_ip6_tnl_encap_setup(struct ip6_tnl *t, - struct ip_tunnel_encap *ipencap) -{ - int hlen; - - memset(&t->encap, 0, sizeof(t->encap)); - - hlen = ip6_encap_hlen(ipencap); - if (hlen < 0) - return hlen; - - t->encap.type = ipencap->type; - t->encap.sport = ipencap->sport; - t->encap.dport = ipencap->dport; - t->encap.flags = ipencap->flags; - - t->encap_hlen = hlen; - t->hlen = t->encap_hlen + t->tun_hlen; - - return 0; -} - -static const struct net_device_ops ip6_tnl_netdev_ops = { - .ndo_init = ip6_tnl_dev_init, - .ndo_uninit = ip6_tnl_dev_uninit, - .ndo_start_xmit = ip6_tnl_start_xmit, - .ndo_do_ioctl = ip6_tnl_ioctl, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = ip6_tnl_change_mtu, -#else - .ndo_change_mtu = ip6_tnl_change_mtu, -#endif - .ndo_get_stats = ip6_get_stats, -#ifdef HAVE_NDO_GET_IFLINK - .ndo_get_iflink = ip6_tnl_get_iflink, -#endif -}; - -#define IPXIPX_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_GSO_SOFTWARE | \ - NETIF_F_HW_CSUM) - -/** - * ip6_tnl_dev_setup - setup virtual tunnel device - * @dev: virtual device associated with tunnel - * - * Description: - * Initialize function pointers and device parameters - **/ - -static void ip6_tnl_dev_setup(struct net_device *dev) -{ - dev->netdev_ops = &ip6_tnl_netdev_ops; -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = free_netdev; -#else - dev->needs_free_netdev = true; - dev->priv_destructor = ip6_dev_free; -#endif - - dev->type = ARPHRD_TUNNEL6; - dev->flags |= IFF_NOARP; - dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_LLTX; - netif_keep_dst(dev); - - dev->features |= IPXIPX_FEATURES; - dev->hw_features |= IPXIPX_FEATURES; - - /* This perm addr will be used as interface identifier by IPv6 */ - dev->addr_assign_type = NET_ADDR_RANDOM; - eth_random_addr(dev->perm_addr); -} - - -/** - * ip6_tnl_dev_init_gen - general initializer for all tunnel devices - * @dev: virtual device associated with tunnel - **/ - -static inline int -ip6_tnl_dev_init_gen(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - int ret; - int t_hlen; - - t->dev = dev; - t->net = dev_net(dev); - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - ret = dst_cache_init(&t->dst_cache, GFP_KERNEL); - if (ret) - goto free_stats; - - ret = gro_cells_init(&t->gro_cells, dev); - if (ret) - goto destroy_dst; - - t->tun_hlen = 0; - t->hlen = t->encap_hlen + t->tun_hlen; - t_hlen = t->hlen + sizeof(struct ipv6hdr); - - dev->type = ARPHRD_TUNNEL6; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; - dev->mtu = ETH_DATA_LEN - t_hlen; - if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; -#ifdef HAVE_NET_DEVICE_MAX_MTU - dev->min_mtu = ETH_MIN_MTU; - dev->max_mtu = 0xFFF8 - dev->hard_header_len; -#endif - - return 0; - -destroy_dst: - dst_cache_destroy(&t->dst_cache); -free_stats: - free_percpu(dev->tstats); - dev->tstats = NULL; - - return ret; -} - -/** - * ip6_tnl_dev_init - initializer for all non fallback tunnel devices - * @dev: virtual device associated with tunnel - **/ - -static int ip6_tnl_dev_init(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - int err = ip6_tnl_dev_init_gen(dev); - - if (err) - return err; - ip6_tnl_link_config(t); - if (t->parms.collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; - netif_keep_dst(dev); - } - return 0; -} - -/** - * ip6_fb_tnl_dev_init - initializer for fallback tunnel device - * @dev: fallback device - * - * Return: 0 - **/ - -static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - - t->parms.proto = IPPROTO_IPV6; - dev_hold(dev); - - rcu_assign_pointer(ip6n->tnls_wc[0], t); - return 0; -} - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int rpl_ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6_tnl_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - u8 proto; - - if (!data || !data[IFLA_IPTUN_PROTO]) - return 0; - - proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); - if (proto != IPPROTO_IPV6 && - proto != IPPROTO_IPIP && - proto != 0) - return -EINVAL; - - return 0; -} -#define ip6_tnl_validate rpl_ip6_tnl_validate - -static void ip6_tnl_netlink_parms(struct nlattr *data[], - struct __ip6_tnl_parm *parms) -{ - memset(parms, 0, sizeof(*parms)); - - if (!data) - return; - - if (data[IFLA_IPTUN_LINK]) - parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); - - if (data[IFLA_IPTUN_LOCAL]) - parms->laddr = nla_get_in6_addr(data[IFLA_IPTUN_LOCAL]); - - if (data[IFLA_IPTUN_REMOTE]) - parms->raddr = nla_get_in6_addr(data[IFLA_IPTUN_REMOTE]); - - if (data[IFLA_IPTUN_TTL]) - parms->hop_limit = nla_get_u8(data[IFLA_IPTUN_TTL]); - - if (data[IFLA_IPTUN_ENCAP_LIMIT]) - parms->encap_limit = nla_get_u8(data[IFLA_IPTUN_ENCAP_LIMIT]); - - if (data[IFLA_IPTUN_FLOWINFO]) - parms->flowinfo = nla_get_be32(data[IFLA_IPTUN_FLOWINFO]); - - if (data[IFLA_IPTUN_FLAGS]) - parms->flags = nla_get_u32(data[IFLA_IPTUN_FLAGS]); - - if (data[IFLA_IPTUN_PROTO]) - parms->proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); - - if (data[IFLA_IPTUN_COLLECT_METADATA]) - parms->collect_md = true; - - if (data[IFLA_IPTUN_FWMARK]) - parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); -} - -static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], - struct ip_tunnel_encap *ipencap) -{ - bool ret = false; - - memset(ipencap, 0, sizeof(*ipencap)); - - if (!data) - return ret; - - if (data[IFLA_IPTUN_ENCAP_TYPE]) { - ret = true; - ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); - } - - if (data[IFLA_IPTUN_ENCAP_FLAGS]) { - ret = true; - ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); - } - - if (data[IFLA_IPTUN_ENCAP_SPORT]) { - ret = true; - ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); - } - - if (data[IFLA_IPTUN_ENCAP_DPORT]) { - ret = true; - ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); - } - - return ret; -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int rpl_ip6_tnl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6_tnl_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - struct ip_tunnel_encap ipencap; - struct ip6_tnl *nt, *t; - int err; - - nt = netdev_priv(dev); - - if (ip6_tnl_netlink_encap_parms(data, &ipencap)) { - err = ip6_tnl_encap_setup(nt, &ipencap); - if (err < 0) - return err; - } - - ip6_tnl_netlink_parms(data, &nt->parms); - - if (nt->parms.collect_md) { - if (rtnl_dereference(ip6n->collect_md_tun)) - return -EEXIST; - } else { - t = ip6_tnl_locate(net, &nt->parms, 0); - if (!IS_ERR(t)) - return -EEXIST; - } - - err = ip6_tnl_create2(dev); - if (!err && tb[IFLA_MTU]) - ip6_tnl_change_mtu(dev, nla_get_u32(tb[IFLA_MTU])); - - return err; -} -#define ip6_tnl_newlink rpl_ip6_tnl_newlink - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int rpl_ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int rpl_ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], - struct nlattr *data[]) -#endif -{ - struct ip6_tnl *t = netdev_priv(dev); - struct __ip6_tnl_parm p; - struct net *net = t->net; - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - struct ip_tunnel_encap ipencap; - - if (dev == ip6n->fb_tnl_dev) - return -EINVAL; - - if (ip6_tnl_netlink_encap_parms(data, &ipencap)) { - int err = ip6_tnl_encap_setup(t, &ipencap); - - if (err < 0) - return err; - } - ip6_tnl_netlink_parms(data, &p); - if (p.collect_md) - return -EINVAL; - - t = ip6_tnl_locate(net, &p, 0); - if (!IS_ERR(t)) { - if (t->dev != dev) - return -EEXIST; - } else - t = netdev_priv(dev); - - return ip6_tnl_update(t, &p); -} -#define ip6_tnl_changelink rpl_ip6_tnl_changelink - -static void ip6_tnl_dellink(struct net_device *dev, struct list_head *head) -{ - struct net *net = dev_net(dev); - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - - if (dev != ip6n->fb_tnl_dev) - unregister_netdevice_queue(dev, head); -} - -static size_t ip6_tnl_get_size(const struct net_device *dev) -{ - return - /* IFLA_IPTUN_LINK */ - nla_total_size(4) + - /* IFLA_IPTUN_LOCAL */ - nla_total_size(sizeof(struct in6_addr)) + - /* IFLA_IPTUN_REMOTE */ - nla_total_size(sizeof(struct in6_addr)) + - /* IFLA_IPTUN_TTL */ - nla_total_size(1) + - /* IFLA_IPTUN_ENCAP_LIMIT */ - nla_total_size(1) + - /* IFLA_IPTUN_FLOWINFO */ - nla_total_size(4) + - /* IFLA_IPTUN_FLAGS */ - nla_total_size(4) + - /* IFLA_IPTUN_PROTO */ - nla_total_size(1) + - /* IFLA_IPTUN_ENCAP_TYPE */ - nla_total_size(2) + - /* IFLA_IPTUN_ENCAP_FLAGS */ - nla_total_size(2) + - /* IFLA_IPTUN_ENCAP_SPORT */ - nla_total_size(2) + - /* IFLA_IPTUN_ENCAP_DPORT */ - nla_total_size(2) + - /* IFLA_IPTUN_COLLECT_METADATA */ - nla_total_size(0) + - /* IFLA_IPTUN_FWMARK */ - nla_total_size(4) + - 0; -} - -static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct ip6_tnl *tunnel = netdev_priv(dev); - struct __ip6_tnl_parm *parm = &tunnel->parms; - - if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || - nla_put_in6_addr(skb, IFLA_IPTUN_LOCAL, &parm->laddr) || - nla_put_in6_addr(skb, IFLA_IPTUN_REMOTE, &parm->raddr) || - nla_put_u8(skb, IFLA_IPTUN_TTL, parm->hop_limit) || - nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || - nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || - nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || - nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || - nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) - goto nla_put_failure; - - if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || - nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || - nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || - nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) - goto nla_put_failure; - - if (parm->collect_md) - if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -#ifdef HAVE_GET_LINK_NET -struct net *rpl_ip6_tnl_get_link_net(const struct net_device *dev) -{ - struct ip6_tnl *tunnel = netdev_priv(dev); - - return tunnel->net; -} - -#endif -static const struct nla_policy ip6_tnl_policy[RPL_IFLA_IPTUN_MAX + 1] = { - [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, - [IFLA_IPTUN_LOCAL] = { .len = sizeof(struct in6_addr) }, - [IFLA_IPTUN_REMOTE] = { .len = sizeof(struct in6_addr) }, - [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, - [IFLA_IPTUN_ENCAP_LIMIT] = { .type = NLA_U8 }, - [IFLA_IPTUN_FLOWINFO] = { .type = NLA_U32 }, - [IFLA_IPTUN_FLAGS] = { .type = NLA_U32 }, - [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, - [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, - [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, - [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, - [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, - [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, - [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, -}; - -static struct rtnl_link_ops ip6_link_ops __read_mostly = { - .kind = "ip6tnl", - .maxtype = RPL_IFLA_IPTUN_MAX, - .policy = ip6_tnl_policy, - .priv_size = sizeof(struct ip6_tnl), - .setup = ip6_tnl_dev_setup, - .validate = ip6_tnl_validate, - .newlink = ip6_tnl_newlink, - .changelink = ip6_tnl_changelink, - .dellink = ip6_tnl_dellink, - .get_size = ip6_tnl_get_size, - .fill_info = ip6_tnl_fill_info, -#ifdef HAVE_GET_LINK_NET - .get_link_net = ip6_tnl_get_link_net, -#endif -}; - -static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { - .handler = ip4ip6_rcv, - .err_handler = ip4ip6_err, - .priority = 1, -}; - -static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { - .handler = ip6ip6_rcv, - .err_handler = ip6ip6_err, - .priority = 1, -}; - -static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) -{ - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - struct net_device *dev, *aux; - int h; - struct ip6_tnl *t; - - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &ip6_link_ops) - unregister_netdevice_queue(dev, list); - - for (h = 0; h < IP6_TUNNEL_HASH_SIZE; h++) { - t = rtnl_dereference(ip6n->tnls_r_l[h]); - while (t) { - /* If dev is in the same netns, it has already - * been added to the list by the previous loop. - */ - if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, list); - t = rtnl_dereference(t->next); - } - } -} - -static int __net_init ip6_tnl_init_net(struct net *net) -{ - struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - struct ip6_tnl *t = NULL; - int err; - - ip6n->tnls[0] = ip6n->tnls_wc; - ip6n->tnls[1] = ip6n->tnls_r_l; - - err = -ENOMEM; - ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ovs-ip6tnl0", - NET_NAME_UNKNOWN, ip6_tnl_dev_setup); - - if (!ip6n->fb_tnl_dev) - goto err_alloc_dev; - dev_net_set(ip6n->fb_tnl_dev, net); - ip6n->fb_tnl_dev->rtnl_link_ops = &ip6_link_ops; - /* FB netdevice is special: we have one, and only one per netns. - * Allowing to move it to another netns is clearly unsafe. - */ - ip6n->fb_tnl_dev->features |= NETIF_F_NETNS_LOCAL; - - err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); - if (err < 0) - goto err_register; - - err = register_netdev(ip6n->fb_tnl_dev); - if (err < 0) - goto err_register; - - t = netdev_priv(ip6n->fb_tnl_dev); - - strcpy(t->parms.name, ip6n->fb_tnl_dev->name); - return 0; - -err_register: - free_netdev(ip6n->fb_tnl_dev); -err_alloc_dev: - return err; -} - -static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) -{ - struct net *net; - LIST_HEAD(list); - - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -static struct pernet_operations ip6_tnl_net_ops = { - .init = ip6_tnl_init_net, - .exit_batch = ip6_tnl_exit_batch_net, - .id = &ip6_tnl_net_id, - .size = sizeof(struct ip6_tnl_net), -}; - -/** - * ip6_tunnel_init - register protocol and reserve needed resources - * - * Return: 0 on success - **/ - -int rpl_ip6_tunnel_init(void) -{ - int err; - -#if 0 - if (!ipv6_mod_enabled()) - return -EOPNOTSUPP; -#endif - err = register_pernet_device(&ip6_tnl_net_ops); - if (err < 0) { - pr_err("%s: can't register ip6_tnl pernet device\n", - __func__); - goto out_pernet; - } - - err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET); - if (err < 0) { - pr_err("%s: can't register ip4ip6\n", __func__); - goto out_ip4ip6; - } - - err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6); - if (err < 0) { - pr_err("%s: can't register ip6ip6\n", __func__); - goto out_ip6ip6; - } - - err = rtnl_link_register(&ip6_link_ops); - if (err < 0) { - pr_err("%s: can't register ip6_lin_ops\n", - __func__); - goto rtnl_link_failed; - } - return 0; - -rtnl_link_failed: - xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); -out_ip6ip6: - xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); -out_ip4ip6: - unregister_pernet_device(&ip6_tnl_net_ops); -out_pernet: - return err; -} - -/** - * ip6_tunnel_cleanup - free resources and unregister protocol - **/ - -void rpl_ip6_tunnel_cleanup(void) -{ - rtnl_link_unregister(&ip6_link_ops); - if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET)) - pr_info("%s: can't deregister ip4ip6\n", __func__); - - if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) - pr_info("%s: can't deregister ip6ip6\n", __func__); - - unregister_pernet_device(&ip6_tnl_net_ops); -} - -#endif /* USE_UPSTREAM_TUNNEL */ diff --git a/datapath/linux/compat/ip_fragment.c b/datapath/linux/compat/ip_fragment.c deleted file mode 100644 index f910b99b4..000000000 --- a/datapath/linux/compat/ip_fragment.c +++ /dev/null @@ -1,831 +0,0 @@ -/* - * IP fragmentation backport, heavily based on linux/net/ipv4/ip_fragment.c, - * copied from Linux 192132b9a034 net: Add support for VRFs to inetpeer cache - * - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * The IP fragmentation functionality. - * - * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG> - * Alan Cox <alan@lxorguk.ukuu.org.uk> - * - * Fixes: - * Alan Cox : Split from ip.c , see ip_input.c for history. - * David S. Miller : Begin massive cleanup... - * Andi Kleen : Add sysctls. - * xxxx : Overlapfrag bug. - * Ultima : ip_expire() kernel panic. - * Bill Hawes : Frag accounting and evictor fixes. - * John McDonald : 0 length frag bug. - * Alexey Kuznetsov: SMP races, threading, cleanup. - * Patrick McHardy : LRU queue of frag heads for evictor. - */ - -#include <linux/version.h> - -#ifndef HAVE_CORRECT_MRU_HANDLING - -#define pr_fmt(fmt) "IPv4: " fmt - -#include <linux/compiler.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/mm.h> -#include <linux/jiffies.h> -#include <linux/skbuff.h> -#include <linux/list.h> -#include <linux/ip.h> -#include <linux/icmp.h> -#include <linux/netdevice.h> -#include <linux/jhash.h> -#include <linux/random.h> -#include <linux/slab.h> -#include <net/route.h> -#include <net/dst.h> -#include <net/sock.h> -#include <net/ip.h> -#include <net/icmp.h> -#include <net/checksum.h> -#include <net/inetpeer.h> -#include <net/inet_frag.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/inet.h> -#include <linux/netfilter_ipv4.h> -#include <net/inet_ecn.h> -#include <net/vrf.h> -#include <net/netfilter/ipv4/nf_defrag_ipv4.h> -#include <net/netns/generic.h> -#include "datapath.h" - -/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 - * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c - * as well. Or notify me, at least. --ANK - */ - -static int sysctl_ipfrag_max_dist __read_mostly = 64; -static const char ip_frag_cache_name[] = "ovs-frag4"; - -struct ipfrag_skb_cb -{ - struct inet_skb_parm h; - int offset; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - -/* Describe an entry in the "incomplete datagrams" queue. */ -struct ipq { - struct inet_frag_queue q; - - u32 user; - __be32 saddr; - __be32 daddr; - __be16 id; - u8 protocol; - u8 ecn; /* RFC3168 support */ - u16 max_df_size; /* largest frag with DF set seen */ - int iif; - int vif; /* VRF device index */ - unsigned int rid; - struct inet_peer *peer; -}; - -static u8 ip4_frag_ecn(u8 tos) -{ - return 1 << (tos & INET_ECN_MASK); -} - -static struct inet_frags ip4_frags; - -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, - struct net_device *dev); - -struct ip4_create_arg { - struct iphdr *iph; - u32 user; - int vif; -}; - -static struct netns_frags *get_netns_frags_from_net(struct net *net) -{ -#ifdef HAVE_INET_FRAG_LRU_MOVE - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - return &(ovs_net->ipv4_frags); -#else - return &(net->ipv4.frags); -#endif -} - -static struct net *get_net_from_netns_frags(struct netns_frags *frags) -{ - struct net *net; -#ifdef HAVE_INET_FRAG_LRU_MOVE - struct ovs_net *ovs_net; - - ovs_net = container_of(frags, struct ovs_net, ipv4_frags); - net = ovs_net->net; -#else - net = container_of(frags, struct net, ipv4.frags); -#endif - return net; -} - -void ovs_netns_frags_init(struct net *net) -{ -#ifdef HAVE_INET_FRAG_LRU_MOVE - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - ovs_net->ipv4_frags.high_thresh = 4 * 1024 * 1024; - ovs_net->ipv4_frags.low_thresh = 3 * 1024 * 1024; - ovs_net->ipv4_frags.timeout = IP_FRAG_TIME; - inet_frags_init_net(&(ovs_net->ipv4_frags)); - ovs_net->net = net; -#endif -} - -void ovs_netns_frags_exit(struct net *net) -{ - struct netns_frags *frags; - - frags = get_netns_frags_from_net(net); - inet_frags_exit_net(frags, &ip4_frags); -} - -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) -{ - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); - return jhash_3words((__force u32)id << 16 | prot, - (__force u32)saddr, (__force u32)daddr, - ip4_frags.rnd); -} -/* fb3cfe6e75b9 ("inet: frag: remove hash size assumptions from callers") - * shifted this logic into inet_fragment, but prior kernels still need this. - */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) -#define ipqhashfn(a, b, c, d) (ipqhashfn(a, b, c, d) & (INETFRAGS_HASHSZ - 1)) -#endif - -#ifdef HAVE_INET_FRAGS_CONST -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) -#else -static unsigned int ip4_hashfn(struct inet_frag_queue *q) -#endif -{ - const struct ipq *ipq; - - ipq = container_of(q, struct ipq, q); - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); -} - -#ifdef HAVE_INET_FRAGS_CONST -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) -#else -static bool ip4_frag_match(struct inet_frag_queue *q, void *a) -#endif -{ - const struct ipq *qp; - const struct ip4_create_arg *arg = a; - - qp = container_of(q, struct ipq, q); - return qp->id == arg->iph->id && - qp->saddr == arg->iph->saddr && - qp->daddr == arg->iph->daddr && - qp->protocol == arg->iph->protocol && - qp->user == arg->user && - qp->vif == arg->vif; -} - -#ifdef HAVE_INET_FRAGS_CONST -static void ip4_frag_init(struct inet_frag_queue *q, const void *a) -#else -static void ip4_frag_init(struct inet_frag_queue *q, void *a) -#endif -{ - struct ipq *qp = container_of(q, struct ipq, q); - struct net *net = get_net_from_netns_frags(q->net); - - const struct ip4_create_arg *arg = a; - - qp->protocol = arg->iph->protocol; - qp->id = arg->iph->id; - qp->ecn = ip4_frag_ecn(arg->iph->tos); - qp->saddr = arg->iph->saddr; - qp->daddr = arg->iph->daddr; - qp->vif = arg->vif; - qp->user = arg->user; - qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : - NULL; -} - -static void ip4_frag_free(struct inet_frag_queue *q) -{ - struct ipq *qp; - - qp = container_of(q, struct ipq, q); - if (qp->peer) - inet_putpeer(qp->peer); -} - - -/* Destruction primitives. */ - -static void ipq_put(struct ipq *ipq) -{ - inet_frag_put(&ipq->q, &ip4_frags); -} - -/* Kill ipq entry. It is not destroyed immediately, - * because caller (and someone more) holds reference count. - */ -static void ipq_kill(struct ipq *ipq) -{ - inet_frag_kill(&ipq->q, &ip4_frags); -} - -static bool frag_expire_skip_icmp(u32 user) -{ - return user == IP_DEFRAG_AF_PACKET || - ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN, - __IP_DEFRAG_CONNTRACK_IN_END) || - ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN, - __IP_DEFRAG_CONNTRACK_BRIDGE_IN); -} - -/* - * Oops, a fragment queue timed out. Kill it and send an ICMP reply. - */ -static void ip_expire(unsigned long arg) -{ - struct ipq *qp; - struct net *net; - - qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); - net = get_net_from_netns_frags(qp->q.net); - - spin_lock(&qp->q.lock); - - if (qp_flags(qp) & INET_FRAG_COMPLETE) - goto out; - - ipq_kill(qp); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - - if (!inet_frag_evicting(&qp->q)) { - struct sk_buff *head = qp->q.fragments; - const struct iphdr *iph; - int err; - - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); - - if (!(qp_flags(qp) & INET_FRAG_FIRST_IN) || !qp->q.fragments) - goto out; - - rcu_read_lock(); - head->dev = dev_get_by_index_rcu(net, qp->iif); - if (!head->dev) - goto out_rcu_unlock; - - /* skb has no dst, perform route lookup again */ - iph = ip_hdr(head); - err = ip_route_input_noref(head, iph->daddr, iph->saddr, - iph->tos, head->dev); - if (err) - goto out_rcu_unlock; - - /* Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (frag_expire_skip_icmp(qp->user) && - (skb_rtable(head)->rt_type != RTN_LOCAL)) - goto out_rcu_unlock; - - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); -out_rcu_unlock: - rcu_read_unlock(); - } -out: - spin_unlock(&qp->q.lock); - ipq_put(qp); -} - -#ifdef HAVE_INET_FRAG_EVICTOR -/* Memory limiting on fragments. Evictor trashes the oldest - * fragment queue until we are back under the threshold. - * - * Necessary for kernels earlier than v3.17. Replaced in commit - * b13d3cbfb8e8 ("inet: frag: move eviction of queues to work queue"). - */ -static void ip_evictor(struct net *net) -{ - int evicted; - struct netns_frags *frags; - - frags = get_netns_frags_from_net(net); - evicted = inet_frag_evictor(frags, &ip4_frags, false); - if (evicted) - IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); -} -#endif - -/* Find the correct entry in the "incomplete datagrams" queue for - * this IP datagram, and create new one, if nothing is found. - */ -static struct ipq *ip_find(struct net *net, struct iphdr *iph, - u32 user, int vif) -{ - struct inet_frag_queue *q; - struct ip4_create_arg arg; - unsigned int hash; - struct netns_frags *frags; - - arg.iph = iph; - arg.user = user; - arg.vif = vif; - -#ifdef HAVE_INET_FRAGS_WITH_RWLOCK - read_lock(&ip4_frags.lock); -#endif - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); - - frags = get_netns_frags_from_net(net); - q = inet_frag_find(frags, &ip4_frags, &arg, hash); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); - return NULL; - } - return container_of(q, struct ipq, q); -} - -/* Is the fragment too far ahead to be part of ipq? */ -static int ip_frag_too_far(struct ipq *qp) -{ - struct inet_peer *peer = qp->peer; - unsigned int max = sysctl_ipfrag_max_dist; - unsigned int start, end; - - int rc; - - if (!peer || !max) - return 0; - - start = qp->rid; - end = atomic_inc_return(&peer->rid); - qp->rid = end; - - rc = qp->q.fragments && (end - start) > max; - - if (rc) { - struct net *net; - - net = get_net_from_netns_frags(qp->q.net); - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - } - - return rc; -} - -static int ip_frag_reinit(struct ipq *qp) -{ - struct sk_buff *fp; - unsigned int sum_truesize = 0; - - if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { - atomic_inc(&qp->q.refcnt); - return -ETIMEDOUT; - } - - fp = qp->q.fragments; - do { - struct sk_buff *xp = fp->next; - - sum_truesize += fp->truesize; - kfree_skb(fp); - fp = xp; - } while (fp); - sub_frag_mem_limit(qp->q.net, sum_truesize); - - qp_flags(qp) = 0; - qp->q.len = 0; - qp->q.meat = 0; - qp->q.fragments = NULL; - qp->q.fragments_tail = NULL; - qp->iif = 0; - qp->ecn = 0; - - return 0; -} - -/* Add new segment to existing queue. */ -static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) -{ - struct sk_buff *prev, *next; - struct net_device *dev; - unsigned int fragsize; - int flags, offset; - int ihl, end; - int err = -ENOENT; - u8 ecn; - - if (qp_flags(qp) & INET_FRAG_COMPLETE) - goto err; - - if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && - unlikely(ip_frag_too_far(qp)) && - unlikely(err = ip_frag_reinit(qp))) { - ipq_kill(qp); - goto err; - } - - ecn = ip4_frag_ecn(ip_hdr(skb)->tos); - offset = ntohs(ip_hdr(skb)->frag_off); - flags = offset & ~IP_OFFSET; - offset &= IP_OFFSET; - offset <<= 3; /* offset is in 8-byte chunks */ - ihl = ip_hdrlen(skb); - - /* Determine the position of this fragment. */ - end = offset + skb->len - skb_network_offset(skb) - ihl; - err = -EINVAL; - - /* Is this the final fragment? */ - if ((flags & IP_MF) == 0) { - /* If we already have some bits beyond end - * or have different end, the segment is corrupted. - */ - if (end < qp->q.len || - ((qp_flags(qp) & INET_FRAG_LAST_IN) && end != qp->q.len)) - goto err; - qp_flags(qp) |= INET_FRAG_LAST_IN; - qp->q.len = end; - } else { - if (end&7) { - end &= ~7; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - if (end > qp->q.len) { - /* Some bits beyond end -> corruption. */ - if (qp_flags(qp) & INET_FRAG_LAST_IN) - goto err; - qp->q.len = end; - } - } - if (end == offset) - goto err; - - err = -ENOMEM; - if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) - goto err; - - err = pskb_trim_rcsum(skb, end - offset); - if (err) - goto err; - - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = qp->q.fragments_tail; - if (!prev || FRAG_CB(prev)->offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = qp->q.fragments; next != NULL; next = next->next) { - if (FRAG_CB(next)->offset >= offset) - break; /* bingo! */ - prev = next; - } - -found: - /* We found where to put this one. Check for overlap with - * preceding fragment, and, if needed, align things so that - * any overlaps are eliminated. - */ - if (prev) { - int i = (FRAG_CB(prev)->offset + prev->len) - offset; - - if (i > 0) { - offset += i; - err = -EINVAL; - if (end <= offset) - goto err; - err = -ENOMEM; - if (!pskb_pull(skb, i)) - goto err; - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; - } - } - - err = -ENOMEM; - - while (next && FRAG_CB(next)->offset < end) { - int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ - - if (i < next->len) { - /* Eat head of the next overlapped fragment - * and leave the loop. The next ones cannot overlap. - */ - if (!pskb_pull(next, i)) - goto err; - FRAG_CB(next)->offset += i; - qp->q.meat -= i; - if (next->ip_summed != CHECKSUM_UNNECESSARY) - next->ip_summed = CHECKSUM_NONE; - break; - } else { - struct sk_buff *free_it = next; - - /* Old fragment is completely overridden with - * new one drop it. - */ - next = next->next; - - if (prev) - prev->next = next; - else - qp->q.fragments = next; - - qp->q.meat -= free_it->len; - sub_frag_mem_limit(qp->q.net, free_it->truesize); - kfree_skb(free_it); - } - } - - FRAG_CB(skb)->offset = offset; - - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - qp->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - qp->q.fragments = skb; - - dev = skb->dev; - if (dev) { - qp->iif = dev->ifindex; - skb->dev = NULL; - } - qp->q.stamp = skb->tstamp; - qp->q.meat += skb->len; - qp->ecn |= ecn; - add_frag_mem_limit(qp->q.net, skb->truesize); - if (offset == 0) - qp_flags(qp) |= INET_FRAG_FIRST_IN; - - fragsize = skb->len + ihl; - - if (fragsize > qp->q.max_size) - qp->q.max_size = fragsize; - - if (ip_hdr(skb)->frag_off & htons(IP_DF) && - fragsize > qp->max_df_size) - qp->max_df_size = fragsize; - - if (qp_flags(qp) == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - qp->q.meat == qp->q.len) { - unsigned long orefdst = skb->_skb_refdst; - - skb->_skb_refdst = 0UL; - err = ip_frag_reasm(qp, prev, dev); - skb->_skb_refdst = orefdst; - return err; - } - - skb_dst_drop(skb); - inet_frag_lru_move(&qp->q); - return -EINPROGRESS; - -err: - kfree_skb(skb); - return err; -} - - -/* Build a new IP datagram from all its fragments. */ - -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, - struct net_device *dev) -{ - struct net *net = get_net_from_netns_frags(qp->q.net); - struct iphdr *iph; - struct sk_buff *fp, *head = qp->q.fragments; - int len; - int ihlen; - int err; - u8 ecn; - - ipq_kill(qp); - - ecn = ip_frag_ecn_table[qp->ecn]; - if (unlikely(ecn == 0xff)) { - err = -EINVAL; - goto out_fail; - } - /* Make the one we just received the head. */ - if (prev) { - head = prev->next; - fp = skb_clone(head, GFP_ATOMIC); - if (!fp) - goto out_nomem; - - fp->next = head->next; - if (!fp->next) - qp->q.fragments_tail = fp; - prev->next = fp; - - skb_morph(head, qp->q.fragments); - head->next = qp->q.fragments->next; - - consume_skb(qp->q.fragments); - qp->q.fragments = head; - } - - WARN_ON(!head); - WARN_ON(FRAG_CB(head)->offset != 0); - - /* Allocate a new buffer for the datagram. */ - ihlen = ip_hdrlen(head); - len = ihlen + qp->q.len; - - err = -E2BIG; - if (len > 65535) - goto out_oversize; - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - goto out_nomem; - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (!clone) - goto out_nomem; - clone->next = head->next; - head->next = clone; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(qp->q.net, clone->truesize); - } - - skb_shinfo(head)->frag_list = head->next; - skb_push(head, head->data - skb_network_header(head)); - - for (fp=head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - } - sub_frag_mem_limit(qp->q.net, head->truesize); - - head->next = NULL; - head->dev = dev; - head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); - - iph = ip_hdr(head); - iph->tot_len = htons(len); - iph->tos |= ecn; - - /* When we set IP_DF on a refragmented skb we must also force a - * call to ip_fragment to avoid forwarding a DF-skb of size s while - * original sender only sent fragments of size f (where f < s). - * - * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest - * frag seen to avoid sending tiny DF-fragments in case skb was built - * from one very small df-fragment and one large non-df frag. - */ - if (qp->max_df_size == qp->q.max_size) { - IPCB(head)->flags |= IPSKB_FRAG_PMTU; - iph->frag_off = htons(IP_DF); - } else { - iph->frag_off = 0; - } - - ip_send_check(iph); - - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); - qp->q.fragments = NULL; - qp->q.fragments_tail = NULL; - return 0; - -out_nomem: - net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp); - err = -ENOMEM; - goto out_fail; -out_oversize: - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); -out_fail: - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - return err; -} - -/* Process an incoming IP datagram fragment. */ -int rpl_ip_defrag(struct net *net, struct sk_buff *skb, u32 user) -{ - struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; - int vif = vrf_master_ifindex_rcu(dev); - struct ipq *qp; - - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); - skb_orphan(skb); - -#ifdef HAVE_INET_FRAG_EVICTOR - /* Start by cleaning up the memory. */ - ip_evictor(net); -#endif - - /* Lookup (or create) queue header */ - qp = ip_find(net, ip_hdr(skb), user, vif); - if (qp) { - int ret; - - spin_lock(&qp->q.lock); - - ret = ip_frag_queue(qp, skb); - - spin_unlock(&qp->q.lock); - ipq_put(qp); - return ret; - } - - IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - kfree_skb(skb); - return -ENOMEM; -} - -#ifdef HAVE_DEFRAG_ENABLE_TAKES_NET -static int __net_init ipv4_frags_init_net(struct net *net) -{ - return nf_defrag_ipv4_enable(net); -} -#endif - -static void __net_exit ipv4_frags_exit_net(struct net *net) -{ -} - -static struct pernet_operations ip4_frags_ops = { -#ifdef HAVE_DEFRAG_ENABLE_TAKES_NET - .init = ipv4_frags_init_net, -#endif - .exit = ipv4_frags_exit_net, -}; - -int __init rpl_ipfrag_init(void) -{ -#ifndef HAVE_DEFRAG_ENABLE_TAKES_NET - nf_defrag_ipv4_enable(); -#endif - register_pernet_subsys(&ip4_frags_ops); - ip4_frags.hashfn = ip4_hashfn; - ip4_frags.constructor = ip4_frag_init; - ip4_frags.destructor = ip4_frag_free; - ip4_frags.skb_free = NULL; - ip4_frags.qsize = sizeof(struct ipq); - ip4_frags.match = ip4_frag_match; - ip4_frags.frag_expire = ip_expire; -#ifdef HAVE_INET_FRAGS_WITH_FRAGS_WORK - ip4_frags.frags_cache_name = ip_frag_cache_name; -#endif -#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0) - ip4_frags.secret_interval = 10 * 60 * HZ; -#endif - if (inet_frags_init(&ip4_frags)) { - pr_warn("IP: failed to allocate ip4_frags cache\n"); - return -ENOMEM; - } - return 0; -} - -void rpl_ipfrag_fini(void) -{ - inet_frags_fini(&ip4_frags); - unregister_pernet_subsys(&ip4_frags_ops); -} - -#endif /* !HAVE_CORRECT_MRU_HANDLING */ diff --git a/datapath/linux/compat/ip_gre.c b/datapath/linux/compat/ip_gre.c deleted file mode 100644 index c194ffe00..000000000 --- a/datapath/linux/compat/ip_gre.c +++ /dev/null @@ -1,1450 +0,0 @@ -/* - * Linux NET3: GRE over IP protocol decoder. - * - * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#ifndef USE_UPSTREAM_TUNNEL -#include <linux/capability.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/kconfig.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/netdev_features.h> -#include <linux/in.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/if_arp.h> -#include <linux/mroute.h> -#include <linux/if_vlan.h> -#include <linux/init.h> -#include <linux/in6.h> -#include <linux/inetdevice.h> -#include <linux/igmp.h> -#include <linux/netfilter_ipv4.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> - -#include <net/sock.h> -#include <net/ip.h> -#include <net/icmp.h> -#include <net/protocol.h> -#include <net/ip_tunnels.h> -#include <net/arp.h> -#include <net/checksum.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/xfrm.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/rtnetlink.h> -#include <net/gre.h> -#include <net/dst_metadata.h> -#include <net/erspan.h> - -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/ip6_fib.h> -#include <net/ip6_route.h> -#endif - -#include "gso.h" -#include "vport-netdev.h" - -static int gre_tap_net_id __read_mostly; -static unsigned int erspan_net_id __read_mostly; -static void erspan_build_header(struct sk_buff *skb, - __be32 id, u32 index, - bool truncate, bool is_ipv4); - -static bool ip_gre_loaded = false; - -/* Normally in net/core/dst.c but move it here */ -struct dst_ops md_dst_ops = { - .family = AF_UNSPEC, -}; - -#ifndef ip_gre_calc_hlen -#define ip_gre_calc_hlen gre_calc_hlen -#endif - -static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, - int gre_hdr_len) -{ - struct net *net = dev_net(skb->dev); - struct metadata_dst *tun_dst = NULL; - struct erspan_base_hdr *ershdr; - struct erspan_metadata *pkt_md; - struct ip_tunnel_net *itn; - struct ip_tunnel *tunnel; - const struct iphdr *iph; - struct erspan_md2 *md2; - int ver; - int len; - - itn = net_generic(net, erspan_net_id); - len = gre_hdr_len + sizeof(*ershdr); - - /* Check based hdr len */ - if (unlikely(!pskb_may_pull(skb, len))) - return PACKET_REJECT; - - iph = ip_hdr(skb); - ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); - ver = ershdr->ver; - - /* The original GRE header does not have key field, - * Use ERSPAN 10-bit session ID as key. - */ - tpi->key = cpu_to_be32(get_session_id(ershdr)); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags, - iph->saddr, iph->daddr, tpi->key); - - if (tunnel) { - len = gre_hdr_len + erspan_hdr_len(ver); - if (unlikely(!pskb_may_pull(skb, len))) - return PACKET_REJECT; - - ershdr = (struct erspan_base_hdr *)skb->data; - pkt_md = (struct erspan_metadata *)(ershdr + 1); - - if (__iptunnel_pull_header(skb, - len, - htons(ETH_P_TEB), - false, false) < 0) - goto drop; - - if (tunnel->collect_md) { - struct ip_tunnel_info *info; - struct erspan_metadata *md; - __be64 tun_id; - __be16 flags; - - tpi->flags |= TUNNEL_KEY; - flags = tpi->flags; - tun_id = key32_to_tunnel_id(tpi->key); - - tun_dst = rpl_ip_tun_rx_dst(skb, flags, tun_id, sizeof(*md)); - if (!tun_dst) - return PACKET_REJECT; - - md = ip_tunnel_info_opts(&tun_dst->u.tun_info); - md->version = ver; - md2 = &md->u.md2; - memcpy(md2, pkt_md, ver == 1 ? ERSPAN_V1_MDSIZE : - ERSPAN_V2_MDSIZE); - - info = &tun_dst->u.tun_info; - info->key.tun_flags |= TUNNEL_ERSPAN_OPT; - info->options_len = sizeof(*md); - } - - skb_reset_mac_header(skb); - ovs_ip_tunnel_rcv(tunnel->dev, skb, tun_dst); - kfree(tun_dst); - return PACKET_RCVD; - } -drop: - kfree_skb(skb); - return PACKET_RCVD; -} - - -static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) -{ - struct metadata_dst tun_dst; - const struct iphdr *iph; - struct ip_tunnel *tunnel; - - iph = ip_hdr(skb); - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags, - iph->saddr, iph->daddr, tpi->key); - - if (tunnel) { - if (__iptunnel_pull_header(skb, hdr_len, tpi->proto, - raw_proto, false) < 0) - goto drop; - - if (tunnel->dev->type != ARPHRD_NONE) - skb_pop_mac_header(skb); - else - skb_reset_mac_header(skb); - if (tunnel->collect_md) { - __be16 flags; - __be64 tun_id; - - flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); - tun_id = key32_to_tunnel_id(tpi->key); - ovs_ip_tun_rx_dst(&tun_dst, skb, flags, tun_id, 0); - } - - ovs_ip_tunnel_rcv(tunnel->dev, skb, &tun_dst); - return PACKET_RCVD; - } - return PACKET_NEXT; - -drop: - kfree_skb(skb); - return PACKET_RCVD; -} - - -static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len) -{ - struct net *net = dev_net(skb->dev); - struct ip_tunnel_net *itn; - int res; - - if (tpi->proto == htons(ETH_P_TEB)) - itn = net_generic(net, gre_tap_net_id); - else if (tpi->proto == htons(ETH_P_ERSPAN) || - tpi->proto == htons(ETH_P_ERSPAN2)) - itn = net_generic(net, erspan_net_id); - else - return PACKET_RCVD; - - res = __ipgre_rcv(skb, tpi, itn, hdr_len, false); - - return res; -} - -static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, - const struct iphdr *tnl_params, - __be16 proto) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - struct tnl_ptk_info tpi; - - tpi.flags = tunnel->parms.o_flags; - tpi.proto = proto; - tpi.key = tunnel->parms.o_key; - if (tunnel->parms.o_flags & TUNNEL_SEQ) - tunnel->o_seqno++; - tpi.seq = htonl(tunnel->o_seqno); - - /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->hlen); - - ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); -} - -static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *unused_tpi) -{ - struct tnl_ptk_info tpi; - bool csum_err = false; - int hdr_len; - - hdr_len = gre_parse_header(skb, &tpi, &csum_err, htons(ETH_P_IP), 0); - if (hdr_len < 0) - goto drop; - - if (unlikely(tpi.proto == htons(ETH_P_ERSPAN) || - tpi.proto == htons(ETH_P_ERSPAN2))) { - if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) - return 0; - goto drop; - } - - if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) - return 0; -drop: - - kfree_skb(skb); - return 0; -} - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) -#include "gso.h" -/* gre_handle_offloads() has different return type on older kernsl. */ -static void gre_nop_fix(struct sk_buff *skb) { } - -static void gre_csum_fix(struct sk_buff *skb) -{ - struct gre_base_hdr *greh; - __be32 *options; - int gre_offset = skb_transport_offset(skb); - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - options = ((__be32 *)greh + 1); - - *options = 0; - *(__sum16 *)options = csum_fold(skb_checksum(skb, gre_offset, - skb->len - gre_offset, 0)); -} - -#define gre_handle_offloads rpl_gre_handle_offloads -static int rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum) -{ - int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE; - gso_fix_segment_t fix_segment; - - if (gre_csum) - fix_segment = gre_csum_fix; - else - fix_segment = gre_nop_fix; - - return ovs_iptunnel_handle_offloads(skb, type, fix_segment); -} -#else -static int gre_handle_offloads(struct sk_buff *skb, bool csum) -{ - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); -} -#endif - -static bool is_gre_gso(struct sk_buff *skb) -{ - return skb_shinfo(skb)->gso_type & - (SKB_GSO_GRE | SKB_GSO_GRE_CSUM); -} - -static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, - __be16 proto, __be32 key, __be32 seq) -{ - struct gre_base_hdr *greh; - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(flags); - greh->protocol = proto; - - if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (flags & TUNNEL_SEQ) { - *ptr = seq; - ptr--; - } - if (flags & TUNNEL_KEY) { - *ptr = key; - ptr--; - } - if (flags & TUNNEL_CSUM && !is_gre_gso(skb)) { - *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); - } - } - ovs_skb_set_inner_protocol(skb, proto); -} - -static struct rtable *gre_get_rt(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - const struct ip_tunnel_key *key) -{ - struct net *net = dev_net(dev); - - memset(fl, 0, sizeof(*fl)); - fl->daddr = key->u.ipv4.dst; - fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->tos); - fl->flowi4_mark = skb->mark; - fl->flowi4_proto = IPPROTO_GRE; - - return ip_route_output_key(net, fl); -} - -static struct rtable *prepare_fb_xmit(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - int tunnel_hlen) -{ - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - struct rtable *rt = NULL; - int min_headroom; - bool use_cache; - int err; - - tun_info = skb_tunnel_info(skb); - key = &tun_info->key; - use_cache = ip_tunnel_dst_cache_usable(skb, tun_info); - - if (use_cache) - rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl->saddr); - if (!rt) { - rt = gre_get_rt(skb, dev, fl, key); - if (IS_ERR(rt)) - goto err_free_skb; - if (use_cache) - dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst, - fl->saddr); - } - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - return rt; - -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; - return NULL; -} - -netdev_tx_t rpl_gre_fb_xmit(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - struct flowi4 fl; - struct rtable *rt; - int min_headroom; - int tunnel_hlen; - __be16 df, flags; - int err; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET)) - goto err_free_skb; - - key = &tun_info->key; - - rt = gre_get_rt(skb, dev, &fl, key); - if (IS_ERR(rt)) - goto err_free_skb; - - tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - if (skb_vlan_tag_present(skb)) { - skb = __vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) { - err = -ENOMEM; - goto err_free_rt; - } - } - - /* Push Tunnel header. */ - err = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)); - if (err) - goto err_free_rt; - - flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); - build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), - tunnel_id_to_key32(tun_info->key.tun_id), 0); - - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); - return NETDEV_TX_OK; - -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; - return NETDEV_TX_OK; -} -EXPORT_SYMBOL(rpl_gre_fb_xmit); - -static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, - __be16 proto) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - struct erspan_metadata *md; - struct rtable *rt = NULL; - struct tnl_ptk_info tpi; - bool truncate = false; - struct flowi4 fl; - int tunnel_hlen; - int version; - __be16 df; - int nhoff; - int thoff; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET)) - goto err_free_skb; - - key = &tun_info->key; - if (!(tun_info->key.tun_flags & TUNNEL_ERSPAN_OPT)) - goto err_free_rt; - md = ip_tunnel_info_opts(tun_info); - if (!md) - goto err_free_rt; - - /* ERSPAN has fixed 8 byte GRE header */ - version = md->version; - tunnel_hlen = 8 + erspan_hdr_len(version); - - rt = prepare_fb_xmit(skb, dev, &fl, tunnel_hlen); - if (!rt) - return; - - if (gre_handle_offloads(skb, false)) - goto err_free_rt; - - if (skb->len > dev->mtu + dev->hard_header_len) { - pskb_trim(skb, dev->mtu + dev->hard_header_len); - truncate = true; - } - - nhoff = skb_network_header(skb) - skb_mac_header(skb); - if (skb->protocol == htons(ETH_P_IP) && - (ntohs(ip_hdr(skb)->tot_len) > skb->len - nhoff)) - truncate = true; - - thoff = skb_transport_header(skb) - skb_mac_header(skb); - if (skb->protocol == htons(ETH_P_IPV6) && - (ntohs(ipv6_hdr(skb)->payload_len) > skb->len - thoff)) - truncate = true; - - if (version == 1) { - erspan_build_header(skb, ntohl(tunnel_id_to_key32(key->tun_id)), - ntohl(md->u.index), truncate, true); - tpi.hdr_len = ERSPAN_V1_MDSIZE; - tpi.proto = htons(ETH_P_ERSPAN); - } else if (version == 2) { - erspan_build_header_v2(skb, - ntohl(tunnel_id_to_key32(key->tun_id)), - md->u.md2.dir, - get_hwid(&md->u.md2), - truncate, true); - tpi.hdr_len = ERSPAN_V2_MDSIZE; - tpi.proto = htons(ETH_P_ERSPAN2); - } else { - goto err_free_rt; - } - - tpi.flags = TUNNEL_SEQ; - tpi.key = tunnel_id_to_key32(key->tun_id); - tpi.seq = htonl(tunnel->o_seqno++); - - gre_build_header(skb, &tpi, 8); - - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - - iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->tos, key->ttl, df, false); - return; - -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; -} - -#define GRE_FEATURES (NETIF_F_SG | \ - NETIF_F_FRAGLIST | \ - NETIF_F_HIGHDMA | \ - NETIF_F_HW_CSUM | \ - NETIF_F_NETNS_LOCAL) - -static void __gre_tunnel_init(struct net_device *dev) -{ - struct ip_tunnel *tunnel; - - tunnel = netdev_priv(dev); - tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags); - tunnel->parms.iph.protocol = IPPROTO_GRE; - - tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; - - dev->features |= GRE_FEATURES; - dev->hw_features |= GRE_FEATURES; - - if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) { - /* TCP offload with GRE SEQ is not supported, nor - * can we support 2 levels of outer headers requiring - * an update. - */ - if (!(tunnel->parms.o_flags & TUNNEL_CSUM) || - (tunnel->encap.type == TUNNEL_ENCAP_NONE)) { - dev->features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - } - - /* Can use a lockless transmit, unless we generate - * output sequences - */ - dev->features |= NETIF_F_LLTX; - } -} - -static int __gre_rcv(struct sk_buff *skb) -{ - return gre_rcv(skb, NULL); -} - -void __gre_err(struct sk_buff *skb, u32 info) -{ - pr_warn("%s: GRE receive error\n", __func__); -} - -static const struct gre_protocol ipgre_protocol = { - .handler = __gre_rcv, - .err_handler = __gre_err, -}; - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - __be16 flags; - - if (!data) - return 0; - - flags = 0; - if (data[IFLA_GRE_IFLAGS]) - flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); - if (data[IFLA_GRE_OFLAGS]) - flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); - if (flags & (GRE_VERSION|GRE_ROUTING)) - return -EINVAL; - - return 0; -} - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - __be32 daddr; - - if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) - return -EINVAL; - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) - return -EADDRNOTAVAIL; - } - - if (!data) - goto out; - - if (data[IFLA_GRE_REMOTE]) { - memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4); - if (!daddr) - return -EINVAL; - } - -out: -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK - return ipgre_tunnel_validate(tb, data, NULL); -#else - return ipgre_tunnel_validate(tb, data); -#endif -} - -enum { -#ifndef HAVE_IFLA_GRE_ENCAP_DPORT - IFLA_GRE_ENCAP_TYPE = IFLA_GRE_FLAGS + 1, - IFLA_GRE_ENCAP_FLAGS, - IFLA_GRE_ENCAP_SPORT, - IFLA_GRE_ENCAP_DPORT, -#endif -#ifndef HAVE_IFLA_GRE_COLLECT_METADATA - IFLA_GRE_COLLECT_METADATA = IFLA_GRE_ENCAP_DPORT + 1, -#endif -#ifndef HAVE_IFLA_GRE_IGNORE_DF - IFLA_GRE_IGNORE_DF = IFLA_GRE_COLLECT_METADATA + 1, -#endif -#ifndef HAVE_IFLA_GRE_FWMARK - IFLA_GRE_FWMARK = IFLA_GRE_IGNORE_DF + 1, -#endif -#ifndef HAVE_IFLA_GRE_ERSPAN_INDEX - IFLA_GRE_ERSPAN_INDEX = IFLA_GRE_FWMARK + 1, -#endif -#ifndef HAVE_IFLA_GRE_ERSPAN_HWID - IFLA_GRE_ERSPAN_VER = IFLA_GRE_ERSPAN_INDEX + 1, - IFLA_GRE_ERSPAN_DIR, - IFLA_GRE_ERSPAN_HWID, -#endif -}; - -#define RPL_IFLA_GRE_MAX (IFLA_GRE_ERSPAN_HWID + 1) - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int erspan_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - __be16 flags = 0; - int ret; - - if (!data) - return 0; - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK - ret = ipgre_tap_validate(tb, data, NULL); -#else - ret = ipgre_tap_validate(tb, data); -#endif - if (ret) - return ret; - - /* ERSPAN should only have GRE sequence and key flag */ - if (data[IFLA_GRE_OFLAGS]) - flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); - if (data[IFLA_GRE_IFLAGS]) - flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); - if (!data[IFLA_GRE_COLLECT_METADATA] && - flags != (GRE_SEQ | GRE_KEY)) - return -EINVAL; - - /* ERSPAN Session ID only has 10-bit. Since we reuse - * 32-bit key field as ID, check it's range. - */ - if (data[IFLA_GRE_OKEY] && - (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) - return -EINVAL; - - return 0; -} - -static int ipgre_netlink_parms(struct net_device *dev, - struct nlattr *data[], - struct nlattr *tb[], - struct ip_tunnel_parm *parms) -{ - struct ip_tunnel *t = netdev_priv(dev); - - memset(parms, 0, sizeof(*parms)); - - parms->iph.protocol = IPPROTO_GRE; - - if (!data) - return 0; - - if (data[IFLA_GRE_LINK]) - parms->link = nla_get_u32(data[IFLA_GRE_LINK]); - - if (data[IFLA_GRE_IFLAGS]) - parms->i_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_IFLAGS])); - - if (data[IFLA_GRE_OFLAGS]) - parms->o_flags = gre_flags_to_tnl_flags(nla_get_be16(data[IFLA_GRE_OFLAGS])); - - if (data[IFLA_GRE_IKEY]) - parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]); - - if (data[IFLA_GRE_OKEY]) - parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]); - - if (data[IFLA_GRE_LOCAL]) - parms->iph.saddr = nla_get_in_addr(data[IFLA_GRE_LOCAL]); - - if (data[IFLA_GRE_REMOTE]) - parms->iph.daddr = nla_get_in_addr(data[IFLA_GRE_REMOTE]); - - if (data[IFLA_GRE_TTL]) - parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]); - - if (data[IFLA_GRE_TOS]) - parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]); - - if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) { - if (t->ignore_df) - return -EINVAL; - parms->iph.frag_off = htons(IP_DF); - } - - if (data[IFLA_GRE_COLLECT_METADATA]) { - t->collect_md = true; - if (dev->type == ARPHRD_IPGRE) - dev->type = ARPHRD_NONE; - } - - if (data[IFLA_GRE_IGNORE_DF]) { - if (nla_get_u8(data[IFLA_GRE_IGNORE_DF]) - && (parms->iph.frag_off & htons(IP_DF))) - return -EINVAL; - t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); - } - - if (data[IFLA_GRE_ERSPAN_INDEX]) { - t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); - - if (t->index & ~INDEX_MASK) - return -EINVAL; - } - - return 0; -} - -static int gre_tap_init(struct net_device *dev) -{ - __gre_tunnel_init(dev); - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - - return ip_tunnel_init(dev); -} - -static netdev_tx_t gre_dev_xmit(struct sk_buff *skb, struct net_device *dev) -{ - /* Drop All packets coming from networking stack. OVS-CB is - * not initialized for these packets. - */ - - dev_kfree_skb(skb); - dev->stats.tx_dropped++; - return NETDEV_TX_OK; -} - -static netdev_tx_t erspan_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - bool truncate = false; - - if (tunnel->collect_md) { - erspan_fb_xmit(skb, dev, skb->protocol); - return NETDEV_TX_OK; - } - - if (gre_handle_offloads(skb, false)) - goto free_skb; - - if (skb_cow_head(skb, dev->needed_headroom)) - goto free_skb; - - if (skb->len > dev->mtu + dev->hard_header_len) { - pskb_trim(skb, dev->mtu + dev->hard_header_len); - truncate = true; - } - - /* Push ERSPAN header */ - if (tunnel->erspan_ver == 1) - erspan_build_header(skb, ntohl(tunnel->parms.o_key), - tunnel->index, - truncate, true); - else if (tunnel->erspan_ver == 2) - erspan_build_header_v2(skb, ntohl(tunnel->parms.o_key), - tunnel->dir, tunnel->hwid, - truncate, true); - else - goto free_skb; - - tunnel->parms.o_flags &= ~TUNNEL_KEY; - __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); - return NETDEV_TX_OK; - -free_skb: - kfree_skb(skb); - dev->stats.tx_dropped++; - return NETDEV_TX_OK; -} - -static netdev_tx_t __erspan_fb_xmit(struct sk_buff *skb) -{ - erspan_fb_xmit(skb, skb->dev, skb->protocol); - return NETDEV_TX_OK; -} - -int ovs_gre_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) -{ - struct ip_tunnel_info *info = skb_tunnel_info(skb); - struct rtable *rt; - struct flowi4 fl4; - - if (ip_tunnel_info_af(info) != AF_INET) - return -EINVAL; - - rt = gre_get_rt(skb, dev, &fl4, &info->key); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - ip_rt_put(rt); - info->key.u.ipv4.src = fl4.saddr; - return 0; -} -EXPORT_SYMBOL_GPL(ovs_gre_fill_metadata_dst); - -static int erspan_tunnel_init(struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - - tunnel->tun_hlen = 8; - tunnel->parms.iph.protocol = IPPROTO_GRE; - tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + - erspan_hdr_len(tunnel->erspan_ver); - - dev->features |= GRE_FEATURES; - dev->hw_features |= GRE_FEATURES; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - netif_keep_dst(dev); - - return ip_tunnel_init(dev); -} - -static const struct net_device_ops gre_tap_netdev_ops = { - .ndo_init = gre_tap_init, - .ndo_uninit = rpl_ip_tunnel_uninit, - .ndo_start_xmit = gre_dev_xmit, - .ndo_set_mac_address = eth_mac_addr, - .ndo_validate_addr = eth_validate_addr, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = ip_tunnel_change_mtu, -#else - .ndo_change_mtu = ip_tunnel_change_mtu, -#endif - .ndo_get_stats64 = ip_tunnel_get_stats64, -#ifdef HAVE_NDO_GET_IFLINK - .ndo_get_iflink = rpl_ip_tunnel_get_iflink, -#endif -#ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = gre_fill_metadata_dst, -#endif -}; - -static const struct net_device_ops erspan_netdev_ops = { - .ndo_init = erspan_tunnel_init, - .ndo_uninit = rpl_ip_tunnel_uninit, - .ndo_start_xmit = erspan_xmit, - .ndo_set_mac_address = eth_mac_addr, - .ndo_validate_addr = eth_validate_addr, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = ip_tunnel_change_mtu, -#else - .ndo_change_mtu = ip_tunnel_change_mtu, -#endif - .ndo_get_stats64 = ip_tunnel_get_stats64, -#ifdef HAVE_NDO_GET_IFLINK - .ndo_get_iflink = rpl_ip_tunnel_get_iflink, -#endif -#ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = gre_fill_metadata_dst, -#endif -}; - -static void ipgre_tap_setup(struct net_device *dev) -{ - ether_setup(dev); -#ifdef HAVE_NET_DEVICE_MAX_MTU - dev->max_mtu = 0; -#endif - dev->netdev_ops = &gre_tap_netdev_ops; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - ip_tunnel_setup(dev, gre_tap_net_id); -} - -static void erspan_setup(struct net_device *dev) -{ - struct ip_tunnel *t = netdev_priv(dev); - - eth_hw_addr_random(dev); - ether_setup(dev); -#ifdef HAVE_NET_DEVICE_MAX_MTU - dev->max_mtu = 0; -#endif - dev->netdev_ops = &erspan_netdev_ops; - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - ip_tunnel_setup(dev, erspan_net_id); - t->erspan_ver = 1; -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int ipgre_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - struct ip_tunnel_parm p; - int err; - - ipgre_netlink_parms(dev, data, tb, &p); - err = ip_tunnel_newlink(dev, tb, &p); - return err; - -} - -static size_t ipgre_get_size(const struct net_device *dev) -{ - return - /* IFLA_GRE_LINK */ - nla_total_size(4) + - /* IFLA_GRE_IFLAGS */ - nla_total_size(2) + - /* IFLA_GRE_OFLAGS */ - nla_total_size(2) + - /* IFLA_GRE_IKEY */ - nla_total_size(4) + - /* IFLA_GRE_OKEY */ - nla_total_size(4) + - /* IFLA_GRE_LOCAL */ - nla_total_size(4) + - /* IFLA_GRE_REMOTE */ - nla_total_size(4) + - /* IFLA_GRE_TTL */ - nla_total_size(1) + - /* IFLA_GRE_TOS */ - nla_total_size(1) + - /* IFLA_GRE_PMTUDISC */ - nla_total_size(1) + - /* IFLA_GRE_ENCAP_TYPE */ - nla_total_size(2) + - /* IFLA_GRE_ENCAP_FLAGS */ - nla_total_size(2) + - /* IFLA_GRE_ENCAP_SPORT */ - nla_total_size(2) + - /* IFLA_GRE_ENCAP_DPORT */ - nla_total_size(2) + - /* IFLA_GRE_COLLECT_METADATA */ - nla_total_size(0) + - /* IFLA_GRE_ERSPAN_INDEX */ - nla_total_size(4) + - /* IFLA_GRE_ERSPAN_VER */ - nla_total_size(1) + - /* IFLA_GRE_ERSPAN_DIR */ - nla_total_size(1) + - /* IFLA_GRE_ERSPAN_HWID */ - nla_total_size(2) + - 0; -} - -static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct ip_tunnel *t = netdev_priv(dev); - struct ip_tunnel_parm *p = &t->parms; - - if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) || - nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) || - nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) || - nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) || - nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) || - nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) || - nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) || - nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || - nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || - nla_put_u8(skb, IFLA_GRE_PMTUDISC, - !!(p->iph.frag_off & htons(IP_DF)))) - goto nla_put_failure; - - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) - goto nla_put_failure; - - if (t->erspan_ver == 1) { - if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) - goto nla_put_failure; - } else if (t->erspan_ver == 2) { - if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) - goto nla_put_failure; - if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) - goto nla_put_failure; - } - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static const struct nla_policy ipgre_policy[RPL_IFLA_GRE_MAX + 1] = { - [IFLA_GRE_LINK] = { .type = NLA_U32 }, - [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, - [IFLA_GRE_OFLAGS] = { .type = NLA_U16 }, - [IFLA_GRE_IKEY] = { .type = NLA_U32 }, - [IFLA_GRE_OKEY] = { .type = NLA_U32 }, - [IFLA_GRE_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, - [IFLA_GRE_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, - [IFLA_GRE_TTL] = { .type = NLA_U8 }, - [IFLA_GRE_TOS] = { .type = NLA_U8 }, - [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 }, - [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, - [IFLA_GRE_ERSPAN_VER] = { .type = NLA_U8 }, - [IFLA_GRE_ERSPAN_DIR] = { .type = NLA_U8 }, - [IFLA_GRE_ERSPAN_HWID] = { .type = NLA_U16 }, -}; - -static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { - .kind = "ovs_gretap", - .maxtype = RPL_IFLA_GRE_MAX, - .policy = ipgre_policy, - .priv_size = sizeof(struct ip_tunnel), - .setup = ipgre_tap_setup, - .validate = ipgre_tap_validate, - .newlink = ipgre_newlink, - .dellink = ip_tunnel_dellink, - .get_size = ipgre_get_size, - .fill_info = ipgre_fill_info, -#ifdef HAVE_GET_LINK_NET - .get_link_net = ip_tunnel_get_link_net, -#endif -}; - -static struct rtnl_link_ops erspan_link_ops __read_mostly = { - .kind = "erspan", - .maxtype = RPL_IFLA_GRE_MAX, - .policy = ipgre_policy, - .priv_size = sizeof(struct ip_tunnel), - .setup = erspan_setup, - .validate = erspan_validate, - .newlink = ipgre_newlink, - .dellink = ip_tunnel_dellink, - .get_size = ipgre_get_size, - .fill_info = ipgre_fill_info, -#ifdef HAVE_GET_LINK_NET - .get_link_net = ip_tunnel_get_link_net, -#endif -}; - -struct net_device *rpl_gretap_fb_dev_create(struct net *net, const char *name, - u8 name_assign_type) -{ - struct nlattr *tb[IFLA_MAX + 1]; - struct net_device *dev; - LIST_HEAD(list_kill); - struct ip_tunnel *t; - int err; - - memset(&tb, 0, sizeof(tb)); - - dev = rtnl_create_link(net, (char *)name, name_assign_type, - &ipgre_tap_ops, tb); - if (IS_ERR(dev)) - return dev; - - t = netdev_priv(dev); - t->collect_md = true; - /* Configure flow based GRE device. */ -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS - err = ipgre_newlink(net, dev, tb, NULL, NULL); -#else - err = ipgre_newlink(net, dev, tb, NULL); -#endif - if (err < 0) { - free_netdev(dev); - return ERR_PTR(err); - } - - /* openvswitch users expect packet sizes to be unrestricted, - * so set the largest MTU we can. - */ - err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); - if (err) - goto out; - - return dev; -out: - ip_tunnel_dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(rpl_gretap_fb_dev_create); - -static int __net_init erspan_init_net(struct net *net) -{ - return ip_tunnel_init_net(net, erspan_net_id, - &erspan_link_ops, NULL); -} - -static void __net_exit erspan_exit_net(struct net *net) -{ - struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); - - ip_tunnel_delete_net(itn, &erspan_link_ops); -} - -static struct pernet_operations erspan_net_ops = { - .init = erspan_init_net, - .exit = erspan_exit_net, - .id = &erspan_net_id, - .size = sizeof(struct ip_tunnel_net), -}; - -static int __net_init ipgre_tap_init_net(struct net *net) -{ - return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "ovs-gretap0"); -} - -static void __net_exit ipgre_tap_exit_net(struct net *net) -{ - struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id); - - ip_tunnel_delete_net(itn, &ipgre_tap_ops); -} - -static struct pernet_operations ipgre_tap_net_ops = { - .init = ipgre_tap_init_net, - .exit = ipgre_tap_exit_net, - .id = &gre_tap_net_id, - .size = sizeof(struct ip_tunnel_net), -}; - -static struct net_device *erspan_fb_dev_create(struct net *net, - const char *name, - u8 name_assign_type) -{ - struct nlattr *tb[IFLA_MAX + 1]; - struct net_device *dev; - LIST_HEAD(list_kill); - struct ip_tunnel *t; - int err; - - memset(&tb, 0, sizeof(tb)); - - dev = rtnl_create_link(net, (char *)name, name_assign_type, - &erspan_link_ops, tb); - if (IS_ERR(dev)) - return dev; - - t = netdev_priv(dev); - t->collect_md = true; - /* Configure flow based GRE device. */ -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS - err = ipgre_newlink(net, dev, tb, NULL, NULL); -#else - err = ipgre_newlink(net, dev, tb, NULL); -#endif - if (err < 0) { - free_netdev(dev); - return ERR_PTR(err); - } - - /* openvswitch users expect packet sizes to be unrestricted, - * so set the largest MTU we can. - */ - err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false); - if (err) - goto out; - - return dev; -out: - ip_tunnel_dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return ERR_PTR(err); -} - -static struct vport_ops ovs_erspan_vport_ops; - -static struct vport *erspan_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct net_device *dev; - struct vport *vport; - int err; - - vport = ovs_vport_alloc(0, &ovs_erspan_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - rtnl_lock(); - dev = erspan_fb_dev_create(net, parms->name, NET_NAME_USER); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_PTR(err); - } - - rtnl_unlock(); - return vport; -} - -static struct vport *erspan_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = erspan_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static struct vport_ops ovs_erspan_vport_ops = { - .type = OVS_VPORT_TYPE_ERSPAN, - .create = erspan_create, - .send = __erspan_fb_xmit, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = gre_fill_metadata_dst, -#endif - .destroy = ovs_netdev_tunnel_destroy, -}; - -static struct vport_ops ovs_ipgre_vport_ops; - -static struct vport *ipgre_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct net_device *dev; - struct vport *vport; - int err; - - vport = ovs_vport_alloc(0, &ovs_ipgre_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - rtnl_lock(); - dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_PTR(err); - } - - rtnl_unlock(); - return vport; -} - -static struct vport *ipgre_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = ipgre_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static struct vport_ops ovs_ipgre_vport_ops = { - .type = OVS_VPORT_TYPE_GRE, - .create = ipgre_create, - .send = gre_fb_xmit, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = gre_fill_metadata_dst, -#endif - .destroy = ovs_netdev_tunnel_destroy, -}; - -int rpl_ipgre_init(void) -{ - int err; - - err = register_pernet_device(&ipgre_tap_net_ops); - if (err < 0) { - if (err == -EEXIST) - goto ip_gre_loaded; - else - goto pnet_tap_failed; - } - - err = register_pernet_device(&erspan_net_ops); - if (err < 0) { - if (err == -EEXIST) - goto ip_gre_loaded; - else - goto pnet_erspan_failed; - } - - err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); - if (err < 0) { - pr_info("%s: can't add protocol\n", __func__); - if (err == -EBUSY) { - goto ip_gre_loaded; - } else { - goto add_proto_failed; - } - } - - pr_info("GRE over IPv4 tunneling driver\n"); - ovs_vport_ops_register(&ovs_ipgre_vport_ops); - ovs_vport_ops_register(&ovs_erspan_vport_ops); - return 0; - -ip_gre_loaded: - /* Since GRE only allows single receiver to be registerd, - * we skip here so only gre transmit works, see: - * - * commit 9f57c67c379d88a10e8ad676426fee5ae7341b14 - * Author: Pravin B Shelar <pshelar@nicira.com> - * Date: Fri Aug 7 23:51:52 2015 -0700 - * gre: Remove support for sharing GRE protocol hook - * - * OVS GRE receive part is disabled. - */ - pr_info("GRE TX only over IPv4 tunneling driver\n"); - ip_gre_loaded = true; - ovs_vport_ops_register(&ovs_ipgre_vport_ops); - ovs_vport_ops_register(&ovs_erspan_vport_ops); - return 0; - -add_proto_failed: - unregister_pernet_device(&erspan_net_ops); -pnet_erspan_failed: - unregister_pernet_device(&ipgre_tap_net_ops); -pnet_tap_failed: - pr_err("Error while initializing GRE %d\n", err); - return err; -} - -void rpl_ipgre_fini(void) -{ - ovs_vport_ops_unregister(&ovs_erspan_vport_ops); - ovs_vport_ops_unregister(&ovs_ipgre_vport_ops); - - if (!ip_gre_loaded) { - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); - unregister_pernet_device(&erspan_net_ops); - unregister_pernet_device(&ipgre_tap_net_ops); - } -} - -#endif diff --git a/datapath/linux/compat/ip_output.c b/datapath/linux/compat/ip_output.c deleted file mode 100644 index e2f869f9a..000000000 --- a/datapath/linux/compat/ip_output.c +++ /dev/null @@ -1,418 +0,0 @@ -/* - * IP fragmentation backport, heavily based on linux/net/ipv4/ip_output.c, - * copied from Linux ae7ef81ef000 ("skbuff: introduce skb_gso_validate_mtu") - * - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * The Internet Protocol (IP) output module. - * - * Authors: Ross Biro - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Donald Becker, <becker@super.org> - * Alan Cox, <Alan.Cox@linux.org> - * Richard Underwood - * Stefan Becker, <stefanb@yello.ping.de> - * Jorge Cwik, <jorge@laser.satlink.net> - * Arnt Gulbrandsen, <agulbra@nvg.unit.no> - * Hirokazu Takahashi, <taka@valinux.co.jp> - * - * See ip_input.c for original log - * - * Fixes: - * Alan Cox : Missing nonblock feature in ip_build_xmit. - * Mike Kilburn : htons() missing in ip_build_xmit. - * Bradford Johnson: Fix faulty handling of some frames when - * no route is found. - * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit - * (in case if packet not accepted by - * output firewall rules) - * Mike McLagan : Routing by source - * Alexey Kuznetsov: use new route cache - * Andi Kleen: Fix broken PMTU recovery and remove - * some redundant tests. - * Vitaly E. Lavrov : Transparent proxy revived after year coma. - * Andi Kleen : Replace ip_reply with ip_send_reply. - * Andi Kleen : Split fast and slow ip_build_xmit path - * for decreased register pressure on x86 - * and more readibility. - * Marc Boucher : When call_out_firewall returns FW_QUEUE, - * silently drop skb instead of failing with -EPERM. - * Detlev Wengorz : Copy protocol for fragments. - * Hirokazu Takahashi: HW checksumming for outgoing UDP - * datagrams. - * Hirokazu Takahashi: sendfile() on UDP works now. - */ - -#ifndef HAVE_CORRECT_MRU_HANDLING -#include <asm/uaccess.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/highmem.h> -#include <linux/slab.h> - -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/proc_fs.h> -#include <linux/stat.h> -#include <linux/init.h> - -#include <net/snmp.h> -#include <net/ip.h> -#include <net/protocol.h> -#include <net/route.h> -#include <net/xfrm.h> -#include <linux/skbuff.h> -#include <net/sock.h> -#include <net/arp.h> -#include <net/icmp.h> -#include <net/checksum.h> -#include <net/inetpeer.h> -#include <linux/igmp.h> -#include <linux/netfilter_ipv4.h> -#include <linux/netfilter_bridge.h> -#include <linux/netlink.h> -#include <linux/tcp.h> - -static inline void rpl_ip_options_fragment(struct sk_buff *skb) -{ - unsigned char *optptr = skb_network_header(skb) + sizeof(struct iphdr); - struct ip_options *opt = &(IPCB(skb)->opt); - int l = opt->optlen; - int optlen; - - while (l > 0) { - switch (*optptr) { - case IPOPT_END: - return; - case IPOPT_NOOP: - l--; - optptr++; - continue; - } - optlen = optptr[1]; - if (optlen < 2 || optlen > l) - return; - if (!IPOPT_COPIED(*optptr)) - memset(optptr, IPOPT_NOOP, optlen); - l -= optlen; - optptr += optlen; - } - opt->ts = 0; - opt->rr = 0; - opt->rr_needaddr = 0; - opt->ts_needaddr = 0; - opt->ts_needtime = 0; -} -#define ip_options_fragment rpl_ip_options_fragment - -static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) -{ - to->pkt_type = from->pkt_type; - to->priority = from->priority; - to->protocol = from->protocol; - skb_dst_drop(to); - skb_dst_copy(to, from); - to->dev = from->dev; - to->mark = from->mark; - - /* Copy the flags to each fragment. */ - IPCB(to)->flags = IPCB(from)->flags; - -#ifdef CONFIG_NET_SCHED - to->tc_index = from->tc_index; -#endif - nf_copy(to, from); -#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE) - to->ipvs_property = from->ipvs_property; -#endif - skb_copy_secmark(to, from); -} - -#ifdef HAVE_IP_DO_FRAGMENT_USING_NET -#define OUTPUT(net, sk, skb) output(net, sk, skb) -#elif defined(HAVE_IP_FRAGMENT_TAKES_SOCK) -#define OUTPUT(net, sk, skb) output(sk, skb) -#else -#define OUTPUT(net, sk, skb) output(skb) -#endif - -/* - * This IP datagram is too large to be sent in one piece. Break it up into - * smaller pieces (each of size equal to IP header plus - * a block of the data of the original IP data part) that will yet fit in a - * single device frame, and queue such a frame for sending. - */ - -int rpl_ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - int (*output)(OVS_VPORT_OUTPUT_PARAMS)) -{ - struct iphdr *iph; - int ptr; - struct net_device *dev; - struct sk_buff *skb2; - unsigned int mtu, hlen, left, len, ll_rs; - int offset; - __be16 not_last_frag; - struct rtable *rt = skb_rtable(skb); - int err = 0; - - dev = rt->dst.dev; - - /* for offloaded checksums cleanup checksum before fragmentation */ - if (skb->ip_summed == CHECKSUM_PARTIAL && - (err = skb_checksum_help(skb))) - goto fail; - - /* - * Point into the IP datagram header. - */ - - iph = ip_hdr(skb); - - mtu = ip_skb_dst_mtu(skb); - if (IPCB(skb)->frag_max_size && IPCB(skb)->frag_max_size < mtu) - mtu = IPCB(skb)->frag_max_size; - - /* - * Setup starting values. - */ - - hlen = iph->ihl * 4; - mtu = mtu - hlen; /* Size of data space */ - IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; - - /* When frag_list is given, use it. First, check its validity: - * some transformers could create wrong frag_list or break existing - * one, it is not prohibited. In this case fall back to copying. - * - * LATER: this step can be merged to real generation of fragments, - * we can switch to copy when see the first bad fragment. - */ - if (skb_has_frag_list(skb)) { - struct sk_buff *frag, *frag2; - int first_len = skb_pagelen(skb); - - if (first_len - hlen > mtu || - ((first_len - hlen) & 7) || - ip_is_fragment(iph) || - skb_cloned(skb)) - goto slow_path; - - skb_walk_frags(skb, frag) { - /* Correct geometry. */ - if (frag->len > mtu || - ((frag->len & 7) && frag->next) || - skb_headroom(frag) < hlen) - goto slow_path_clean; - - /* Partially cloned skb? */ - if (skb_shared(frag)) - goto slow_path_clean; - - BUG_ON(frag->sk); - if (skb->sk) { - frag->sk = skb->sk; - frag->destructor = sock_wfree; - } - skb->truesize -= frag->truesize; - } - - /* Everything is OK. Generate! */ - - err = 0; - offset = 0; - frag = skb_shinfo(skb)->frag_list; - skb_frag_list_init(skb); - skb->data_len = first_len - skb_headlen(skb); - skb->len = first_len; - iph->tot_len = htons(first_len); - iph->frag_off = htons(IP_MF); - ip_send_check(iph); - - for (;;) { - /* Prepare header of the next frame, - * before previous one went down. */ - if (frag) { - frag->ip_summed = CHECKSUM_NONE; - skb_reset_transport_header(frag); - __skb_push(frag, hlen); - skb_reset_network_header(frag); - memcpy(skb_network_header(frag), iph, hlen); - iph = ip_hdr(frag); - iph->tot_len = htons(frag->len); - ip_copy_metadata(frag, skb); - if (offset == 0) - ip_options_fragment(frag); - offset += skb->len - hlen; - iph->frag_off = htons(offset>>3); - if (frag->next) - iph->frag_off |= htons(IP_MF); - /* Ready, complete checksum */ - ip_send_check(iph); - } - - err = OUTPUT(net, sk, skb); - - if (!err) - IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); - if (err || !frag) - break; - - skb = frag; - frag = skb->next; - skb->next = NULL; - } - - if (err == 0) { - IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); - return 0; - } - - while (frag) { - skb = frag->next; - kfree_skb(frag); - frag = skb; - } - IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); - return err; - -slow_path_clean: - skb_walk_frags(skb, frag2) { - if (frag2 == frag) - break; - frag2->sk = NULL; - frag2->destructor = NULL; - skb->truesize += frag2->truesize; - } - } - -slow_path: - iph = ip_hdr(skb); - - left = skb->len - hlen; /* Space per frame */ - ptr = hlen; /* Where to start from */ - - ll_rs = LL_RESERVED_SPACE(rt->dst.dev); - - /* - * Fragment the datagram. - */ - - offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; - not_last_frag = iph->frag_off & htons(IP_MF); - - /* - * Keep copying data until we run out. - */ - - while (left > 0) { - len = left; - /* IF: it doesn't fit, use 'mtu' - the data space left */ - if (len > mtu) - len = mtu; - /* IF: we are not sending up to and including the packet end - then align the next start on an eight byte boundary */ - if (len < left) { - len &= ~7; - } - - /* Allocate buffer */ - skb2 = alloc_skb(len + hlen + ll_rs, GFP_ATOMIC); - if (!skb2) { - err = -ENOMEM; - goto fail; - } - - /* - * Set up data on packet - */ - - ip_copy_metadata(skb2, skb); - skb_reserve(skb2, ll_rs); - skb_put(skb2, len + hlen); - skb_reset_network_header(skb2); - skb2->transport_header = skb2->network_header + hlen; - - /* - * Charge the memory for the fragment to any owner - * it might possess - */ - - if (skb->sk) - skb_set_owner_w(skb2, skb->sk); - - /* - * Copy the packet header into the new buffer. - */ - - skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen); - - /* - * Copy a block of the IP datagram. - */ - if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len)) - BUG(); - left -= len; - - /* - * Fill in the new header fields. - */ - iph = ip_hdr(skb2); - iph->frag_off = htons((offset >> 3)); - - if (IPCB(skb)->flags & IPSKB_FRAG_PMTU) - iph->frag_off |= htons(IP_DF); - - /* ANK: dirty, but effective trick. Upgrade options only if - * the segment to be fragmented was THE FIRST (otherwise, - * options are already fixed) and make it ONCE - * on the initial skb, so that all the following fragments - * will inherit fixed options. - */ - if (offset == 0) - ip_options_fragment(skb); - - /* - * Added AC : If we are fragmenting a fragment that's not the - * last fragment then keep MF on each bit - */ - if (left > 0 || not_last_frag) - iph->frag_off |= htons(IP_MF); - ptr += len; - offset += len; - - /* - * Put this fragment into the sending queue. - */ - iph->tot_len = htons(len + hlen); - - ip_send_check(iph); - - err = OUTPUT(net, sk, skb2); - if (err) - goto fail; - - IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES); - } - consume_skb(skb); - IP_INC_STATS(net, IPSTATS_MIB_FRAGOKS); - return err; - -fail: - kfree_skb(skb); - IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); - return err; -} -EXPORT_SYMBOL(rpl_ip_do_fragment); - -#endif /* HAVE_CORRECT_MRU_HANDLING */ diff --git a/datapath/linux/compat/ip_tunnel.c b/datapath/linux/compat/ip_tunnel.c deleted file mode 100644 index e7a039358..000000000 --- a/datapath/linux/compat/ip_tunnel.c +++ /dev/null @@ -1,776 +0,0 @@ -/* - * Copyright (c) 2013,2018 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/capability.h> -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/uaccess.h> -#include <linux/skbuff.h> -#include <linux/netdevice.h> -#include <linux/in.h> -#include <linux/tcp.h> -#include <linux/udp.h> -#include <linux/if_arp.h> -#include <linux/mroute.h> -#include <linux/init.h> -#include <linux/in6.h> -#include <linux/inetdevice.h> -#include <linux/igmp.h> -#include <linux/netfilter_ipv4.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/if_vlan.h> -#include <linux/rculist.h> -#include <linux/err.h> - -#include <net/sock.h> -#include <net/ip.h> -#include <net/icmp.h> -#include <net/protocol.h> -#include <net/ip_tunnels.h> -#include <net/arp.h> -#include <net/checksum.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/xfrm.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/rtnetlink.h> -#include <net/dst_metadata.h> - -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/ip6_fib.h> -#include <net/ip6_route.h> -#endif - -#include "compat.h" - -#ifndef USE_UPSTREAM_TUNNEL -const struct ip_tunnel_encap_ops __rcu * - rpl_iptun_encaps[MAX_IPTUN_ENCAP_OPS] __read_mostly; - -static unsigned int rpl_ip_tunnel_hash(__be32 key, __be32 remote) -{ - return hash_32((__force u32)key ^ (__force u32)remote, - IP_TNL_HASH_BITS); -} - -static bool rpl_ip_tunnel_key_match(const struct ip_tunnel_parm *p, - __be16 flags, __be32 key) -{ - if (p->i_flags & TUNNEL_KEY) { - if (flags & TUNNEL_KEY) - return key == p->i_key; - else - /* key expected, none present */ - return false; - } else - return !(flags & TUNNEL_KEY); -} - -static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn, - struct ip_tunnel_parm *parms) -{ - unsigned int h; - __be32 remote; - __be32 i_key = parms->i_key; - - if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr)) - remote = parms->iph.daddr; - else - remote = 0; - - if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI)) - i_key = 0; - - h = rpl_ip_tunnel_hash(i_key, remote); - return &itn->tunnels[h]; -} - -static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) -{ - struct hlist_head *head = ip_bucket(itn, &t->parms); - - if (t->collect_md) - rcu_assign_pointer(itn->collect_md_tun, t); - hlist_add_head_rcu(&t->hash_node, head); -} - -static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) -{ - if (t->collect_md) - rcu_assign_pointer(itn->collect_md_tun, NULL); - hlist_del_init_rcu(&t->hash_node); -} - -static struct net_device *__ip_tunnel_create(struct net *net, - const struct rtnl_link_ops *ops, - struct ip_tunnel_parm *parms) -{ - int err; - struct ip_tunnel *tunnel; - struct net_device *dev; - char name[IFNAMSIZ]; - - if (parms->name[0]) - strlcpy(name, parms->name, IFNAMSIZ); - else { - if (strlen(ops->kind) > (IFNAMSIZ - 3)) { - err = -E2BIG; - goto failed; - } - strlcpy(name, ops->kind, IFNAMSIZ); - strncat(name, "%d", 2); - } - - ASSERT_RTNL(); - dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup); - if (!dev) { - err = -ENOMEM; - goto failed; - } - dev_net_set(dev, net); - - dev->rtnl_link_ops = ops; - - tunnel = netdev_priv(dev); - tunnel->parms = *parms; - tunnel->net = net; - - err = register_netdevice(dev); - if (err) - goto failed_free; - - return dev; - -failed_free: - free_netdev(dev); -failed: - return ERR_PTR(err); -} - -static inline void init_tunnel_flow(struct flowi4 *fl4, - int proto, - __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif) -{ - memset(fl4, 0, sizeof(*fl4)); - fl4->flowi4_oif = oif; - fl4->daddr = daddr; - fl4->saddr = saddr; - fl4->flowi4_tos = tos; - fl4->flowi4_proto = proto; - fl4->fl4_gre_key = key; -} - -static int ip_tunnel_bind_dev(struct net_device *dev) -{ - struct net_device *tdev = NULL; - struct ip_tunnel *tunnel = netdev_priv(dev); - const struct iphdr *iph; - int hlen = LL_MAX_HEADER; - int mtu = ETH_DATA_LEN; - int t_hlen = tunnel->hlen + sizeof(struct iphdr); - - iph = &tunnel->parms.iph; - - /* Guess output device to choose reasonable mtu and needed_headroom */ - if (iph->daddr) { - struct flowi4 fl4; - struct rtable *rt; - - init_tunnel_flow(&fl4, iph->protocol, iph->daddr, - iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), tunnel->parms.link); - rt = ip_route_output_key(tunnel->net, &fl4); - - if (!IS_ERR(rt)) { - tdev = rt->dst.dev; - ip_rt_put(rt); - } - if (dev->type != ARPHRD_ETHER) - dev->flags |= IFF_POINTOPOINT; - - dst_cache_reset(&tunnel->dst_cache); - } - - if (!tdev && tunnel->parms.link) - tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); - - if (tdev) { - hlen = tdev->hard_header_len + tdev->needed_headroom; - mtu = tdev->mtu; - } - - dev->needed_headroom = t_hlen + hlen; - mtu -= (dev->hard_header_len + t_hlen); - - if (mtu < 68) - mtu = 68; - - return mtu; -} - -int rpl___ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - int t_hlen = tunnel->hlen + sizeof(struct iphdr); - int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen; - - if (new_mtu < 68) - return -EINVAL; - - if (new_mtu > max_mtu) { - if (strict) - return -EINVAL; - - new_mtu = max_mtu; - } - - dev->mtu = new_mtu; - return 0; -} - -int rpl_ip_tunnel_change_mtu(struct net_device *dev, int new_mtu) -{ - return rpl___ip_tunnel_change_mtu(dev, new_mtu, true); -} - -static int rpl_tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, - struct rtable *rt, __be16 df, - const struct iphdr *inner_iph) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len; - int mtu; - - if (df) - mtu = dst_mtu(&rt->dst) - dev->hard_header_len - - sizeof(struct iphdr) - tunnel->hlen; - else - mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - - if (skb_valid_dst(skb)) -#ifndef HAVE_DST_OPS_CONFIRM_NEIGH - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu); -#else - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), - NULL, skb, mtu, false); -#endif - - if (skb->protocol == htons(ETH_P_IP)) { - if (!skb_is_gso(skb) && - (inner_iph->frag_off & htons(IP_DF)) && - mtu < pkt_size) { - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - return -E2BIG; - } - } -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) { - struct rt6_info *rt6; - - rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) : - NULL; - - if (rt6 && mtu < dst_mtu(skb_dst(skb)) && - mtu >= IPV6_MIN_MTU) { - if ((tunnel->parms.iph.daddr && - !ipv4_is_multicast(tunnel->parms.iph.daddr)) || - rt6->rt6i_dst.plen == 128) { - rt6->rt6i_flags |= RTF_MODIFIED; - dst_metric_set(skb_dst(skb), RTAX_MTU, mtu); - } - } - - if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU && - mtu < pkt_size) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - return -E2BIG; - } - } -#endif - return 0; -} - -void rpl_ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, - const struct iphdr *tnl_params, const u8 protocol) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - const struct iphdr *inner_iph; - struct flowi4 fl4; - u8 tos, ttl; - __be16 df; - struct rtable *rt; /* Route to the other host */ - unsigned int max_headroom; /* The extra header space needed */ - __be32 dst; - bool connected; - - inner_iph = (const struct iphdr *)skb_inner_network_header(skb); - connected = (tunnel->parms.iph.daddr != 0); - - dst = tnl_params->daddr; - if (dst == 0) { - /* NBMA tunnel */ - - if (skb_dst(skb) == NULL) { - dev->stats.tx_fifo_errors++; - goto tx_error; - } - - if (skb->protocol == htons(ETH_P_IP)) { - rt = skb_rtable(skb); - dst = rt_nexthop(rt, inner_iph->daddr); - } -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) { - const struct in6_addr *addr6; - struct neighbour *neigh; - bool do_tx_error_icmp; - int addr_type; - - neigh = dst_neigh_lookup(skb_dst(skb), - &ipv6_hdr(skb)->daddr); - if (neigh == NULL) - goto tx_error; - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if (addr_type == IPV6_ADDR_ANY) { - addr6 = &ipv6_hdr(skb)->daddr; - addr_type = ipv6_addr_type(addr6); - } - - if ((addr_type & IPV6_ADDR_COMPATv4) == 0) - do_tx_error_icmp = true; - else { - do_tx_error_icmp = false; - dst = addr6->s6_addr32[3]; - } - neigh_release(neigh); - if (do_tx_error_icmp) - goto tx_error_icmp; - } -#endif - else - goto tx_error; - - connected = false; - } - - tos = tnl_params->tos; - if (tos & 0x1) { - tos &= ~0x1; - if (skb->protocol == htons(ETH_P_IP)) { - tos = inner_iph->tos; - connected = false; - } else if (skb->protocol == htons(ETH_P_IPV6)) { - tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); - connected = false; - } - } - - init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); - - if (ovs_ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) - goto tx_error; - - rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) : - NULL; - - if (!rt) { - rt = ip_route_output_key(tunnel->net, &fl4); - - if (IS_ERR(rt)) { - dev->stats.tx_carrier_errors++; - goto tx_error; - } - if (connected) - dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, - fl4.saddr); - } - - if (rt->dst.dev == dev) { - ip_rt_put(rt); - dev->stats.collisions++; - goto tx_error; - } - - if (rpl_tnl_update_pmtu(dev, skb, rt, - tnl_params->frag_off, inner_iph)) { - ip_rt_put(rt); - goto tx_error; - } - - if (tunnel->err_count > 0) { - if (time_before(jiffies, - tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { - tunnel->err_count--; - - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - dst_link_failure(skb); - } else - tunnel->err_count = 0; - } - - tos = ip_tunnel_ecn_encap(tos, inner_iph, skb); - ttl = tnl_params->ttl; - if (ttl == 0) { - if (skb->protocol == htons(ETH_P_IP)) - ttl = inner_iph->ttl; -#if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) - ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit; -#endif - else - ttl = ip4_dst_hoplimit(&rt->dst); - } - - df = tnl_params->frag_off; - if (skb->protocol == htons(ETH_P_IP)) - df |= (inner_iph->frag_off&htons(IP_DF)); - - max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr) - + rt->dst.header_len; - if (max_headroom > dev->needed_headroom) - dev->needed_headroom = max_headroom; - - if (skb_cow_head(skb, dev->needed_headroom)) { - ip_rt_put(rt); - dev->stats.tx_dropped++; - kfree_skb(skb); - return; - } - - iptunnel_xmit(skb->sk, rt, skb, fl4.saddr, fl4.daddr, protocol, - tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); - - return; - -#if IS_ENABLED(CONFIG_IPV6) -tx_error_icmp: - dst_link_failure(skb); -#endif -tx_error: - dev->stats.tx_errors++; - kfree_skb(skb); -} -EXPORT_SYMBOL_GPL(rpl_ip_tunnel_xmit); - -static void ip_tunnel_dev_free(struct net_device *dev) -{ - free_percpu(dev->tstats); -#ifndef HAVE_NEEDS_FREE_NETDEV - free_netdev(dev); -#endif -} - -void rpl_ip_tunnel_dellink(struct net_device *dev, struct list_head *head) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - struct ip_tunnel_net *itn; - - itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); - - if (itn->fb_tunnel_dev != dev) { - ip_tunnel_del(itn, netdev_priv(dev)); - unregister_netdevice_queue(dev, head); - } -} - -int rpl_ip_tunnel_init_net(struct net *net, int ip_tnl_net_id, - struct rtnl_link_ops *ops, char *devname) -{ - struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id); - struct ip_tunnel_parm parms; - unsigned int i; - - for (i = 0; i < IP_TNL_HASH_SIZE; i++) - INIT_HLIST_HEAD(&itn->tunnels[i]); - - if (!ops) { - itn->fb_tunnel_dev = NULL; - return 0; - } - - memset(&parms, 0, sizeof(parms)); - if (devname) - strlcpy(parms.name, devname, IFNAMSIZ); - - rtnl_lock(); - itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms); - /* FB netdevice is special: we have one, and only one per netns. - * * Allowing to move it to another netns is clearly unsafe. - * */ - if (!IS_ERR(itn->fb_tunnel_dev)) { - itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; - itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev); - ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev)); - } - rtnl_unlock(); - - return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev); -} - -static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head, - struct rtnl_link_ops *ops) -{ - struct net *net = dev_net(itn->fb_tunnel_dev); - struct net_device *dev, *aux; - int h; - - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == ops) - unregister_netdevice_queue(dev, head); - - for (h = 0; h < IP_TNL_HASH_SIZE; h++) { - struct ip_tunnel *t; - struct hlist_node *n; - struct hlist_head *thead = &itn->tunnels[h]; - - hlist_for_each_entry_safe(t, n, thead, hash_node) - /* If dev is in the same netns, it has already - * been added to the list by the previous loop. - */ - if (!net_eq(dev_net(t->dev), net)) - unregister_netdevice_queue(t->dev, head); - } -} - -void rpl_ip_tunnel_delete_net(struct ip_tunnel_net *itn, - struct rtnl_link_ops *ops) -{ - LIST_HEAD(list); - - rtnl_lock(); - ip_tunnel_destroy(itn, &list, ops); - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -int rpl_ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p) -{ - struct ip_tunnel *nt; - struct net *net = dev_net(dev); - struct ip_tunnel_net *itn; - int mtu; - int err; - - nt = netdev_priv(dev); - itn = net_generic(net, nt->ip_tnl_net_id); - - if (nt->collect_md) { - if (rtnl_dereference(itn->collect_md_tun)) - return -EEXIST; - } else { - return -EOPNOTSUPP; - } - - nt->net = net; - nt->parms = *p; - err = register_netdevice(dev); - if (err) - goto out; - - if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS]) - eth_hw_addr_random(dev); - - mtu = ip_tunnel_bind_dev(dev); - if (!tb[IFLA_MTU]) - dev->mtu = mtu; - - ip_tunnel_add(itn, nt); -out: - return err; -} - -int rpl_ip_tunnel_init(struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - struct iphdr *iph = &tunnel->parms.iph; - int err; - -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = ip_tunnel_dev_free; -#else - dev->needs_free_netdev = true; - dev->priv_destructor = ip_tunnel_dev_free; -#endif - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); - if (err) { - free_percpu(dev->tstats); - return err; - } - - err = gro_cells_init(&tunnel->gro_cells, dev); - if (err) { - dst_cache_destroy(&tunnel->dst_cache); - free_percpu(dev->tstats); - return err; - } - - tunnel->dev = dev; - tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); - iph->version = 4; - iph->ihl = 5; - - if (tunnel->collect_md) { - dev->features |= NETIF_F_NETNS_LOCAL; - netif_keep_dst(dev); - } - return 0; -} - -void rpl_ip_tunnel_uninit(struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - struct net *net = tunnel->net; - struct ip_tunnel_net *itn; - - itn = net_generic(net, tunnel->ip_tnl_net_id); - if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(itn, netdev_priv(dev)); -} - -/* Do least required initialization, rest of init is done in tunnel_init call */ -void rpl_ip_tunnel_setup(struct net_device *dev, int net_id) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - - tunnel->ip_tnl_net_id = net_id; -} - -int rpl_ip_tunnel_get_iflink(const struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - - return tunnel->parms.link; -} - -struct net *rpl_ip_tunnel_get_link_net(const struct net_device *dev) -{ - struct ip_tunnel *tunnel = netdev_priv(dev); - - return tunnel->net; -} - -struct ip_tunnel *rpl_ip_tunnel_lookup(struct ip_tunnel_net *itn, - int link, __be16 flags, - __be32 remote, __be32 local, - __be32 key) -{ - unsigned int hash; - struct ip_tunnel *t, *cand = NULL; - struct hlist_head *head; - - hash = rpl_ip_tunnel_hash(key, remote); - head = &itn->tunnels[hash]; - - hlist_for_each_entry_rcu(t, head, hash_node) { - if (local != t->parms.iph.saddr || - remote != t->parms.iph.daddr || - !(t->dev->flags & IFF_UP)) - continue; - - if (!rpl_ip_tunnel_key_match(&t->parms, flags, key)) - continue; - - if (t->parms.link == link) - return t; - else - cand = t; - } - - hlist_for_each_entry_rcu(t, head, hash_node) { - if (remote != t->parms.iph.daddr || - t->parms.iph.saddr != 0 || - !(t->dev->flags & IFF_UP)) - continue; - - if (!rpl_ip_tunnel_key_match(&t->parms, flags, key)) - continue; - - if (t->parms.link == link) - return t; - else if (!cand) - cand = t; - } - - hash = rpl_ip_tunnel_hash(key, 0); - head = &itn->tunnels[hash]; - - hlist_for_each_entry_rcu(t, head, hash_node) { - if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) && - (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) - continue; - - if (!(t->dev->flags & IFF_UP)) - continue; - - if (!rpl_ip_tunnel_key_match(&t->parms, flags, key)) - continue; - - if (t->parms.link == link) - return t; - else if (!cand) - cand = t; - } - - if (flags & TUNNEL_NO_KEY) - goto skip_key_lookup; - - hlist_for_each_entry_rcu(t, head, hash_node) { - if (t->parms.i_key != key || - t->parms.iph.saddr != 0 || - t->parms.iph.daddr != 0 || - !(t->dev->flags & IFF_UP)) - continue; - - if (t->parms.link == link) - return t; - else if (!cand) - cand = t; - } - -skip_key_lookup: - if (cand) - return cand; - - t = rcu_dereference(itn->collect_md_tun); - if (t) - return t; - - if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) - return netdev_priv(itn->fb_tunnel_dev); - - - return NULL; -} -EXPORT_SYMBOL_GPL(rpl_ip_tunnel_lookup); - -#endif diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c deleted file mode 100644 index a3b1f7fc1..000000000 --- a/datapath/linux/compat/ip_tunnels_core.c +++ /dev/null @@ -1,330 +0,0 @@ -/* - * Copyright (c) 2007-2013 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/if_vlan.h> -#include <linux/in.h> -#include <linux/in_route.h> -#include <linux/inetdevice.h> -#include <linux/jhash.h> -#include <linux/list.h> -#include <linux/kernel.h> -#include <linux/version.h> -#include <linux/workqueue.h> -#include <linux/rculist.h> -#include <net/ip_tunnels.h> -#include <net/ip6_tunnel.h> -#include <net/route.h> -#include <net/xfrm.h> - -#include "compat.h" -#include "gso.h" -#include "vport-netdev.h" - -#ifndef USE_UPSTREAM_TUNNEL -void rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 proto, __u8 tos, __u8 ttl, - __be16 df, bool xnet) -{ - struct net_device *dev = skb->dev; - int pkt_len = skb->len - skb_inner_network_offset(skb); - struct iphdr *iph; - int err; - - skb_scrub_packet(skb, xnet); - - skb_clear_hash(skb); - skb_dst_set(skb, &rt->dst); - -#if 0 - /* Do not clear ovs_skb_cb. It will be done in gso code. */ - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); -#endif - - /* Push down and install the IP header. */ - __skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - - iph = ip_hdr(skb); - - iph->version = 4; - iph->ihl = sizeof(struct iphdr) >> 2; - iph->frag_off = df; - iph->protocol = proto; - iph->tos = tos; - iph->daddr = dst; - iph->saddr = src; - iph->ttl = ttl; - -#ifdef HAVE_IP_SELECT_IDENT_USING_DST_ENTRY - __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); -#elif defined(HAVE_IP_SELECT_IDENT_USING_NET) - __ip_select_ident(dev_net(rt->dst.dev), iph, - skb_shinfo(skb)->gso_segs ?: 1); -#else - __ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1); -#endif - - err = ip_local_out(dev_net(rt->dst.dev), sk, skb); - if (unlikely(net_xmit_eval(err))) - pkt_len = 0; - iptunnel_xmit_stats(dev, pkt_len); -} -EXPORT_SYMBOL_GPL(rpl_iptunnel_xmit); - -int ovs_iptunnel_handle_offloads(struct sk_buff *skb, - int gso_type_mask, - void (*fix_segment)(struct sk_buff *)) -{ - int err; - - if (likely(!skb_is_encapsulated(skb))) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } else if (skb_is_gso(skb)) { - err = -ENOSYS; - goto error; - } - - if (skb_is_gso(skb)) { - err = skb_unclone(skb, GFP_ATOMIC); - if (unlikely(err)) - goto error; - skb_shinfo(skb)->gso_type |= gso_type_mask; - -#ifndef USE_UPSTREAM_TUNNEL_GSO - if (gso_type_mask) - fix_segment = NULL; - - OVS_GSO_CB(skb)->fix_segment = fix_segment; -#endif - return 0; - } - - if (skb->ip_summed != CHECKSUM_PARTIAL) { - skb->ip_summed = CHECKSUM_NONE; - skb->encapsulation = 0; - } - - return 0; -error: - return err; -} -EXPORT_SYMBOL_GPL(ovs_iptunnel_handle_offloads); - - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) -struct sk_buff *rpl_iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, - int gso_type_mask) -#else -int rpl_iptunnel_handle_offloads(struct sk_buff *skb, - bool csum_help, - int gso_type_mask) -#endif -{ - int err; - - if (likely(!skb->encapsulation)) { - skb_reset_inner_headers(skb); - skb->encapsulation = 1; - } - - if (skb_is_gso(skb)) { - err = skb_unclone(skb, GFP_ATOMIC); - if (unlikely(err)) - goto error; - skb_shinfo(skb)->gso_type |= gso_type_mask; - goto out; - } - - /* If packet is not gso and we are resolving any partial checksum, - * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL - * on the outer header without confusing devices that implement - * NETIF_F_IP_CSUM with encapsulation. - */ - if (csum_help) - skb->encapsulation = 0; - - if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) { - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } else if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_NONE; - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) -out: - return skb; -error: - kfree_skb(skb); - return ERR_PTR(err); -#else -out: -error: - return 0; -#endif -} -EXPORT_SYMBOL_GPL(rpl_iptunnel_handle_offloads); - -int rpl___iptunnel_pull_header(struct sk_buff *skb, int hdr_len, - __be16 inner_proto, bool raw_proto, bool xnet) -{ - if (unlikely(!pskb_may_pull(skb, hdr_len))) - return -ENOMEM; - - skb_pull_rcsum(skb, hdr_len); - - if (!raw_proto && inner_proto == htons(ETH_P_TEB)) { - struct ethhdr *eh; - - if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) - return -ENOMEM; - - eh = (struct ethhdr *)skb->data; - if (likely(eth_proto_is_802_3(eh->h_proto))) - skb->protocol = eh->h_proto; - else - skb->protocol = htons(ETH_P_802_2); - - } else { - skb->protocol = inner_proto; - } - - skb_clear_hash_if_not_l4(skb); - skb->vlan_tci = 0; - skb_set_queue_mapping(skb, 0); - skb_scrub_packet(skb, xnet); - - return iptunnel_pull_offloads(skb); -} -EXPORT_SYMBOL_GPL(rpl___iptunnel_pull_header); -#endif /* USE_UPSTREAM_TUNNEL */ - -bool ovs_skb_is_encapsulated(struct sk_buff *skb) -{ - /* checking for inner protocol should be sufficient on newer kernel, but - * old kernel just set encapsulation bit. - */ - return ovs_skb_get_inner_protocol(skb) || skb->encapsulation; -} -EXPORT_SYMBOL_GPL(ovs_skb_is_encapsulated); - -/* derived from ip_tunnel_rcv(). */ -void ovs_ip_tunnel_rcv(struct net_device *dev, struct sk_buff *skb, - struct metadata_dst *tun_dst) -{ - struct pcpu_sw_netstats *tstats; - - tstats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)dev->tstats); - u64_stats_update_begin(&tstats->syncp); - tstats->rx_packets++; - tstats->rx_bytes += skb->len; - u64_stats_update_end(&tstats->syncp); - - skb_reset_mac_header(skb); - skb_scrub_packet(skb, false); - skb->protocol = eth_type_trans(skb, dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst); - -#ifndef USE_UPSTREAM_TUNNEL - netdev_port_receive(skb, &tun_dst->u.tun_info); -#else - netif_rx(skb); -#endif -} - -#ifndef HAVE_PCPU_SW_NETSTATS -#define netdev_stats_to_stats64 rpl_netdev_stats_to_stats64 -static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64, - const struct net_device_stats *netdev_stats) -{ -#if BITS_PER_LONG == 64 - BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats)); - memcpy(stats64, netdev_stats, sizeof(*stats64)); -#else - size_t i, n = sizeof(*stats64) / sizeof(u64); - const unsigned long *src = (const unsigned long *)netdev_stats; - u64 *dst = (u64 *)stats64; - - BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) != - sizeof(*stats64) / sizeof(u64)); - for (i = 0; i < n; i++) - dst[i] = src[i]; -#endif -} -#endif - -#if !defined(HAVE_VOID_NDO_GET_STATS64) && !defined(HAVE_RHEL7_MAX_MTU) -struct rtnl_link_stats64 *rpl_ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -#else -void rpl_ip_tunnel_get_stats64(struct net_device *dev, - struct rtnl_link_stats64 *tot) -#endif -{ - int i; - - netdev_stats_to_stats64(tot, &dev->stats); - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *tstats = - per_cpu_ptr((struct pcpu_sw_netstats __percpu *)dev->tstats, i); - u64 rx_packets, rx_bytes, tx_packets, tx_bytes; - unsigned int start; - - do { - start = u64_stats_fetch_begin_irq(&tstats->syncp); - rx_packets = tstats->rx_packets; - tx_packets = tstats->tx_packets; - rx_bytes = tstats->rx_bytes; - tx_bytes = tstats->tx_bytes; - } while (u64_stats_fetch_retry_irq(&tstats->syncp, start)); - - tot->rx_packets += rx_packets; - tot->tx_packets += tx_packets; - tot->rx_bytes += rx_bytes; - tot->tx_bytes += tx_bytes; - } - -#if !defined(HAVE_VOID_NDO_GET_STATS64) && !defined(HAVE_RHEL7_MAX_MTU) - return tot; -#endif -} - -void rpl_ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, - struct net_device *dev) -{ - int pkt_len, err; - - pkt_len = skb->len - skb_inner_network_offset(skb); -#ifdef HAVE_IP6_LOCAL_OUT_SK - err = ip6_local_out_sk(sk, skb); -#else - err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); -#endif - if (net_xmit_eval(err)) - pkt_len = -1; - - iptunnel_xmit_stats(dev, pkt_len); -} -EXPORT_SYMBOL_GPL(rpl_ip6tunnel_xmit); diff --git a/datapath/linux/compat/lisp.c b/datapath/linux/compat/lisp.c deleted file mode 100644 index 49c60f4ed..000000000 --- a/datapath/linux/compat/lisp.c +++ /dev/null @@ -1,816 +0,0 @@ -/* - * Copyright (c) 2015 Nicira, Inc. - * Copyright (c) 2013 Cisco Systems, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/version.h> - -#include <linux/etherdevice.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/net.h> -#include <linux/module.h> -#include <linux/rculist.h> -#include <linux/udp.h> - -#include <net/icmp.h> -#include <net/ip.h> -#include <net/lisp.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/route.h> -#include <net/udp.h> -#include <net/udp_tunnel.h> -#include <net/xfrm.h> - -#include "datapath.h" -#include "gso.h" -#include "vport.h" -#include "vport-netdev.h" - -#define LISP_UDP_PORT 4341 -#define LISP_NETDEV_VER "0.1" -static int lisp_net_id; - -/* Pseudo network device */ -struct lisp_dev { - struct net *net; /* netns for packet i/o */ - struct net_device *dev; /* netdev for lisp tunnel */ - struct socket __rcu *sock; - __be16 dst_port; - struct list_head next; -}; - -/* per-network namespace private data for this module */ -struct lisp_net { - struct list_head lisp_list; -}; - -/* - * LISP encapsulation header: - * - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * |N|L|E|V|I|flags| Nonce/Map-Version | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * | Instance ID/Locator Status Bits | - * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - * - */ - -/** - * struct lisphdr - LISP header - * @nonce_present: Flag indicating the presence of a 24 bit nonce value. - * @locator_status_bits_present: Flag indicating the presence of Locator Status - * Bits (LSB). - * @solicit_echo_nonce: Flag indicating the use of the echo noncing mechanism. - * @map_version_present: Flag indicating the use of mapping versioning. - * @instance_id_present: Flag indicating the presence of a 24 bit Instance ID. - * @reserved_flags: 3 bits reserved for future flags. - * @nonce: 24 bit nonce value. - * @map_version: 24 bit mapping version. - * @locator_status_bits: Locator Status Bits: 32 bits when instance_id_present - * is not set, 8 bits when it is. - * @instance_id: 24 bit Instance ID - */ -struct lisphdr { -#ifdef __LITTLE_ENDIAN_BITFIELD - __u8 reserved_flags:3; - __u8 instance_id_present:1; - __u8 map_version_present:1; - __u8 solicit_echo_nonce:1; - __u8 locator_status_bits_present:1; - __u8 nonce_present:1; -#else - __u8 nonce_present:1; - __u8 locator_status_bits_present:1; - __u8 solicit_echo_nonce:1; - __u8 map_version_present:1; - __u8 instance_id_present:1; - __u8 reserved_flags:3; -#endif - union { - __u8 nonce[3]; - __u8 map_version[3]; - } u1; - union { - __be32 locator_status_bits; - struct { - __u8 instance_id[3]; - __u8 locator_status_bits; - } word2; - } u2; -}; - -#define LISP_HLEN (sizeof(struct udphdr) + sizeof(struct lisphdr)) -#define LISP_MAX_MTU (IP_MAX_MTU - LISP_HLEN - sizeof(struct iphdr)) - -static inline struct lisphdr *lisp_hdr(const struct sk_buff *skb) -{ - return (struct lisphdr *)(udp_hdr(skb) + 1); -} - -/* Convert 64 bit tunnel ID to 24 bit Instance ID. */ -static void tunnel_id_to_instance_id(__be64 tun_id, __u8 *iid) -{ - -#ifdef __BIG_ENDIAN - iid[0] = (__force __u8)(tun_id >> 16); - iid[1] = (__force __u8)(tun_id >> 8); - iid[2] = (__force __u8)tun_id; -#else - iid[0] = (__force __u8)((__force u64)tun_id >> 40); - iid[1] = (__force __u8)((__force u64)tun_id >> 48); - iid[2] = (__force __u8)((__force u64)tun_id >> 56); -#endif -} - -/* Convert 24 bit Instance ID to 64 bit tunnel ID. */ -static __be64 instance_id_to_tunnel_id(__u8 *iid) -{ -#ifdef __BIG_ENDIAN - return (iid[0] << 16) | (iid[1] << 8) | iid[2]; -#else - return (__force __be64)(((__force u64)iid[0] << 40) | - ((__force u64)iid[1] << 48) | - ((__force u64)iid[2] << 56)); -#endif -} - -/* Compute source UDP port for outgoing packet. - * Currently we use the flow hash. - */ -static u16 get_src_port(struct net *net, struct sk_buff *skb) -{ - u32 hash = skb_get_hash(skb); - unsigned int range; - int high; - int low; - - if (!hash) { - if (skb->protocol == htons(ETH_P_IP)) { - struct iphdr *iph; - int size = (sizeof(iph->saddr) * 2) / sizeof(u32); - - iph = (struct iphdr *) skb_network_header(skb); - hash = jhash2((const u32 *)&iph->saddr, size, 0); - } else if (skb->protocol == htons(ETH_P_IPV6)) { - struct ipv6hdr *ipv6hdr; - - ipv6hdr = (struct ipv6hdr *) skb_network_header(skb); - hash = jhash2((const u32 *)&ipv6hdr->saddr, - (sizeof(struct in6_addr) * 2) / sizeof(u32), 0); - } else { - pr_warn_once("LISP inner protocol is not IP when " - "calculating hash.\n"); - } - } - - inet_get_local_port_range(net, &low, &high); - range = (high - low) + 1; - return (((u64) hash * range) >> 32) + low; -} - -static void lisp_build_header(struct sk_buff *skb, - const struct ip_tunnel_key *tun_key) -{ - struct lisphdr *lisph; - - lisph = (struct lisphdr *)__skb_push(skb, sizeof(struct lisphdr)); - lisph->nonce_present = 0; /* We don't support echo nonce algorithm */ - lisph->locator_status_bits_present = 1; /* Set LSB */ - lisph->solicit_echo_nonce = 0; /* No echo noncing */ - lisph->map_version_present = 0; /* No mapping versioning, nonce instead */ - lisph->instance_id_present = 1; /* Store the tun_id as Instance ID */ - lisph->reserved_flags = 0; /* Reserved flags, set to 0 */ - - lisph->u1.nonce[0] = 0; - lisph->u1.nonce[1] = 0; - lisph->u1.nonce[2] = 0; - - tunnel_id_to_instance_id(tun_key->tun_id, &lisph->u2.word2.instance_id[0]); - lisph->u2.word2.locator_status_bits = 1; -} - -/* Called with rcu_read_lock and BH disabled. */ -static int lisp_rcv(struct sock *sk, struct sk_buff *skb) -{ - struct lisp_dev *lisp_dev; - struct net_device *dev; - struct lisphdr *lisph; - struct iphdr *inner_iph; - struct metadata_dst *tun_dst; -#ifndef USE_UPSTREAM_TUNNEL - struct metadata_dst temp; -#endif - __be64 key; - struct ethhdr *ethh; - __be16 protocol; - - dev = rcu_dereference_sk_user_data(sk); - if (unlikely(!dev)) - goto error; - - lisp_dev = netdev_priv(dev); - if (iptunnel_pull_header(skb, LISP_HLEN, 0, - !net_eq(lisp_dev->net, dev_net(lisp_dev->dev)))) - goto error; - - lisph = lisp_hdr(skb); - - if (lisph->instance_id_present != 1) - key = 0; - else - key = instance_id_to_tunnel_id(&lisph->u2.word2.instance_id[0]); - - /* Save outer tunnel values */ -#ifndef USE_UPSTREAM_TUNNEL - tun_dst = &temp; - ovs_udp_tun_rx_dst(tun_dst, skb, AF_INET, TUNNEL_KEY, key, 0); -#else - tun_dst = udp_tun_rx_dst(skb, AF_INET, TUNNEL_KEY, key, 0); -#endif - /* Drop non-IP inner packets */ - inner_iph = (struct iphdr *)(lisph + 1); - switch (inner_iph->version) { - case 4: - protocol = htons(ETH_P_IP); - break; - case 6: - protocol = htons(ETH_P_IPV6); - break; - default: - goto error; - } - skb->protocol = protocol; - - /* Add Ethernet header */ - ethh = (struct ethhdr *)skb_push(skb, ETH_HLEN); - memset(ethh, 0, ETH_HLEN); - ethh->h_dest[0] = 0x02; - ethh->h_source[0] = 0x02; - ethh->h_proto = protocol; - - ovs_ip_tunnel_rcv(dev, skb, tun_dst); - goto out; - -error: - kfree_skb(skb); -out: - return 0; -} - -static struct rtable *lisp_get_rt(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - const struct ip_tunnel_key *key) -{ - struct net *net = dev_net(dev); - - /* Route lookup */ - memset(fl, 0, sizeof(*fl)); - fl->daddr = key->u.ipv4.dst; - fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->tos); - fl->flowi4_mark = skb->mark; - fl->flowi4_proto = IPPROTO_UDP; - - return ip_route_output_key(net, fl); -} - -/* this is to handle the return type change in handle-offload - * functions. - */ -#if !defined(HAVE_UDP_TUNNEL_HANDLE_OFFLOAD_RET_SKB) || !defined(USE_UPSTREAM_TUNNEL) -static struct sk_buff * -__udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) -{ - int err; - - err = udp_tunnel_handle_offloads(skb, udp_csum); - if (err) { - kfree_skb(skb); - return NULL; - } - return skb; -} -#else -#define __udp_tunnel_handle_offloads udp_tunnel_handle_offloads -#endif - -netdev_tx_t rpl_lisp_xmit(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct lisp_dev *lisp_dev = netdev_priv(dev); - struct net *net = lisp_dev->net; - int network_offset = skb_network_offset(skb); - struct ip_tunnel_info *info; - struct ip_tunnel_key *tun_key; - __be16 src_port, dst_port; - struct rtable *rt; - int min_headroom; - struct socket *sock; - struct flowi4 fl; - __be16 df; - int err; - - info = skb_tunnel_info(skb); - if (unlikely(!info)) { - err = -EINVAL; - goto error; - } - - sock = rcu_dereference(lisp_dev->sock); - if (!sock) { - err = -EIO; - goto error; - } - - if (skb->protocol != htons(ETH_P_IP) && - skb->protocol != htons(ETH_P_IPV6)) { - err = 0; - goto error; - } - - tun_key = &info->key; - - rt = lisp_get_rt(skb, dev, &fl, tun_key); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + sizeof(struct iphdr) + LISP_HLEN; - - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - /* Reset l2 headers. */ - skb_pull(skb, network_offset); - skb_reset_mac_header(skb); - skb->vlan_tci = 0; - - if (skb_is_gso(skb) && skb_is_encapsulated(skb)) - goto err_free_rt; - - skb = __udp_tunnel_handle_offloads(skb, false); - if (!skb) - return NETDEV_TX_OK; - - src_port = htons(get_src_port(net, skb)); - dst_port = lisp_dev->dst_port; - - lisp_build_header(skb, tun_key); - - skb->ignore_df = 1; - - ovs_skb_set_inner_protocol(skb, skb->protocol); - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - udp_tunnel_xmit_skb(rt, sock->sk, skb, - fl.saddr, tun_key->u.ipv4.dst, - tun_key->tos, tun_key->ttl, - df, src_port, dst_port, false, true); - - return NETDEV_TX_OK; - -err_free_rt: - ip_rt_put(rt); -error: - kfree_skb(skb); - return NETDEV_TX_OK; -} -EXPORT_SYMBOL(rpl_lisp_xmit); - -/* Setup stats when device is created */ -static int lisp_init(struct net_device *dev) -{ - dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; -} - -static void lisp_uninit(struct net_device *dev) -{ - free_percpu(dev->tstats); -} - -static struct socket *create_sock(struct net *net, bool ipv6, - __be16 port) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - } else { - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -static int lisp_open(struct net_device *dev) -{ - struct lisp_dev *lisp = netdev_priv(dev); - struct udp_tunnel_sock_cfg tunnel_cfg; - struct net *net = lisp->net; - struct socket *sock; - - sock = create_sock(net, false, lisp->dst_port); - if (IS_ERR(sock)) - return PTR_ERR(sock); - - rcu_assign_pointer(lisp->sock, sock); - /* Mark socket as an encapsulation socket */ - memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); - tunnel_cfg.sk_user_data = dev; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = lisp_rcv; - tunnel_cfg.encap_destroy = NULL; - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - return 0; -} - -static int lisp_stop(struct net_device *dev) -{ - struct lisp_dev *lisp = netdev_priv(dev); - struct socket *socket; - - socket = rtnl_dereference(lisp->sock); - if (!socket) - return 0; - - rcu_assign_pointer(lisp->sock, NULL); - - synchronize_net(); - udp_tunnel_sock_release(socket); - return 0; -} - -static netdev_tx_t lisp_dev_xmit(struct sk_buff *skb, struct net_device *dev) -{ -#ifdef USE_UPSTREAM_TUNNEL - return rpl_lisp_xmit(skb); -#else - /* Drop All packets coming from networking stack. OVS-CB is - * not initialized for these packets. - */ - - dev_kfree_skb(skb); - dev->stats.tx_dropped++; - return NETDEV_TX_OK; -#endif -} - -static int lisp_change_mtu(struct net_device *dev, int new_mtu) -{ - if (new_mtu < 68 || new_mtu > LISP_MAX_MTU) - return -EINVAL; - - dev->mtu = new_mtu; - return 0; -} - -static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb, - struct ip_tunnel_info *info, - __be16 sport, __be16 dport) -{ - struct rtable *rt; - struct flowi4 fl4; - - rt = lisp_get_rt(skb, dev, &fl4, &info->key); - if (IS_ERR(rt)) - return PTR_ERR(rt); - ip_rt_put(rt); - - info->key.u.ipv4.src = fl4.saddr; - info->key.tp_src = sport; - info->key.tp_dst = dport; - return 0; -} - -int ovs_lisp_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) -{ - struct lisp_dev *lisp = netdev_priv(dev); - struct net *net = lisp->net; - struct ip_tunnel_info *info = skb_tunnel_info(skb); - __be16 sport, dport; - - sport = htons(get_src_port(net, skb)); - dport = lisp->dst_port; - - if (ip_tunnel_info_af(info) == AF_INET) - return egress_ipv4_tun_info(dev, skb, info, sport, dport); - return -EINVAL; -} -EXPORT_SYMBOL_GPL(ovs_lisp_fill_metadata_dst); - -static const struct net_device_ops lisp_netdev_ops = { - .ndo_init = lisp_init, - .ndo_uninit = lisp_uninit, - .ndo_get_stats64 = ip_tunnel_get_stats64, - .ndo_open = lisp_open, - .ndo_stop = lisp_stop, - .ndo_start_xmit = lisp_dev_xmit, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = lisp_change_mtu, -#else - .ndo_change_mtu = lisp_change_mtu, -#endif - .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, -#ifdef USE_UPSTREAM_TUNNEL -#ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = lisp_fill_metadata_dst, -#endif -#endif -}; - -static void lisp_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *drvinfo) -{ - strlcpy(drvinfo->version, LISP_NETDEV_VER, sizeof(drvinfo->version)); - strlcpy(drvinfo->driver, "lisp", sizeof(drvinfo->driver)); -} - -static const struct ethtool_ops lisp_ethtool_ops = { - .get_drvinfo = lisp_get_drvinfo, - .get_link = ethtool_op_get_link, -}; - -/* Info for udev, that this is a virtual tunnel endpoint */ -static struct device_type lisp_type = { - .name = "lisp", -}; - -/* Initialize the device structure. */ -static void lisp_setup(struct net_device *dev) -{ - ether_setup(dev); - - dev->netdev_ops = &lisp_netdev_ops; - dev->ethtool_ops = &lisp_ethtool_ops; -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = free_netdev; -#else - dev->needs_free_netdev = true; -#endif - - SET_NETDEV_DEVTYPE(dev, &lisp_type); - - dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL; - dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; - dev->features |= NETIF_F_RXCSUM; - dev->features |= NETIF_F_GSO_SOFTWARE; - - dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; -#ifdef USE_UPSTREAM_TUNNEL - netif_keep_dst(dev); -#endif - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; - eth_hw_addr_random(dev); -} - -static const struct nla_policy lisp_policy[IFLA_LISP_MAX + 1] = { - [IFLA_LISP_PORT] = { .type = NLA_U16 }, -}; - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int lisp_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack __always_unused *extack) -#else -static int lisp_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) - return -EINVAL; - - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) - return -EADDRNOTAVAIL; - } - - return 0; -} - -static struct lisp_dev *find_dev(struct net *net, __be16 dst_port) -{ - struct lisp_net *ln = net_generic(net, lisp_net_id); - struct lisp_dev *dev; - - list_for_each_entry(dev, &ln->lisp_list, next) { - if (dev->dst_port == dst_port) - return dev; - } - return NULL; -} - -static int lisp_configure(struct net *net, struct net_device *dev, - __be16 dst_port) -{ - struct lisp_net *ln = net_generic(net, lisp_net_id); - struct lisp_dev *lisp = netdev_priv(dev); - int err; - - lisp->net = net; - lisp->dev = dev; - - lisp->dst_port = dst_port; - - if (find_dev(net, dst_port)) - return -EBUSY; - - err = lisp_change_mtu(dev, LISP_MAX_MTU); - if (err) - return err; - - err = register_netdevice(dev); - if (err) - return err; - - list_add(&lisp->next, &ln->lisp_list); - return 0; -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int lisp_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack __always_unused *extack) -#else -static int lisp_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - __be16 dst_port = htons(LISP_UDP_PORT); - - if (data[IFLA_LISP_PORT]) - dst_port = nla_get_be16(data[IFLA_LISP_PORT]); - - return lisp_configure(net, dev, dst_port); -} - -static void lisp_dellink(struct net_device *dev, struct list_head *head) -{ - struct lisp_dev *lisp = netdev_priv(dev); - - list_del(&lisp->next); - unregister_netdevice_queue(dev, head); -} - -static size_t lisp_get_size(const struct net_device *dev) -{ - return nla_total_size(sizeof(__be32)); /* IFLA_LISP_PORT */ -} - -static int lisp_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct lisp_dev *lisp = netdev_priv(dev); - - if (nla_put_be16(skb, IFLA_LISP_PORT, lisp->dst_port)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static struct rtnl_link_ops lisp_link_ops __read_mostly = { - .kind = "lisp", - .maxtype = IFLA_LISP_MAX, - .policy = lisp_policy, - .priv_size = sizeof(struct lisp_dev), - .setup = lisp_setup, - .validate = lisp_validate, - .newlink = lisp_newlink, - .dellink = lisp_dellink, - .get_size = lisp_get_size, - .fill_info = lisp_fill_info, -}; - -struct net_device *rpl_lisp_dev_create_fb(struct net *net, const char *name, - u8 name_assign_type, u16 dst_port) -{ - struct nlattr *tb[IFLA_MAX + 1]; - struct net_device *dev; - int err; - - memset(tb, 0, sizeof(tb)); - dev = rtnl_create_link(net, (char *) name, name_assign_type, - &lisp_link_ops, tb); - if (IS_ERR(dev)) - return dev; - - err = lisp_configure(net, dev, htons(dst_port)); - if (err) { - free_netdev(dev); - return ERR_PTR(err); - } - return dev; -} -EXPORT_SYMBOL_GPL(rpl_lisp_dev_create_fb); - -static int lisp_init_net(struct net *net) -{ - struct lisp_net *ln = net_generic(net, lisp_net_id); - - INIT_LIST_HEAD(&ln->lisp_list); - return 0; -} - -static void lisp_exit_net(struct net *net) -{ - struct lisp_net *ln = net_generic(net, lisp_net_id); - struct lisp_dev *lisp, *next; - struct net_device *dev, *aux; - LIST_HEAD(list); - - rtnl_lock(); - - /* gather any lisp devices that were moved into this ns */ - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &lisp_link_ops) - unregister_netdevice_queue(dev, &list); - - list_for_each_entry_safe(lisp, next, &ln->lisp_list, next) { - /* If lisp->dev is in the same netns, it was already added - * to the lisp by the previous loop. - */ - if (!net_eq(dev_net(lisp->dev), net)) - unregister_netdevice_queue(lisp->dev, &list); - } - - /* unregister the devices gathered above */ - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -static struct pernet_operations lisp_net_ops = { - .init = lisp_init_net, - .exit = lisp_exit_net, - .id = &lisp_net_id, - .size = sizeof(struct lisp_net), -}; - -int rpl_lisp_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&lisp_net_ops); - if (rc) - goto out1; - - rc = rtnl_link_register(&lisp_link_ops); - if (rc) - goto out2; - - pr_info("LISP tunneling driver\n"); - return 0; -out2: - unregister_pernet_subsys(&lisp_net_ops); -out1: - pr_err("Error while initializing LISP %d\n", rc); - return rc; -} - -void rpl_lisp_cleanup_module(void) -{ - rtnl_link_unregister(&lisp_link_ops); - unregister_pernet_subsys(&lisp_net_ops); -} diff --git a/datapath/linux/compat/netdevice.c b/datapath/linux/compat/netdevice.c deleted file mode 100644 index c0ffbbd31..000000000 --- a/datapath/linux/compat/netdevice.c +++ /dev/null @@ -1,167 +0,0 @@ -#include <linux/netdevice.h> -#include <linux/if_vlan.h> -#include <net/mpls.h> - -#include "gso.h" - -#ifdef OVS_USE_COMPAT_GSO_SEGMENTATION -struct sk_buff *rpl__skb_gso_segment(struct sk_buff *skb, - netdev_features_t features, - bool tx_path) -{ - int vlan_depth = ETH_HLEN; - __be16 type = skb->protocol; - __be16 skb_proto; - struct sk_buff *skb_gso; - - while (type == htons(ETH_P_8021Q)) { - struct vlan_hdr *vh; - - if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN))) - return ERR_PTR(-EINVAL); - - vh = (struct vlan_hdr *)(skb->data + vlan_depth); - type = vh->h_vlan_encapsulated_proto; - vlan_depth += VLAN_HLEN; - } - - if (eth_p_mpls(type)) - type = ovs_skb_get_inner_protocol(skb); - - /* this hack needed to get regular skb_gso_segment() */ - skb_proto = skb->protocol; - skb->protocol = type; - -#ifdef HAVE___SKB_GSO_SEGMENT -#undef __skb_gso_segment - skb_gso = __skb_gso_segment(skb, features, tx_path); -#else -#undef skb_gso_segment - skb_gso = skb_gso_segment(skb, features); -#endif - - skb->protocol = skb_proto; - return skb_gso; -} -EXPORT_SYMBOL_GPL(rpl__skb_gso_segment); - -#endif /* OVS_USE_COMPAT_GSO_SEGMENTATION */ - -#ifdef HAVE_UDP_OFFLOAD -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) -struct sk_buff **rpl_eth_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -{ - struct sk_buff *p, **pp = NULL; - struct ethhdr *eh, *eh2; - unsigned int hlen, off_eth; - const struct packet_offload *ptype; - __be16 type; - int flush = 1; - - off_eth = skb_gro_offset(skb); - hlen = off_eth + sizeof(*eh); - eh = skb_gro_header_fast(skb, off_eth); - if (skb_gro_header_hard(skb, hlen)) { - eh = skb_gro_header_slow(skb, hlen, off_eth); - if (unlikely(!eh)) - goto out; - } - - flush = 0; - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - eh2 = (struct ethhdr *)(p->data + off_eth); - if (compare_ether_header(eh, eh2)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - type = eh->h_proto; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (ptype == NULL) { - flush = 1; - goto out_unlock; - } - - skb_gro_pull(skb, sizeof(*eh)); - skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -int rpl_eth_gro_complete(struct sk_buff *skb, int nhoff) -{ - struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff); - __be16 type = eh->h_proto; - struct packet_offload *ptype; - int err = -ENOSYS; - - if (skb->encapsulation) - skb_set_inner_mac_header(skb, nhoff); - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype != NULL) - err = ptype->callbacks.gro_complete(skb, nhoff + - sizeof(struct ethhdr)); - - rcu_read_unlock(); - return err; -} - -#endif -#endif /* HAVE_UDP_OFFLOAD */ - -#ifndef HAVE_RTNL_LINK_STATS64 -#undef dev_get_stats -struct rtnl_link_stats64 *rpl_dev_get_stats(struct net_device *dev, - struct rtnl_link_stats64 *storage) -{ - const struct net_device_stats *stats = dev_get_stats(dev); - -#define copy(s) storage->s = stats->s - - copy(rx_packets); - copy(tx_packets); - copy(rx_bytes); - copy(tx_bytes); - copy(rx_errors); - copy(tx_errors); - copy(rx_dropped); - copy(tx_dropped); - copy(multicast); - copy(collisions); - - copy(rx_length_errors); - copy(rx_over_errors); - copy(rx_crc_errors); - copy(rx_frame_errors); - copy(rx_fifo_errors); - copy(rx_missed_errors); - - copy(tx_aborted_errors); - copy(tx_carrier_errors); - copy(tx_fifo_errors); - copy(tx_heartbeat_errors); - copy(tx_window_errors); - - copy(rx_compressed); - copy(tx_compressed); - -#undef copy - return storage; -} -#endif diff --git a/datapath/linux/compat/nf_conncount.c b/datapath/linux/compat/nf_conncount.c deleted file mode 100644 index 97bdfb933..000000000 --- a/datapath/linux/compat/nf_conncount.c +++ /dev/null @@ -1,621 +0,0 @@ -/* - * Backported from upstream commit 5c789e131cbb ("netfilter: - * nf_conncount: Add list lock and gc worker, and RCU for init tree search") - * - * count the number of connections matching an arbitrary key. - * - * (C) 2017 Red Hat GmbH - * Author: Florian Westphal <fw@strlen.de> - * - * split from xt_connlimit.c: - * (c) 2000 Gerd Knorr <kraxel@bytesex.org> - * Nov 2002: Martin Bene <martin.bene@icomedias.com>: - * only ignore TIME_WAIT or gone connections - * (C) CC Computer Consultants GmbH, 2007 - */ -#ifndef HAVE_UPSTREAM_NF_CONNCOUNT - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/in.h> -#include <linux/in6.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/jhash.h> -#include <linux/slab.h> -#include <linux/list.h> -#include <linux/rbtree.h> -#include <linux/module.h> -#include <linux/random.h> -#include <linux/skbuff.h> -#include <linux/spinlock.h> -#include <linux/netfilter/nf_conntrack_tcp.h> -#include <linux/netfilter/x_tables.h> -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_count.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_tuple.h> -#include <net/netfilter/nf_conntrack_zones.h> - -#define CONNCOUNT_SLOTS 256U - -#define CONNCOUNT_GC_MAX_NODES 8 -#define MAX_KEYLEN 5 - -/* we will save the tuples of all connections we care about */ -struct nf_conncount_tuple { - struct list_head node; - struct nf_conntrack_tuple tuple; - struct nf_conntrack_zone zone; - int cpu; - u32 jiffies32; -}; - -struct nf_conncount_rb { - struct rb_node node; - struct nf_conncount_list list; - u32 key[MAX_KEYLEN]; - struct rcu_head rcu_head; -}; - -static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp; - -struct nf_conncount_data { - unsigned int keylen; - struct rb_root root[CONNCOUNT_SLOTS]; - struct net *net; - struct work_struct gc_work; - unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)]; - unsigned int gc_tree; -}; - -static u_int32_t conncount_rnd __read_mostly; -static struct kmem_cache *conncount_rb_cachep __read_mostly; -static struct kmem_cache *conncount_conn_cachep __read_mostly; - -static inline bool already_closed(const struct nf_conn *conn) -{ - if (nf_ct_protonum(conn) == IPPROTO_TCP) - return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || - conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; - else - return false; -} - -static int key_diff(const u32 *a, const u32 *b, unsigned int klen) -{ - return memcmp(a, b, klen * sizeof(u32)); -} - -static void conn_free(struct nf_conncount_list *list, - struct nf_conncount_tuple *conn) -{ - lockdep_assert_held(&list->list_lock); - - list->count--; - list_del(&conn->node); - - kmem_cache_free(conncount_conn_cachep, conn); -} - -static const struct nf_conntrack_tuple_hash * -find_or_evict(struct net *net, struct nf_conncount_list *list, - struct nf_conncount_tuple *conn) -{ - const struct nf_conntrack_tuple_hash *found; - unsigned long a, b; - int cpu = raw_smp_processor_id(); - u32 age; - - found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); - if (found) - return found; - b = conn->jiffies32; - a = (u32)jiffies; - - /* conn might have been added just before by another cpu and - * might still be unconfirmed. In this case, nf_conntrack_find() - * returns no result. Thus only evict if this cpu added the - * stale entry or if the entry is older than two jiffies. - */ - age = a - b; - if (conn->cpu == cpu || age >= 2) { - conn_free(list, conn); - return ERR_PTR(-ENOENT); - } - - return ERR_PTR(-EAGAIN); -} - -static int __nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) -{ - const struct nf_conntrack_tuple_hash *found; - struct nf_conncount_tuple *conn, *conn_n; - struct nf_conn *found_ct; - unsigned int collect = 0; - - /* check the saved connections */ - list_for_each_entry_safe(conn, conn_n, &list->head, node) { - if (collect > CONNCOUNT_GC_MAX_NODES) - break; - - found = find_or_evict(net, list, conn); - if (IS_ERR(found)) { - /* Not found, but might be about to be confirmed */ - if (PTR_ERR(found) == -EAGAIN) { - if (nf_ct_tuple_equal(&conn->tuple, tuple) && - nf_ct_zone_id(&conn->zone, conn->zone.dir) == - nf_ct_zone_id(zone, zone->dir)) - return 0; /* already exists */ - } else { - collect++; - } - continue; - } - - found_ct = nf_ct_tuplehash_to_ctrack(found); - - if (nf_ct_tuple_equal(&conn->tuple, tuple) && - nf_ct_zone_equal(found_ct, zone, zone->dir)) { - /* - * We should not see tuples twice unless someone hooks - * this into a table without "-p tcp --syn". - * - * Attempt to avoid a re-add in this case. - */ - nf_ct_put(found_ct); - return 0; - } else if (already_closed(found_ct)) { - /* - * we do not care about connections which are - * closed already -> ditch it - */ - nf_ct_put(found_ct); - conn_free(list, conn); - collect++; - continue; - } - - nf_ct_put(found_ct); - } - - if (WARN_ON_ONCE(list->count > INT_MAX)) - return -EOVERFLOW; - - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) - return -ENOMEM; - - conn->tuple = *tuple; - conn->zone = *zone; - conn->cpu = raw_smp_processor_id(); - conn->jiffies32 = (u32)jiffies; - list_add_tail(&conn->node, &list->head); - list->count++; - return 0; -} - -int nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) -{ - int ret; - - /* check the saved connections */ - spin_lock_bh(&list->list_lock); - ret = __nf_conncount_add(net, list, tuple, zone); - spin_unlock_bh(&list->list_lock); - - return ret; -} - -static void nf_conncount_list_init(struct nf_conncount_list *list) -{ - spin_lock_init(&list->list_lock); - INIT_LIST_HEAD(&list->head); - list->count = 0; -} - -/* Return true if the list is empty. Must be called with BH disabled. */ -static bool nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) -{ - const struct nf_conntrack_tuple_hash *found; - struct nf_conncount_tuple *conn, *conn_n; - struct nf_conn *found_ct; - unsigned int collected = 0; - bool ret = false; - - /* don't bother if other cpu is already doing GC */ - if (!spin_trylock(&list->list_lock)) - return false; - - list_for_each_entry_safe(conn, conn_n, &list->head, node) { - found = find_or_evict(net, list, conn); - if (IS_ERR(found)) { - if (PTR_ERR(found) == -ENOENT) - collected++; - continue; - } - - found_ct = nf_ct_tuplehash_to_ctrack(found); - if (already_closed(found_ct)) { - /* - * we do not care about connections which are - * closed already -> ditch it - */ - nf_ct_put(found_ct); - conn_free(list, conn); - collected++; - continue; - } - - nf_ct_put(found_ct); - if (collected > CONNCOUNT_GC_MAX_NODES) - break; - } - - if (!list->count) - ret = true; - spin_unlock(&list->list_lock); - - return ret; -} - -static void __tree_nodes_free(struct rcu_head *h) -{ - struct nf_conncount_rb *rbconn; - - rbconn = container_of(h, struct nf_conncount_rb, rcu_head); - kmem_cache_free(conncount_rb_cachep, rbconn); -} - -/* caller must hold tree nf_conncount_locks[] lock */ -static void tree_nodes_free(struct rb_root *root, - struct nf_conncount_rb *gc_nodes[], - unsigned int gc_count) -{ - struct nf_conncount_rb *rbconn; - - while (gc_count) { - rbconn = gc_nodes[--gc_count]; - spin_lock(&rbconn->list.list_lock); - if (!rbconn->list.count) { - rb_erase(&rbconn->node, root); - call_rcu(&rbconn->rcu_head, __tree_nodes_free); - } - spin_unlock(&rbconn->list.list_lock); - } -} - -static void schedule_gc_worker(struct nf_conncount_data *data, int tree) -{ - set_bit(tree, data->pending_trees); - schedule_work(&data->gc_work); -} - -static unsigned int -insert_tree(struct net *net, - struct nf_conncount_data *data, - struct rb_root *root, - unsigned int hash, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) -{ - struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; - struct rb_node **rbnode, *parent; - struct nf_conncount_rb *rbconn; - struct nf_conncount_tuple *conn; - unsigned int count = 0, gc_count = 0; - u8 keylen = data->keylen; - bool do_gc = true; - - spin_lock_bh(&nf_conncount_locks[hash]); -restart: - parent = NULL; - rbnode = &(root->rb_node); - while (*rbnode) { - int diff; - rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node); - - parent = *rbnode; - diff = key_diff(key, rbconn->key, keylen); - if (diff < 0) { - rbnode = &((*rbnode)->rb_left); - } else if (diff > 0) { - rbnode = &((*rbnode)->rb_right); - } else { - int ret; - - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); - if (ret) - count = 0; /* hotdrop */ - else - count = rbconn->list.count; - tree_nodes_free(root, gc_nodes, gc_count); - goto out_unlock; - } - - if (gc_count >= ARRAY_SIZE(gc_nodes)) - continue; - - if (do_gc && nf_conncount_gc_list(net, &rbconn->list)) - gc_nodes[gc_count++] = rbconn; - } - - if (gc_count) { - tree_nodes_free(root, gc_nodes, gc_count); - schedule_gc_worker(data, hash); - gc_count = 0; - do_gc = false; - goto restart; - } - - /* expected case: match, insert new node */ - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - goto out_unlock; - - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(conncount_rb_cachep, rbconn); - goto out_unlock; - } - - conn->tuple = *tuple; - conn->zone = *zone; - memcpy(rbconn->key, key, sizeof(u32) * keylen); - - nf_conncount_list_init(&rbconn->list); - list_add(&conn->node, &rbconn->list.head); - count = 1; - rbconn->list.count = count; - - rb_link_node_rcu(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); -out_unlock: - spin_unlock_bh(&nf_conncount_locks[hash]); - return count; -} - -static unsigned int -count_tree(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) -{ - struct rb_root *root; - struct rb_node *parent; - struct nf_conncount_rb *rbconn; - unsigned int hash; - u8 keylen = data->keylen; - - hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS; - root = &data->root[hash]; - - parent = rcu_dereference_raw(root->rb_node); - while (parent) { - int diff; - - rbconn = rb_entry(parent, struct nf_conncount_rb, node); - - diff = key_diff(key, rbconn->key, keylen); - if (diff < 0) { - parent = rcu_dereference_raw(parent->rb_left); - } else if (diff > 0) { - parent = rcu_dereference_raw(parent->rb_right); - } else { - int ret; - - if (!tuple) { - nf_conncount_gc_list(net, &rbconn->list); - return rbconn->list.count; - } - - spin_lock_bh(&rbconn->list.list_lock); - /* Node might be about to be free'd. - * We need to defer to insert_tree() in this case. - */ - if (rbconn->list.count == 0) { - spin_unlock_bh(&rbconn->list.list_lock); - break; - } - - /* same source network -> be counted! */ - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); - spin_unlock_bh(&rbconn->list.list_lock); - if (ret) - return 0; /* hotdrop */ - else - return rbconn->list.count; - } - } - - if (!tuple) - return 0; - - return insert_tree(net, data, root, hash, key, tuple, zone); -} - -static void tree_gc_worker(struct work_struct *work) -{ - struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work); - struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn; - struct rb_root *root; - struct rb_node *node; - unsigned int tree, next_tree, gc_count = 0; - - tree = data->gc_tree % CONNCOUNT_SLOTS; - root = &data->root[tree]; - - local_bh_disable(); - rcu_read_lock(); - for (node = rb_first(root); node != NULL; node = rb_next(node)) { - rbconn = rb_entry(node, struct nf_conncount_rb, node); - if (nf_conncount_gc_list(data->net, &rbconn->list)) - gc_count++; - } - rcu_read_unlock(); - local_bh_enable(); - - cond_resched(); - - spin_lock_bh(&nf_conncount_locks[tree]); - if (gc_count < ARRAY_SIZE(gc_nodes)) - goto next; /* do not bother */ - - gc_count = 0; - node = rb_first(root); - while (node != NULL) { - rbconn = rb_entry(node, struct nf_conncount_rb, node); - node = rb_next(node); - - if (rbconn->list.count > 0) - continue; - - gc_nodes[gc_count++] = rbconn; - if (gc_count >= ARRAY_SIZE(gc_nodes)) { - tree_nodes_free(root, gc_nodes, gc_count); - gc_count = 0; - } - } - - tree_nodes_free(root, gc_nodes, gc_count); -next: - - clear_bit(tree, data->pending_trees); - - next_tree = (tree + 1) % CONNCOUNT_SLOTS; - next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree); - - if (next_tree < CONNCOUNT_SLOTS) { - data->gc_tree = next_tree; - schedule_work(work); - } - - spin_unlock_bh(&nf_conncount_locks[tree]); -} - -/* Count and return number of conntrack entries in 'net' with particular 'key'. - * If 'tuple' is not null, insert it into the accounting data structure. - * Call with RCU read lock. - */ -unsigned int rpl_nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) -{ - return count_tree(net, data, key, tuple, zone); -} -EXPORT_SYMBOL_GPL(rpl_nf_conncount_count); - -struct nf_conncount_data *rpl_nf_conncount_init(struct net *net, unsigned int family, - unsigned int keylen) -{ - struct nf_conncount_data *data; - int ret, i; - - if (keylen % sizeof(u32) || - keylen / sizeof(u32) > MAX_KEYLEN || - keylen == 0) - return ERR_PTR(-EINVAL); - - net_get_random_once(&conncount_rnd, sizeof(conncount_rnd)); - - data = kmalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return ERR_PTR(-ENOMEM); - - ret = nf_ct_netns_get(net, family); - if (ret < 0) { - kfree(data); - return ERR_PTR(ret); - } - - for (i = 0; i < ARRAY_SIZE(data->root); ++i) - data->root[i] = RB_ROOT; - - data->keylen = keylen / sizeof(u32); - data->net = net; - INIT_WORK(&data->gc_work, tree_gc_worker); - - return data; -} -EXPORT_SYMBOL_GPL(rpl_nf_conncount_init); - -static void nf_conncount_cache_free(struct nf_conncount_list *list) -{ - struct nf_conncount_tuple *conn, *conn_n; - - list_for_each_entry_safe(conn, conn_n, &list->head, node) - kmem_cache_free(conncount_conn_cachep, conn); -} - -static void destroy_tree(struct rb_root *r) -{ - struct nf_conncount_rb *rbconn; - struct rb_node *node; - - while ((node = rb_first(r)) != NULL) { - rbconn = rb_entry(node, struct nf_conncount_rb, node); - - rb_erase(node, r); - - nf_conncount_cache_free(&rbconn->list); - - kmem_cache_free(conncount_rb_cachep, rbconn); - } -} - -void rpl_nf_conncount_destroy(struct net *net, unsigned int family, - struct nf_conncount_data *data) -{ - unsigned int i; - - cancel_work_sync(&data->gc_work); - nf_ct_netns_put(net, family); - - for (i = 0; i < ARRAY_SIZE(data->root); ++i) - destroy_tree(&data->root[i]); - - kfree(data); -} -EXPORT_SYMBOL_GPL(rpl_nf_conncount_destroy); - -int rpl_nf_conncount_modinit(void) -{ - int i; - - for (i = 0; i < CONNCOUNT_SLOTS; ++i) - spin_lock_init(&nf_conncount_locks[i]); - - conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", - sizeof(struct nf_conncount_tuple), - 0, 0, NULL); - if (!conncount_conn_cachep) - return -ENOMEM; - - conncount_rb_cachep = kmem_cache_create("nf_conncount_rb", - sizeof(struct nf_conncount_rb), - 0, 0, NULL); - if (!conncount_rb_cachep) { - kmem_cache_destroy(conncount_conn_cachep); - return -ENOMEM; - } - - return 0; -} - -void rpl_nf_conncount_modexit(void) -{ - kmem_cache_destroy(conncount_conn_cachep); - kmem_cache_destroy(conncount_rb_cachep); -} - -#endif /* HAVE_UPSTREAM_NF_CONNCOUNT */ diff --git a/datapath/linux/compat/nf_conntrack_core.c b/datapath/linux/compat/nf_conntrack_core.c deleted file mode 100644 index a7d3d4331..000000000 --- a/datapath/linux/compat/nf_conntrack_core.c +++ /dev/null @@ -1,13 +0,0 @@ -#include <linux/version.h> - -#ifndef HAVE_NF_CT_ZONE_INIT - -#include <net/netfilter/nf_conntrack_zones.h> - -/* Built-in default zone used e.g. by modules. */ -const struct nf_conntrack_zone nf_ct_zone_dflt = { - .id = NF_CT_DEFAULT_ZONE_ID, - .dir = NF_CT_DEFAULT_ZONE_DIR, -}; - -#endif /* HAVE_NF_CT_ZONE_INIT */ diff --git a/datapath/linux/compat/nf_conntrack_proto.c b/datapath/linux/compat/nf_conntrack_proto.c deleted file mode 100644 index fe291dbf2..000000000 --- a/datapath/linux/compat/nf_conntrack_proto.c +++ /dev/null @@ -1,114 +0,0 @@ -#include <linux/types.h> - -#include <net/netfilter/nf_conntrack.h> -#ifdef HAVE_NF_CONNTRACK_L3PROATO_H -#include <net/netfilter/nf_conntrack_l3proto.h> -#endif - -/* - * Upstream net-next commmit 7e35ec0e8044 - * ("netfilter: conntrack: move nf_ct_netns_{get,put}() to core") - * is introduced in v4.15, and it supports NFPROTO_INET in - * nf_ct_netns_{get,put}() that OVS conntrack uses this feature. - * - * However, we only need this feature if the underlying nf_conntrack_l3proto - * supports net_ns_get/put. Thus, we just mock the functions if - * HAVE_NET_NS_GET is false. - */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) -#ifdef HAVE_NET_NS_GET -static int nf_ct_netns_do_get(struct net *net, u8 nfproto) -{ - const struct nf_conntrack_l3proto *l3proto; - int ret; - - might_sleep(); - - ret = nf_ct_l3proto_try_module_get(nfproto); - if (ret < 0) - return ret; - - /* we already have a reference, can't fail */ - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(nfproto); - rcu_read_unlock(); - - if (!l3proto->net_ns_get) - return 0; - - ret = l3proto->net_ns_get(net); - if (ret < 0) - nf_ct_l3proto_module_put(nfproto); - - return ret; -} - -int rpl_nf_ct_netns_get(struct net *net, u8 nfproto) -{ - int err; - - if (nfproto == NFPROTO_INET) { - err = nf_ct_netns_do_get(net, NFPROTO_IPV4); - if (err < 0) - goto err1; - err = nf_ct_netns_do_get(net, NFPROTO_IPV6); - if (err < 0) - goto err2; - } else { - err = nf_ct_netns_do_get(net, nfproto); - if (err < 0) - goto err1; - } - return 0; - -err2: - nf_ct_netns_put(net, NFPROTO_IPV4); -err1: - return err; -} -EXPORT_SYMBOL_GPL(rpl_nf_ct_netns_get); - -static void nf_ct_netns_do_put(struct net *net, u8 nfproto) -{ - const struct nf_conntrack_l3proto *l3proto; - - might_sleep(); - - /* same as nf_conntrack_netns_get(), reference assumed */ - rcu_read_lock(); - l3proto = __nf_ct_l3proto_find(nfproto); - rcu_read_unlock(); - - if (WARN_ON(!l3proto)) - return; - - if (l3proto->net_ns_put) - l3proto->net_ns_put(net); - - nf_ct_l3proto_module_put(nfproto); -} - -void rpl_nf_ct_netns_put(struct net *net, uint8_t nfproto) -{ - if (nfproto == NFPROTO_INET) { - nf_ct_netns_do_put(net, NFPROTO_IPV4); - nf_ct_netns_do_put(net, NFPROTO_IPV6); - } else - nf_ct_netns_do_put(net, nfproto); -} -EXPORT_SYMBOL_GPL(rpl_nf_ct_netns_put); - -#else /* !HAVE_NET_NS_GET */ -void rpl_nf_ct_netns_put(struct net *net, uint8_t nfproto) -{ -} -EXPORT_SYMBOL_GPL(rpl_nf_ct_netns_put); - -int rpl_nf_ct_netns_get(struct net *net, u8 nfproto) -{ - return 0; -} -EXPORT_SYMBOL_GPL(rpl_nf_ct_netns_get); - -#endif /* HAVE_NET_NS_GET */ -#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) */ diff --git a/datapath/linux/compat/nf_conntrack_reasm.c b/datapath/linux/compat/nf_conntrack_reasm.c deleted file mode 100644 index 77b4b2548..000000000 --- a/datapath/linux/compat/nf_conntrack_reasm.c +++ /dev/null @@ -1,740 +0,0 @@ -/* - * Backported from upstream commit 5b490047240f - * ("ipv6: Export nf_ct_frag6_gather()") - * - * IPv6 fragment reassembly for connection tracking - * - * Copyright (C)2004 USAGI/WIDE Project - * - * Author: - * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> - * - * Based on: net/ipv6/reassembly.c - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) "IPv6-nf: " fmt - -#include <linux/version.h> - -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/sockios.h> -#include <linux/jiffies.h> -#include <linux/net.h> -#include <linux/list.h> -#include <linux/netdevice.h> -#include <linux/in6.h> -#include <linux/ipv6.h> -#include <linux/icmpv6.h> -#include <linux/random.h> -#include <linux/slab.h> - -#include <net/sock.h> -#include <net/snmp.h> -#include <net/inet_frag.h> - -#include <net/ipv6.h> -#include <net/ipv6_frag.h> -#include <net/protocol.h> -#include <net/transp_v6.h> -#include <net/rawv6.h> -#include <net/ndisc.h> -#include <net/addrconf.h> -#include <net/inet_ecn.h> -#include <net/netfilter/ipv6/nf_conntrack_ipv6.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv6.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <net/netfilter/ipv6/nf_defrag_ipv6.h> -#include <net/netns/generic.h> -#include "datapath.h" - -#if defined(HAVE_INET_FRAGS_WITH_FRAGS_WORK) || !defined(HAVE_INET_FRAGS_RND) - -static const char nf_frags_cache_name[] = "ovs-frag6"; - -#endif - -#ifdef OVS_NF_DEFRAG6_BACKPORT -struct nf_ct_frag6_skb_cb -{ - struct inet6_skb_parm h; - int offset; -}; - -#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb*)((skb)->cb)) - -static struct inet_frags nf_frags; - -static struct netns_frags *get_netns_frags6_from_net(struct net *net) -{ -#ifdef HAVE_INET_FRAG_LRU_MOVE - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - return &(ovs_net->nf_frags); -#else - return &(net->nf_frag.frags); -#endif -} - -static struct net *get_net_from_netns_frags6(struct netns_frags *frags) -{ - struct net *net; -#ifdef HAVE_INET_FRAG_LRU_MOVE - struct ovs_net *ovs_net; - - ovs_net = container_of(frags, struct ovs_net, nf_frags); - net = ovs_net->net; -#else - net = container_of(frags, struct net, nf_frag.frags); -#endif - return net; -} - -static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) -{ - return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); -} - -#ifdef HAVE_INET_FRAGS_RND -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, - const struct in6_addr *daddr) -{ - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), - (__force u32)id, nf_frags.rnd); -} -/* fb3cfe6e75b9 ("inet: frag: remove hash size assumptions from callers") - * shifted this logic into inet_fragment, but prior kernels still need this. - */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,17,0) -#define nf_hash_frag(a, b, c) (nf_hash_frag(a, b, c) & (INETFRAGS_HASHSZ - 1)) -#endif - -#ifdef HAVE_INET_FRAGS_CONST -static unsigned int nf_hashfn(const struct inet_frag_queue *q) -#else -static unsigned int nf_hashfn(struct inet_frag_queue *q) -#endif -{ - const struct frag_queue *nq; - - nq = container_of(q, struct frag_queue, q); - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); -} - -#endif /* HAVE_INET_FRAGS_RND */ -static void nf_ct_frag6_expire(unsigned long data) -{ - struct frag_queue *fq; - struct net *net; - - fq = container_of((struct inet_frag_queue *)data, struct frag_queue, q); - net = get_net_from_netns_frags6(fq->q.net); - -#ifdef HAVE_INET_FRAGS_RND - ip6_expire_frag_queue(net, fq, &nf_frags); -#else -#ifdef HAVE_IPV6_FRAG_H - ip6frag_expire_frag_queue(net, fq); -#else - ip6_expire_frag_queue(net, fq); -#endif -#endif -} - -#ifdef HAVE_INET_FRAGS_RND -/* Creation primitives. */ -static inline struct frag_queue *fq_find(struct net *net, __be32 id, - u32 user, struct in6_addr *src, - struct in6_addr *dst, u8 ecn) -{ - struct inet_frag_queue *q; - struct ip6_create_arg arg; - unsigned int hash; - struct netns_frags *frags; - - arg.id = id; - arg.user = user; - arg.src = src; - arg.dst = dst; - arg.ecn = ecn; - -#ifdef HAVE_INET_FRAGS_WITH_RWLOCK - read_lock_bh(&nf_frags.lock); -#else - local_bh_disable(); -#endif - hash = nf_hash_frag(id, src, dst); - - frags = get_netns_frags6_from_net(net); - q = inet_frag_find(frags, &nf_frags, &arg, hash); - local_bh_enable(); - if (IS_ERR_OR_NULL(q)) { - inet_frag_maybe_warn_overflow(q, pr_fmt()); - return NULL; - } - return container_of(q, struct frag_queue, q); -} -#else -static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, - const struct ipv6hdr *hdr, int iif) -{ - struct frag_v6_compare_key key = { - .id = id, - .saddr = hdr->saddr, - .daddr = hdr->daddr, - .user = user, - .iif = iif, - }; - struct inet_frag_queue *q; - - q = inet_frag_find(&net->nf_frag.frags, &key); - if (!q) - return NULL; - - return container_of(q, struct frag_queue, q); -} - -#endif /* HAVE_INET_FRAGS_RND */ - -static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, - const struct frag_hdr *fhdr, int nhoff) -{ - struct sk_buff *prev, *next; - unsigned int payload_len; - int offset, end; - u8 ecn; - - if (qp_flags(fq) & INET_FRAG_COMPLETE) { - pr_debug("Already completed\n"); - goto err; - } - - payload_len = ntohs(ipv6_hdr(skb)->payload_len); - - offset = ntohs(fhdr->frag_off) & ~0x7; - end = offset + (payload_len - - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); - - if ((unsigned int)end > IPV6_MAXPLEN) { - pr_debug("offset is too large.\n"); - return -1; - } - - ecn = ip6_frag_ecn(ipv6_hdr(skb)); - - if (skb->ip_summed == CHECKSUM_COMPLETE) { - const unsigned char *nh = skb_network_header(skb); - skb->csum = csum_sub(skb->csum, - csum_partial(nh, (u8 *)(fhdr + 1) - nh, - 0)); - } - - /* Is this the final fragment? */ - if (!(fhdr->frag_off & htons(IP6_MF))) { - /* If we already have some bits beyond end - * or have different end, the segment is corrupted. - */ - if (end < fq->q.len || - ((qp_flags(fq) & INET_FRAG_LAST_IN) && end != fq->q.len)) { - pr_debug("already received last fragment\n"); - goto err; - } - qp_flags(fq) |= INET_FRAG_LAST_IN; - fq->q.len = end; - } else { - /* Check if the fragment is rounded to 8 bytes. - * Required by the RFC. - */ - if (end & 0x7) { - /* RFC2460 says always send parameter problem in - * this case. -DaveM - */ - pr_debug("end of fragment not rounded to 8 bytes.\n"); - return -1; - } - if (end > fq->q.len) { - /* Some bits beyond end -> corruption. */ - if (qp_flags(fq) & INET_FRAG_LAST_IN) { - pr_debug("last packet already reached.\n"); - goto err; - } - fq->q.len = end; - } - } - - if (end == offset) - goto err; - - /* Point into the IP datagram 'data' part. */ - if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) { - pr_debug("queue: message is too short.\n"); - goto err; - } - if (pskb_trim_rcsum(skb, end - offset)) { - pr_debug("Can't trim\n"); - goto err; - } - - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = fq->q.fragments_tail; - if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = fq->q.fragments; next != NULL; next = next->next) { - if (NFCT_FRAG6_CB(next)->offset >= offset) - break; /* bingo! */ - prev = next; - } - -found: - /* RFC5722, Section 4: - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments, including those not yet received) MUST be silently - * discarded. - */ - - /* Check for overlap with preceding fragment. */ - if (prev && - (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) - goto discard_fq; - - /* Look for overlap with succeeding segment. */ - if (next && NFCT_FRAG6_CB(next)->offset < end) - goto discard_fq; - - NFCT_FRAG6_CB(skb)->offset = offset; - - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - fq->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - fq->q.fragments = skb; - - if (skb->dev) { - fq->iif = skb->dev->ifindex; - skb->dev = NULL; - } - fq->q.stamp = skb->tstamp; - fq->q.meat += skb->len; - fq->ecn |= ecn; - if (payload_len > fq->q.max_size) - fq->q.max_size = payload_len; - add_frag_mem_limit(fq->q.net, skb->truesize); - - /* The first fragment. - * nhoffset is obtained from the first fragment, of course. - */ - if (offset == 0) { - fq->nhoffset = nhoff; - qp_flags(fq) |= INET_FRAG_FIRST_IN; - } - - inet_frag_lru_move(&fq->q); - return 0; - -discard_fq: -#ifdef HAVE_INET_FRAGS_RND - inet_frag_kill(&fq->q, &nf_frags); -#else - inet_frag_kill(&fq->q); -#endif -err: - return -1; -} - -/* - * Check if this packet is complete. - * - * It is called with locked fq, and caller must check that - * queue is eligible for reassembly i.e. it is not COMPLETE, - * the last and the first frames arrived and all the bits are here. - * - * returns true if *prev skb has been transformed into the reassembled - * skb, false otherwise. - */ -static bool -nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) -{ - struct sk_buff *fp, *head = fq->q.fragments; - int payload_len; - u8 ecn; - -#ifdef HAVE_INET_FRAGS_RND - inet_frag_kill(&fq->q, &nf_frags); -#else - inet_frag_kill(&fq->q); -#endif - - WARN_ON(head == NULL); - WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); - - ecn = ip_frag_ecn_table[fq->ecn]; - if (unlikely(ecn == 0xff)) - return false; - - /* Unfragmented part is taken from the first segment. */ - payload_len = ((head->data - skb_network_header(head)) - - sizeof(struct ipv6hdr) + fq->q.len - - sizeof(struct frag_hdr)); - if (payload_len > IPV6_MAXPLEN) { - net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", - payload_len); - return false; - } - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - return false; - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (clone == NULL) - return false; - - clone->next = head->next; - head->next = clone; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - - add_frag_mem_limit(fq->q.net, clone->truesize); - } - - /* morph head into last received skb: prev. - * - * This allows callers of ipv6 conntrack defrag to continue - * to use the last skb(frag) passed into the reasm engine. - * The last skb frag 'silently' turns into the full reassembled skb. - * - * Since prev is also part of q->fragments we have to clone it first. - */ - if (head != prev) { - struct sk_buff *iter; - - fp = skb_clone(prev, GFP_ATOMIC); - if (!fp) - return false; - - fp->next = prev->next; - - iter = head; - while (iter) { - if (iter->next == prev) { - iter->next = fp; - break; - } - iter = iter->next; - } - - skb_morph(prev, head); - prev->next = head->next; - consume_skb(head); - head = prev; - } - - /* We have to remove fragment header from datagram and to relocate - * header in order to calculate ICV correctly. */ - skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; - memmove(head->head + sizeof(struct frag_hdr), head->head, - (head->data - head->head) - sizeof(struct frag_hdr)); - head->mac_header += sizeof(struct frag_hdr); - head->network_header += sizeof(struct frag_hdr); - - skb_shinfo(head)->frag_list = head->next; - skb_reset_transport_header(head); - skb_push(head, head->data - skb_network_header(head)); - - for (fp=head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - } - sub_frag_mem_limit(fq->q.net, head->truesize); - - head->ignore_df = 1; - head->next = NULL; - head->dev = dev; - head->tstamp = fq->q.stamp; - ipv6_hdr(head)->payload_len = htons(payload_len); - ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); - IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; - - /* Yes, and fold redundant checksum back. 8) */ - if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(skb_network_header(head), - skb_network_header_len(head), - head->csum); - - fq->q.fragments = NULL; - fq->q.fragments_tail = NULL; - - return true; -} - -/* - * find the header just before Fragment Header. - * - * if success return 0 and set ... - * (*prevhdrp): the value of "Next Header Field" in the header - * just before Fragment Header. - * (*prevhoff): the offset of "Next Header Field" in the header - * just before Fragment Header. - * (*fhoff) : the offset of Fragment Header. - * - * Based on ipv6_skip_hdr() in net/ipv6/exthdr.c - * - */ -static int -find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) -{ - u8 nexthdr = ipv6_hdr(skb)->nexthdr; - const int netoff = skb_network_offset(skb); - u8 prev_nhoff = netoff + offsetof(struct ipv6hdr, nexthdr); - int start = netoff + sizeof(struct ipv6hdr); - int len = skb->len - start; - u8 prevhdr = NEXTHDR_IPV6; - - while (nexthdr != NEXTHDR_FRAGMENT) { - struct ipv6_opt_hdr hdr; - int hdrlen; - - if (!ipv6_ext_hdr(nexthdr)) { - return -1; - } - if (nexthdr == NEXTHDR_NONE) { - pr_debug("next header is none\n"); - return -1; - } - if (len < (int)sizeof(struct ipv6_opt_hdr)) { - pr_debug("too short\n"); - return -1; - } - if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) - BUG(); - if (nexthdr == NEXTHDR_AUTH) - hdrlen = (hdr.hdrlen+2)<<2; - else - hdrlen = ipv6_optlen(&hdr); - - prevhdr = nexthdr; - prev_nhoff = start; - - nexthdr = hdr.nexthdr; - len -= hdrlen; - start += hdrlen; - } - - if (len < 0) - return -1; - - *prevhdrp = prevhdr; - *prevhoff = prev_nhoff; - *fhoff = start; - - return 0; -} - -int rpl_nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) -{ - struct net_device *dev = skb->dev; - int fhoff, nhoff, ret; - struct frag_hdr *fhdr; - struct frag_queue *fq; - struct ipv6hdr *hdr; - u8 prevhdr; - struct netns_frags *frags; - - /* Jumbo payload inhibits frag. header */ - if (ipv6_hdr(skb)->payload_len == 0) { - pr_debug("payload len = 0\n"); - return -EINVAL; - } - - if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0) - return -EINVAL; - - if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr))) - return -ENOMEM; - - skb_set_transport_header(skb, fhoff); - hdr = ipv6_hdr(skb); - fhdr = (struct frag_hdr *)skb_transport_header(skb); - -/* See ip_evictor(). */ - frags = get_netns_frags6_from_net(net); -#ifdef HAVE_INET_FRAG_EVICTOR - local_bh_disable(); - inet_frag_evictor(frags, &nf_frags, false); - local_bh_enable(); -#endif - - skb_orphan(skb); -#ifdef HAVE_INET_FRAGS_RND - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, - ip6_frag_ecn(hdr)); -#else - fq = fq_find(net, fhdr->identification, user, hdr, - skb->dev ? skb->dev->ifindex : 0); -#endif - if (fq == NULL) - return -ENOMEM; - - spin_lock_bh(&fq->q.lock); - - if (nf_ct_frag6_queue(fq, skb, fhdr, nhoff) < 0) { - ret = -EINVAL; - goto out_unlock; - } - - /* after queue has assumed skb ownership, only 0 or -EINPROGRESS - * must be returned. - */ - ret = -EINPROGRESS; - if (qp_flags(fq) == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len && - nf_ct_frag6_reasm(fq, skb, dev)) - ret = 0; - -out_unlock: - spin_unlock_bh(&fq->q.lock); -#ifdef HAVE_INET_FRAGS_RND - inet_frag_put(&fq->q, &nf_frags); -#else - inet_frag_put(&fq->q); -#endif - return ret; -} - -#ifdef HAVE_DEFRAG_ENABLE_TAKES_NET -static int nf_ct_net_init(struct net *net) -{ - return nf_defrag_ipv6_enable(net); -} -#endif - -static void nf_ct_net_exit(struct net *net) -{ -} - -void ovs_netns_frags6_init(struct net *net) -{ -#ifdef HAVE_INET_FRAG_LRU_MOVE - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - ovs_net->nf_frags.high_thresh = IPV6_FRAG_HIGH_THRESH; - ovs_net->nf_frags.low_thresh = IPV6_FRAG_LOW_THRESH; - ovs_net->nf_frags.timeout = IPV6_FRAG_TIMEOUT; - - inet_frags_init_net(&(ovs_net->nf_frags)); -#endif -} - -void ovs_netns_frags6_exit(struct net *net) -{ -#ifdef HAVE_INET_FRAGS_RND - struct netns_frags *frags; - - frags = get_netns_frags6_from_net(net); - inet_frags_exit_net(frags, &nf_frags); -#endif -} - -static struct pernet_operations nf_ct_net_ops = { -#ifdef HAVE_DEFRAG_ENABLE_TAKES_NET - .init = nf_ct_net_init, -#endif - .exit = nf_ct_net_exit, -}; - -#ifdef HAVE_IPV6_FRAG_H -static const struct rhashtable_params nfct_rhash_params = { - .head_offset = offsetof(struct inet_frag_queue, node), - .hashfn = ip6frag_key_hashfn, - .obj_hashfn = ip6frag_obj_hashfn, - .obj_cmpfn = ip6frag_obj_cmpfn, - .automatic_shrinking = true, -}; -#endif - -int rpl_nf_ct_frag6_init(void) -{ - int ret = 0; - -#ifndef HAVE_DEFRAG_ENABLE_TAKES_NET - nf_defrag_ipv6_enable(); -#endif -#ifdef HAVE_INET_FRAGS_RND - nf_frags.hashfn = nf_hashfn; - nf_frags.match = ip6_frag_match; - nf_frags.constructor = ip6_frag_init; -#else -#ifdef HAVE_IPV6_FRAG_H - nf_frags.rhash_params = nfct_rhash_params; - nf_frags.constructor = ip6frag_init; -#else - nf_frags.rhash_params = ip6_rhash_params; - nf_frags.constructor = ip6_frag_init; -#endif -#endif /* HAVE_INET_FRAGS_RND */ - nf_frags.destructor = NULL; - nf_frags.qsize = sizeof(struct frag_queue); - nf_frags.frag_expire = nf_ct_frag6_expire; -#if defined(HAVE_INET_FRAGS_WITH_FRAGS_WORK) || !defined(HAVE_INET_FRAGS_RND) - nf_frags.frags_cache_name = nf_frags_cache_name; -#endif -#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0) - nf_frags.secret_interval = 10 * 60 * HZ; -#endif - ret = inet_frags_init(&nf_frags); - if (ret) - goto out; - ret = register_pernet_subsys(&nf_ct_net_ops); - if (ret) - inet_frags_fini(&nf_frags); - -out: - return ret; -} - -void rpl_nf_ct_frag6_cleanup(void) -{ - unregister_pernet_subsys(&nf_ct_net_ops); - inet_frags_fini(&nf_frags); -} - -#endif /* OVS_NF_DEFRAG6_BACKPORT */ diff --git a/datapath/linux/compat/nf_conntrack_timeout.c b/datapath/linux/compat/nf_conntrack_timeout.c deleted file mode 100644 index c02baff57..000000000 --- a/datapath/linux/compat/nf_conntrack_timeout.c +++ /dev/null @@ -1,102 +0,0 @@ -#include <net/netfilter/nf_conntrack.h> -#include <net/netfilter/nf_conntrack_core.h> -#include <net/netfilter/nf_conntrack_extend.h> -#include <net/netfilter/nf_conntrack_timeout.h> - -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT -#ifndef HAVE_NF_CT_SET_TIMEOUT -static void rpl__nf_ct_timeout_put(struct nf_ct_timeout *timeout) -{ - typeof(nf_ct_timeout_put_hook) timeout_put; - - timeout_put = rcu_dereference(nf_ct_timeout_put_hook); - if (timeout_put) - timeout_put(timeout); -} - -int rpl_nf_ct_set_timeout(struct net *net, struct nf_conn *ct, - u8 l3num, u8 l4num, const char *timeout_name) -{ - typeof(nf_ct_timeout_find_get_hook) timeout_find_get; - struct nf_ct_timeout *timeout; - struct nf_conn_timeout *timeout_ext; - const char *errmsg = NULL; - int ret = 0; - - rcu_read_lock(); - timeout_find_get = rcu_dereference(nf_ct_timeout_find_get_hook); - if (!timeout_find_get) { - ret = -ENOENT; - errmsg = "Timeout policy base is empty"; - goto out; - } - -#ifdef HAVE_NF_CT_TIMEOUT_FIND_GET_HOOK_NET - timeout = timeout_find_get(net, timeout_name); -#else - timeout = timeout_find_get(timeout_name); -#endif - if (!timeout) { - ret = -ENOENT; - pr_info_ratelimited("No such timeout policy \"%s\"\n", - timeout_name); - goto out; - } - - if (timeout->l3num != l3num) { - ret = -EINVAL; - pr_info_ratelimited("Timeout policy `%s' can only be used by " - "L%d protocol number %d\n", - timeout_name, 3, timeout->l3num); - goto err_put_timeout; - } - /* Make sure the timeout policy matches any existing protocol tracker, - * otherwise default to generic. - */ - if (timeout->l4proto->l4proto != l4num) { - ret = -EINVAL; - pr_info_ratelimited("Timeout policy `%s' can only be used by " - "L%d protocol number %d\n", - timeout_name, 4, timeout->l4proto->l4proto); - goto err_put_timeout; - } - timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); - if (!timeout_ext) { - ret = -ENOMEM; - goto err_put_timeout; - } - - rcu_read_unlock(); - return ret; - -err_put_timeout: - rpl__nf_ct_timeout_put(timeout); -out: - rcu_read_unlock(); - if (errmsg) - pr_info_ratelimited("%s\n", errmsg); - return ret; -} -EXPORT_SYMBOL_GPL(rpl_nf_ct_set_timeout); - -void rpl_nf_ct_destroy_timeout(struct nf_conn *ct) -{ - struct nf_conn_timeout *timeout_ext; - typeof(nf_ct_timeout_put_hook) timeout_put; - - rcu_read_lock(); - timeout_put = rcu_dereference(nf_ct_timeout_put_hook); - - if (timeout_put) { - timeout_ext = nf_ct_timeout_find(ct); - if (timeout_ext) { - timeout_put(timeout_ext->timeout); - RCU_INIT_POINTER(timeout_ext->timeout, NULL); - } - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(rpl_nf_ct_destroy_timeout); - -#endif /* HAVE_NF_CT_SET_TIMEOUT */ -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ diff --git a/datapath/linux/compat/reciprocal_div.c b/datapath/linux/compat/reciprocal_div.c deleted file mode 100644 index 818502a0f..000000000 --- a/datapath/linux/compat/reciprocal_div.c +++ /dev/null @@ -1,27 +0,0 @@ -#include <linux/kernel.h> -#include <asm/div64.h> -#include <linux/module.h> -#include <linux/reciprocal_div.h> - -/* - * For a description of the algorithm please have a look at - * include/linux/reciprocal_div.h - */ - -struct reciprocal_value rpl_reciprocal_value(u32 d) -{ - struct reciprocal_value R; - u64 m; - int l; - - l = fls(d - 1); - m = ((1ULL << 32) * ((1ULL << l) - d)); - do_div(m, d); - ++m; - R.m = (u32)m; - R.sh1 = min(l, 1); - R.sh2 = max(l - 1, 0); - - return R; -} -EXPORT_SYMBOL_GPL(rpl_reciprocal_value); diff --git a/datapath/linux/compat/skbuff-openvswitch.c b/datapath/linux/compat/skbuff-openvswitch.c deleted file mode 100644 index 4cdeedc58..000000000 --- a/datapath/linux/compat/skbuff-openvswitch.c +++ /dev/null @@ -1,310 +0,0 @@ -#include <linux/module.h> -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/if_vlan.h> -#include <linux/kconfig.h> - -#include "gso.h" - -#if !defined(HAVE_SKB_WARN_LRO) && defined(NETIF_F_LRO) - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -void __skb_warn_lro_forwarding(const struct sk_buff *skb) -{ - if (net_ratelimit()) - pr_warn("%s: received packets cannot be forwarded while LRO is enabled\n", - skb->dev->name); -} - -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) - -static inline bool head_frag(const struct sk_buff *skb) -{ - return skb->head_frag; -} - - /** - * skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy() - * @from: source buffer - * - * Calculates the amount of linear headroom needed in the 'to' skb passed - * into skb_zerocopy(). - */ -unsigned int -rpl_skb_zerocopy_headlen(const struct sk_buff *from) -{ - unsigned int hlen = 0; - - if (!head_frag(from) || - skb_headlen(from) < L1_CACHE_BYTES || - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) - hlen = skb_headlen(from); - - if (skb_has_frag_list(from)) - hlen = from->len; - - return hlen; -} -EXPORT_SYMBOL_GPL(rpl_skb_zerocopy_headlen); - -#ifndef HAVE_SKB_ZEROCOPY -/** - * skb_zerocopy - Zero copy skb to skb - * @to: destination buffer - * @source: source buffer - * @len: number of bytes to copy from source buffer - * @hlen: size of linear headroom in destination buffer - * - * Copies up to `len` bytes from `from` to `to` by creating references - * to the frags in the source buffer. - * - * The `hlen` as calculated by skb_zerocopy_headlen() specifies the - * headroom in the `to` buffer. - * - * Return value: - * 0: everything is OK - * -ENOMEM: couldn't orphan frags of @from due to lack of memory - * -EFAULT: skb_copy_bits() found some problem with skb geometry - */ -int -rpl_skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) -{ - int i, j = 0; - int plen = 0; /* length of skb->head fragment */ - int ret; - struct page *page; - unsigned int offset; - - BUG_ON(!head_frag(from) && !hlen); - - /* dont bother with small payloads */ - if (len <= skb_tailroom(to)) - return skb_copy_bits(from, 0, skb_put(to, len), len); - - if (hlen) { - ret = skb_copy_bits(from, 0, skb_put(to, hlen), hlen); - if (unlikely(ret)) - return ret; - len -= hlen; - } else { - plen = min_t(int, skb_headlen(from), len); - if (plen) { - page = virt_to_head_page(from->head); - offset = from->data - (unsigned char *)page_address(page); - __skb_fill_page_desc(to, 0, page, offset, plen); - get_page(page); - j = 1; - len -= plen; - } - } - - to->truesize += len + plen; - to->len += len + plen; - to->data_len += len + plen; - - if (unlikely(skb_orphan_frags(from, GFP_ATOMIC))) { - skb_tx_error(from); - return -ENOMEM; - } - - for (i = 0; i < skb_shinfo(from)->nr_frags; i++) { - if (!len) - break; - skb_shinfo(to)->frags[j] = skb_shinfo(from)->frags[i]; - skb_shinfo(to)->frags[j].size = min_t(int, skb_shinfo(to)->frags[j].size, len); - len -= skb_shinfo(to)->frags[j].size; - skb_frag_ref(to, j); - j++; - } - skb_shinfo(to)->nr_frags = j; - - return 0; -} -EXPORT_SYMBOL_GPL(rpl_skb_zerocopy); -#endif -#endif - -#ifndef HAVE_SKB_ENSURE_WRITABLE -int rpl_skb_ensure_writable(struct sk_buff *skb, int write_len) -{ - if (!pskb_may_pull(skb, write_len)) - return -ENOMEM; - - if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) - return 0; - - return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); -} -EXPORT_SYMBOL_GPL(rpl_skb_ensure_writable); -#endif - -#if !defined(HAVE___SKB_VLAN_POP) || !defined(HAVE_SKB_VLAN_POP) -/* remove VLAN header from packet and update csum accordingly. */ -int rpl___skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci) -{ - struct vlan_hdr *vhdr; - unsigned int offset = skb->data - skb_mac_header(skb); - int err; - - __skb_push(skb, offset); - err = skb_ensure_writable(skb, VLAN_ETH_HLEN); - if (unlikely(err)) - goto pull; - - skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN); - - vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); - *vlan_tci = ntohs(vhdr->h_vlan_TCI); - - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); - __skb_pull(skb, VLAN_HLEN); - - vlan_set_encap_proto(skb, vhdr); - skb->mac_header += VLAN_HLEN; - - if (skb_network_offset(skb) < ETH_HLEN) - skb_set_network_header(skb, ETH_HLEN); - - skb_reset_mac_len(skb); -pull: - __skb_pull(skb, offset); - - return err; -} -#endif - -#ifndef HAVE_SKB_VLAN_POP -int rpl_skb_vlan_pop(struct sk_buff *skb) -{ - u16 vlan_tci; - __be16 vlan_proto; - int err; - - if (likely(skb_vlan_tag_present(skb))) { - skb->vlan_tci = 0; - } else { - if (unlikely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) - return 0; - - err = rpl___skb_vlan_pop(skb, &vlan_tci); - if (err) - return err; - } - /* move next vlan tag to hw accel tag */ - if (likely((skb->protocol != htons(ETH_P_8021Q) && - skb->protocol != htons(ETH_P_8021AD)) || - skb->len < VLAN_ETH_HLEN)) - return 0; - - vlan_proto = htons(ETH_P_8021Q); - err = __skb_vlan_pop(skb, &vlan_tci); - if (unlikely(err)) - return err; - - __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); - return 0; -} -EXPORT_SYMBOL_GPL(rpl_skb_vlan_pop); -#endif - -#ifndef HAVE_SKB_VLAN_PUSH -int rpl_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) -{ - if (skb_vlan_tag_present(skb)) { - unsigned int offset = skb->data - skb_mac_header(skb); - int err; - - /* __vlan_insert_tag expect skb->data pointing to mac header. - * So change skb->data before calling it and change back to - * original position later - */ - __skb_push(skb, offset); - err = __vlan_insert_tag(skb, skb->vlan_proto, - skb_vlan_tag_get(skb)); - if (err) - return err; - skb->mac_len += VLAN_HLEN; - __skb_pull(skb, offset); - - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); - } - __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); - return 0; -} -EXPORT_SYMBOL_GPL(rpl_skb_vlan_push); -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0) -int rpl_pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, - gfp_t gfp_mask) -{ - int err; - int inner_mac_offset, inner_nw_offset, inner_transport_offset; - - inner_mac_offset = skb_inner_mac_offset(skb); - inner_nw_offset = skb_inner_network_offset(skb); - inner_transport_offset = skb_inner_transport_offset(skb); - -#undef pskb_expand_head - err = pskb_expand_head(skb, nhead, ntail, gfp_mask); - if (err) - return err; - - skb_set_inner_mac_header(skb, inner_mac_offset); - skb_set_inner_network_header(skb, inner_nw_offset); - skb_set_inner_transport_header(skb, inner_transport_offset); - - return 0; -} -EXPORT_SYMBOL(rpl_pskb_expand_head); - -#endif - -#ifndef HAVE_KFREE_SKB_LIST -void rpl_kfree_skb_list(struct sk_buff *segs) -{ - while (segs) { - struct sk_buff *next = segs->next; - - kfree_skb(segs); - segs = next; - } -} -EXPORT_SYMBOL(rpl_kfree_skb_list); -#endif - -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0) - -#define nf_reset_trace rpl_nf_reset_trace -static void nf_reset_trace(struct sk_buff *skb) -{ -#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES) - skb->nf_trace = 0; -#endif -} - -void rpl_skb_scrub_packet(struct sk_buff *skb, bool xnet) -{ - skb->tstamp.tv64 = 0; - skb->pkt_type = PACKET_HOST; - skb->skb_iif = 0; - skb->ignore_df = 0; - skb_dst_drop(skb); - secpath_reset(skb); - nf_reset(skb); - nf_reset_trace(skb); - - if (!xnet) - return; - - skb_orphan(skb); - skb->mark = 0; -} -#endif diff --git a/datapath/linux/compat/socket.c b/datapath/linux/compat/socket.c deleted file mode 100644 index 7f61e4456..000000000 --- a/datapath/linux/compat/socket.c +++ /dev/null @@ -1,32 +0,0 @@ -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/socket.h> -#include <linux/udp.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <net/ip_tunnels.h> -#include <net/udp.h> -#include <net/udp_tunnel.h> -#include <net/net_namespace.h> - - -#ifndef HAVE_SOCK_CREATE_KERN_NET -#undef sock_create_kern - -int ovs_sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) -{ - int err; - - err = sock_create_kern(family, type, protocol, res); - if (err < 0) - return err; - - sk_change_net((*res)->sk, net); - return err; -} -#undef sk_release_kernel -void ovs_sock_release(struct socket *sock) -{ - sk_release_kernel(sock->sk); -} -#endif diff --git a/datapath/linux/compat/stt.c b/datapath/linux/compat/stt.c deleted file mode 100644 index 39a294764..000000000 --- a/datapath/linux/compat/stt.c +++ /dev/null @@ -1,2129 +0,0 @@ -/* - * Stateless TCP Tunnel (STT) vport. - * - * Copyright (c) 2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <asm/unaligned.h> - -#include <linux/delay.h> -#include <linux/if.h> -#include <linux/if_vlan.h> -#include <linux/ip.h> -#include <linux/ipv6.h> -#include <linux/jhash.h> -#include <linux/list.h> -#include <linux/log2.h> -#include <linux/module.h> -#include <linux/net.h> -#include <linux/netfilter.h> -#include <linux/percpu.h> -#include <linux/skbuff.h> -#include <linux/tcp.h> -#include <linux/workqueue.h> - -#include <net/dst_metadata.h> -#include <net/icmp.h> -#include <net/inet_ecn.h> -#include <net/ip.h> -#include <net/ip_tunnels.h> -#include <net/ip6_checksum.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/sock.h> -#include <net/stt.h> -#include <net/tcp.h> -#include <net/udp.h> - -#include "gso.h" -#include "compat.h" - -#define STT_NETDEV_VER "0.1" -#define STT_DST_PORT 7471 - -#ifdef OVS_STT -#ifdef CONFIG_SLUB -/* - * We saw better performance with skipping zero copy in case of SLUB. - * So skip zero copy for SLUB case. - */ -#define SKIP_ZERO_COPY -#endif - -#define STT_VER 0 - -/* @list: Per-net list of STT ports. - * @rcv: The callback is called on STT packet recv, STT reassembly can generate - * multiple packets, in this case first packet has tunnel outer header, rest - * of the packets are inner packet segments with no stt header. - * @rcv_data: user data. - * @sock: Fake TCP socket for the STT port. - */ -struct stt_dev { - struct net_device *dev; - struct net *net; - struct list_head next; - struct list_head up_next; - struct socket *sock; - __be16 dst_port; -}; - -#define STT_CSUM_VERIFIED BIT(0) -#define STT_CSUM_PARTIAL BIT(1) -#define STT_PROTO_IPV4 BIT(2) -#define STT_PROTO_TCP BIT(3) -#define STT_PROTO_TYPES (STT_PROTO_IPV4 | STT_PROTO_TCP) - -#ifdef HAVE_SKB_GSO_UDP -#define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_UDP | SKB_GSO_DODGY | \ - SKB_GSO_TCPV6) -#else -#define SUPPORTED_GSO_TYPES (SKB_GSO_TCPV4 | SKB_GSO_DODGY | \ - SKB_GSO_TCPV6) -#endif - -/* The length and offset of a fragment are encoded in the sequence number. - * STT_SEQ_LEN_SHIFT is the left shift needed to store the length. - * STT_SEQ_OFFSET_MASK is the mask to extract the offset. - */ -#define STT_SEQ_LEN_SHIFT 16 -#define STT_SEQ_OFFSET_MASK (BIT(STT_SEQ_LEN_SHIFT) - 1) - -/* The maximum amount of memory used to store packets waiting to be reassembled - * on a given CPU. Once this threshold is exceeded we will begin freeing the - * least recently used fragments. - */ -#define REASM_HI_THRESH (4 * 1024 * 1024) -/* The target for the high memory evictor. Once we have exceeded - * REASM_HI_THRESH, we will continue freeing fragments until we hit - * this limit. - */ -#define REASM_LO_THRESH (3 * 1024 * 1024) -/* The length of time a given packet has to be reassembled from the time the - * first fragment arrives. Once this limit is exceeded it becomes available - * for cleaning. - */ -#define FRAG_EXP_TIME (30 * HZ) -/* Number of hash entries. Each entry has only a single slot to hold a packet - * so if there are collisions, we will drop packets. This is allocated - * per-cpu and each entry consists of struct pkt_frag. - */ -#define FRAG_HASH_SHIFT 8 -#define FRAG_HASH_ENTRIES BIT(FRAG_HASH_SHIFT) -#define FRAG_HASH_SEGS ((sizeof(u32) * 8) / FRAG_HASH_SHIFT) - -#define CLEAN_PERCPU_INTERVAL (30 * HZ) - -struct pkt_key { - __be32 saddr; - __be32 daddr; - __be32 pkt_seq; - u32 mark; -}; - -struct pkt_frag { - struct sk_buff *skbs; - unsigned long timestamp; - struct list_head lru_node; - struct pkt_key key; -}; - -struct stt_percpu { - struct pkt_frag *frag_hash; - struct list_head frag_lru; - unsigned int frag_mem_used; - - /* Protect frags table. */ - spinlock_t lock; -}; - -struct first_frag { - struct sk_buff *last_skb; - unsigned int mem_used; - u16 tot_len; - u16 rcvd_len; - bool set_ecn_ce; -}; - -struct frag_skb_cb { - u16 offset; - - /* Only valid for the first skb in the chain. */ - struct first_frag first; -}; - -#define FRAG_CB(skb) ((struct frag_skb_cb *)(skb)->cb) - -/* per-network namespace private data for this module */ -struct stt_net { - struct list_head stt_list; - struct list_head stt_up_list; /* Devices which are in IFF_UP state. */ - int n_tunnels; -#ifdef HAVE_NF_REGISTER_NET_HOOK - bool nf_hook_reg_done; -#endif -}; - -static int stt_net_id; - -static struct stt_percpu __percpu *stt_percpu_data __read_mostly; -static u32 frag_hash_seed __read_mostly; - -/* Protects sock-hash and refcounts. */ -static DEFINE_MUTEX(stt_mutex); - -static int n_tunnels; -static DEFINE_PER_CPU(u32, pkt_seq_counter); - -static void clean_percpu(struct work_struct *work); -static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu); - -static struct stt_dev *stt_find_up_dev(struct net *net, __be16 port) -{ - struct stt_net *sn = net_generic(net, stt_net_id); - struct stt_dev *stt_dev; - - list_for_each_entry_rcu(stt_dev, &sn->stt_up_list, up_next) { - if (stt_dev->dst_port == port) - return stt_dev; - } - return NULL; -} - -static __be32 ack_seq(void) -{ -#if NR_CPUS <= 65536 - u32 pkt_seq, ack; - - pkt_seq = this_cpu_read(pkt_seq_counter); - ack = pkt_seq << ilog2(NR_CPUS) | smp_processor_id(); - this_cpu_inc(pkt_seq_counter); - - return (__force __be32)ack; -#else -#error "Support for greater than 64k CPUs not implemented" -#endif -} - -static int clear_gso(struct sk_buff *skb) -{ - struct skb_shared_info *shinfo = skb_shinfo(skb); - int err; - - if (shinfo->gso_type == 0 && shinfo->gso_size == 0 && - shinfo->gso_segs == 0) - return 0; - - err = skb_unclone(skb, GFP_ATOMIC); - if (unlikely(err)) - return err; - - shinfo = skb_shinfo(skb); - shinfo->gso_type = 0; - shinfo->gso_size = 0; - shinfo->gso_segs = 0; - return 0; -} - -static void copy_skb_metadata(struct sk_buff *to, struct sk_buff *from) -{ - to->protocol = from->protocol; - to->tstamp = from->tstamp; - to->priority = from->priority; - to->mark = from->mark; - to->vlan_tci = from->vlan_tci; - to->vlan_proto = from->vlan_proto; - skb_copy_secmark(to, from); -} - -static void update_headers(struct sk_buff *skb, bool head, - unsigned int l4_offset, unsigned int hdr_len, - bool ipv4, u32 tcp_seq) -{ - u16 old_len, new_len; - __be32 delta; - struct tcphdr *tcph; - int gso_size; - - if (ipv4) { - struct iphdr *iph = (struct iphdr *)(skb->data + ETH_HLEN); - - old_len = ntohs(iph->tot_len); - new_len = skb->len - ETH_HLEN; - iph->tot_len = htons(new_len); - - ip_send_check(iph); - } else { - struct ipv6hdr *ip6h = (struct ipv6hdr *)(skb->data + ETH_HLEN); - - old_len = ntohs(ip6h->payload_len); - new_len = skb->len - ETH_HLEN - sizeof(struct ipv6hdr); - ip6h->payload_len = htons(new_len); - } - - tcph = (struct tcphdr *)(skb->data + l4_offset); - if (!head) { - tcph->seq = htonl(tcp_seq); - tcph->cwr = 0; - } - - if (skb->next) { - tcph->fin = 0; - tcph->psh = 0; - } - - delta = htonl(~old_len + new_len); - tcph->check = ~csum_fold((__force __wsum)((__force u32)tcph->check + - (__force u32)delta)); - - gso_size = skb_shinfo(skb)->gso_size; - if (gso_size && skb->len - hdr_len <= gso_size) - BUG_ON(clear_gso(skb)); -} - -static bool can_segment(struct sk_buff *head, bool ipv4, bool tcp, bool csum_partial) -{ - /* If no offloading is in use then we don't have enough information - * to process the headers. - */ - if (!csum_partial) - goto linearize; - - /* Handling UDP packets requires IP fragmentation, which means that - * the L4 checksum can no longer be calculated by hardware (since the - * fragments are in different packets. If we have to compute the - * checksum it's faster just to linearize and large UDP packets are - * pretty uncommon anyways, so it's not worth dealing with for now. - */ - if (!tcp) - goto linearize; - - if (ipv4) { - struct iphdr *iph = (struct iphdr *)(head->data + ETH_HLEN); - - /* It's difficult to get the IP IDs exactly right here due to - * varying segment sizes and potentially multiple layers of - * segmentation. IP ID isn't important when DF is set and DF - * is generally set for TCP packets, so just linearize if it's - * not. - */ - if (!(iph->frag_off & htons(IP_DF))) - goto linearize; - } else { - struct ipv6hdr *ip6h = (struct ipv6hdr *)(head->data + ETH_HLEN); - - /* Jumbograms require more processing to update and we'll - * probably never see them, so just linearize. - */ - if (ip6h->payload_len == 0) - goto linearize; - } - return true; - -linearize: - return false; -} - -static int copy_headers(struct sk_buff *head, struct sk_buff *frag, - int hdr_len) -{ - u16 csum_start; - - if (skb_cloned(frag) || skb_headroom(frag) < hdr_len) { - int extra_head = hdr_len - skb_headroom(frag); - - extra_head = extra_head > 0 ? extra_head : 0; - if (unlikely(pskb_expand_head(frag, extra_head, 0, - GFP_ATOMIC))) - return -ENOMEM; - } - - memcpy(__skb_push(frag, hdr_len), head->data, hdr_len); - - csum_start = head->csum_start - skb_headroom(head); - frag->csum_start = skb_headroom(frag) + csum_start; - frag->csum_offset = head->csum_offset; - frag->ip_summed = head->ip_summed; - - skb_shinfo(frag)->gso_size = skb_shinfo(head)->gso_size; - skb_shinfo(frag)->gso_type = skb_shinfo(head)->gso_type; - skb_shinfo(frag)->gso_segs = 0; - - copy_skb_metadata(frag, head); - return 0; -} - -static int skb_list_segment(struct sk_buff *head, bool ipv4, int l4_offset) -{ - struct sk_buff *skb; - struct tcphdr *tcph; - int seg_len; - int hdr_len; - int tcp_len; - u32 seq; - - if (unlikely(!pskb_may_pull(head, l4_offset + sizeof(*tcph)))) - return -ENOMEM; - - tcph = (struct tcphdr *)(head->data + l4_offset); - tcp_len = tcph->doff * 4; - hdr_len = l4_offset + tcp_len; - - if (unlikely((tcp_len < sizeof(struct tcphdr)) || - (head->len < hdr_len))) - return -EINVAL; - - if (unlikely(!pskb_may_pull(head, hdr_len))) - return -ENOMEM; - - tcph = (struct tcphdr *)(head->data + l4_offset); - /* Update header of each segment. */ - seq = ntohl(tcph->seq); - seg_len = skb_pagelen(head) - hdr_len; - - skb = skb_shinfo(head)->frag_list; - skb_shinfo(head)->frag_list = NULL; - head->next = skb; - for (; skb; skb = skb->next) { - int err; - - head->len -= skb->len; - head->data_len -= skb->len; - head->truesize -= skb->truesize; - - seq += seg_len; - seg_len = skb->len; - err = copy_headers(head, skb, hdr_len); - if (err) - return err; - update_headers(skb, false, l4_offset, hdr_len, ipv4, seq); - } - update_headers(head, true, l4_offset, hdr_len, ipv4, 0); - return 0; -} - -#ifndef SKIP_ZERO_COPY -static struct sk_buff *normalize_frag_list(struct sk_buff *head, - struct sk_buff **skbp) -{ - struct sk_buff *skb = *skbp; - struct sk_buff *last; - - do { - struct sk_buff *frags; - - if (skb_shared(skb)) { - struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); - - if (unlikely(!nskb)) - return ERR_PTR(-ENOMEM); - - nskb->next = skb->next; - consume_skb(skb); - skb = nskb; - *skbp = skb; - } - - if (head) { - head->len -= skb->len; - head->data_len -= skb->len; - head->truesize -= skb->truesize; - } - - frags = skb_shinfo(skb)->frag_list; - if (frags) { - int err; - - err = skb_unclone(skb, GFP_ATOMIC); - if (unlikely(err)) - return ERR_PTR(err); - - last = normalize_frag_list(skb, &frags); - if (IS_ERR(last)) - return last; - - skb_shinfo(skb)->frag_list = NULL; - last->next = skb->next; - skb->next = frags; - } else { - last = skb; - } - - skbp = &skb->next; - } while ((skb = skb->next)); - - return last; -} - -/* Takes a linked list of skbs, which potentially contain frag_list - * (whose members in turn potentially contain frag_lists, etc.) and - * converts them into a single linear linked list. - */ -static int straighten_frag_list(struct sk_buff **skbp) -{ - struct sk_buff *err_skb; - - err_skb = normalize_frag_list(NULL, skbp); - if (IS_ERR(err_skb)) - return PTR_ERR(err_skb); - - return 0; -} - -static int coalesce_skb(struct sk_buff **headp) -{ - struct sk_buff *frag, *head, *prev; - int err; - - err = straighten_frag_list(headp); - if (unlikely(err)) - return err; - head = *headp; - - /* Coalesce frag list. */ - prev = head; - for (frag = head->next; frag; frag = frag->next) { - bool headstolen; - int delta; - - if (unlikely(skb_unclone(prev, GFP_ATOMIC))) - return -ENOMEM; - - if (!skb_try_coalesce(prev, frag, &headstolen, &delta)) { - prev = frag; - continue; - } - - prev->next = frag->next; - frag->len = 0; - frag->data_len = 0; - frag->truesize -= delta; - kfree_skb_partial(frag, headstolen); - frag = prev; - } - - if (!head->next) - return 0; - - for (frag = head->next; frag; frag = frag->next) { - head->len += frag->len; - head->data_len += frag->len; - head->truesize += frag->truesize; - } - - skb_shinfo(head)->frag_list = head->next; - head->next = NULL; - return 0; -} -#else -static int coalesce_skb(struct sk_buff **headp) -{ - struct sk_buff *frag, *head = *headp, *next; - int delta = FRAG_CB(head)->first.tot_len - skb_headlen(head); - int err; - - if (unlikely(!head->next)) - return 0; - - err = pskb_expand_head(head, 0, delta, GFP_ATOMIC); - if (unlikely(err)) - return err; - - if (unlikely(!__pskb_pull_tail(head, head->data_len))) - BUG(); - - for (frag = head->next; frag; frag = next) { - skb_copy_bits(frag, 0, skb_put(head, frag->len), frag->len); - next = frag->next; - kfree_skb(frag); - } - - head->next = NULL; - head->truesize = SKB_TRUESIZE(head->len); - return 0; -} -#endif - -static int __try_to_segment(struct sk_buff *skb, bool csum_partial, - bool ipv4, bool tcp, int l4_offset) -{ - if (can_segment(skb, ipv4, tcp, csum_partial)) - return skb_list_segment(skb, ipv4, l4_offset); - else - return skb_linearize(skb); -} - -static int try_to_segment(struct sk_buff *skb) -{ - struct stthdr *stth = stt_hdr(skb); - bool csum_partial = !!(stth->flags & STT_CSUM_PARTIAL); - bool ipv4 = !!(stth->flags & STT_PROTO_IPV4); - bool tcp = !!(stth->flags & STT_PROTO_TCP); - int l4_offset = stth->l4_offset; - - return __try_to_segment(skb, csum_partial, ipv4, tcp, l4_offset); -} - -static int segment_skb(struct sk_buff **headp, bool csum_partial, - bool ipv4, bool tcp, int l4_offset) -{ -#ifndef SKIP_ZERO_COPY - int err; - - err = coalesce_skb(headp); - if (err) - return err; -#endif - - if (skb_shinfo(*headp)->frag_list) - return __try_to_segment(*headp, csum_partial, - ipv4, tcp, l4_offset); - return 0; -} - -static int __push_stt_header(struct sk_buff *skb, __be64 tun_id, - __be16 s_port, __be16 d_port, - __be32 saddr, __be32 dst, - __be16 l3_proto, u8 l4_proto, - int dst_mtu) -{ - int data_len = skb->len + sizeof(struct stthdr) + STT_ETH_PAD; - unsigned short encap_mss; - struct tcphdr *tcph; - struct stthdr *stth; - - skb_push(skb, STT_HEADER_LEN); - skb_reset_transport_header(skb); - tcph = tcp_hdr(skb); - memset(tcph, 0, STT_HEADER_LEN); - stth = stt_hdr(skb); - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - stth->flags |= STT_CSUM_PARTIAL; - - stth->l4_offset = skb->csum_start - - (skb_headroom(skb) + - STT_HEADER_LEN); - - if (l3_proto == htons(ETH_P_IP)) - stth->flags |= STT_PROTO_IPV4; - - if (l4_proto == IPPROTO_TCP) - stth->flags |= STT_PROTO_TCP; - - stth->mss = htons(skb_shinfo(skb)->gso_size); - } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) { - stth->flags |= STT_CSUM_VERIFIED; - } - - stth->vlan_tci = htons(skb->vlan_tci); - skb->vlan_tci = 0; - put_unaligned(tun_id, &stth->key); - - tcph->source = s_port; - tcph->dest = d_port; - tcph->doff = sizeof(struct tcphdr) / 4; - tcph->ack = 1; - tcph->psh = 1; - tcph->window = htons(USHRT_MAX); - tcph->seq = htonl(data_len << STT_SEQ_LEN_SHIFT); - tcph->ack_seq = ack_seq(); - tcph->check = ~tcp_v4_check(skb->len, saddr, dst, 0); - - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct tcphdr, check); - skb->ip_summed = CHECKSUM_PARTIAL; - - encap_mss = dst_mtu - sizeof(struct iphdr) - sizeof(struct tcphdr); - if (data_len > encap_mss) { - if (unlikely(skb_unclone(skb, GFP_ATOMIC))) - return -EINVAL; - - skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; - skb_shinfo(skb)->gso_size = encap_mss; - skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(data_len, encap_mss); - } else { - if (unlikely(clear_gso(skb))) - return -EINVAL; - } - return 0; -} - -static struct sk_buff *push_stt_header(struct sk_buff *head, __be64 tun_id, - __be16 s_port, __be16 d_port, - __be32 saddr, __be32 dst, - __be16 l3_proto, u8 l4_proto, - int dst_mtu) -{ - struct sk_buff *skb; - - if (skb_shinfo(head)->frag_list) { - bool ipv4 = (l3_proto == htons(ETH_P_IP)); - bool tcp = (l4_proto == IPPROTO_TCP); - bool csum_partial = (head->ip_summed == CHECKSUM_PARTIAL); - int l4_offset = skb_transport_offset(head); - - /* Need to call skb_orphan() to report currect true-size. - * calling skb_orphan() in this layer is odd but SKB with - * frag-list should not be associated with any socket, so - * skb-orphan should be no-op. */ - skb_orphan(head); - if (unlikely(segment_skb(&head, csum_partial, - ipv4, tcp, l4_offset))) - goto error; - } - - for (skb = head; skb; skb = skb->next) { - if (__push_stt_header(skb, tun_id, s_port, d_port, saddr, dst, - l3_proto, l4_proto, dst_mtu)) - goto error; - } - - return head; -error: - kfree_skb_list(head); - return NULL; -} - -static int stt_can_offload(struct sk_buff *skb, __be16 l3_proto, u8 l4_proto) -{ - if (skb_is_gso(skb) && skb->ip_summed != CHECKSUM_PARTIAL) { - int csum_offset; - __sum16 *csum; - int len; - - if (l4_proto == IPPROTO_TCP) - csum_offset = offsetof(struct tcphdr, check); - else if (l4_proto == IPPROTO_UDP) - csum_offset = offsetof(struct udphdr, check); - else - return 0; - - len = skb->len - skb_transport_offset(skb); - csum = (__sum16 *)(skb_transport_header(skb) + csum_offset); - - if (unlikely(!pskb_may_pull(skb, skb_transport_offset(skb) + - csum_offset + sizeof(*csum)))) - return -EINVAL; - - if (l3_proto == htons(ETH_P_IP)) { - struct iphdr *iph = ip_hdr(skb); - - *csum = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - len, l4_proto, 0); - } else if (l3_proto == htons(ETH_P_IPV6)) { - struct ipv6hdr *ip6h = ipv6_hdr(skb); - - *csum = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, - len, l4_proto, 0); - } else { - return 0; - } - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = csum_offset; - skb->ip_summed = CHECKSUM_PARTIAL; - } - - if (skb->ip_summed == CHECKSUM_PARTIAL) { - /* Assume receiver can only offload TCP/UDP over IPv4/6, - * and require 802.1Q VLANs to be accelerated. - */ - if (l3_proto != htons(ETH_P_IP) && - l3_proto != htons(ETH_P_IPV6)) - return 0; - - if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) - return 0; - - /* L4 offset must fit in a 1-byte field. */ - if (skb->csum_start - skb_headroom(skb) > 255) - return 0; - - if (skb_shinfo(skb)->gso_type & ~SUPPORTED_GSO_TYPES) - return 0; - } - /* Total size of encapsulated packet must fit in 16 bits. */ - if (skb->len + STT_HEADER_LEN + sizeof(struct iphdr) > 65535) - return 0; - - if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) - return 0; - return 1; -} - -static bool need_linearize(const struct sk_buff *skb) -{ - struct skb_shared_info *shinfo = skb_shinfo(skb); - int i; - - if (unlikely(shinfo->frag_list)) - return true; - - /* Generally speaking we should linearize if there are paged frags. - * However, if all of the refcounts are 1 we know nobody else can - * change them from underneath us and we can skip the linearization. - */ - for (i = 0; i < shinfo->nr_frags; i++) - if (unlikely(page_count(skb_frag_page(&shinfo->frags[i])) > 1)) - return true; - - return false; -} - -static struct sk_buff *handle_offloads(struct sk_buff *skb, int min_headroom) -{ - int err; - - if (skb_vlan_tag_present(skb) && skb->vlan_proto != htons(ETH_P_8021Q)) { - - min_headroom += VLAN_HLEN; - if (skb_headroom(skb) < min_headroom) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + 16); - - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto error; - } - - skb = __vlan_hwaccel_push_inside(skb); - if (!skb) { - err = -ENOMEM; - goto error; - } - } - - if (skb_is_gso(skb)) { - struct sk_buff *nskb; - char cb[sizeof(skb->cb)]; - - memcpy(cb, skb->cb, sizeof(cb)); - - nskb = __skb_gso_segment(skb, 0, false); - if (IS_ERR(nskb)) { - err = PTR_ERR(nskb); - goto error; - } - - consume_skb(skb); - skb = nskb; - while (nskb) { - memcpy(nskb->cb, cb, sizeof(cb)); - nskb = nskb->next; - } - } else if (skb->ip_summed == CHECKSUM_PARTIAL) { - /* Pages aren't locked and could change at any time. - * If this happens after we compute the checksum, the - * checksum will be wrong. We linearize now to avoid - * this problem. - */ - if (unlikely(need_linearize(skb))) { - err = __skb_linearize(skb); - if (unlikely(err)) - goto error; - } - - err = skb_checksum_help(skb); - if (unlikely(err)) - goto error; - } - skb->ip_summed = CHECKSUM_NONE; - - return skb; -error: - kfree_skb(skb); - return ERR_PTR(err); -} - -static void skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src, - __be32 dst, __u8 tos, __u8 ttl, __be16 df) -{ - while (skb) { - struct sk_buff *next = skb->next; - - if (next) - dst_clone(&rt->dst); - - skb->next = NULL; - iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP, - tos, ttl, df, false); - - skb = next; - } -} - -static u8 parse_ipv6_l4_proto(struct sk_buff *skb) -{ - unsigned int nh_ofs = skb_network_offset(skb); - int payload_ofs; - struct ipv6hdr *nh; - uint8_t nexthdr; - __be16 frag_off; - - if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct ipv6hdr)))) - return 0; - - nh = ipv6_hdr(skb); - nexthdr = nh->nexthdr; - payload_ofs = (u8 *)(nh + 1) - skb->data; - - payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); - if (unlikely(payload_ofs < 0)) - return 0; - - return nexthdr; -} - -static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto) -{ - if (l3_proto == htons(ETH_P_IP)) { - unsigned int nh_ofs = skb_network_offset(skb); - - if (unlikely(!pskb_may_pull(skb, nh_ofs + sizeof(struct iphdr)))) - return 0; - - return ip_hdr(skb)->protocol; - } else if (l3_proto == htons(ETH_P_IPV6)) { - return parse_ipv6_l4_proto(skb); - } - return 0; -} - -static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt, - __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be64 tun_id) -{ - struct ethhdr *eh = eth_hdr(skb); - int ret = 0, min_headroom; - __be16 inner_l3_proto; - u8 inner_l4_proto; - - inner_l3_proto = eh->h_proto; - inner_l4_proto = skb_get_l4_proto(skb, inner_l3_proto); - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + STT_HEADER_LEN + sizeof(struct iphdr); - - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - - ret = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(ret)) - goto err_free_rt; - } - - ret = stt_can_offload(skb, inner_l3_proto, inner_l4_proto); - if (ret < 0) - goto err_free_rt; - if (!ret) { - skb = handle_offloads(skb, min_headroom); - if (IS_ERR(skb)) { - ret = PTR_ERR(skb); - skb = NULL; - goto err_free_rt; - } - } - - ret = 0; - while (skb) { - struct sk_buff *next_skb = skb->next; - - skb->next = NULL; - - if (next_skb) - dst_clone(&rt->dst); - - /* Push STT and TCP header. */ - skb = push_stt_header(skb, tun_id, src_port, dst_port, src, - dst, inner_l3_proto, inner_l4_proto, - dst_mtu(&rt->dst)); - if (unlikely(!skb)) { - ip_rt_put(rt); - goto next; - } - - /* Push IP header. */ - skb_list_xmit(rt, skb, src, dst, tos, ttl, df); - -next: - skb = next_skb; - } - - return 0; - -err_free_rt: - ip_rt_put(rt); - kfree_skb(skb); - return ret; -} - -static struct rtable *stt_get_rt(struct sk_buff *skb, - struct net_device *dev, - struct flowi4 *fl, - const struct ip_tunnel_key *key, - __be16 dport, __be16 sport) -{ - struct net *net = dev_net(dev); - - /* Route lookup */ - memset(fl, 0, sizeof(*fl)); - fl->daddr = key->u.ipv4.dst; - fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->tos); - fl->flowi4_mark = skb->mark; - fl->flowi4_proto = IPPROTO_TCP; - fl->fl4_dport = dport; - fl->fl4_sport = sport; - - return ip_route_output_key(net, fl); -} - -netdev_tx_t ovs_stt_xmit(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct stt_dev *stt_dev = netdev_priv(dev); - struct net *net = stt_dev->net; - __be16 dport = stt_dev->dst_port; - struct ip_tunnel_key *tun_key; - struct ip_tunnel_info *tun_info; - struct rtable *rt; - struct flowi4 fl; - __be16 sport; - __be16 df; - int err; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &tun_info->key; - - sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - rt = stt_get_rt(skb, dev, &fl, tun_key, dport, sport); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - skb->ignore_df = 1; - - stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst, - tun_key->tos, tun_key->ttl, - df, sport, dport, tun_key->tun_id); - return NETDEV_TX_OK; -error: - kfree_skb(skb); - dev->stats.tx_errors++; - return err; -} -EXPORT_SYMBOL(ovs_stt_xmit); - -static void free_frag(struct stt_percpu *stt_percpu, - struct pkt_frag *frag) -{ - stt_percpu->frag_mem_used -= FRAG_CB(frag->skbs)->first.mem_used; - kfree_skb_list(frag->skbs); - list_del(&frag->lru_node); - frag->skbs = NULL; -} - -static void evict_frags(struct stt_percpu *stt_percpu) -{ - while (!list_empty(&stt_percpu->frag_lru) && - stt_percpu->frag_mem_used > REASM_LO_THRESH) { - struct pkt_frag *frag; - - frag = list_first_entry(&stt_percpu->frag_lru, - struct pkt_frag, - lru_node); - free_frag(stt_percpu, frag); - } -} - -static bool pkt_key_match(struct net *net, - const struct pkt_frag *a, const struct pkt_key *b) -{ - return a->key.saddr == b->saddr && a->key.daddr == b->daddr && - a->key.pkt_seq == b->pkt_seq && a->key.mark == b->mark && - net_eq(dev_net(a->skbs->dev), net); -} - -static u32 pkt_key_hash(const struct net *net, const struct pkt_key *key) -{ - u32 initval = frag_hash_seed ^ (u32)(unsigned long)net ^ key->mark; - - return jhash_3words((__force u32)key->saddr, (__force u32)key->daddr, - (__force u32)key->pkt_seq, initval); -} - -static struct pkt_frag *lookup_frag(struct net *net, - struct stt_percpu *stt_percpu, - const struct pkt_key *key, u32 hash) -{ - struct pkt_frag *frag, *victim_frag = NULL; - int i; - - for (i = 0; i < FRAG_HASH_SEGS; i++) { - frag = &stt_percpu->frag_hash[hash & (FRAG_HASH_ENTRIES - 1)]; - - if (frag->skbs && - time_before(jiffies, frag->timestamp + FRAG_EXP_TIME) && - pkt_key_match(net, frag, key)) - return frag; - - if (!victim_frag || - (victim_frag->skbs && - (!frag->skbs || - time_before(frag->timestamp, victim_frag->timestamp)))) - victim_frag = frag; - - hash >>= FRAG_HASH_SHIFT; - } - - if (victim_frag->skbs) - free_frag(stt_percpu, victim_frag); - - return victim_frag; -} - -#ifdef SKIP_ZERO_COPY -static int __copy_skb(struct sk_buff *to, struct sk_buff *from, - int *delta, bool *headstolen) -{ - int err; - - if (unlikely(to->next)) - return -EINVAL; - - if (unlikely(FRAG_CB(to)->offset)) - return -EINVAL; - - if (unlikely(skb_unclone(to, GFP_ATOMIC))) - return -ENOMEM; - - if (skb_try_coalesce(to, from, headstolen, delta)) - return 0; - - *headstolen = false; - err = pskb_expand_head(to, 0, to->data_len + from->len, GFP_ATOMIC); - if (unlikely(err)) - return err; - - if (unlikely(!__pskb_pull_tail(to, to->data_len))) - BUG(); - - skb_copy_bits(from, 0, skb_put(to, from->len), from->len); - - *delta = from->len; - to->truesize += from->len; - return 0; -} -#else -static int __copy_skb(struct sk_buff *to, struct sk_buff *from, - int *delta, bool *headstolen) -{ - *headstolen = false; - return -EINVAL; -} -#endif - -static struct sk_buff *reassemble(struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - struct tcphdr *tcph = tcp_hdr(skb); - u32 seq = ntohl(tcph->seq); - struct stt_percpu *stt_percpu; - struct sk_buff *last_skb, *copied_skb = NULL; - struct pkt_frag *frag; - struct pkt_key key; - int tot_len, delta = skb->truesize; - bool headstolen; - u32 hash; - - tot_len = seq >> STT_SEQ_LEN_SHIFT; - FRAG_CB(skb)->offset = seq & STT_SEQ_OFFSET_MASK; - - if (unlikely(skb->len == 0)) - goto out_free; - - if (unlikely(FRAG_CB(skb)->offset + skb->len > tot_len)) - goto out_free; - - if (tot_len == skb->len) - goto out; - - key.saddr = iph->saddr; - key.daddr = iph->daddr; - key.pkt_seq = tcph->ack_seq; - key.mark = skb->mark; - hash = pkt_key_hash(dev_net(skb->dev), &key); - - stt_percpu = per_cpu_ptr(stt_percpu_data, smp_processor_id()); - - spin_lock(&stt_percpu->lock); - - if (unlikely(stt_percpu->frag_mem_used + skb->truesize > REASM_HI_THRESH)) - evict_frags(stt_percpu); - - frag = lookup_frag(dev_net(skb->dev), stt_percpu, &key, hash); - if (!frag->skbs) { - frag->skbs = skb; - frag->key = key; - frag->timestamp = jiffies; - FRAG_CB(skb)->first.last_skb = skb; - FRAG_CB(skb)->first.mem_used = skb->truesize; - FRAG_CB(skb)->first.tot_len = tot_len; - FRAG_CB(skb)->first.rcvd_len = skb->len; - FRAG_CB(skb)->first.set_ecn_ce = false; - list_add_tail(&frag->lru_node, &stt_percpu->frag_lru); - stt_percpu->frag_mem_used += skb->truesize; - skb = NULL; - goto unlock; - } - - /* Optimize for the common case where fragments are received in-order - * and not overlapping. - */ - last_skb = FRAG_CB(frag->skbs)->first.last_skb; - if (likely(FRAG_CB(last_skb)->offset + last_skb->len == - FRAG_CB(skb)->offset)) { - - if (!__copy_skb(frag->skbs, skb, &delta, &headstolen)) { - copied_skb = skb; - } else { - last_skb->next = skb; - FRAG_CB(frag->skbs)->first.last_skb = skb; - } - } else { - struct sk_buff *prev = NULL, *next; - - for (next = frag->skbs; next; next = next->next) { - if (FRAG_CB(next)->offset >= FRAG_CB(skb)->offset) - break; - prev = next; - } - - /* Overlapping fragments aren't allowed. We shouldn't start - * before the end of the previous fragment. - */ - if (prev && - FRAG_CB(prev)->offset + prev->len > FRAG_CB(skb)->offset) - goto unlock_free; - - /* We also shouldn't end after the beginning of the next - * fragment. - */ - if (next && - FRAG_CB(skb)->offset + skb->len > FRAG_CB(next)->offset) - goto unlock_free; - - if (prev) { - prev->next = skb; - } else { - FRAG_CB(skb)->first = FRAG_CB(frag->skbs)->first; - frag->skbs = skb; - } - - if (next) - skb->next = next; - else - FRAG_CB(frag->skbs)->first.last_skb = skb; - } - - FRAG_CB(frag->skbs)->first.set_ecn_ce |= INET_ECN_is_ce(iph->tos); - FRAG_CB(frag->skbs)->first.rcvd_len += skb->len; - stt_percpu->frag_mem_used += delta; - FRAG_CB(frag->skbs)->first.mem_used += delta; - - if (FRAG_CB(frag->skbs)->first.tot_len == - FRAG_CB(frag->skbs)->first.rcvd_len) { - struct sk_buff *frag_head = frag->skbs; - - frag_head->tstamp = skb->tstamp; - if (FRAG_CB(frag_head)->first.set_ecn_ce) - INET_ECN_set_ce(frag_head); - - list_del(&frag->lru_node); - stt_percpu->frag_mem_used -= FRAG_CB(frag_head)->first.mem_used; - frag->skbs = NULL; - skb = frag_head; - } else { - list_move_tail(&frag->lru_node, &stt_percpu->frag_lru); - skb = NULL; - } - - if (copied_skb) - kfree_skb_partial(copied_skb, headstolen); - goto unlock; - -unlock_free: - kfree_skb(skb); - skb = NULL; -unlock: - spin_unlock(&stt_percpu->lock); - return skb; -out_free: - kfree_skb(skb); - skb = NULL; -out: - return skb; -} - -static bool validate_checksum(struct sk_buff *skb) -{ - struct iphdr *iph = ip_hdr(skb); - - if (skb_csum_unnecessary(skb)) - return true; - - if (skb->ip_summed == CHECKSUM_COMPLETE && - !tcp_v4_check(skb->len, iph->saddr, iph->daddr, skb->csum)) - return true; - - skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len, - IPPROTO_TCP, 0); - - return __skb_checksum_complete(skb) == 0; -} - -static bool set_offloads(struct sk_buff *skb) -{ - struct stthdr *stth = stt_hdr(skb); - unsigned int gso_type = 0; - int l3_header_size; - int l4_header_size; - u16 csum_offset; - u8 proto_type; - - if (stth->vlan_tci) - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), - ntohs(stth->vlan_tci)); - - if (!(stth->flags & STT_CSUM_PARTIAL)) { - if (stth->flags & STT_CSUM_VERIFIED) - skb->ip_summed = CHECKSUM_UNNECESSARY; - else - skb->ip_summed = CHECKSUM_NONE; - - return clear_gso(skb) == 0; - } - - proto_type = stth->flags & STT_PROTO_TYPES; - - switch (proto_type) { - case (STT_PROTO_IPV4 | STT_PROTO_TCP): - /* TCP/IPv4 */ - csum_offset = offsetof(struct tcphdr, check); - gso_type = SKB_GSO_TCPV4; - l3_header_size = sizeof(struct iphdr); - l4_header_size = sizeof(struct tcphdr); - skb->protocol = htons(ETH_P_IP); - break; - case STT_PROTO_TCP: - /* TCP/IPv6 */ - csum_offset = offsetof(struct tcphdr, check); - gso_type = SKB_GSO_TCPV6; - l3_header_size = sizeof(struct ipv6hdr); - l4_header_size = sizeof(struct tcphdr); - skb->protocol = htons(ETH_P_IPV6); - break; - case STT_PROTO_IPV4: - /* UDP/IPv4 */ - csum_offset = offsetof(struct udphdr, check); -#ifdef HAVE_SKB_GSO_UDP - gso_type = SKB_GSO_UDP; -#endif - l3_header_size = sizeof(struct iphdr); - l4_header_size = sizeof(struct udphdr); - skb->protocol = htons(ETH_P_IP); - break; - default: - /* UDP/IPv6 */ - csum_offset = offsetof(struct udphdr, check); -#ifdef HAVE_SKB_GSO_UDP - gso_type = SKB_GSO_UDP; -#endif - l3_header_size = sizeof(struct ipv6hdr); - l4_header_size = sizeof(struct udphdr); - skb->protocol = htons(ETH_P_IPV6); - } - - if (unlikely(stth->l4_offset < ETH_HLEN + l3_header_size)) - return false; - - if (unlikely(!pskb_may_pull(skb, stth->l4_offset + l4_header_size))) - return false; - - stth = stt_hdr(skb); - - skb->csum_start = skb_headroom(skb) + stth->l4_offset; - skb->csum_offset = csum_offset; - skb->ip_summed = CHECKSUM_PARTIAL; - - if (stth->mss) { - if (unlikely(skb_unclone(skb, GFP_ATOMIC))) - return false; - - skb_shinfo(skb)->gso_type = gso_type | SKB_GSO_DODGY; - skb_shinfo(skb)->gso_size = ntohs(stth->mss); - skb_shinfo(skb)->gso_segs = 0; - } else { - if (unlikely(clear_gso(skb))) - return false; - } - - return true; -} - -static void rcv_list(struct net_device *dev, struct sk_buff *skb, - struct metadata_dst *tun_dst) -{ - struct sk_buff *next; - - do { - next = skb->next; - skb->next = NULL; - if (next) { - ovs_dst_hold((struct dst_entry *)tun_dst); - ovs_skb_dst_set(next, (struct dst_entry *)tun_dst); - } - ovs_ip_tunnel_rcv(dev, skb, tun_dst); - } while ((skb = next)); -} - -#ifndef USE_UPSTREAM_TUNNEL -static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb) -{ - struct metadata_dst tun_dst; - - ovs_ip_tun_rx_dst(&tun_dst, skb, TUNNEL_KEY | TUNNEL_CSUM, - get_unaligned(&stt_hdr(skb)->key), 0); - tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source; - tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest; - - rcv_list(stt_dev->dev, skb, &tun_dst); - return 0; -} -#else -static int __stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb) -{ - struct metadata_dst *tun_dst; - __be16 flags; - __be64 tun_id; - - flags = TUNNEL_KEY | TUNNEL_CSUM; - tun_id = get_unaligned(&stt_hdr(skb)->key); - tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); - if (!tun_dst) - return -ENOMEM; - tun_dst->u.tun_info.key.tp_src = tcp_hdr(skb)->source; - tun_dst->u.tun_info.key.tp_dst = tcp_hdr(skb)->dest; - - rcv_list(stt_dev->dev, skb, tun_dst); - return 0; -} -#endif - -static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb) -{ - int err; - - if (unlikely(!validate_checksum(skb))) - goto drop; - - __skb_pull(skb, sizeof(struct tcphdr)); - skb = reassemble(skb); - if (!skb) - return; - - if (skb->next && coalesce_skb(&skb)) - goto drop; - - err = iptunnel_pull_header(skb, - sizeof(struct stthdr) + STT_ETH_PAD, - htons(ETH_P_TEB), - !net_eq(stt_dev->net, dev_net(stt_dev->dev))); - if (unlikely(err)) - goto drop; - - if (unlikely(stt_hdr(skb)->version != 0)) - goto drop; - - if (unlikely(!set_offloads(skb))) - goto drop; - - if (skb_shinfo(skb)->frag_list && try_to_segment(skb)) - goto drop; - - err = __stt_rcv(stt_dev, skb); - if (err) - goto drop; - return; -drop: - /* Consume bad packet */ - kfree_skb_list(skb); - stt_dev->dev->stats.rx_errors++; -} - -static void tcp_sock_release(struct socket *sock) -{ - kernel_sock_shutdown(sock, SHUT_RDWR); - sock_release(sock); -} - -static int tcp_sock_create4(struct net *net, __be16 port, - struct socket **sockp) -{ - struct sockaddr_in tcp_addr; - struct socket *sock = NULL; - int err; - - err = sock_create_kern(net, AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (err < 0) - goto error; - - memset(&tcp_addr, 0, sizeof(tcp_addr)); - tcp_addr.sin_family = AF_INET; - tcp_addr.sin_addr.s_addr = htonl(INADDR_ANY); - tcp_addr.sin_port = port; - err = kernel_bind(sock, (struct sockaddr *)&tcp_addr, - sizeof(tcp_addr)); - if (err < 0) - goto error; - - *sockp = sock; - return 0; - -error: - if (sock) - tcp_sock_release(sock); - *sockp = NULL; - return err; -} - -static void schedule_clean_percpu(void) -{ - schedule_delayed_work(&clean_percpu_wq, CLEAN_PERCPU_INTERVAL); -} - -static void clean_percpu(struct work_struct *work) -{ - int i; - - for_each_possible_cpu(i) { - struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); - int j; - - for (j = 0; j < FRAG_HASH_ENTRIES; j++) { - struct pkt_frag *frag; - - frag = &stt_percpu->frag_hash[j]; - if (!frag->skbs || - time_before(jiffies, frag->timestamp + FRAG_EXP_TIME)) - continue; - - spin_lock_bh(&stt_percpu->lock); - - if (frag->skbs && - time_after(jiffies, frag->timestamp + FRAG_EXP_TIME)) - free_frag(stt_percpu, frag); - - spin_unlock_bh(&stt_percpu->lock); - } - } - schedule_clean_percpu(); -} - -#ifdef HAVE_NF_HOOKFN_ARG_OPS -#define FIRST_PARAM const struct nf_hook_ops *ops -#else -#ifdef HAVE_NF_HOOKFN_ARG_PRIV -#define FIRST_PARAM void *priv -#else -#define FIRST_PARAM unsigned int hooknum -#endif -#endif - -#ifdef HAVE_NF_HOOK_STATE -#if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0) && RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0) -/* RHEL nfhook hacks. */ -#ifndef __GENKSYMS__ -#define LAST_PARAM const struct net_device *in, const struct net_device *out, \ - const struct nf_hook_state *state -#else -#define LAST_PARAM const struct net_device *in, const struct net_device *out, \ - int (*okfn)(struct sk_buff *) -#endif -#else -#define LAST_PARAM const struct nf_hook_state *state -#endif -#else -#define LAST_PARAM const struct net_device *in, const struct net_device *out, \ - int (*okfn)(struct sk_buff *) -#endif - -static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM) -{ - struct stt_dev *stt_dev; - int ip_hdr_len; - - if (ip_hdr(skb)->protocol != IPPROTO_TCP) - return NF_ACCEPT; - - ip_hdr_len = ip_hdrlen(skb); - if (unlikely(!pskb_may_pull(skb, ip_hdr_len + sizeof(struct tcphdr)))) - return NF_ACCEPT; - - skb_set_transport_header(skb, ip_hdr_len); - - stt_dev = stt_find_up_dev(dev_net(skb->dev), tcp_hdr(skb)->dest); - if (!stt_dev) - return NF_ACCEPT; - - __skb_pull(skb, ip_hdr_len); - stt_rcv(stt_dev, skb); - return NF_STOLEN; -} - -static struct nf_hook_ops nf_hook_ops __read_mostly = { - .hook = nf_ip_hook, -#ifdef HAVE_NF_HOOKS_OPS_OWNER - .owner = THIS_MODULE, -#endif - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = INT_MAX, -}; - -static int stt_start(struct net *net) -{ - struct stt_net *sn = net_generic(net, stt_net_id); - int err; - int i; - - if (n_tunnels) { - n_tunnels++; - return 0; - } - get_random_bytes(&frag_hash_seed, sizeof(u32)); - - stt_percpu_data = alloc_percpu(struct stt_percpu); - if (!stt_percpu_data) { - err = -ENOMEM; - goto error; - } - - for_each_possible_cpu(i) { - struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); - struct pkt_frag *frag_hash; - - spin_lock_init(&stt_percpu->lock); - INIT_LIST_HEAD(&stt_percpu->frag_lru); - get_random_bytes(&per_cpu(pkt_seq_counter, i), sizeof(u32)); - - frag_hash = kvmalloc_array(sizeof(struct pkt_frag), - FRAG_HASH_ENTRIES, - GFP_KERNEL | __GFP_ZERO); - if (!frag_hash) { - err = -ENOMEM; - goto free_percpu; - } - stt_percpu->frag_hash = frag_hash; - } - schedule_clean_percpu(); - n_tunnels++; - - if (sn->n_tunnels) { - sn->n_tunnels++; - return 0; - } -#ifdef HAVE_NF_REGISTER_NET_HOOK - /* On kernel which support per net nf-hook, nf_register_hook() takes - * rtnl-lock, which results in dead lock in stt-dev-create. Therefore - * use this new API. - */ - - if (sn->nf_hook_reg_done) - goto out; - - err = nf_register_net_hook(net, &nf_hook_ops); - if (!err) - sn->nf_hook_reg_done = true; -#else - /* Register STT only on very first STT device addition. */ - if (!list_empty(&nf_hook_ops.list)) - goto out; - - err = nf_register_hook(&nf_hook_ops); -#endif - if (err) - goto dec_n_tunnel; -out: - sn->n_tunnels++; - return 0; - -dec_n_tunnel: - n_tunnels--; -free_percpu: - for_each_possible_cpu(i) { - struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); - - if (stt_percpu->frag_hash) - kvfree(stt_percpu->frag_hash); - } - - free_percpu(stt_percpu_data); - -error: - return err; -} - -static void stt_cleanup(struct net *net) -{ - struct stt_net *sn = net_generic(net, stt_net_id); - int i; - - sn->n_tunnels--; - n_tunnels--; - if (n_tunnels) - return; - - cancel_delayed_work_sync(&clean_percpu_wq); - for_each_possible_cpu(i) { - struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i); - int j; - - for (j = 0; j < FRAG_HASH_ENTRIES; j++) { - struct pkt_frag *frag; - - frag = &stt_percpu->frag_hash[j]; - kfree_skb_list(frag->skbs); - } - - kvfree(stt_percpu->frag_hash); - } - - free_percpu(stt_percpu_data); -} - -static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev) -{ -#ifdef USE_UPSTREAM_TUNNEL - return ovs_stt_xmit(skb); -#else - /* Drop All packets coming from networking stack. OVS-CB is - * not initialized for these packets. - */ - dev_kfree_skb(skb); - dev->stats.tx_dropped++; - return NETDEV_TX_OK; -#endif -} - -/* Setup stats when device is created */ -static int stt_init(struct net_device *dev) -{ - dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; -} - -static void stt_uninit(struct net_device *dev) -{ - free_percpu(dev->tstats); -} - -static int stt_open(struct net_device *dev) -{ - struct stt_dev *stt = netdev_priv(dev); - struct net *net = stt->net; - struct stt_net *sn = net_generic(net, stt_net_id); - int err; - - err = stt_start(net); - if (err) - return err; - - err = tcp_sock_create4(net, stt->dst_port, &stt->sock); - if (err) - return err; - list_add_rcu(&stt->up_next, &sn->stt_up_list); - return 0; -} - -static int stt_stop(struct net_device *dev) -{ - struct stt_dev *stt_dev = netdev_priv(dev); - struct net *net = stt_dev->net; - - list_del_rcu(&stt_dev->up_next); - synchronize_net(); - tcp_sock_release(stt_dev->sock); - stt_dev->sock = NULL; - stt_cleanup(net); - return 0; -} - -static int __stt_change_mtu(struct net_device *dev, int new_mtu, bool strict) -{ - int max_mtu = IP_MAX_MTU - STT_HEADER_LEN - sizeof(struct iphdr) - - dev->hard_header_len; - - if (new_mtu < 68) - return -EINVAL; - - if (new_mtu > max_mtu) { - if (strict) - return -EINVAL; - - new_mtu = max_mtu; - } - - dev->mtu = new_mtu; - return 0; -} - -static int stt_change_mtu(struct net_device *dev, int new_mtu) -{ - return __stt_change_mtu(dev, new_mtu, true); -} - -int ovs_stt_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) -{ - struct ip_tunnel_info *info = skb_tunnel_info(skb); - struct stt_dev *stt_dev = netdev_priv(dev); - struct net *net = stt_dev->net; - __be16 dport = stt_dev->dst_port; - __be16 sport; - struct flowi4 fl4; - struct rtable *rt; - - if (ip_tunnel_info_af(info) != AF_INET) - return -EINVAL; - - sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - rt = stt_get_rt(skb, dev, &fl4, &info->key, dport, sport); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - ip_rt_put(rt); - - info->key.u.ipv4.src = fl4.saddr; - info->key.tp_src = sport; - info->key.tp_dst = dport; - return 0; -} -EXPORT_SYMBOL_GPL(ovs_stt_fill_metadata_dst); - -static const struct net_device_ops stt_netdev_ops = { - .ndo_init = stt_init, - .ndo_uninit = stt_uninit, - .ndo_open = stt_open, - .ndo_stop = stt_stop, - .ndo_start_xmit = stt_dev_xmit, - .ndo_get_stats64 = ip_tunnel_get_stats64, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = stt_change_mtu, -#else - .ndo_change_mtu = stt_change_mtu, -#endif - .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, -#ifdef USE_UPSTREAM_TUNNEL -#ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = stt_fill_metadata_dst, -#endif -#endif -}; - -static void stt_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *drvinfo) -{ - strlcpy(drvinfo->version, STT_NETDEV_VER, sizeof(drvinfo->version)); - strlcpy(drvinfo->driver, "stt", sizeof(drvinfo->driver)); -} - -static const struct ethtool_ops stt_ethtool_ops = { - .get_drvinfo = stt_get_drvinfo, - .get_link = ethtool_op_get_link, -}; - -/* Info for udev, that this is a virtual tunnel endpoint */ -static struct device_type stt_type = { - .name = "stt", -}; - -/* Initialize the device structure. */ -static void stt_setup(struct net_device *dev) -{ - ether_setup(dev); - - dev->netdev_ops = &stt_netdev_ops; - dev->ethtool_ops = &stt_ethtool_ops; -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = free_netdev; -#else - dev->needs_free_netdev = true; -#endif - - SET_NETDEV_DEVTYPE(dev, &stt_type); - - dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL; - dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; - dev->features |= NETIF_F_RXCSUM; - dev->features |= NETIF_F_GSO_SOFTWARE; - - dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - -#ifdef USE_UPSTREAM_TUNNEL - netif_keep_dst(dev); -#endif - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; - eth_hw_addr_random(dev); -} - -static const struct nla_policy stt_policy[IFLA_STT_MAX + 1] = { - [IFLA_STT_PORT] = { .type = NLA_U16 }, -}; - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int stt_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack __always_unused *extack) -#else -static int stt_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) - return -EINVAL; - - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) - return -EADDRNOTAVAIL; - } - - return 0; -} - -static struct stt_dev *find_dev(struct net *net, __be16 dst_port) -{ - struct stt_net *sn = net_generic(net, stt_net_id); - struct stt_dev *dev; - - list_for_each_entry(dev, &sn->stt_list, next) { - if (dev->dst_port == dst_port) - return dev; - } - return NULL; -} - -static int stt_configure(struct net *net, struct net_device *dev, - __be16 dst_port) -{ - struct stt_net *sn = net_generic(net, stt_net_id); - struct stt_dev *stt = netdev_priv(dev); - int err; - - stt->net = net; - stt->dev = dev; - - stt->dst_port = dst_port; - - if (find_dev(net, dst_port)) - return -EBUSY; - - err = __stt_change_mtu(dev, IP_MAX_MTU, false); - if (err) - return err; - - err = register_netdevice(dev); - if (err) - return err; - - list_add(&stt->next, &sn->stt_list); - return 0; -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int stt_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack __always_unused *extack) -#else -static int stt_newlink(struct net *net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - __be16 dst_port = htons(STT_DST_PORT); - - if (data[IFLA_STT_PORT]) - dst_port = nla_get_be16(data[IFLA_STT_PORT]); - - return stt_configure(net, dev, dst_port); -} - -static void stt_dellink(struct net_device *dev, struct list_head *head) -{ - struct stt_dev *stt = netdev_priv(dev); - - list_del(&stt->next); - unregister_netdevice_queue(dev, head); -} - -static size_t stt_get_size(const struct net_device *dev) -{ - return nla_total_size(sizeof(__be32)); /* IFLA_STT_PORT */ -} - -static int stt_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - struct stt_dev *stt = netdev_priv(dev); - - if (nla_put_be16(skb, IFLA_STT_PORT, stt->dst_port)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -static struct rtnl_link_ops stt_link_ops __read_mostly = { - .kind = "stt", - .maxtype = IFLA_STT_MAX, - .policy = stt_policy, - .priv_size = sizeof(struct stt_dev), - .setup = stt_setup, - .validate = stt_validate, - .newlink = stt_newlink, - .dellink = stt_dellink, - .get_size = stt_get_size, - .fill_info = stt_fill_info, -}; - -struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name, - u8 name_assign_type, u16 dst_port) -{ - struct nlattr *tb[IFLA_MAX + 1]; - struct net_device *dev; - int err; - - memset(tb, 0, sizeof(tb)); - dev = rtnl_create_link(net, (char *) name, name_assign_type, - &stt_link_ops, tb); - if (IS_ERR(dev)) - return dev; - - err = stt_configure(net, dev, htons(dst_port)); - if (err) { - free_netdev(dev); - return ERR_PTR(err); - } - return dev; -} -EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb); - -static int stt_init_net(struct net *net) -{ - struct stt_net *sn = net_generic(net, stt_net_id); - - INIT_LIST_HEAD(&sn->stt_list); - INIT_LIST_HEAD(&sn->stt_up_list); -#ifdef HAVE_NF_REGISTER_NET_HOOK - sn->nf_hook_reg_done = false; -#endif - return 0; -} - -static void stt_exit_net(struct net *net) -{ - struct stt_net *sn = net_generic(net, stt_net_id); - struct stt_dev *stt, *next; - struct net_device *dev, *aux; - LIST_HEAD(list); - -#ifdef HAVE_NF_REGISTER_NET_HOOK - /* Ideally this should be done from stt_stop(), But on some kernels - * nf-unreg operation needs RTNL-lock, which can cause deallock. - * So it is done from here. */ - if (sn->nf_hook_reg_done) - nf_unregister_net_hook(net, &nf_hook_ops); -#endif - - rtnl_lock(); - - /* gather any stt devices that were moved into this ns */ - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &stt_link_ops) - unregister_netdevice_queue(dev, &list); - - list_for_each_entry_safe(stt, next, &sn->stt_list, next) { - /* If stt->dev is in the same netns, it was already added - * to the stt by the previous loop. - */ - if (!net_eq(dev_net(stt->dev), net)) - unregister_netdevice_queue(stt->dev, &list); - } - - /* unregister the devices gathered above */ - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -static struct pernet_operations stt_net_ops = { - .init = stt_init_net, - .exit = stt_exit_net, - .id = &stt_net_id, - .size = sizeof(struct stt_net), -}; - -int stt_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&stt_net_ops); - if (rc) - goto out1; - - rc = rtnl_link_register(&stt_link_ops); - if (rc) - goto out2; - -#ifdef HAVE_LIST_IN_NF_HOOK_OPS - INIT_LIST_HEAD(&nf_hook_ops.list); -#endif - pr_info("STT tunneling driver\n"); - return 0; -out2: - unregister_pernet_subsys(&stt_net_ops); -out1: - pr_err("Error while initializing STT %d\n", rc); - return rc; -} - -void stt_cleanup_module(void) -{ -#ifndef HAVE_NF_REGISTER_NET_HOOK - if (!list_empty(&nf_hook_ops.list)) - nf_unregister_hook(&nf_hook_ops); -#endif - rtnl_link_unregister(&stt_link_ops); - unregister_pernet_subsys(&stt_net_ops); -} -#endif diff --git a/datapath/linux/compat/udp.c b/datapath/linux/compat/udp.c deleted file mode 100644 index 38bf332db..000000000 --- a/datapath/linux/compat/udp.c +++ /dev/null @@ -1,46 +0,0 @@ -#include <linux/version.h> - -#ifndef USE_UPSTREAM_TUNNEL - -#include <net/udp.h> - -/* Function to set UDP checksum for an IPv4 UDP packet. This is intended - * for the simple case like when setting the checksum for a UDP tunnel. - */ -void rpl_udp_set_csum(bool nocheck, struct sk_buff *skb, - __be32 saddr, __be32 daddr, int len) -{ - struct udphdr *uh = udp_hdr(skb); - - - if (nocheck) { - uh->check = 0; - } else if (skb_is_gso(skb)) { - uh->check = ~udp_v4_check(len, saddr, daddr, 0); - } else if (skb->ip_summed == CHECKSUM_PARTIAL) { - uh->check = 0; - uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } else { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - uh->check = ~udp_v4_check(len, saddr, daddr, 0); - } -} -EXPORT_SYMBOL_GPL(rpl_udp_set_csum); - -#endif /* Linux version < 3.16 */ - -#ifdef OVS_CHECK_UDP_TUNNEL_ZERO_CSUM -void rpl_udp6_csum_zero_error(struct sk_buff *skb) -{ - /* RFC 2460 section 8.1 says that we SHOULD log - * this error. Well, it is reasonable. - */ - net_dbg_ratelimited("IPv6: udp checksum is 0 for [%pI6c]:%u->[%pI6c]:%u\n", - &ipv6_hdr(skb)->saddr, ntohs(udp_hdr(skb)->source), - &ipv6_hdr(skb)->daddr, ntohs(udp_hdr(skb)->dest)); -} -#endif diff --git a/datapath/linux/compat/udp_tunnel.c b/datapath/linux/compat/udp_tunnel.c deleted file mode 100644 index 852069f62..000000000 --- a/datapath/linux/compat/udp_tunnel.c +++ /dev/null @@ -1,292 +0,0 @@ -#include <linux/version.h> - -#ifndef USE_UPSTREAM_TUNNEL - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/socket.h> -#include <linux/udp.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <net/ip_tunnels.h> -#include <net/udp.h> -#include <net/udp_tunnel.h> -#include <net/net_namespace.h> -#include <net/ip6_checksum.h> -#include <net/ip6_tunnel.h> - -#include "gso.h" - -int rpl_udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp) -{ - int err; - struct socket *sock = NULL; - struct sockaddr_in udp_addr; - - err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto error; - - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->local_ip; - udp_addr.sin_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp_addr, - sizeof(udp_addr)); - if (err < 0) - goto error; - - if (cfg->peer_udp_port) { - udp_addr.sin_family = AF_INET; - udp_addr.sin_addr = cfg->peer_ip; - udp_addr.sin_port = cfg->peer_udp_port; - err = kernel_connect(sock, (struct sockaddr *)&udp_addr, - sizeof(udp_addr), 0); - if (err < 0) - goto error; - } -#ifdef HAVE_SK_NO_CHECK_TX - sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; -#endif - *sockp = sock; - return 0; - -error: - if (sock) { - kernel_sock_shutdown(sock, SHUT_RDWR); - sock_release(sock); - } - *sockp = NULL; - return err; -} -EXPORT_SYMBOL(rpl_udp_sock_create4); - -int rpl_udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, - struct socket **sockp) -{ - struct sockaddr_in6 udp6_addr; - int err; - struct socket *sock = NULL; - - err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); - if (err < 0) - goto error; - - if (cfg->ipv6_v6only) { - int val = 1; - - err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - (char *) &val, sizeof(val)); - if (err < 0) - goto error; - } - - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, - sizeof(udp6_addr)); - if (err < 0) - goto error; - - if (cfg->peer_udp_port) { - udp6_addr.sin6_family = AF_INET6; - memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, - sizeof(udp6_addr.sin6_addr)); - udp6_addr.sin6_port = cfg->peer_udp_port; - err = kernel_connect(sock, - (struct sockaddr *)&udp6_addr, - sizeof(udp6_addr), 0); - } - if (err < 0) - goto error; - - udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); - udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); - - *sockp = sock; - return 0; - -error: - if (sock) { - kernel_sock_shutdown(sock, SHUT_RDWR); - sock_release(sock); - } - *sockp = NULL; - return err; -} -EXPORT_SYMBOL_GPL(rpl_udp_sock_create6); - -void rpl_setup_udp_tunnel_sock(struct net *net, struct socket *sock, - struct udp_tunnel_sock_cfg *cfg) -{ - struct sock *sk = sock->sk; - - /* Disable multicast loopback */ - inet_sk(sk)->mc_loop = 0; - - rcu_assign_sk_user_data(sk, cfg->sk_user_data); - - udp_sk(sk)->encap_type = cfg->encap_type; - udp_sk(sk)->encap_rcv = cfg->encap_rcv; - udp_sk(sk)->encap_destroy = cfg->encap_destroy; -#ifdef HAVE_UDP_TUNNEL_SOCK_CFG_GRO_RECEIVE - udp_sk(sk)->gro_receive = cfg->gro_receive; - udp_sk(sk)->gro_complete = cfg->gro_complete; -#endif - - udp_tunnel_encap_enable(sock); -} -EXPORT_SYMBOL_GPL(rpl_setup_udp_tunnel_sock); - -void rpl_udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, - struct sk_buff *skb, __be32 src, __be32 dst, - __u8 tos, __u8 ttl, __be16 df, __be16 src_port, - __be16 dst_port, bool xnet, bool nocheck) -{ - struct udphdr *uh; - - __skb_push(skb, sizeof(*uh)); - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - uh->dest = dst_port; - uh->source = src_port; - uh->len = htons(skb->len); - - udp_set_csum(nocheck, skb, src, dst, skb->len); - - iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); -} -EXPORT_SYMBOL_GPL(rpl_udp_tunnel_xmit_skb); - -void rpl_udp_tunnel_sock_release(struct socket *sock) -{ - rcu_assign_sk_user_data(sock->sk, NULL); - kernel_sock_shutdown(sock, SHUT_RDWR); - sock_release(sock); -} -EXPORT_SYMBOL_GPL(rpl_udp_tunnel_sock_release); - -#if IS_ENABLED(CONFIG_IPV6) - -#define udp_v6_check rpl_udp_v6_check -static __sum16 udp_v6_check(int len, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __wsum base) -{ - return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); -} - -#define udp6_set_csum rpl_udp6_set_csum -static void udp6_set_csum(bool nocheck, struct sk_buff *skb, - const struct in6_addr *saddr, - const struct in6_addr *daddr, int len) -{ - struct udphdr *uh = udp_hdr(skb); - - if (nocheck) - uh->check = 0; - else if (skb_is_gso(skb)) - uh->check = ~udp_v6_check(len, saddr, daddr, 0); - else if (skb->ip_summed == CHECKSUM_PARTIAL) { - uh->check = 0; - uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb)); - if (uh->check == 0) - uh->check = CSUM_MANGLED_0; - } else { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_transport_header(skb) - skb->head; - skb->csum_offset = offsetof(struct udphdr, check); - uh->check = ~udp_v6_check(len, saddr, daddr, 0); - } -} - -#define ip6_flow_hdr rpl_ip6_flow_hdr -static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, - __be32 flowlabel) -{ - *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel; -} - -int rpl_udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, struct in6_addr *saddr, - struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, __be16 src_port, - __be16 dst_port, bool nocheck) -{ - struct udphdr *uh; - struct ipv6hdr *ip6h; - - __skb_push(skb, sizeof(*uh)); - skb_reset_transport_header(skb); - uh = udp_hdr(skb); - - uh->dest = dst_port; - uh->source = src_port; - - uh->len = htons(skb->len); - - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED - | IPSKB_REROUTED); - skb_dst_set(skb, dst); - - udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); - - __skb_push(skb, sizeof(*ip6h)); - skb_reset_network_header(skb); - ip6h = ipv6_hdr(skb); - ip6_flow_hdr(ip6h, prio, label); - ip6h->payload_len = htons(skb->len); - ip6h->nexthdr = IPPROTO_UDP; - ip6h->hop_limit = ttl; - ip6h->daddr = *daddr; - ip6h->saddr = *saddr; - - ip6tunnel_xmit(sk, skb, dev); - return 0; -} -#endif - -#ifndef USE_UPSTREAM_TUNNEL_GSO -void ovs_udp_gso(struct sk_buff *skb) -{ - int udp_offset = skb_transport_offset(skb); - struct udphdr *uh; - - uh = udp_hdr(skb); - uh->len = htons(skb->len - udp_offset); -} -EXPORT_SYMBOL_GPL(ovs_udp_gso); - -void ovs_udp_csum_gso(struct sk_buff *skb) -{ - int udp_offset = skb_transport_offset(skb); - - ovs_udp_gso(skb); - - if (!OVS_GSO_CB(skb)->ipv6) { - struct iphdr *iph = ip_hdr(skb); - - /* csum segment if tunnel sets skb with csum. The cleanest way - * to do this just to set it up from scratch. */ - udp_set_csum(false, skb, iph->saddr, iph->daddr, - skb->len - udp_offset); -#if IS_ENABLED(CONFIG_IPV6) - } else { - struct ipv6hdr *ip6h; - - ip6h = ipv6_hdr(skb); - udp6_set_csum(false, skb, &ip6h->saddr, &ip6h->daddr, - skb->len - udp_offset); -#endif - } -} -EXPORT_SYMBOL_GPL(ovs_udp_csum_gso); -#endif /* USE_UPSTREAM_TUNNEL_GSO */ - -#endif diff --git a/datapath/linux/compat/utils.c b/datapath/linux/compat/utils.c deleted file mode 100644 index a4a98ba65..000000000 --- a/datapath/linux/compat/utils.c +++ /dev/null @@ -1,112 +0,0 @@ -#include <linux/module.h> -#include <linux/jiffies.h> -#include <linux/kernel.h> -#include <linux/ctype.h> -#include <linux/inet.h> -#include <linux/mm.h> -#include <linux/net.h> -#include <net/checksum.h> -#include <net/ip.h> -#include <linux/string.h> -#include <linux/types.h> -#include <linux/percpu.h> -#include <linux/init.h> -#include <linux/ratelimit.h> - -#include <net/sock.h> - -#include <asm/byteorder.h> -#include <asm/uaccess.h> - -#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) - -bool rpl___net_get_random_once(void *buf, int nbytes, bool *done, - atomic_t *done_key) -{ - static DEFINE_SPINLOCK(lock); - unsigned long flags; - - spin_lock_irqsave(&lock, flags); - if (*done) { - spin_unlock_irqrestore(&lock, flags); - return false; - } - - get_random_bytes(buf, nbytes); - *done = true; - spin_unlock_irqrestore(&lock, flags); - - atomic_set(done_key, 1); - - return true; -} -EXPORT_SYMBOL_GPL(rpl___net_get_random_once); - -#endif - -#ifdef NEED_ALLOC_PERCPU_GFP -void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) -{ - void __percpu *p; - int i; - - /* older kernel do not allow all GFP flags, specifically atomic - * allocation. - */ - if (gfp & ~(GFP_KERNEL | __GFP_ZERO)) - return NULL; - p = __alloc_percpu(size, align); - if (!p) - return p; - - if (!(gfp & __GFP_ZERO)) - return p; - - for_each_possible_cpu(i) { - void *d; - - d = per_cpu_ptr(p, i); - memset(d, 0, size); - } - return p; -} -#endif - -#ifndef HAVE_NLA_PUT_64BIT -int rpl_nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, - const void *data, int padattr) -{ - size_t len; - - if (nla_need_padding_for_64bit(skb)) - len = nla_total_size_64bit(attrlen); - else - len = nla_total_size(attrlen); - if (unlikely(skb_tailroom(skb) < len)) - return -EMSGSIZE; - - __nla_put_64bit(skb, attrtype, attrlen, data, padattr); - return 0; -} -EXPORT_SYMBOL_GPL(rpl_nla_put_64bit); - -void rpl___nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, - const void *data, int padattr) -{ - struct nlattr *nla; - - nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); - memcpy(nla_data(nla), data, attrlen); -} -EXPORT_SYMBOL_GPL(rpl___nla_put_64bit); - -struct nlattr *rpl___nla_reserve_64bit(struct sk_buff *skb, int attrtype, - int attrlen, int padattr) -{ - if (nla_need_padding_for_64bit(skb)) - nla_align_64bit(skb, padattr); - - return __nla_reserve(skb, attrtype, attrlen); -} -EXPORT_SYMBOL_GPL(rpl___nla_reserve_64bit); -#endif diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c deleted file mode 100644 index e65d955e9..000000000 --- a/datapath/linux/compat/vxlan.c +++ /dev/null @@ -1,2382 +0,0 @@ -/* - * VXLAN: Virtual eXtensible Local Area Network - * - * Copyright (c) 2012-2013 Vyatta Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/skbuff.h> -#include <linux/rculist.h> -#include <linux/netdevice.h> -#include <linux/netdev_features.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/udp.h> -#include <linux/igmp.h> -#include <linux/etherdevice.h> -#include <linux/if_ether.h> -#include <linux/if_vlan.h> -#include <linux/hash.h> -#include <linux/ethtool.h> -#include <net/arp.h> -#include <net/dst_metadata.h> -#include <net/ndisc.h> -#include <net/ip.h> -#include <net/ip_tunnels.h> -#include <net/icmp.h> -#include <net/udp.h> -#include <net/udp_tunnel.h> -#include <net/rtnetlink.h> -#include <net/route.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/protocol.h> - -#if IS_ENABLED(CONFIG_IPV6) -#include <net/ipv6.h> -#include <net/addrconf.h> -#include <net/ip6_tunnel.h> -#include <net/ip6_checksum.h> -#include <net/ip6_route.h> -#endif - -#include <net/tun_proto.h> -#include <net/vxlan.h> -#include "gso.h" -#include "vport-netdev.h" -#include "compat.h" - -#ifndef USE_UPSTREAM_TUNNEL -#define VXLAN_VERSION "0.1" - -#define PORT_HASH_BITS 8 -#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) -#define FDB_AGE_DEFAULT 300 /* 5 min */ -#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ - -/* UDP port for VXLAN traffic. - * The IANA assigned port is 4789, but the Linux default is 8472 - * for compatibility with early adopters. - */ -static unsigned short vxlan_port __read_mostly = 8472; -module_param_named(udp_port, vxlan_port, ushort, 0444); -MODULE_PARM_DESC(udp_port, "Destination UDP port"); - -static int vxlan_net_id; -static struct rtnl_link_ops vxlan_link_ops; - -static const u8 all_zeros_mac[ETH_ALEN + 2]; - -static int vxlan_sock_add(struct vxlan_dev *vxlan); - -/* per-network namespace private data for this module */ -struct vxlan_net { - struct list_head vxlan_list; - struct hlist_head sock_list[PORT_HASH_SIZE]; - spinlock_t sock_lock; -}; - -/* Forwarding table entry */ -struct vxlan_fdb { - struct hlist_node hlist; /* linked list of entries */ - struct rcu_head rcu; - unsigned long updated; /* jiffies */ - unsigned long used; - struct list_head remotes; - u8 eth_addr[ETH_ALEN]; - u16 state; /* see ndm_state */ - u8 flags; /* see ndm_flags */ -}; - -/* salt for hash table */ -static u32 vxlan_salt __read_mostly; - -static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) -{ - return vs->flags & VXLAN_F_COLLECT_METADATA || - ip_tunnel_collect_metadata(); -} - -#if IS_ENABLED(CONFIG_IPV6) -static inline -bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) -{ - if (a->sa.sa_family != b->sa.sa_family) - return false; - if (a->sa.sa_family == AF_INET6) - return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); - else - return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; -} - -static inline bool vxlan_addr_any(const union vxlan_addr *ipa) -{ - if (ipa->sa.sa_family == AF_INET6) - return ipv6_addr_any(&ipa->sin6.sin6_addr); - else - return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); -} - -static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) -{ - if (ipa->sa.sa_family == AF_INET6) - return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); - else - return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); -} - -#else /* !CONFIG_IPV6 */ - -static inline -bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) -{ - return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; -} - -static inline bool vxlan_addr_any(const union vxlan_addr *ipa) -{ - return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); -} - -static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) -{ - return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); -} -#endif - -/* Virtual Network hash table head */ -static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) -{ - return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; -} - -/* Socket hash table head */ -static inline struct hlist_head *vs_head(struct net *net, __be16 port) -{ - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - - return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; -} - -/* Find VXLAN socket based on network namespace, address family and UDP port - * and enabled unshareable flags. - */ -static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, - __be16 port, u32 flags) -{ - struct vxlan_sock *vs; - - flags &= VXLAN_F_RCV_FLAGS; - - hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { - if (inet_sk(vs->sock->sk)->inet_sport == port && - vxlan_get_sk_family(vs) == family && - vs->flags == flags) - return vs; - } - return NULL; -} - -static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, __be32 vni) -{ - struct vxlan_dev *vxlan; - - /* For flow based devices, map all packets to VNI 0 */ - if (vs->flags & VXLAN_F_COLLECT_METADATA) - vni = 0; - - hlist_for_each_entry_rcu(vxlan, vni_head(vs, vni), hlist) { - if (vxlan->default_dst.remote_vni == vni) - return vxlan; - } - - return NULL; -} - -/* Look up VNI in a per net namespace table */ -static struct vxlan_dev *vxlan_find_vni(struct net *net, __be32 vni, - sa_family_t family, __be16 port, - u32 flags) -{ - struct vxlan_sock *vs; - - vs = vxlan_find_sock(net, family, port, flags); - if (!vs) - return NULL; - - return vxlan_vs_find_vni(vs, vni); -} - -static int vxlan_fdb_create(struct vxlan_dev *vxlan, - const u8 *mac, union vxlan_addr *ip, - __u16 state, __u16 flags, - __be16 port, __be32 vni, __u32 ifindex, - __u8 ndm_flags) -{ - return -EINVAL; -} - -static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) -{ - -} - -static inline size_t vxlan_nlmsg_size(void) -{ - return NLMSG_ALIGN(sizeof(struct ndmsg)) - + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ - + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ - + nla_total_size(sizeof(__be16)) /* NDA_PORT */ - + nla_total_size(sizeof(__be32)) /* NDA_VNI */ - + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ - + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ - + nla_total_size(sizeof(struct nda_cacheinfo)); -} - -#ifdef HAVE_UDP_OFFLOAD -#ifdef HAVE_NETIF_F_GSO_TUNNEL_REMCSUM - -static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, - unsigned int off, - struct vxlanhdr *vh, size_t hdrlen, - __be32 vni_field, - struct gro_remcsum *grc, - bool nopartial) -{ - size_t start, offset; - - if (skb->remcsum_offload) - return vh; - - if (!NAPI_GRO_CB(skb)->csum_valid) - return NULL; - - start = vxlan_rco_start(vni_field); - offset = start + vxlan_rco_offset(vni_field); - - vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, - start, offset, grc, nopartial); - - skb->remcsum_offload = 1; - - return vh; -} -#else -static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, - unsigned int off, - struct vxlanhdr *vh, size_t hdrlen, - u32 data, struct gro_remcsum *grc, - bool nopartial) -{ - return NULL; -} -#endif - -#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF -static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb) -#else -static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) -#endif -{ -#ifdef HAVE_UDP_OFFLOAD_ARG_UOFF - struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock, - udp_offloads); -#else - struct vxlan_sock *vs = NULL; -#endif - struct sk_buff *p, **pp = NULL; - struct vxlanhdr *vh, *vh2; - unsigned int hlen, off_vx; - int flush = 1; - __be32 flags; - struct gro_remcsum grc; - - skb_gro_remcsum_init(&grc); - - off_vx = skb_gro_offset(skb); - hlen = off_vx + sizeof(*vh); - vh = skb_gro_header_fast(skb, off_vx); - if (skb_gro_header_hard(skb, hlen)) { - vh = skb_gro_header_slow(skb, hlen, off_vx); - if (unlikely(!vh)) - goto out; - } - - skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); - - flags = vh->vx_flags; - - if ((flags & VXLAN_HF_RCO) && vs && (vs->flags & VXLAN_F_REMCSUM_RX)) { - vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), - vh->vx_vni, &grc, - !!(vs->flags & - VXLAN_F_REMCSUM_NOPARTIAL)); - - if (!vh) - goto out; - } - - skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - vh2 = (struct vxlanhdr *)(p->data + off_vx); - if (vh->vx_flags != vh2->vx_flags || - vh->vx_vni != vh2->vx_vni) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - pp = eth_gro_receive(head, skb); - flush = 0; - -out: - skb_gro_remcsum_cleanup(skb, &grc); - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF -static int vxlan_gro_complete(struct sk_buff *skb, int nhoff) -#else -static int vxlan_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) -#endif -{ - /* Sets 'skb->inner_mac_header' since we are always called with - * 'skb->encapsulation' set. - */ - udp_tunnel_gro_complete(skb, nhoff); - - return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); -} -#endif - -/* Notify netdevs that UDP port started listening */ -static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) -{ - struct net_device *dev; - struct sock *sk = vs->sock->sk; - struct net *net = sock_net(sk); - sa_family_t sa_family = vxlan_get_sk_family(vs); - - - if (sa_family == AF_INET) { - int err; - - err = udp_add_offload(net, &vs->udp_offloads); - if (err) - pr_warn("vxlan: udp_add_offload failed with status %d\n", err); - } - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { -#ifdef HAVE_NDO_ADD_VXLAN_PORT - __be16 port = inet_sk(sk)->inet_sport; - - if (dev->netdev_ops->ndo_add_vxlan_port) - dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, - port); -#elif defined(HAVE_NDO_UDP_TUNNEL_ADD) - struct udp_tunnel_info ti; - if (vs->flags & VXLAN_F_GPE) - ti.type = UDP_TUNNEL_TYPE_VXLAN_GPE; - else - ti.type = UDP_TUNNEL_TYPE_VXLAN; - ti.sa_family = sa_family; - ti.port = inet_sk(sk)->inet_sport; - - if (dev->netdev_ops->ndo_udp_tunnel_add) - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); -#endif - } - rcu_read_unlock(); -} - -/* Notify netdevs that UDP port is no more listening */ -static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) -{ - struct net_device *dev; - struct sock *sk = vs->sock->sk; - struct net *net = sock_net(sk); - sa_family_t sa_family = vxlan_get_sk_family(vs); - - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { -#ifdef HAVE_NDO_ADD_VXLAN_PORT - __be16 port = inet_sk(sk)->inet_sport; - - if (dev->netdev_ops->ndo_del_vxlan_port) - dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family, - port); -#elif defined(HAVE_NDO_UDP_TUNNEL_ADD) - struct udp_tunnel_info ti; - if (vs->flags & VXLAN_F_GPE) - ti.type = UDP_TUNNEL_TYPE_VXLAN_GPE; - else - ti.type = UDP_TUNNEL_TYPE_VXLAN; - ti.port = inet_sk(sk)->inet_sport; - ti.sa_family = sa_family; - - if (dev->netdev_ops->ndo_udp_tunnel_del) - dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); -#endif - } - rcu_read_unlock(); - - if (sa_family == AF_INET) { - udp_del_offload(&vs->udp_offloads); - } -} - -/* See if multicast group is already in use by other ID */ -static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) -{ - struct vxlan_dev *vxlan; - struct vxlan_sock *sock4; - struct vxlan_sock *sock6 = NULL; - unsigned short family = dev->default_dst.remote_ip.sa.sa_family; - - sock4 = rtnl_dereference(dev->vn4_sock); - - /* The vxlan_sock is only used by dev, leaving group has - * no effect on other vxlan devices. - */ - if (family == AF_INET && sock4 && atomic_read(&sock4->refcnt) == 1) - return false; -#if IS_ENABLED(CONFIG_IPV6) - sock6 = rtnl_dereference(dev->vn6_sock); - if (family == AF_INET6 && sock6 && atomic_read(&sock6->refcnt) == 1) - return false; -#endif - - list_for_each_entry(vxlan, &vn->vxlan_list, next) { - if (!netif_running(vxlan->dev) || vxlan == dev) - continue; - - if (family == AF_INET && - rtnl_dereference(vxlan->vn4_sock) != sock4) - continue; -#if IS_ENABLED(CONFIG_IPV6) - if (family == AF_INET6 && - rtnl_dereference(vxlan->vn6_sock) != sock6) - continue; -#endif - - if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, - &dev->default_dst.remote_ip)) - continue; - - if (vxlan->default_dst.remote_ifindex != - dev->default_dst.remote_ifindex) - continue; - - return true; - } - - return false; -} - -static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) -{ - struct vxlan_net *vn; - - if (!vs) - return false; - if (!atomic_dec_and_test(&vs->refcnt)) - return false; - - vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); - spin_lock(&vn->sock_lock); - hlist_del_rcu(&vs->hlist); - vxlan_notify_del_rx_port(vs); - spin_unlock(&vn->sock_lock); - - return true; -} - -static void vxlan_sock_release(struct vxlan_dev *vxlan) -{ - struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); -#if IS_ENABLED(CONFIG_IPV6) - struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); - - rcu_assign_pointer(vxlan->vn6_sock, NULL); -#endif - - rcu_assign_pointer(vxlan->vn4_sock, NULL); - synchronize_net(); - - if (__vxlan_sock_release_prep(sock4)) { - udp_tunnel_sock_release(sock4->sock); - kfree(sock4); - } - -#if IS_ENABLED(CONFIG_IPV6) - if (__vxlan_sock_release_prep(sock6)) { - udp_tunnel_sock_release(sock6->sock); - kfree(sock6); - } -#endif -} - -/* Update multicast group membership when first VNI on - * multicast address is brought up - */ -static int vxlan_igmp_join(struct vxlan_dev *vxlan) -{ - return -EINVAL; -} - -/* Inverse of vxlan_igmp_join when last VNI is brought down */ -static int vxlan_igmp_leave(struct vxlan_dev *vxlan) -{ - return -EINVAL; -} - -static bool vxlan_remcsum(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags) -{ -#ifndef USE_UPSTREAM_TUNNEL - return false; -#else - size_t start, offset; - - if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) - goto out; - - start = vxlan_rco_start(unparsed->vx_vni); - offset = start + vxlan_rco_offset(unparsed->vx_vni); - - if (!pskb_may_pull(skb, offset + sizeof(u16))) - return false; - - skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, - !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); -out: - unparsed->vx_flags &= ~VXLAN_HF_RCO; - unparsed->vx_vni &= VXLAN_VNI_MASK; - return true; -#endif -} - -static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, - struct sk_buff *skb, u32 vxflags, - struct vxlan_metadata *md) -{ - struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; - struct metadata_dst *tun_dst; - - if (!(unparsed->vx_flags & VXLAN_HF_GBP)) - goto out; - - md->gbp = ntohs(gbp->policy_id); - - tun_dst = (struct metadata_dst *)skb_dst(skb); - if (tun_dst) { - tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; - tun_dst->u.tun_info.options_len = sizeof(*md); - } - if (gbp->dont_learn) - md->gbp |= VXLAN_GBP_DONT_LEARN; - - if (gbp->policy_applied) - md->gbp |= VXLAN_GBP_POLICY_APPLIED; - - /* In flow-based mode, GBP is carried in dst_metadata */ - if (!(vxflags & VXLAN_F_COLLECT_METADATA)) - skb->mark = md->gbp; -out: - unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; -} - -static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed, - __be16 *protocol, - struct sk_buff *skb, u32 vxflags) -{ - struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed; - - /* Need to have Next Protocol set for interfaces in GPE mode. */ - if (!gpe->np_applied) - return false; - /* "The initial version is 0. If a receiver does not support the - * version indicated it MUST drop the packet. - */ - if (gpe->version != 0) - return false; - /* "When the O bit is set to 1, the packet is an OAM packet and OAM - * processing MUST occur." However, we don't implement OAM - * processing, thus drop the packet. - */ - if (gpe->oam_flag) - return false; - - *protocol = tun_p_to_eth_p(gpe->next_protocol); - if (!*protocol) - return false; - - unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS; - return true; -} - -static bool vxlan_set_mac(struct vxlan_dev *vxlan, - struct vxlan_sock *vs, - struct sk_buff *skb) -{ - return true; -} - -static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, - struct sk_buff *skb) -{ - int err = 0; - - if (vxlan_get_sk_family(vs) == AF_INET) - err = IP_ECN_decapsulate(oiph, skb); -#if IS_ENABLED(CONFIG_IPV6) - else - err = IP6_ECN_decapsulate(oiph, skb); -#endif - return err <= 1; -} - -/* Callback from net/ipv4/udp.c to receive packets */ -static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) -{ - union { - struct metadata_dst dst; - char buf[sizeof(struct metadata_dst) + sizeof(struct vxlan_metadata)]; - } buf; - - struct pcpu_sw_netstats *stats; - struct vxlan_dev *vxlan; - struct vxlan_sock *vs; - struct vxlanhdr unparsed; - struct vxlan_metadata _md; - struct vxlan_metadata *md = &_md; - __be16 protocol = htons(ETH_P_TEB); - bool raw_proto = false; - void *oiph; - - /* Need UDP and VXLAN header to be present */ - if (!pskb_may_pull(skb, VXLAN_HLEN)) - goto drop; - - unparsed = *vxlan_hdr(skb); - /* VNI flag always required to be set */ - if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { - netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", - ntohl(vxlan_hdr(skb)->vx_flags), - ntohl(vxlan_hdr(skb)->vx_vni)); - /* Return non vxlan pkt */ - goto drop; - } - - unparsed.vx_flags &= ~VXLAN_HF_VNI; - unparsed.vx_vni &= ~VXLAN_VNI_MASK; - - vs = rcu_dereference_sk_user_data(sk); - if (!vs) - goto drop; - -#if IS_ENABLED(CONFIG_IPV6) -#ifdef OVS_CHECK_UDP_TUNNEL_ZERO_CSUM - if (vxlan_get_sk_family(vs) == AF_INET6 && - !udp_hdr(skb)->check && - !(vs->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) { - udp6_csum_zero_error(skb); - goto drop; - } -#endif -#endif - vxlan = vxlan_vs_find_vni(vs, vxlan_vni(vxlan_hdr(skb)->vx_vni)); - if (!vxlan) - goto drop; - - /* For backwards compatibility, only allow reserved fields to be - * used by VXLAN extensions if explicitly requested. - */ - if (vs->flags & VXLAN_F_GPE) { - if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags)) - goto drop; - raw_proto = true; - } - - if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, - !net_eq(vxlan->net, dev_net(vxlan->dev)))) - goto drop; - - if (vxlan_collect_metadata(vs)) { - __be32 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); - struct metadata_dst *tun_dst; - - tun_dst = &buf.dst; - ovs_udp_tun_rx_dst(tun_dst, skb, - vxlan_get_sk_family(vs), TUNNEL_KEY, - vxlan_vni_to_tun_id(vni), sizeof(*md)); - - if (!tun_dst) - goto drop; - - md = ip_tunnel_info_opts(&tun_dst->u.tun_info); - - ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst); - } else { - memset(md, 0, sizeof(*md)); - } - - if (vs->flags & VXLAN_F_REMCSUM_RX) - if (!vxlan_remcsum(&unparsed, skb, vs->flags)) - goto drop; - - if (vs->flags & VXLAN_F_GBP) - vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); - /* Note that GBP and GPE can never be active together. This is - * ensured in vxlan_dev_configure. - */ - - if (unparsed.vx_flags || unparsed.vx_vni) { - /* If there are any unprocessed flags remaining treat - * this as a malformed packet. This behavior diverges from - * VXLAN RFC (RFC7348) which stipulates that bits in reserved - * in reserved fields are to be ignored. The approach here - * maintains compatibility with previous stack code, and also - * is more robust and provides a little more security in - * adding extensions to VXLAN. - */ - goto drop; - } - - if (!raw_proto) { - if (!vxlan_set_mac(vxlan, vs, skb)) - goto drop; - skb_reset_mac_header(skb); - skb->protocol = eth_type_trans(skb, vxlan->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - } else { - skb_reset_mac_header(skb); - skb->dev = vxlan->dev; - skb->pkt_type = PACKET_HOST; - } - - oiph = skb_network_header(skb); - skb_reset_network_header(skb); - - if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { - ++vxlan->dev->stats.rx_frame_errors; - ++vxlan->dev->stats.rx_errors; - goto drop; - } - - stats = this_cpu_ptr(vxlan->dev->tstats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); - - netdev_port_receive(skb, skb_tunnel_info(skb)); - return 0; - -drop: - /* Consume bad packet */ - kfree_skb(skb); - return 0; -} - -static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, - struct vxlan_metadata *md) -{ - struct vxlanhdr_gbp *gbp; - - if (!md->gbp) - return; - - gbp = (struct vxlanhdr_gbp *)vxh; - vxh->vx_flags |= VXLAN_HF_GBP; - - if (md->gbp & VXLAN_GBP_DONT_LEARN) - gbp->dont_learn = 1; - - if (md->gbp & VXLAN_GBP_POLICY_APPLIED) - gbp->policy_applied = 1; - - gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); -} - -static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags, - __be16 protocol) -{ - struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; - - gpe->np_applied = 1; - gpe->next_protocol = tun_p_from_eth_p(protocol); - if (!gpe->next_protocol) - return -EPFNOSUPPORT; - return 0; -} - -static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, - int iphdr_len, __be32 vni, - struct vxlan_metadata *md, u32 vxflags, - bool udp_sum) -{ - void (*fix_segment)(struct sk_buff *); - struct vxlanhdr *vxh; - int min_headroom; - int err; - int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; - __be16 inner_protocol = htons(ETH_P_TEB); - - if ((vxflags & VXLAN_F_REMCSUM_TX) && - skb->ip_summed == CHECKSUM_PARTIAL) { - int csum_start = skb_checksum_start_offset(skb); - - if (csum_start <= VXLAN_MAX_REMCSUM_START && - !(csum_start & VXLAN_RCO_SHIFT_MASK) && - (skb->csum_offset == offsetof(struct udphdr, check) || - skb->csum_offset == offsetof(struct tcphdr, check))) - type |= SKB_GSO_TUNNEL_REMCSUM; - } - - min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len - + VXLAN_HLEN + iphdr_len - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - /* Need space for new headers (invalidates iph ptr) */ - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) - goto out_free; - - if (skb_vlan_tag_present(skb)) - skb = __vlan_hwaccel_push_inside(skb); - if (WARN_ON(!skb)) - return -ENOMEM; - - type |= udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; -#ifndef USE_UPSTREAM_TUNNEL_GSO - fix_segment = !udp_sum ? ovs_udp_gso : ovs_udp_csum_gso; -#else - fix_segment = NULL; -#endif - err = ovs_iptunnel_handle_offloads(skb, type, fix_segment); - if (err) - goto out_free; - - vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = VXLAN_HF_VNI; - vxh->vx_vni = vxlan_vni_field(vni); - - if (type & SKB_GSO_TUNNEL_REMCSUM) { - unsigned int start; - - start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr); - vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset); - vxh->vx_flags |= VXLAN_HF_RCO; - - if (!skb_is_gso(skb)) { - skb->ip_summed = CHECKSUM_NONE; - skb->encapsulation = 0; - } - } - - if (vxflags & VXLAN_F_GBP) - vxlan_build_gbp_hdr(vxh, vxflags, md); - if (vxflags & VXLAN_F_GPE) { - err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol); - if (err < 0) - goto out_free; - inner_protocol = skb->protocol; - } - - ovs_skb_set_inner_protocol(skb, inner_protocol); - return 0; - -out_free: - kfree_skb(skb); - return err; -} - -static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, - struct sk_buff *skb, int oif, u8 tos, - __be32 daddr, __be32 *saddr, - __be16 dport, __be16 sport, - struct dst_cache *dst_cache, - const struct ip_tunnel_info *info) -{ - bool use_cache = (dst_cache && ip_tunnel_dst_cache_usable(skb, info)); - struct rtable *rt = NULL; - struct flowi4 fl4; - - if (tos && !info) - use_cache = false; - if (use_cache) { - rt = dst_cache_get_ip4(dst_cache, saddr); - if (rt) - return rt; - } - - memset(&fl4, 0, sizeof(fl4)); - fl4.flowi4_oif = oif; - fl4.flowi4_tos = RT_TOS(tos); - fl4.flowi4_mark = skb->mark; - fl4.flowi4_proto = IPPROTO_UDP; - fl4.daddr = daddr; - fl4.saddr = *saddr; - fl4.fl4_dport = dport; - fl4.fl4_sport = sport; - - rt = ip_route_output_key(vxlan->net, &fl4); - if (!IS_ERR(rt)) { - *saddr = fl4.saddr; - if (use_cache) - dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); - } - return rt; -} - -#if IS_ENABLED(CONFIG_IPV6) -static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, - struct sk_buff *skb, int oif, u8 tos, - __be32 label, - const struct in6_addr *daddr, - struct in6_addr *saddr, - __be16 dport, __be16 sport, - struct dst_cache *dst_cache, - const struct ip_tunnel_info *info) -{ - struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); - bool use_cache = (dst_cache && ip_tunnel_dst_cache_usable(skb, info)); - struct dst_entry *ndst; - struct flowi6 fl6; -#if !defined(HAVE_IPV6_STUB_WITH_DST_ENTRY) || \ - !defined(HAVE_IPV6_DST_LOOKUP_FLOW) - int err; -#endif - - if (!sock6) - return ERR_PTR(-EIO); - - if (tos && !info) - use_cache = false; - if (use_cache) { - ndst = dst_cache_get_ip6(dst_cache, saddr); - if (ndst) - return ndst; - } - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = oif; - fl6.daddr = *daddr; - fl6.saddr = *saddr; - fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label); - fl6.flowi6_mark = skb->mark; - fl6.flowi6_proto = IPPROTO_UDP; - fl6.fl6_dport = dport; - fl6.fl6_sport = sport; - -#if defined(HAVE_IPV6_STUB_WITH_DST_ENTRY) && defined(HAVE_IPV6_DST_LOOKUP_FLOW) -#ifdef HAVE_IPV6_DST_LOOKUP_FLOW_NET - ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk, - &fl6, NULL); -#else - ndst = ipv6_stub->ipv6_dst_lookup_flow(sock6->sock->sk, &fl6, NULL); -#endif - if (unlikely(IS_ERR(ndst))) { -#elif defined(HAVE_IPV6_DST_LOOKUP_FLOW_NET) - err = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk, - &ndst, &fl6); -#elif defined(HAVE_IPV6_DST_LOOKUP_FLOW) - err = ipv6_stub->ipv6_dst_lookup_flow(sock6->sock->sk, &ndst, &fl6); -#elif defined(HAVE_IPV6_DST_LOOKUP_NET) - err = ipv6_stub->ipv6_dst_lookup(vxlan->net, sock6->sock->sk, - &ndst, &fl6); -#elif defined(HAVE_IPV6_STUB) - err = ipv6_stub->ipv6_dst_lookup(vxlan->vn6_sock->sock->sk, - &ndst, &fl6); -#else - err = ip6_dst_lookup(vxlan->vn6_sock->sock->sk, &ndst, &fl6); -#endif -#if defined(HAVE_IPV6_STUB_WITH_DST_ENTRY) && defined(HAVE_IPV6_DST_LOOKUP_FLOW) - return ERR_PTR(-ENETUNREACH); - } -#else - if (err < 0) - return ERR_PTR(err); -#endif - - *saddr = fl6.saddr; - if (use_cache) - dst_cache_set_ip6(dst_cache, ndst, saddr); - return ndst; -} -#endif - -/* Bypass encapsulation if the destination is local */ -static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, - struct vxlan_dev *dst_vxlan) -{ - skb->dev->stats.rx_dropped++; - kfree_skb(skb); -} - -static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, - struct vxlan_rdst *rdst, bool did_rsc) -{ - struct dst_cache *dst_cache; - struct ip_tunnel_info *info; - struct vxlan_dev *vxlan = netdev_priv(dev); - struct sock *sk; - struct rtable *rt = NULL; - const struct iphdr *old_iph; - union vxlan_addr *dst; - union vxlan_addr remote_ip, local_ip; - union vxlan_addr *src; - struct vxlan_metadata _md; - struct vxlan_metadata *md = &_md; - __be16 src_port = 0, dst_port; - __be32 vni, label; - __be16 df = 0; - __u8 tos, ttl; - int err; - u32 flags = vxlan->flags; - bool udp_sum = false; - bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); - - info = skb_tunnel_info(skb); - - if (rdst) { - dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; - vni = rdst->remote_vni; - dst = &rdst->remote_ip; - src = &vxlan->cfg.saddr; - dst_cache = &rdst->dst_cache; - } else { - if (!info) { - WARN_ONCE(1, "%s: Missing encapsulation instructions\n", - dev->name); - goto drop; - } - dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; - vni = vxlan_tun_id_to_vni(info->key.tun_id); - remote_ip.sa.sa_family = ip_tunnel_info_af(info); - if (remote_ip.sa.sa_family == AF_INET) { - remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; - local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src; - } else { - remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; - local_ip.sin6.sin6_addr = info->key.u.ipv6.src; - } - dst = &remote_ip; - src = &local_ip; - dst_cache = &info->dst_cache; - } - - if (vxlan_addr_any(dst)) { - if (did_rsc) { - /* short-circuited back to local bridge */ - vxlan_encap_bypass(skb, vxlan, vxlan); - return; - } - goto drop; - } - - old_iph = ip_hdr(skb); - - ttl = vxlan->cfg.ttl; - if (!ttl && vxlan_addr_multicast(dst)) - ttl = 1; - - tos = vxlan->cfg.tos; - if (tos == 1) - tos = ip_tunnel_get_dsfield(old_iph, skb); - - label = vxlan->cfg.label; - src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, - vxlan->cfg.port_max, true); - - if (info) { - ttl = info->key.ttl; - tos = info->key.tos; - label = info->key.label; - udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); - - if (info->options_len && - info->key.tun_flags & TUNNEL_VXLAN_OPT) - md = ip_tunnel_info_opts(info); - } else { - md->gbp = skb->mark; - } - - if (dst->sa.sa_family == AF_INET) { - struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); - - if (!sock4) - goto drop; - sk = sock4->sock->sk; - - rt = vxlan_get_route(vxlan, skb, - rdst ? rdst->remote_ifindex : 0, tos, - dst->sin.sin_addr.s_addr, - &src->sin.sin_addr.s_addr, - dst_port, src_port, - dst_cache, info); - if (IS_ERR(rt)) { - netdev_dbg(dev, "no route to %pI4\n", - &dst->sin.sin_addr.s_addr); - dev->stats.tx_carrier_errors++; - goto tx_error; - } - - if (rt->dst.dev == dev) { - netdev_dbg(dev, "circular route to %pI4\n", - &dst->sin.sin_addr.s_addr); - dev->stats.collisions++; - goto rt_tx_error; - } - - /* Bypass encapsulation if the destination is local */ - if (!info && rt->rt_flags & RTCF_LOCAL && - !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { - struct vxlan_dev *dst_vxlan; - - ip_rt_put(rt); - dst_vxlan = vxlan_find_vni(vxlan->net, vni, - dst->sa.sa_family, dst_port, - vxlan->flags); - if (!dst_vxlan) - goto tx_error; - vxlan_encap_bypass(skb, vxlan, dst_vxlan); - return; - } - - if (!info) - udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); - else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) - df = htons(IP_DF); - - tos = ip_tunnel_ecn_encap(tos, old_iph, skb); - ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr), - vni, md, flags, udp_sum); - if (err < 0) - goto xmit_tx_error; - - udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr, - dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, xnet, !udp_sum); -#if IS_ENABLED(CONFIG_IPV6) - } else { - struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); - struct dst_entry *ndst; - u32 rt6i_flags; - - if (!sock6) - goto drop; - sk = sock6->sock->sk; - - ndst = vxlan6_get_route(vxlan, skb, - rdst ? rdst->remote_ifindex : 0, tos, - label, &dst->sin6.sin6_addr, - &src->sin6.sin6_addr, - dst_port, src_port, - dst_cache, info); - if (IS_ERR(ndst)) { - netdev_dbg(dev, "no route to %pI6\n", - &dst->sin6.sin6_addr); - dev->stats.tx_carrier_errors++; - goto tx_error; - } - - if (ndst->dev == dev) { - netdev_dbg(dev, "circular route to %pI6\n", - &dst->sin6.sin6_addr); - dst_release(ndst); - dev->stats.collisions++; - goto tx_error; - } - - /* Bypass encapsulation if the destination is local */ - rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; - if (!info && rt6i_flags & RTF_LOCAL && - !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { - struct vxlan_dev *dst_vxlan; - - dst_release(ndst); - dst_vxlan = vxlan_find_vni(vxlan->net, vni, - dst->sa.sa_family, dst_port, - vxlan->flags); - if (!dst_vxlan) - goto tx_error; - vxlan_encap_bypass(skb, vxlan, dst_vxlan); - return; - } - - if (!info) - udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); - - tos = ip_tunnel_ecn_encap(tos, old_iph, skb); - ttl = ttl ? : ip6_dst_hoplimit(ndst); - skb_scrub_packet(skb, xnet); - err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), - vni, md, flags, udp_sum); - if (err < 0) { - dst_release(ndst); - return; - } - udp_tunnel6_xmit_skb(ndst, sk, skb, dev, - &src->sin6.sin6_addr, - &dst->sin6.sin6_addr, tos, ttl, - label, src_port, dst_port, !udp_sum); -#endif - } - - return; - -drop: - dev->stats.tx_dropped++; - goto tx_free; - -xmit_tx_error: - /* skb is already freed. */ - skb = NULL; -rt_tx_error: - ip_rt_put(rt); -tx_error: - dev->stats.tx_errors++; -tx_free: - dev_kfree_skb(skb); -} - -/* Transmit local packets over Vxlan - * - * Outer IP header inherits ECN and DF from inner header. - * Outer UDP destination is the VXLAN assigned port. - * source port is based on hash of flow - */ -netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct vxlan_dev *vxlan = netdev_priv(dev); - const struct ip_tunnel_info *info; - - info = skb_tunnel_info(skb); - skb_reset_mac_header(skb); - if (vxlan->flags & VXLAN_F_COLLECT_METADATA) { - if (info && info->mode & IP_TUNNEL_INFO_TX) { - vxlan_xmit_one(skb, dev, NULL, false); - return NETDEV_TX_OK; - } - } - - dev->stats.tx_dropped++; - kfree_skb(skb); - return NETDEV_TX_OK; -} -EXPORT_SYMBOL_GPL(rpl_vxlan_xmit); - -/* Walk the forwarding table and purge stale entries */ -#ifdef HAVE_INIT_TIMER_DEFERRABLE -static void vxlan_cleanup(unsigned long arg) -{ - struct vxlan_dev *vxlan = (struct vxlan_dev *) arg; -#else -static void vxlan_cleanup(struct timer_list *t) -{ - struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); -#endif - unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; - unsigned int h; - - if (!netif_running(vxlan->dev)) - return; - - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; - - spin_lock_bh(&vxlan->hash_lock); - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); - unsigned long timeout; - - if (f->state & NUD_PERMANENT) - continue; - - timeout = f->used + vxlan->cfg.age_interval * HZ; - if (time_before_eq(timeout, jiffies)) { - netdev_dbg(vxlan->dev, - "garbage collect %pM\n", - f->eth_addr); - f->state = NUD_STALE; - vxlan_fdb_destroy(vxlan, f); - } else if (time_before(timeout, next_timer)) - next_timer = timeout; - } - spin_unlock_bh(&vxlan->hash_lock); - } - - mod_timer(&vxlan->age_timer, next_timer); -} - -static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) -{ - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - __be32 vni = vxlan->default_dst.remote_vni; - - spin_lock(&vn->sock_lock); - hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); - spin_unlock(&vn->sock_lock); -} - -/* Setup stats when device is created */ -static int vxlan_init(struct net_device *dev) -{ - dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!dev->tstats) - return -ENOMEM; - - return 0; -} - -static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) -{ -} - -static void vxlan_uninit(struct net_device *dev) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - - vxlan_fdb_delete_default(vxlan); - - free_percpu(dev->tstats); -} - -/* Start ageing timer and join group when device is brought up */ -static int vxlan_open(struct net_device *dev) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - int ret; - - ret = vxlan_sock_add(vxlan); - if (ret < 0) - return ret; - - if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { - ret = vxlan_igmp_join(vxlan); - if (ret == -EADDRINUSE) - ret = 0; - if (ret) { - vxlan_sock_release(vxlan); - return ret; - } - } - - if (vxlan->cfg.age_interval) - mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); - - return ret; -} - -/* Purge the forwarding table */ -static void vxlan_flush(struct vxlan_dev *vxlan) -{ - unsigned int h; - - spin_lock_bh(&vxlan->hash_lock); - for (h = 0; h < FDB_HASH_SIZE; ++h) { - struct hlist_node *p, *n; - hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { - struct vxlan_fdb *f - = container_of(p, struct vxlan_fdb, hlist); - /* the all_zeros_mac entry is deleted at vxlan_uninit */ - if (!is_zero_ether_addr(f->eth_addr)) - vxlan_fdb_destroy(vxlan, f); - } - } - spin_unlock_bh(&vxlan->hash_lock); -} - -/* Cleanup timer and forwarding table on shutdown */ -static int vxlan_stop(struct net_device *dev) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - int ret = 0; - - if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && - !vxlan_group_used(vn, vxlan)) - ret = vxlan_igmp_leave(vxlan); - - del_timer_sync(&vxlan->age_timer); - - vxlan_flush(vxlan); - vxlan_sock_release(vxlan); - - return ret; -} - -/* Stub, nothing needs to be done. */ -static void vxlan_set_multicast_list(struct net_device *dev) -{ -} - -static int __vxlan_change_mtu(struct net_device *dev, - struct net_device *lowerdev, - struct vxlan_rdst *dst, int new_mtu, bool strict) -{ - int max_mtu = IP_MAX_MTU; - - if (lowerdev) - max_mtu = lowerdev->mtu; - - if (dst->remote_ip.sa.sa_family == AF_INET6) - max_mtu -= VXLAN6_HEADROOM; - else - max_mtu -= VXLAN_HEADROOM; - - if (new_mtu < 68) - return -EINVAL; - - if (new_mtu > max_mtu) { - if (strict) - return -EINVAL; - - new_mtu = max_mtu; - } - - dev->mtu = new_mtu; - return 0; -} - -static int vxlan_change_mtu(struct net_device *dev, int new_mtu) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_rdst *dst = &vxlan->default_dst; - struct net_device *lowerdev = __dev_get_by_index(vxlan->net, - dst->remote_ifindex); - return __vxlan_change_mtu(dev, lowerdev, dst, new_mtu, true); -} - -int ovs_vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - struct ip_tunnel_info *info = skb_tunnel_info(skb); - __be16 sport, dport; - - sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, - vxlan->cfg.port_max, true); - dport = info->key.tp_dst ? : vxlan->cfg.dst_port; - - if (ip_tunnel_info_af(info) == AF_INET) { - struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); - struct rtable *rt; - - if (!sock4) - return -EINVAL; - rt = vxlan_get_route(vxlan, skb, 0, info->key.tos, - info->key.u.ipv4.dst, - &info->key.u.ipv4.src, - dport, sport, NULL, info); - if (IS_ERR(rt)) - return PTR_ERR(rt); - ip_rt_put(rt); - } else { -#if IS_ENABLED(CONFIG_IPV6) - struct dst_entry *ndst; - - ndst = vxlan6_get_route(vxlan, skb, 0, info->key.tos, - info->key.label, &info->key.u.ipv6.dst, - &info->key.u.ipv6.src, - dport, sport, NULL, info); - if (IS_ERR(ndst)) - return PTR_ERR(ndst); - dst_release(ndst); -#else /* !CONFIG_IPV6 */ - return -EPFNOSUPPORT; -#endif - } - info->key.tp_src = sport; - info->key.tp_dst = dport; - return 0; -} -EXPORT_SYMBOL_GPL(ovs_vxlan_fill_metadata_dst); - -static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev) -{ - /* Drop All packets coming from networking stack. OVS-CB is - * not initialized for these packets. - */ - - dev_kfree_skb(skb); - dev->stats.tx_dropped++; - return NETDEV_TX_OK; -} - -static const struct net_device_ops vxlan_netdev_ether_ops = { - .ndo_init = vxlan_init, - .ndo_uninit = vxlan_uninit, - .ndo_open = vxlan_open, - .ndo_stop = vxlan_stop, - .ndo_start_xmit = vxlan_dev_xmit, - .ndo_get_stats64 = ip_tunnel_get_stats64, - .ndo_set_rx_mode = vxlan_set_multicast_list, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = vxlan_change_mtu, -#else - .ndo_change_mtu = vxlan_change_mtu, -#endif - .ndo_validate_addr = eth_validate_addr, - .ndo_set_mac_address = eth_mac_addr, -#ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = ovs_vxlan_fill_metadata_dst, -#endif -}; - -static const struct net_device_ops vxlan_netdev_raw_ops = { - .ndo_init = vxlan_init, - .ndo_uninit = vxlan_uninit, - .ndo_open = vxlan_open, - .ndo_stop = vxlan_stop, - .ndo_start_xmit = vxlan_dev_xmit, - .ndo_get_stats64 = ip_tunnel_get_stats64, -#ifdef HAVE_RHEL7_MAX_MTU - .ndo_size = sizeof(struct net_device_ops), - .extended.ndo_change_mtu = vxlan_change_mtu, -#else - .ndo_change_mtu = vxlan_change_mtu, -#endif -#ifdef HAVE_NDO_FILL_METADATA_DST - .ndo_fill_metadata_dst = ovs_vxlan_fill_metadata_dst, -#endif -}; - -/* Info for udev, that this is a virtual tunnel endpoint */ -static struct device_type vxlan_type = { - .name = "vxlan", -}; - -/* Calls the ndo_add_vxlan_port or ndo_udp_tunnel_add of the caller - * in order to supply the listening VXLAN udp ports. Callers are - * expected to implement the ndo_add_vxlan_port. - */ -static void vxlan_push_rx_ports(struct net_device *dev) -{ -#ifdef HAVE_NDO_ADD_VXLAN_PORT - struct vxlan_sock *vs; - struct net *net = dev_net(dev); - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - sa_family_t sa_family; - __be16 port; - unsigned int i; - - if (!dev->netdev_ops->ndo_add_vxlan_port) - return; - - spin_lock(&vn->sock_lock); - for (i = 0; i < PORT_HASH_SIZE; ++i) { - hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { - port = inet_sk(vs->sock->sk)->inet_sport; - sa_family = vxlan_get_sk_family(vs); - dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, - port); - } - } - spin_unlock(&vn->sock_lock); -#elif defined(HAVE_NDO_UDP_TUNNEL_ADD) - struct vxlan_sock *vs; - struct net *net = dev_net(dev); - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - unsigned int i; - - if (!dev->netdev_ops->ndo_udp_tunnel_add) - return; - - spin_lock(&vn->sock_lock); - for (i = 0; i < PORT_HASH_SIZE; ++i) { - hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { - struct udp_tunnel_info ti; - if (vs->flags & VXLAN_F_GPE) - ti.type = UDP_TUNNEL_TYPE_VXLAN_GPE; - else - ti.type = UDP_TUNNEL_TYPE_VXLAN; - ti.port = inet_sk(vs->sock->sk)->inet_sport; - ti.sa_family = vxlan_get_sk_family(vs); - - dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); - } - } - spin_unlock(&vn->sock_lock); -#endif -} - -/* Initialize the device structure. */ -static void vxlan_setup(struct net_device *dev) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - unsigned int h; - - eth_hw_addr_random(dev); - ether_setup(dev); - -#ifndef HAVE_NEEDS_FREE_NETDEV - dev->destructor = free_netdev; -#else - dev->needs_free_netdev = true; -#endif - SET_NETDEV_DEVTYPE(dev, &vxlan_type); - - dev->features |= NETIF_F_LLTX; - dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; - dev->features |= NETIF_F_RXCSUM; - dev->features |= NETIF_F_GSO_SOFTWARE; - - dev->vlan_features = dev->features; - dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; - dev->hw_features |= NETIF_F_GSO_SOFTWARE; - dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; -#if 0 - netif_keep_dst(dev); -#endif - dev->priv_flags |= IFF_NO_QUEUE; - - INIT_LIST_HEAD(&vxlan->next); - spin_lock_init(&vxlan->hash_lock); - -#ifdef HAVE_INIT_TIMER_DEFERRABLE - init_timer_deferrable(&vxlan->age_timer); - vxlan->age_timer.function = vxlan_cleanup; - vxlan->age_timer.data = (unsigned long) vxlan; -#else - timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); -#endif - - vxlan->cfg.dst_port = htons(vxlan_port); - - vxlan->dev = dev; - - for (h = 0; h < FDB_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vxlan->fdb_head[h]); -} - -static void vxlan_ether_setup(struct net_device *dev) -{ - dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - dev->netdev_ops = &vxlan_netdev_ether_ops; -} - -static void vxlan_raw_setup(struct net_device *dev) -{ - dev->header_ops = NULL; - dev->type = ARPHRD_NONE; - dev->hard_header_len = 0; - dev->addr_len = 0; - dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; - dev->netdev_ops = &vxlan_netdev_raw_ops; -} - -static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { - [IFLA_VXLAN_ID] = { .type = NLA_U32 }, - [IFLA_VXLAN_GROUP] = { .len = sizeof_field(struct iphdr, daddr) }, - [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, - [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, - [IFLA_VXLAN_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, - [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, - [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, - [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, - [IFLA_VXLAN_LABEL] = { .type = NLA_U32 }, - [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, - [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, - [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, - [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, - [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, - [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, - [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, - [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, - [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, - [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, - [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, - [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, - [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, - [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, - [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, - [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, - [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, - [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, -}; - -#ifdef HAVE_RTNLOP_VALIDATE_WITH_EXTACK -static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - if (tb[IFLA_ADDRESS]) { - if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { - pr_debug("invalid link address (not ethernet)\n"); - return -EINVAL; - } - - if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { - pr_debug("invalid all zero ethernet address\n"); - return -EADDRNOTAVAIL; - } - } - - if (!data) - return -EINVAL; - - if (data[IFLA_VXLAN_ID]) { - __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); - if (id >= VXLAN_VID_MASK) - return -ERANGE; - } - - if (data[IFLA_VXLAN_PORT_RANGE]) { - const struct ifla_vxlan_port_range *p - = nla_data(data[IFLA_VXLAN_PORT_RANGE]); - - if (ntohs(p->high) < ntohs(p->low)) { - pr_debug("port range %u .. %u not valid\n", - ntohs(p->low), ntohs(p->high)); - return -EINVAL; - } - } - - return 0; -} - -static void vxlan_get_drvinfo(struct net_device *netdev, - struct ethtool_drvinfo *drvinfo) -{ - strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); - strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); -} - -static const struct ethtool_ops vxlan_ethtool_ops = { - .get_drvinfo = vxlan_get_drvinfo, - .get_link = ethtool_op_get_link, -}; - -static struct socket *vxlan_create_sock(struct net *net, bool ipv6, - __be16 port, u32 flags) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - udp_conf.use_udp6_rx_checksums = - !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); - udp_conf.ipv6_v6only = 1; - } else { - udp_conf.family = AF_INET; - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -/* Create new listen socket if needed */ -static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, - __be16 port, u32 flags) -{ - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct vxlan_sock *vs; - struct socket *sock; - unsigned int h; - struct udp_tunnel_sock_cfg tunnel_cfg; - - vs = kzalloc(sizeof(*vs), GFP_KERNEL); - if (!vs) - return ERR_PTR(-ENOMEM); - - for (h = 0; h < VNI_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vs->vni_list[h]); - - sock = vxlan_create_sock(net, ipv6, port, flags); - if (IS_ERR(sock)) { - kfree(vs); - return ERR_CAST(sock); - } - - vs->sock = sock; - atomic_set(&vs->refcnt, 1); - vs->flags = (flags & VXLAN_F_RCV_FLAGS); - -#ifdef HAVE_UDP_OFFLOAD - vs->udp_offloads.port = port; - vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive; - vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete; -#endif - - spin_lock(&vn->sock_lock); - hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); - vxlan_notify_add_rx_port(vs); - spin_unlock(&vn->sock_lock); - - /* Mark socket as an encapsulation socket. */ - memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); - tunnel_cfg.sk_user_data = vs; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = vxlan_rcv; - tunnel_cfg.encap_destroy = NULL; -#ifdef HAVE_UDP_TUNNEL_SOCK_CFG_GRO_RECEIVE - tunnel_cfg.gro_receive = vxlan_gro_receive; - tunnel_cfg.gro_complete = vxlan_gro_complete; -#endif - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - - return vs; -} - -static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) -{ - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - struct vxlan_sock *vs = NULL; - - if (!vxlan->cfg.no_share) { - spin_lock(&vn->sock_lock); - vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, - vxlan->cfg.dst_port, vxlan->flags); - if (vs && !atomic_add_unless(&vs->refcnt, 1, 0)) { - spin_unlock(&vn->sock_lock); - return -EBUSY; - } - spin_unlock(&vn->sock_lock); - } - if (!vs) - vs = vxlan_socket_create(vxlan->net, ipv6, - vxlan->cfg.dst_port, vxlan->flags); - if (IS_ERR(vs)) - return PTR_ERR(vs); -#if IS_ENABLED(CONFIG_IPV6) - if (ipv6) - rcu_assign_pointer(vxlan->vn6_sock, vs); - else -#endif - rcu_assign_pointer(vxlan->vn4_sock, vs); - vxlan_vs_add_dev(vs, vxlan); - return 0; -} - -static int vxlan_sock_add(struct vxlan_dev *vxlan) -{ - bool metadata = vxlan->flags & VXLAN_F_COLLECT_METADATA; - bool ipv6 = vxlan->flags & VXLAN_F_IPV6 || metadata; - bool ipv4 = !ipv6 || metadata; - int ret = 0; - - RCU_INIT_POINTER(vxlan->vn4_sock, NULL); -#if IS_ENABLED(CONFIG_IPV6) - RCU_INIT_POINTER(vxlan->vn6_sock, NULL); - if (ipv6) { - ret = __vxlan_sock_add(vxlan, true); - if (ret < 0 && ret != -EAFNOSUPPORT) - ipv4 = false; - } -#endif - if (ipv4) - ret = __vxlan_sock_add(vxlan, false); - if (ret < 0) - vxlan_sock_release(vxlan); - return ret; -} - -static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, - struct vxlan_config *conf) -{ - struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); - struct vxlan_dev *vxlan = netdev_priv(dev), *tmp; - struct vxlan_rdst *dst = &vxlan->default_dst; - unsigned short needed_headroom = ETH_HLEN; - int err; - bool use_ipv6 = false; - __be16 default_port = vxlan->cfg.dst_port; - struct net_device *lowerdev = NULL; - - if (conf->flags & VXLAN_F_GPE) { - if (conf->flags & ~VXLAN_F_ALLOWED_GPE) - return -EINVAL; - /* For now, allow GPE only together with COLLECT_METADATA. - * This can be relaxed later; in such case, the other side - * of the PtP link will have to be provided. - */ - if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) - return -EINVAL; - - vxlan_raw_setup(dev); - } else { - vxlan_ether_setup(dev); - } - - vxlan->net = src_net; - - dst->remote_vni = conf->vni; - - memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); - - /* Unless IPv6 is explicitly requested, assume IPv4 */ - if (!dst->remote_ip.sa.sa_family) - dst->remote_ip.sa.sa_family = AF_INET; - - if (dst->remote_ip.sa.sa_family == AF_INET6 || - vxlan->cfg.saddr.sa.sa_family == AF_INET6) { - if (!IS_ENABLED(CONFIG_IPV6)) - return -EPFNOSUPPORT; - use_ipv6 = true; - vxlan->flags |= VXLAN_F_IPV6; - } - - if (conf->label && !use_ipv6) { - pr_info("label only supported in use with IPv6\n"); - return -EINVAL; - } - - if (conf->remote_ifindex) { - lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); - dst->remote_ifindex = conf->remote_ifindex; - - if (!lowerdev) { - pr_info("ifindex %d does not exist\n", dst->remote_ifindex); - return -ENODEV; - } - -#if IS_ENABLED(CONFIG_IPV6) - if (use_ipv6) { - struct inet6_dev *idev = __in6_dev_get(lowerdev); - if (idev && idev->cnf.disable_ipv6) { - pr_info("IPv6 is disabled via sysctl\n"); - return -EPERM; - } - } -#endif - - if (!conf->mtu) - dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); - - needed_headroom = lowerdev->hard_header_len; - } - - if (conf->mtu) { - err = __vxlan_change_mtu(dev, lowerdev, dst, conf->mtu, false); - if (err) - return err; - } - - if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA) - needed_headroom += VXLAN6_HEADROOM; - else - needed_headroom += VXLAN_HEADROOM; - dev->needed_headroom = needed_headroom; - - memcpy(&vxlan->cfg, conf, sizeof(*conf)); - if (!vxlan->cfg.dst_port) { - if (conf->flags & VXLAN_F_GPE) - vxlan->cfg.dst_port = 4790; /* IANA assigned VXLAN-GPE port */ - else - vxlan->cfg.dst_port = default_port; - } - vxlan->flags |= conf->flags; - - if (!vxlan->cfg.age_interval) - vxlan->cfg.age_interval = FDB_AGE_DEFAULT; - - list_for_each_entry(tmp, &vn->vxlan_list, next) { - if (tmp->cfg.vni == conf->vni && - (tmp->default_dst.remote_ip.sa.sa_family == AF_INET6 || - tmp->cfg.saddr.sa.sa_family == AF_INET6) == use_ipv6 && - tmp->cfg.dst_port == vxlan->cfg.dst_port && - (tmp->flags & VXLAN_F_RCV_FLAGS) == - (vxlan->flags & VXLAN_F_RCV_FLAGS)) - return -EEXIST; - } - - dev->ethtool_ops = &vxlan_ethtool_ops; - - /* create an fdb entry for a valid default destination */ - if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { - err = vxlan_fdb_create(vxlan, all_zeros_mac, - &vxlan->default_dst.remote_ip, - NUD_REACHABLE|NUD_PERMANENT, - NLM_F_EXCL|NLM_F_CREATE, - vxlan->cfg.dst_port, - vxlan->default_dst.remote_vni, - vxlan->default_dst.remote_ifindex, - NTF_SELF); - if (err) - return err; - } - - err = register_netdevice(dev); - if (err) { - vxlan_fdb_delete_default(vxlan); - return err; - } - - list_add(&vxlan->next, &vn->vxlan_list); - - return 0; -} - -#ifdef HAVE_EXT_ACK_IN_RTNL_LINKOPS -static int vxlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[], - struct netlink_ext_ack *extack) -#else -static int vxlan_newlink(struct net *src_net, struct net_device *dev, - struct nlattr *tb[], struct nlattr *data[]) -#endif -{ - pr_info("unsupported operation\n"); - return -EINVAL; -} - -static void vxlan_dellink(struct net_device *dev, struct list_head *head) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); - - spin_lock(&vn->sock_lock); - if (!hlist_unhashed(&vxlan->hlist)) - hlist_del_rcu(&vxlan->hlist); - spin_unlock(&vn->sock_lock); - - list_del(&vxlan->next); - unregister_netdevice_queue(dev, head); -} - -static size_t vxlan_get_size(const struct net_device *dev) -{ - - return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ - nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ - nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ - nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ - nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ - nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ - nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ - nla_total_size(sizeof(struct ifla_vxlan_port_range)) + - nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ - 0; -} - -static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) -{ - const struct vxlan_dev *vxlan = netdev_priv(dev); - const struct vxlan_rdst *dst = &vxlan->default_dst; - struct ifla_vxlan_port_range ports = { - .low = htons(vxlan->cfg.port_min), - .high = htons(vxlan->cfg.port_max), - }; - - if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni))) - goto nla_put_failure; - - if (!vxlan_addr_any(&dst->remote_ip)) { - if (dst->remote_ip.sa.sa_family == AF_INET) { - if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP, - dst->remote_ip.sin.sin_addr.s_addr)) - goto nla_put_failure; -#if IS_ENABLED(CONFIG_IPV6) - } else { - if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6, - &dst->remote_ip.sin6.sin6_addr)) - goto nla_put_failure; -#endif - } - } - - if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) - goto nla_put_failure; - - if (!vxlan_addr_any(&vxlan->cfg.saddr)) { - if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { - if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, - vxlan->cfg.saddr.sin.sin_addr.s_addr)) - goto nla_put_failure; -#if IS_ENABLED(CONFIG_IPV6) - } else { - if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, - &vxlan->cfg.saddr.sin6.sin6_addr)) - goto nla_put_failure; -#endif - } - } - - if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || - nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || - nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || - nla_put_u8(skb, IFLA_VXLAN_LEARNING, - !!(vxlan->flags & VXLAN_F_LEARN)) || - nla_put_u8(skb, IFLA_VXLAN_PROXY, - !!(vxlan->flags & VXLAN_F_PROXY)) || - nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) || - nla_put_u8(skb, IFLA_VXLAN_L2MISS, - !!(vxlan->flags & VXLAN_F_L2MISS)) || - nla_put_u8(skb, IFLA_VXLAN_L3MISS, - !!(vxlan->flags & VXLAN_F_L3MISS)) || - nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, - !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) || - nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || - nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || - nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || - nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, - !(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM_TX)) || - nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, - !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || - nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, - !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || - nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, - !!(vxlan->flags & VXLAN_F_REMCSUM_TX)) || - nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, - !!(vxlan->flags & VXLAN_F_REMCSUM_RX))) - goto nla_put_failure; - - if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) - goto nla_put_failure; - - if (vxlan->flags & VXLAN_F_GBP && - nla_put_flag(skb, IFLA_VXLAN_GBP)) - goto nla_put_failure; - - if (vxlan->flags & VXLAN_F_GPE && - nla_put_flag(skb, IFLA_VXLAN_GPE)) - goto nla_put_failure; - - if (vxlan->flags & VXLAN_F_REMCSUM_NOPARTIAL && - nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) - goto nla_put_failure; - - return 0; - -nla_put_failure: - return -EMSGSIZE; -} - -#ifdef HAVE_GET_LINK_NET -static struct net *vxlan_get_link_net(const struct net_device *dev) -{ - struct vxlan_dev *vxlan = netdev_priv(dev); - - return vxlan->net; -} -#endif - -static struct rtnl_link_ops vxlan_link_ops __read_mostly = { - .kind = "ovs_vxlan", - .maxtype = IFLA_VXLAN_MAX, - .policy = vxlan_policy, - .priv_size = sizeof(struct vxlan_dev), - .setup = vxlan_setup, - .validate = vxlan_validate, - .newlink = vxlan_newlink, - .dellink = vxlan_dellink, - .get_size = vxlan_get_size, - .fill_info = vxlan_fill_info, -#ifdef HAVE_GET_LINK_NET - .get_link_net = vxlan_get_link_net, -#endif -}; - -struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name, - u8 name_assign_type, - struct vxlan_config *conf) -{ - struct nlattr *tb[IFLA_MAX + 1]; - struct net_device *dev; - int err; - - memset(&tb, 0, sizeof(tb)); - - dev = rtnl_create_link(net, name, name_assign_type, - &vxlan_link_ops, tb); - if (IS_ERR(dev)) - return dev; - - err = vxlan_dev_configure(net, dev, conf); - if (err < 0) { - free_netdev(dev); - return ERR_PTR(err); - } - - err = rtnl_configure_link(dev, NULL); - if (err < 0) { - LIST_HEAD(list_kill); - - vxlan_dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return ERR_PTR(err); - } - - return dev; -} -EXPORT_SYMBOL_GPL(rpl_vxlan_dev_create); - -static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, - struct net_device *dev) -{ - struct vxlan_dev *vxlan, *next; - LIST_HEAD(list_kill); - - list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { - struct vxlan_rdst *dst = &vxlan->default_dst; - - /* In case we created vxlan device with carrier - * and we loose the carrier due to module unload - * we also need to remove vxlan device. In other - * cases, it's not necessary and remote_ifindex - * is 0 here, so no matches. - */ - if (dst->remote_ifindex == dev->ifindex) - vxlan_dellink(vxlan->dev, &list_kill); - } - - unregister_netdevice_many(&list_kill); -} - -static int vxlan_netdevice_event(struct notifier_block *unused, - unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); - - if (event == NETDEV_UNREGISTER) - vxlan_handle_lowerdev_unregister(vn, dev); - else if (event == NETDEV_OFFLOAD_PUSH_VXLAN) - vxlan_push_rx_ports(dev); - - return NOTIFY_DONE; -} - -static struct notifier_block vxlan_notifier_block __read_mostly = { - .notifier_call = vxlan_netdevice_event, -}; - -static __net_init int vxlan_init_net(struct net *net) -{ - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - unsigned int h; - - INIT_LIST_HEAD(&vn->vxlan_list); - spin_lock_init(&vn->sock_lock); - - for (h = 0; h < PORT_HASH_SIZE; ++h) - INIT_HLIST_HEAD(&vn->sock_list[h]); - - return 0; -} - -static void __net_exit vxlan_exit_net(struct net *net) -{ - struct vxlan_net *vn = net_generic(net, vxlan_net_id); - struct vxlan_dev *vxlan, *next; - struct net_device *dev, *aux; - LIST_HEAD(list); - - rtnl_lock(); - for_each_netdev_safe(net, dev, aux) - if (dev->rtnl_link_ops == &vxlan_link_ops) - unregister_netdevice_queue(dev, &list); - - list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { - /* If vxlan->dev is in the same netns, it has already been added - * to the list by the previous loop. - */ - if (!net_eq(dev_net(vxlan->dev), net)) { - unregister_netdevice_queue(vxlan->dev, &list); - } - } - - unregister_netdevice_many(&list); - rtnl_unlock(); -} - -static struct pernet_operations vxlan_net_ops = { - .init = vxlan_init_net, - .exit = vxlan_exit_net, - .id = &vxlan_net_id, - .size = sizeof(struct vxlan_net), -}; - -int rpl_vxlan_init_module(void) -{ - int rc; - - get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); - - rc = register_pernet_subsys(&vxlan_net_ops); - if (rc) - goto out1; - - rc = register_netdevice_notifier(&vxlan_notifier_block); - if (rc) - goto out2; - - rc = rtnl_link_register(&vxlan_link_ops); - if (rc) - goto out3; - - pr_info("VxLAN tunneling driver\n"); - return 0; -out3: - unregister_netdevice_notifier(&vxlan_notifier_block); -out2: - unregister_pernet_subsys(&vxlan_net_ops); -out1: - pr_err("Error while initializing VxLAN %d\n", rc); - return rc; -} - -void rpl_vxlan_cleanup_module(void) -{ - rtnl_link_unregister(&vxlan_link_ops); - unregister_netdevice_notifier(&vxlan_notifier_block); - unregister_pernet_subsys(&vxlan_net_ops); - /* rcu_barrier() is called by netns */ -} -#endif diff --git a/datapath/meter.c b/datapath/meter.c deleted file mode 100644 index 92c9c3671..000000000 --- a/datapath/meter.c +++ /dev/null @@ -1,639 +0,0 @@ -/* - * Copyright (c) 2017 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/if.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/kernel.h> -#include <linux/openvswitch.h> -#include <linux/overflow.h> -#include <linux/netlink.h> -#include <linux/rculist.h> - -#include <net/netlink.h> -#include <net/genetlink.h> -#include <linux/mm.h> - -#include "datapath.h" -#include "meter.h" - -#define METER_HASH_BUCKETS 1024 - -static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { - [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, - [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, - [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, - [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, - [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, - [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, - [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, - [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, -}; - -static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { - [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, - [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, - [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, - [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, -}; - -static void ovs_meter_free(struct dp_meter *meter) -{ - if (!meter) - return; - - kfree_rcu(meter, rcu); -} - -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, - u32 meter_id) -{ - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; -} - -/* Call with ovs_mutex or RCU read lock. */ -static struct dp_meter *lookup_meter(const struct datapath *dp, - u32 meter_id) -{ - struct dp_meter *meter; - struct hlist_head *head; - - head = meter_hash_bucket(dp, meter_id); - hlist_for_each_entry_rcu(meter, head, dp_hash_node) { - if (meter->id == meter_id) - return meter; - } - return NULL; -} - -static void attach_meter(struct datapath *dp, struct dp_meter *meter) -{ - struct hlist_head *head = meter_hash_bucket(dp, meter->id); - - hlist_add_head_rcu(&meter->dp_hash_node, head); -} - -static void detach_meter(struct dp_meter *meter) -{ - ASSERT_OVSL(); - if (meter) - hlist_del_rcu(&meter->dp_hash_node); -} - -static struct sk_buff * -ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, - struct ovs_header **ovs_reply_header) -{ - struct sk_buff *skb; - struct ovs_header *ovs_header = info->userhdr; - - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); - if (!skb) - return ERR_PTR(-ENOMEM); - - *ovs_reply_header = genlmsg_put(skb, info->snd_portid, - info->snd_seq, - &dp_meter_genl_family, 0, cmd); - if (!*ovs_reply_header) { - nlmsg_free(skb); - return ERR_PTR(-EMSGSIZE); - } - (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; - - return skb; -} - -static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, - struct dp_meter *meter) -{ - struct nlattr *nla; - struct dp_meter_band *band; - u16 i; - - if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) - goto error; - - if (!meter) - return 0; - - if (nla_put(reply, OVS_METER_ATTR_STATS, - sizeof(struct ovs_flow_stats), &meter->stats) || - nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, - OVS_METER_ATTR_PAD)) - goto error; - - nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); - if (!nla) - goto error; - - band = meter->bands; - - for (i = 0; i < meter->n_bands; ++i, ++band) { - struct nlattr *band_nla; - - band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); - if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, - sizeof(struct ovs_flow_stats), - &band->stats)) - goto error; - nla_nest_end(reply, band_nla); - } - nla_nest_end(reply, nla); - - return 0; -error: - return -EMSGSIZE; -} - -static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) -{ - struct sk_buff *reply; - struct ovs_header *ovs_reply_header; - struct nlattr *nla, *band_nla; - int err; - - reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, - &ovs_reply_header); - if (IS_ERR(reply)) - return PTR_ERR(reply); - - if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || - nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) - goto nla_put_failure; - - nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); - if (!nla) - goto nla_put_failure; - - band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); - if (!band_nla) - goto nla_put_failure; - /* Currently only DROP band type is supported. */ - if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) - goto nla_put_failure; - nla_nest_end(reply, band_nla); - nla_nest_end(reply, nla); - - genlmsg_end(reply, ovs_reply_header); - return genlmsg_reply(reply, info); - -nla_put_failure: - nlmsg_free(reply); - err = -EMSGSIZE; - return err; -} - -#ifndef HAVE_KTIME_GET_NS -#ifndef ktime_to_ns -#define ktime_to_ns(kt) ((kt).tv64) -#endif -static inline u64 ktime_get_ns(void) -{ - return ktime_to_ns(ktime_get()); -} -#endif - -static struct dp_meter *dp_meter_create(struct nlattr **a) -{ - struct nlattr *nla; - int rem; - u16 n_bands = 0; - struct dp_meter *meter; - struct dp_meter_band *band; - int err; - - /* Validate attributes, count the bands. */ - if (!a[OVS_METER_ATTR_BANDS]) - return ERR_PTR(-EINVAL); - - nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) - if (++n_bands > DP_MAX_BANDS) - return ERR_PTR(-EINVAL); - - /* Allocate and set up the meter before locking anything. */ - meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL); - if (!meter) - return ERR_PTR(-ENOMEM); - - meter->id = nla_get_u32(a[OVS_METER_ATTR_ID]); - meter->used = div_u64(ktime_get_ns(), 1000 * 1000); - meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; - meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; - spin_lock_init(&meter->lock); - if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { - meter->stats = *(struct ovs_flow_stats *) - nla_data(a[OVS_METER_ATTR_STATS]); - } - meter->n_bands = n_bands; - - /* Set up meter bands. */ - band = meter->bands; - nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { - struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; - u32 band_max_delta_t; - - err = nla_parse_deprecated_strict((struct nlattr **)&attr, - OVS_BAND_ATTR_MAX, - nla_data(nla), - nla_len(nla), - band_policy, NULL); - if (err) - goto exit_free_meter; - - if (!attr[OVS_BAND_ATTR_TYPE] || - !attr[OVS_BAND_ATTR_RATE] || - !attr[OVS_BAND_ATTR_BURST]) { - err = -EINVAL; - goto exit_free_meter; - } - - band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); - band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); - if (band->rate == 0) { - err = -EINVAL; - goto exit_free_meter; - } - - band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); - /* Figure out max delta_t that is enough to fill any bucket. - * Keep max_delta_t size to the bucket units: - * pkts => 1/1000 packets, kilobits => bits. - * - * Start with a full bucket. - */ - band->bucket = (band->burst_size + band->rate) * 1000; - band_max_delta_t = band->bucket / band->rate; - if (band_max_delta_t > meter->max_delta_t) - meter->max_delta_t = band_max_delta_t; - band++; - } - - return meter; - -exit_free_meter: - kfree(meter); - return ERR_PTR(err); -} - -static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - struct dp_meter *meter, *old_meter; - struct sk_buff *reply; - struct ovs_header *ovs_reply_header; - struct ovs_header *ovs_header = info->userhdr; - struct datapath *dp; - int err; - u32 meter_id; - bool failed; - - if (!a[OVS_METER_ATTR_ID]) { - return -ENODEV; - } - - meter = dp_meter_create(a); - if (IS_ERR_OR_NULL(meter)) - return PTR_ERR(meter); - - reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, - &ovs_reply_header); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - goto exit_free_meter; - } - - ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { - err = -ENODEV; - goto exit_unlock; - } - - meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); - - /* Cannot fail after this. */ - old_meter = lookup_meter(dp, meter_id); - detach_meter(old_meter); - attach_meter(dp, meter); - ovs_unlock(); - - /* Build response with the meter_id and stats from - * the old meter, if any. - */ - failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); - WARN_ON(failed); - if (old_meter) { - spin_lock_bh(&old_meter->lock); - if (old_meter->keep_stats) { - err = ovs_meter_cmd_reply_stats(reply, meter_id, - old_meter); - WARN_ON(err); - } - spin_unlock_bh(&old_meter->lock); - ovs_meter_free(old_meter); - } - - genlmsg_end(reply, ovs_reply_header); - return genlmsg_reply(reply, info); - -exit_unlock: - ovs_unlock(); - nlmsg_free(reply); -exit_free_meter: - kfree(meter); - return err; -} - -static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - u32 meter_id; - struct ovs_header *ovs_header = info->userhdr; - struct ovs_header *ovs_reply_header; - struct datapath *dp; - int err; - struct sk_buff *reply; - struct dp_meter *meter; - - if (!a[OVS_METER_ATTR_ID]) - return -EINVAL; - - meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); - - reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, - &ovs_reply_header); - if (IS_ERR(reply)) - return PTR_ERR(reply); - - ovs_lock(); - - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { - err = -ENODEV; - goto exit_unlock; - } - - /* Locate meter, copy stats. */ - meter = lookup_meter(dp, meter_id); - if (!meter) { - err = -ENOENT; - goto exit_unlock; - } - - spin_lock_bh(&meter->lock); - err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); - spin_unlock_bh(&meter->lock); - if (err) - goto exit_unlock; - - ovs_unlock(); - - genlmsg_end(reply, ovs_reply_header); - return genlmsg_reply(reply, info); - -exit_unlock: - ovs_unlock(); - nlmsg_free(reply); - return err; -} - -static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) -{ - struct nlattr **a = info->attrs; - u32 meter_id; - struct ovs_header *ovs_header = info->userhdr; - struct ovs_header *ovs_reply_header; - struct datapath *dp; - int err; - struct sk_buff *reply; - struct dp_meter *old_meter; - - if (!a[OVS_METER_ATTR_ID]) - return -EINVAL; - meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); - - reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, - &ovs_reply_header); - if (IS_ERR(reply)) - return PTR_ERR(reply); - - ovs_lock(); - - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { - err = -ENODEV; - goto exit_unlock; - } - - old_meter = lookup_meter(dp, meter_id); - if (old_meter) { - spin_lock_bh(&old_meter->lock); - err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); - WARN_ON(err); - spin_unlock_bh(&old_meter->lock); - detach_meter(old_meter); - } - ovs_unlock(); - ovs_meter_free(old_meter); - genlmsg_end(reply, ovs_reply_header); - return genlmsg_reply(reply, info); - -exit_unlock: - ovs_unlock(); - nlmsg_free(reply); - return err; -} - -/* Meter action execution. - * - * Return true 'meter_id' drop band is triggered. The 'skb' should be - * dropped by the caller'. - */ -bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, u32 meter_id) -{ - struct dp_meter *meter; - struct dp_meter_band *band; - long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); - long long int long_delta_ms; - u32 delta_ms; - u32 cost; - int i, band_exceeded_max = -1; - u32 band_exceeded_rate = 0; - - meter = lookup_meter(dp, meter_id); - /* Do not drop the packet when there is no meter. */ - if (!meter) - return false; - - /* Lock the meter while using it. */ - spin_lock(&meter->lock); - - long_delta_ms = (now_ms - meter->used); /* ms */ - - /* Make sure delta_ms will not be too large, so that bucket will not - * wrap around below. - */ - delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) - ? meter->max_delta_t : (u32)long_delta_ms; - - /* Update meter statistics. - */ - meter->used = now_ms; - meter->stats.n_packets += 1; - meter->stats.n_bytes += skb->len; - - /* Bucket rate is either in kilobits per second, or in packets per - * second. We maintain the bucket in the units of either bits or - * 1/1000th of a packet, correspondingly. - * Then, when rate is multiplied with milliseconds, we get the - * bucket units: - * msec * kbps = bits, and - * msec * packets/sec = 1/1000 packets. - * - * 'cost' is the number of bucket units in this packet. - */ - cost = (meter->kbps) ? skb->len * 8 : 1000; - - /* Update all bands and find the one hit with the highest rate. */ - for (i = 0; i < meter->n_bands; ++i) { - long long int max_bucket_size; - - band = &meter->bands[i]; - max_bucket_size = (band->burst_size + band->rate) * 1000LL; - - band->bucket += delta_ms * band->rate; - if (band->bucket > max_bucket_size) - band->bucket = max_bucket_size; - - if (band->bucket >= cost) { - band->bucket -= cost; - } else if (band->rate > band_exceeded_rate) { - band_exceeded_rate = band->rate; - band_exceeded_max = i; - } - } - - if (band_exceeded_max >= 0) { - /* Update band statistics. */ - band = &meter->bands[band_exceeded_max]; - band->stats.n_packets += 1; - band->stats.n_bytes += skb->len; - - /* Drop band triggered, let the caller drop the 'skb'. */ - if (band->type == OVS_METER_BAND_TYPE_DROP) { - spin_unlock(&meter->lock); - return true; - } - } - - spin_unlock(&meter->lock); - return false; -} - -static struct genl_ops dp_meter_genl_ops[] = { - { .cmd = OVS_METER_CMD_FEATURES, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = 0, /* OK for unprivileged users. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = meter_policy, -#endif - .doit = ovs_meter_cmd_features - }, - { .cmd = OVS_METER_CMD_SET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. - */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = meter_policy, -#endif - .doit = ovs_meter_cmd_set, - }, - { .cmd = OVS_METER_CMD_GET, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = 0, /* OK for unprivileged users. */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = meter_policy, -#endif - .doit = ovs_meter_cmd_get, - }, - { .cmd = OVS_METER_CMD_DEL, -#ifdef HAVE_GENL_VALIDATE_FLAGS - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, -#endif - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. - */ -#ifdef HAVE_GENL_OPS_POLICY - .policy = meter_policy, -#endif - .doit = ovs_meter_cmd_del - }, -}; - -static const struct genl_multicast_group ovs_meter_multicast_group = { - .name = OVS_METER_MCGROUP, -}; - -struct genl_family dp_meter_genl_family __ro_after_init = { - .hdrsize = sizeof(struct ovs_header), - .name = OVS_METER_FAMILY, - .version = OVS_METER_VERSION, - .maxattr = OVS_METER_ATTR_MAX, -#ifndef HAVE_GENL_OPS_POLICY - .policy = meter_policy, -#endif - .netnsok = true, - .parallel_ops = true, - .ops = dp_meter_genl_ops, - .n_ops = ARRAY_SIZE(dp_meter_genl_ops), - .mcgrps = &ovs_meter_multicast_group, - .n_mcgrps = 1, - .module = THIS_MODULE, -}; - -int ovs_meters_init(struct datapath *dp) -{ - int i; - - dp->meters = kmalloc_array(METER_HASH_BUCKETS, - sizeof(struct hlist_head), GFP_KERNEL); - - if (!dp->meters) - return -ENOMEM; - - for (i = 0; i < METER_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->meters[i]); - - return 0; -} - -void ovs_meters_exit(struct datapath *dp) -{ - int i; - - for (i = 0; i < METER_HASH_BUCKETS; i++) { - struct hlist_head *head = &dp->meters[i]; - struct dp_meter *meter; - struct hlist_node *n; - - hlist_for_each_entry_safe(meter, n, head, dp_hash_node) - kfree(meter); - } - - kfree(dp->meters); -} diff --git a/datapath/meter.h b/datapath/meter.h deleted file mode 100644 index 964ace265..000000000 --- a/datapath/meter.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2017 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ - -#ifndef METER_H -#define METER_H 1 - -#include <linux/init.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/netlink.h> -#include <linux/openvswitch.h> -#include <linux/genetlink.h> -#include <linux/skbuff.h> - -#include "flow.h" -struct datapath; - -#define DP_MAX_BANDS 1 - -struct dp_meter_band { - u32 type; - u32 rate; - u32 burst_size; - u32 bucket; /* 1/1000 packets, or in bits */ - struct ovs_flow_stats stats; -}; - -struct dp_meter { - spinlock_t lock; /* Per meter lock */ - struct rcu_head rcu; - struct hlist_node dp_hash_node; /*Element in datapath->meters - * hash table. - */ - u32 id; - u16 kbps:1, keep_stats:1; - u16 n_bands; - u32 max_delta_t; - u64 used; - struct ovs_flow_stats stats; - struct dp_meter_band bands[]; -}; - -extern struct genl_family dp_meter_genl_family; -int ovs_meters_init(struct datapath *dp); -void ovs_meters_exit(struct datapath *dp); -bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, u32 meter_id); - -#endif /* meter.h */ diff --git a/datapath/nsh.c b/datapath/nsh.c deleted file mode 100644 index 9e583edbe..000000000 --- a/datapath/nsh.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Network Service Header - * - * Copyright (c) 2017 Red Hat, Inc. -- Jiri Benc <jbenc@redhat.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <net/nsh.h> -#include <net/tun_proto.h> - -int ovs_nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh) -{ - struct nshhdr *nh; - size_t length = nsh_hdr_len(pushed_nh); - u8 next_proto; - - if (skb->mac_len) { - next_proto = TUN_P_ETHERNET; - } else { - next_proto = tun_p_from_eth_p(skb->protocol); - if (!next_proto) - return -EAFNOSUPPORT; - } - - /* Add the NSH header */ - if (skb_cow_head(skb, length) < 0) - return -ENOMEM; - - skb_push(skb, length); - nh = (struct nshhdr *)(skb->data); - memcpy(nh, pushed_nh, length); - nh->np = next_proto; - skb_postpush_rcsum(skb, nh, length); - - skb->protocol = htons(ETH_P_NSH); - skb_reset_mac_header(skb); - skb_reset_network_header(skb); - skb_reset_mac_len(skb); - - return 0; -} -EXPORT_SYMBOL_GPL(ovs_nsh_push); - -int ovs_nsh_pop(struct sk_buff *skb) -{ - struct nshhdr *nh; - size_t length; - __be16 inner_proto; - - if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN)) - return -ENOMEM; - nh = (struct nshhdr *)(skb->data); - length = nsh_hdr_len(nh); - inner_proto = tun_p_to_eth_p(nh->np); - if (!pskb_may_pull(skb, length)) - return -ENOMEM; - - if (!inner_proto) - return -EAFNOSUPPORT; - - skb_pull_rcsum(skb, length); - skb_reset_mac_header(skb); - skb_reset_network_header(skb); - skb_reset_mac_len(skb); - skb->protocol = inner_proto; - - return 0; -} -EXPORT_SYMBOL_GPL(ovs_nsh_pop); - -static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, - netdev_features_t features) -{ - struct sk_buff *segs = ERR_PTR(-EINVAL); - unsigned int nsh_len, mac_len; - __be16 proto; - int nhoff; - - skb_reset_network_header(skb); - - nhoff = skb->network_header - skb->mac_header; - mac_len = skb->mac_len; - - if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN))) - goto out; - nsh_len = nsh_hdr_len(nsh_hdr(skb)); - if (unlikely(!pskb_may_pull(skb, nsh_len))) - goto out; - - proto = tun_p_to_eth_p(nsh_hdr(skb)->np); - if (!proto) - goto out; - - __skb_pull(skb, nsh_len); - - skb_reset_mac_header(skb); - skb_reset_mac_len(skb); - skb->protocol = proto; - - features &= NETIF_F_SG; - segs = skb_mac_gso_segment(skb, features); - if (IS_ERR_OR_NULL(segs)) { - skb_gso_error_unwind(skb, htons(ETH_P_NSH), nsh_len, - skb->network_header - nhoff, - mac_len); - goto out; - } - - for (skb = segs; skb; skb = skb->next) { - skb->protocol = htons(ETH_P_NSH); - __skb_push(skb, nsh_len); - skb_set_mac_header(skb, -nhoff); - skb->network_header = skb->mac_header + mac_len; - skb->mac_len = mac_len; - } - -out: - return segs; -} - -static struct packet_offload nsh_packet_offload __read_mostly = { - .type = htons(ETH_P_NSH), - .callbacks = { - .gso_segment = nsh_gso_segment, - }, -}; - -int ovs_nsh_init(void) -{ - dev_add_offload(&nsh_packet_offload); - return 0; -} - -void ovs_nsh_cleanup(void) -{ - dev_remove_offload(&nsh_packet_offload); -} diff --git a/datapath/vport-geneve.c b/datapath/vport-geneve.c deleted file mode 100644 index a5b91246f..000000000 --- a/datapath/vport-geneve.c +++ /dev/null @@ -1,147 +0,0 @@ -/* - * Copyright (c) 2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/net.h> -#include <linux/rculist.h> -#include <linux/udp.h> -#include <linux/if_vlan.h> -#include <linux/module.h> - -#include <net/geneve.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/route.h> -#include <net/udp.h> -#include <net/xfrm.h> - -#include "datapath.h" -#include "vport.h" -#include "vport-netdev.h" - -static struct vport_ops ovs_geneve_vport_ops; -/** - * struct geneve_port - Keeps track of open UDP ports - * @dst_port: destination port. - */ -struct geneve_port { - u16 dst_port; -}; - -static inline struct geneve_port *geneve_vport(const struct vport *vport) -{ - return vport_priv(vport); -} - -static int geneve_get_options(const struct vport *vport, - struct sk_buff *skb) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->dst_port)) - return -EMSGSIZE; - return 0; -} - -static struct vport *geneve_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct nlattr *options = parms->options; - struct geneve_port *geneve_port; - struct net_device *dev; - struct vport *vport; - struct nlattr *a; - u16 dst_port; - int err; - - if (!options) { - err = -EINVAL; - goto error; - } - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); - if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); - } else { - /* Require destination port from userspace. */ - err = -EINVAL; - goto error; - } - - vport = ovs_vport_alloc(sizeof(struct geneve_port), - &ovs_geneve_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - geneve_port = geneve_vport(vport); - geneve_port->dst_port = dst_port; - - rtnl_lock(); - dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - goto error; - } - - rtnl_unlock(); - return vport; -error: - return ERR_PTR(err); -} - -static struct vport *geneve_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = geneve_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static struct vport_ops ovs_geneve_vport_ops = { - .type = OVS_VPORT_TYPE_GENEVE, - .create = geneve_create, - .destroy = ovs_netdev_tunnel_destroy, - .get_options = geneve_get_options, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = geneve_fill_metadata_dst, -#endif - .send = geneve_xmit, -}; - -static int __init ovs_geneve_tnl_init(void) -{ - return ovs_vport_ops_register(&ovs_geneve_vport_ops); -} - -static void __exit ovs_geneve_tnl_exit(void) -{ - ovs_vport_ops_unregister(&ovs_geneve_vport_ops); -} - -module_init(ovs_geneve_tnl_init); -module_exit(ovs_geneve_tnl_exit); - -MODULE_DESCRIPTION("OVS: Geneve switching port"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("vport-type-5"); diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c deleted file mode 100644 index 07a8c19df..000000000 --- a/datapath/vport-gre.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/if.h> -#include <linux/skbuff.h> -#include <linux/ip.h> -#include <linux/if_tunnel.h> -#include <linux/if_vlan.h> -#include <linux/in.h> -#include <linux/in_route.h> -#include <linux/inetdevice.h> -#include <linux/jhash.h> -#include <linux/list.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/workqueue.h> -#include <linux/rculist.h> -#include <net/route.h> -#include <net/xfrm.h> - -#include <net/icmp.h> -#include <net/ip.h> -#include <net/ip_tunnels.h> -#include <net/gre.h> -#include <net/net_namespace.h> -#include <net/netns/generic.h> -#include <net/protocol.h> - -#include "datapath.h" -#include "vport.h" -#include "vport-netdev.h" - -static struct vport_ops ovs_gre_vport_ops; - -static struct vport *gre_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct net_device *dev; - struct vport *vport; - int err; - - vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - rtnl_lock(); - dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_PTR(err); - } - - rtnl_unlock(); - return vport; -} - -static struct vport *gre_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = gre_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static struct vport_ops ovs_gre_vport_ops = { - .type = OVS_VPORT_TYPE_GRE, - .create = gre_create, - .send = gre_fb_xmit, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = gre_fill_metadata_dst, -#endif - .destroy = ovs_netdev_tunnel_destroy, -}; - -static int __init ovs_gre_tnl_init(void) -{ - return ovs_vport_ops_register(&ovs_gre_vport_ops); -} - -static void __exit ovs_gre_tnl_exit(void) -{ - ovs_vport_ops_unregister(&ovs_gre_vport_ops); -} - -module_init(ovs_gre_tnl_init); -module_exit(ovs_gre_tnl_exit); - -MODULE_DESCRIPTION("OVS: GRE switching port"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("vport-type-3"); diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c deleted file mode 100644 index dbc200231..000000000 --- a/datapath/vport-internal_dev.c +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include <linux/if_vlan.h> -#include <linux/kernel.h> -#include <linux/netdevice.h> -#include <linux/etherdevice.h> -#include <linux/ethtool.h> -#include <linux/skbuff.h> - -#include <net/dst.h> -#include <net/xfrm.h> -#include <net/rtnetlink.h> - -#include "datapath.h" -#include "vport-internal_dev.h" -#include "vport-netdev.h" - -struct internal_dev { - struct vport *vport; -}; - -static struct vport_ops ovs_internal_vport_ops; - -static struct internal_dev *internal_dev_priv(struct net_device *netdev) -{ - return netdev_priv(netdev); -} - -/* Called with rcu_read_lock_bh. */ -static netdev_tx_t -internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) -{ - int len, err; - - len = skb->len; - rcu_read_lock(); - err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); - rcu_read_unlock(); - - if (likely(!err)) { - struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); - - u64_stats_update_begin(&tstats->syncp); - tstats->tx_bytes += len; - tstats->tx_packets++; - u64_stats_update_end(&tstats->syncp); - } else { - netdev->stats.tx_errors++; - } - return NETDEV_TX_OK; -} - -static int internal_dev_open(struct net_device *netdev) -{ - netif_start_queue(netdev); - return 0; -} - -static int internal_dev_stop(struct net_device *netdev) -{ - netif_stop_queue(netdev); - return 0; -} - -static void internal_dev_getinfo(struct net_device *netdev, - struct ethtool_drvinfo *info) -{ - strlcpy(info->driver, "openvswitch", sizeof(info->driver)); -} - -static const struct ethtool_ops internal_dev_ethtool_ops = { - .get_drvinfo = internal_dev_getinfo, - .get_link = ethtool_op_get_link, -}; - -#if !defined(HAVE_NET_DEVICE_WITH_MAX_MTU) && !defined(HAVE_RHEL7_MAX_MTU) -static int internal_dev_change_mtu(struct net_device *dev, int new_mtu) -{ - if (new_mtu < ETH_MIN_MTU) { - net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n", - dev->name, new_mtu, ETH_MIN_MTU); - return -EINVAL; - } - - if (new_mtu > ETH_MAX_MTU) { - net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n", - dev->name, new_mtu, ETH_MAX_MTU); - return -EINVAL; - } - - dev->mtu = new_mtu; - return 0; -} -#endif - -static void internal_dev_destructor(struct net_device *dev) -{ - struct vport *vport = ovs_internal_dev_get_vport(dev); - - ovs_vport_free(vport); -#ifndef HAVE_NEEDS_FREE_NETDEV - free_netdev(dev); -#endif -} - -static void -internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) -{ - int i; - - memset(stats, 0, sizeof(*stats)); - stats->rx_errors = dev->stats.rx_errors; - stats->tx_errors = dev->stats.tx_errors; - stats->tx_dropped = dev->stats.tx_dropped; - stats->rx_dropped = dev->stats.rx_dropped; - - for_each_possible_cpu(i) { - const struct pcpu_sw_netstats *percpu_stats; - struct pcpu_sw_netstats local_stats; - unsigned int start; - - percpu_stats = per_cpu_ptr(dev->tstats, i); - - do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); - - stats->rx_bytes += local_stats.rx_bytes; - stats->rx_packets += local_stats.rx_packets; - stats->tx_bytes += local_stats.tx_bytes; - stats->tx_packets += local_stats.tx_packets; - } -} - -static const struct net_device_ops internal_dev_netdev_ops = { - .ndo_open = internal_dev_open, - .ndo_stop = internal_dev_stop, - .ndo_start_xmit = internal_dev_xmit, - .ndo_set_mac_address = eth_mac_addr, -#if !defined(HAVE_NET_DEVICE_WITH_MAX_MTU) && !defined(HAVE_RHEL7_MAX_MTU) - .ndo_change_mtu = internal_dev_change_mtu, -#endif - .ndo_get_stats64 = (void *)internal_get_stats, -}; - -static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { - .kind = "openvswitch", -}; - -static void do_setup(struct net_device *netdev) -{ - ether_setup(netdev); - -#ifdef HAVE_NET_DEVICE_WITH_MAX_MTU - netdev->max_mtu = ETH_MAX_MTU; -#elif defined(HAVE_RHEL7_MAX_MTU) - netdev->extended->max_mtu = ETH_MAX_MTU; -#endif - netdev->netdev_ops = &internal_dev_netdev_ops; - - netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | - IFF_NO_QUEUE; -#ifndef HAVE_NEEDS_FREE_NETDEV - netdev->destructor = internal_dev_destructor; -#else - netdev->needs_free_netdev = true; - netdev->priv_destructor = internal_dev_destructor; -#endif /* HAVE_NEEDS_FREE_NETDEV */ - netdev->ethtool_ops = &internal_dev_ethtool_ops; - netdev->rtnl_link_ops = &internal_dev_link_ops; - -#ifndef HAVE_IFF_NO_QUEUE - netdev->tx_queue_len = 0; -#endif - - netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | - NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; - - netdev->vlan_features = netdev->features; - netdev->hw_enc_features = netdev->features; - netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - netdev->hw_features = netdev->features & ~NETIF_F_LLTX; - - eth_hw_addr_random(netdev); -} - -static struct vport *internal_dev_create(const struct vport_parms *parms) -{ - struct vport *vport; - struct internal_dev *internal_dev; - int err; - - vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } - - vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_USER, do_setup); - if (!vport->dev) { - err = -ENOMEM; - goto error_free_vport; - } - vport->dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!vport->dev->tstats) { - err = -ENOMEM; - goto error_free_netdev; - } - - dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); - internal_dev = internal_dev_priv(vport->dev); - internal_dev->vport = vport; - - /* Restrict bridge port to current netns. */ - if (vport->port_no == OVSP_LOCAL) - vport->dev->features |= NETIF_F_NETNS_LOCAL; - - rtnl_lock(); - err = register_netdevice(vport->dev); - if (err) - goto error_unlock; - - dev_set_promiscuity(vport->dev, 1); - rtnl_unlock(); - netif_start_queue(vport->dev); - - return vport; - -error_unlock: - rtnl_unlock(); - free_percpu(vport->dev->tstats); -error_free_netdev: - free_netdev(vport->dev); -error_free_vport: - ovs_vport_free(vport); -error: - return ERR_PTR(err); -} - -static void internal_dev_destroy(struct vport *vport) -{ - netif_stop_queue(vport->dev); - rtnl_lock(); - dev_set_promiscuity(vport->dev, -1); - - /* unregister_netdevice() waits for an RCU grace period. */ - unregister_netdevice(vport->dev); - free_percpu(vport->dev->tstats); - rtnl_unlock(); -} - -static netdev_tx_t internal_dev_recv(struct sk_buff *skb) -{ - struct net_device *netdev = skb->dev; - struct pcpu_sw_netstats *stats; - - if (unlikely(!(netdev->flags & IFF_UP))) { - kfree_skb(skb); - netdev->stats.rx_dropped++; - return NETDEV_TX_OK; - } - - skb_dst_drop(skb); - nf_reset_ct(skb); - secpath_reset(skb); - - skb->pkt_type = PACKET_HOST; - skb->protocol = eth_type_trans(skb, netdev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - stats = this_cpu_ptr(netdev->tstats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); - - netif_rx(skb); - return NETDEV_TX_OK; -} - -static struct vport_ops ovs_internal_vport_ops = { - .type = OVS_VPORT_TYPE_INTERNAL, - .create = internal_dev_create, - .destroy = internal_dev_destroy, - .send = internal_dev_recv, -}; - -int ovs_is_internal_dev(const struct net_device *netdev) -{ - return netdev->netdev_ops == &internal_dev_netdev_ops; -} - -struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) -{ - if (!ovs_is_internal_dev(netdev)) - return NULL; - - return internal_dev_priv(netdev)->vport; -} - -int ovs_internal_dev_rtnl_link_register(void) -{ - int err; - - err = rtnl_link_register(&internal_dev_link_ops); - if (err < 0) - return err; - - err = ovs_vport_ops_register(&ovs_internal_vport_ops); - if (err < 0) - rtnl_link_unregister(&internal_dev_link_ops); - - return err; -} - -void ovs_internal_dev_rtnl_link_unregister(void) -{ - ovs_vport_ops_unregister(&ovs_internal_vport_ops); - rtnl_link_unregister(&internal_dev_link_ops); -} diff --git a/datapath/vport-internal_dev.h b/datapath/vport-internal_dev.h deleted file mode 100644 index 1b179a190..000000000 --- a/datapath/vport-internal_dev.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (c) 2007-2011 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef VPORT_INTERNAL_DEV_H -#define VPORT_INTERNAL_DEV_H 1 - -#include "datapath.h" -#include "vport.h" - -int ovs_is_internal_dev(const struct net_device *); -struct vport *ovs_internal_dev_get_vport(struct net_device *); -int ovs_internal_dev_rtnl_link_register(void); -void ovs_internal_dev_rtnl_link_unregister(void); - -#endif /* vport-internal_dev.h */ diff --git a/datapath/vport-lisp.c b/datapath/vport-lisp.c deleted file mode 100644 index 5e2bcda88..000000000 --- a/datapath/vport-lisp.c +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Copyright (c) 2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/net.h> -#include <linux/rculist.h> -#include <linux/udp.h> -#include <linux/if_vlan.h> -#include <linux/module.h> - -#include <net/lisp.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/route.h> -#include <net/udp.h> -#include <net/xfrm.h> - -#include "datapath.h" -#include "vport.h" -#include "vport-netdev.h" - -static struct vport_ops ovs_lisp_vport_ops; -/** - * struct lisp_port - Keeps track of open UDP ports - * @dst_port: destination port. - */ -struct lisp_port { - u16 port_no; -}; - -static inline struct lisp_port *lisp_vport(const struct vport *vport) -{ - return vport_priv(vport); -} - -static int lisp_get_options(const struct vport *vport, - struct sk_buff *skb) -{ - struct lisp_port *lisp_port = lisp_vport(vport); - - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, lisp_port->port_no)) - return -EMSGSIZE; - return 0; -} - -static struct vport *lisp_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct nlattr *options = parms->options; - struct lisp_port *lisp_port; - struct net_device *dev; - struct vport *vport; - struct nlattr *a; - u16 dst_port; - int err; - - if (!options) { - err = -EINVAL; - goto error; - } - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); - if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); - } else { - /* Require destination port from userspace. */ - err = -EINVAL; - goto error; - } - - vport = ovs_vport_alloc(sizeof(struct lisp_port), - &ovs_lisp_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - lisp_port = lisp_vport(vport); - lisp_port->port_no = dst_port; - - rtnl_lock(); - dev = lisp_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - goto error; - } - - rtnl_unlock(); - return vport; -error: - return ERR_PTR(err); -} - -static struct vport *lisp_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = lisp_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static struct vport_ops ovs_lisp_vport_ops = { - .type = OVS_VPORT_TYPE_LISP, - .create = lisp_create, - .destroy = ovs_netdev_tunnel_destroy, - .get_options = lisp_get_options, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = lisp_fill_metadata_dst, -#endif - .send = lisp_xmit, -}; - -static int __init ovs_lisp_tnl_init(void) -{ - return ovs_vport_ops_register(&ovs_lisp_vport_ops); -} - -static void __exit ovs_lisp_tnl_exit(void) -{ - ovs_vport_ops_unregister(&ovs_lisp_vport_ops); -} - -module_init(ovs_lisp_tnl_init); -module_exit(ovs_lisp_tnl_exit); - -MODULE_DESCRIPTION("OVS: Lisp switching port"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("vport-type-105"); diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c deleted file mode 100644 index 4eb881671..000000000 --- a/datapath/vport-netdev.c +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/if_arp.h> -#include <linux/if_bridge.h> -#include <linux/if_vlan.h> -#include <linux/kernel.h> -#include <linux/llc.h> -#include <linux/rtnetlink.h> -#include <linux/skbuff.h> -#include <linux/openvswitch.h> -#include <linux/export.h> - -#include <net/ip_tunnels.h> -#include <net/rtnetlink.h> - -#include "datapath.h" -#include "gso.h" -#include "vport.h" -#include "vport-internal_dev.h" -#include "vport-netdev.h" - -static struct vport_ops ovs_netdev_vport_ops; - -/* Must be called with rcu_read_lock. */ -void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info) -{ - struct vport *vport; - - vport = ovs_netdev_get_vport(skb->dev); - if (unlikely(!vport)) - goto error; - - if (unlikely(skb_warn_if_lro(skb))) - goto error; - - /* Make our own copy of the packet. Otherwise we will mangle the - * packet for anyone who came before us (e.g. tcpdump via AF_PACKET). - */ - skb = skb_share_check(skb, GFP_ATOMIC); - if (unlikely(!skb)) - return; - - if (skb->dev->type == ARPHRD_ETHER) { - skb_push(skb, ETH_HLEN); - skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - } - ovs_vport_receive(vport, skb, tun_info); - return; -error: - kfree_skb(skb); -} - -/* Called with rcu_read_lock and bottom-halves disabled. */ -static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) -{ - struct sk_buff *skb = *pskb; - - if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) - return RX_HANDLER_PASS; - -#ifndef USE_UPSTREAM_TUNNEL - netdev_port_receive(skb, NULL); -#else - netdev_port_receive(skb, skb_tunnel_info(skb)); -#endif - return RX_HANDLER_CONSUMED; -} - -static struct net_device *get_dpdev(const struct datapath *dp) -{ - struct vport *local; - - local = ovs_vport_ovsl(dp, OVSP_LOCAL); - BUG_ON(!local); - return local->dev; -} - -struct vport *ovs_netdev_link(struct vport *vport, const char *name) -{ - int err; - - vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); - if (!vport->dev) { - err = -ENODEV; - goto error_free_vport; - } - - if (vport->dev->flags & IFF_LOOPBACK || - (vport->dev->type != ARPHRD_ETHER && - vport->dev->type != ARPHRD_NONE) || - ovs_is_internal_dev(vport->dev)) { - err = -EINVAL; - goto error_put; - } - - rtnl_lock(); - err = netdev_master_upper_dev_link(vport->dev, - get_dpdev(vport->dp), - NULL, NULL, NULL); - if (err) - goto error_unlock; - - err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, - vport); - if (err) - goto error_master_upper_dev_unlink; - - dev_disable_lro(vport->dev); - dev_set_promiscuity(vport->dev, 1); - vport->dev->priv_flags |= IFF_OVS_DATAPATH; - rtnl_unlock(); - - return vport; - -error_master_upper_dev_unlink: - netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); -error_unlock: - rtnl_unlock(); -error_put: - dev_put(vport->dev); -error_free_vport: - ovs_vport_free(vport); - return ERR_PTR(err); -} -EXPORT_SYMBOL_GPL(ovs_netdev_link); - -static struct vport *netdev_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static void vport_netdev_free(struct rcu_head *rcu) -{ - struct vport *vport = container_of(rcu, struct vport, rcu); - - if (vport->dev) - dev_put(vport->dev); - ovs_vport_free(vport); -} - -void ovs_netdev_detach_dev(struct vport *vport) -{ - ASSERT_RTNL(); - vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; - netdev_rx_handler_unregister(vport->dev); - netdev_upper_dev_unlink(vport->dev, - netdev_master_upper_dev_get(vport->dev)); - dev_set_promiscuity(vport->dev, -1); -} - -static void netdev_destroy(struct vport *vport) -{ - rtnl_lock(); - if (vport->dev->priv_flags & IFF_OVS_DATAPATH) - ovs_netdev_detach_dev(vport); - rtnl_unlock(); - - call_rcu(&vport->rcu, vport_netdev_free); -} - -void ovs_netdev_tunnel_destroy(struct vport *vport) -{ - rtnl_lock(); - if (vport->dev->priv_flags & IFF_OVS_DATAPATH) - ovs_netdev_detach_dev(vport); - - /* We can be invoked by both explicit vport deletion and - * underlying netdev deregistration; delete the link only - * if it's not already shutting down. - */ - if (vport->dev->reg_state == NETREG_REGISTERED) - rtnl_delete_link(vport->dev); - dev_put(vport->dev); - vport->dev = NULL; - rtnl_unlock(); - - call_rcu(&vport->rcu, vport_netdev_free); -} -EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); - -/* Returns null if this device is not attached to a datapath. */ -struct vport *ovs_netdev_get_vport(struct net_device *dev) -{ - if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) - return (struct vport *) - rcu_dereference_rtnl(dev->rx_handler_data); - else - return NULL; -} - -static struct vport_ops ovs_netdev_vport_ops = { - .type = OVS_VPORT_TYPE_NETDEV, - .create = netdev_create, - .destroy = netdev_destroy, - .send = dev_queue_xmit, -}; - -int __init ovs_netdev_init(void) -{ - return ovs_vport_ops_register(&ovs_netdev_vport_ops); -} - -void ovs_netdev_exit(void) -{ - ovs_vport_ops_unregister(&ovs_netdev_vport_ops); -} diff --git a/datapath/vport-netdev.h b/datapath/vport-netdev.h deleted file mode 100644 index 04ad190c9..000000000 --- a/datapath/vport-netdev.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef VPORT_NETDEV_H -#define VPORT_NETDEV_H 1 - -#include <linux/netdevice.h> -#include <linux/rcupdate.h> - -#include "vport.h" - -struct vport *ovs_netdev_get_vport(struct net_device *dev); - -struct vport *ovs_netdev_link(struct vport *vport, const char *name); -void ovs_netdev_detach_dev(struct vport *); - -int __init ovs_netdev_init(void); -void ovs_netdev_exit(void); - -void ovs_netdev_tunnel_destroy(struct vport *vport); - -void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info); - -#endif /* vport_netdev.h */ diff --git a/datapath/vport-stt.c b/datapath/vport-stt.c deleted file mode 100644 index 71bbeda63..000000000 --- a/datapath/vport-stt.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) 2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/net.h> -#include <linux/rculist.h> -#include <linux/udp.h> -#include <linux/if_vlan.h> -#include <linux/module.h> - -#include <net/stt.h> -#include <net/icmp.h> -#include <net/ip.h> -#include <net/route.h> -#include <net/udp.h> -#include <net/xfrm.h> - -#include "datapath.h" -#include "vport.h" -#include "vport-netdev.h" - -#ifdef OVS_STT -static struct vport_ops ovs_stt_vport_ops; -/** - * struct stt_port - Keeps track of open UDP ports - * @dst_port: destination port. - */ -struct stt_port { - u16 port_no; -}; - -static inline struct stt_port *stt_vport(const struct vport *vport) -{ - return vport_priv(vport); -} - -static int stt_get_options(const struct vport *vport, - struct sk_buff *skb) -{ - struct stt_port *stt_port = stt_vport(vport); - - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, stt_port->port_no)) - return -EMSGSIZE; - return 0; -} - -static struct vport *stt_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct nlattr *options = parms->options; - struct stt_port *stt_port; - struct net_device *dev; - struct vport *vport; - struct nlattr *a; - u16 dst_port; - int err; - - if (!options) { - err = -EINVAL; - goto error; - } - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); - if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); - } else { - /* Require destination port from userspace. */ - err = -EINVAL; - goto error; - } - - vport = ovs_vport_alloc(sizeof(struct stt_port), - &ovs_stt_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - stt_port = stt_vport(vport); - stt_port->port_no = dst_port; - - rtnl_lock(); - dev = stt_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - goto error; - } - - rtnl_unlock(); - return vport; -error: - return ERR_PTR(err); -} - -static struct vport *stt_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = stt_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static struct vport_ops ovs_stt_vport_ops = { - .type = OVS_VPORT_TYPE_STT, - .create = stt_create, - .destroy = ovs_netdev_tunnel_destroy, - .get_options = stt_get_options, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = stt_fill_metadata_dst, -#endif - .send = ovs_stt_xmit, -}; - -static int __init ovs_stt_tnl_init(void) -{ - return ovs_vport_ops_register(&ovs_stt_vport_ops); -} - -static void __exit ovs_stt_tnl_exit(void) -{ - ovs_vport_ops_unregister(&ovs_stt_vport_ops); -} - -module_init(ovs_stt_tnl_init); -module_exit(ovs_stt_tnl_exit); - -MODULE_DESCRIPTION("OVS: STT switching port"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("vport-type-106"); -#endif diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c deleted file mode 100644 index 79331c968..000000000 --- a/datapath/vport-vxlan.c +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2015,2017 Nicira, Inc. - * Copyright (c) 2013 Cisco Systems, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/openvswitch.h> -#include <linux/module.h> -#include <net/udp.h> -#include <net/ip_tunnels.h> -#include <net/rtnetlink.h> -#include <net/vxlan.h> - -#include "datapath.h" -#include "vport.h" -#include "vport-netdev.h" - -static struct vport_ops ovs_vxlan_netdev_vport_ops; - -static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) -{ - struct vxlan_dev *vxlan = netdev_priv(vport->dev); - __be16 dst_port = vxlan->cfg.dst_port; - - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) - return -EMSGSIZE; - -#ifdef HAVE_VXLAN_DEV_CFG - if (vxlan->cfg.flags & VXLAN_F_GBP) { -#else - if (vxlan->flags & VXLAN_F_GBP) { -#endif - struct nlattr *exts; - - exts = nla_nest_start_noflag(skb, OVS_TUNNEL_ATTR_EXTENSION); - if (!exts) - return -EMSGSIZE; - -#ifdef HAVE_VXLAN_DEV_CFG - if (vxlan->cfg.flags & VXLAN_F_GBP && -#else - if (vxlan->flags & VXLAN_F_GBP && -#endif - nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) - return -EMSGSIZE; - - nla_nest_end(skb, exts); -#ifdef HAVE_VXLAN_DEV_CFG - } else if (vxlan->cfg.flags & VXLAN_F_GPE) { -#else - } else if (vxlan->flags & VXLAN_F_GPE) { -#endif - struct nlattr *exts; - - exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); - if (!exts) - return -EMSGSIZE; - -#ifdef HAVE_VXLAN_DEV_CFG - if (vxlan->cfg.flags & VXLAN_F_GPE && -#else - if (vxlan->flags & VXLAN_F_GPE && -#endif - nla_put_flag(skb, OVS_VXLAN_EXT_GPE)) - return -EMSGSIZE; - - nla_nest_end(skb, exts); - } - - return 0; -} - -static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, - [OVS_VXLAN_EXT_GPE] = { .type = NLA_FLAG, }, -}; - -static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, - struct vxlan_config *conf) -{ - struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; - int err; - - if (nla_len(attr) < sizeof(struct nlattr)) - return -EINVAL; - - err = nla_parse_nested_deprecated(exts, OVS_VXLAN_EXT_MAX, attr, - exts_policy, NULL); - if (err < 0) - return err; - - if (exts[OVS_VXLAN_EXT_GBP]) - conf->flags |= VXLAN_F_GBP; - else if (exts[OVS_VXLAN_EXT_GPE]) - conf->flags |= VXLAN_F_GPE; - - return 0; -} - -static struct vport *vxlan_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct nlattr *options = parms->options; - struct net_device *dev; - struct vport *vport; - struct nlattr *a; - int err; - struct vxlan_config conf = { - .no_share = true, - .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, - /* Don't restrict the packets that can be sent by MTU */ - .mtu = IP_MAX_MTU, - }; - - if (!options) { - err = -EINVAL; - goto error; - } - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); - if (a && nla_len(a) == sizeof(u16)) { - conf.dst_port = htons(nla_get_u16(a)); - } else { - /* Require destination port from userspace. */ - err = -EINVAL; - goto error; - } - - vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); - if (a) { - err = vxlan_configure_exts(vport, a, &conf); - if (err) { - ovs_vport_free(vport); - goto error; - } - } - - rtnl_lock(); - dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); - if (err < 0) { - rtnl_delete_link(dev); - rtnl_unlock(); - ovs_vport_free(vport); - goto error; - } - - rtnl_unlock(); - return vport; -error: - return ERR_PTR(err); -} - -static struct vport *vxlan_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = vxlan_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return ovs_netdev_link(vport, parms->name); -} - -static struct vport_ops ovs_vxlan_netdev_vport_ops = { - .type = OVS_VPORT_TYPE_VXLAN, - .create = vxlan_create, - .destroy = ovs_netdev_tunnel_destroy, - .get_options = vxlan_get_options, -#ifndef USE_UPSTREAM_TUNNEL - .fill_metadata_dst = vxlan_fill_metadata_dst, -#endif - .send = vxlan_xmit, -}; - -static int __init ovs_vxlan_tnl_init(void) -{ - return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); -} - -static void __exit ovs_vxlan_tnl_exit(void) -{ - ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); -} - -module_init(ovs_vxlan_tnl_init); -module_exit(ovs_vxlan_tnl_exit); - -MODULE_DESCRIPTION("OVS: VXLAN switching port"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("vport-type-4"); diff --git a/datapath/vport.c b/datapath/vport.c deleted file mode 100644 index bd62c5612..000000000 --- a/datapath/vport.c +++ /dev/null @@ -1,614 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#include <linux/etherdevice.h> -#include <linux/if.h> -#include <linux/if_vlan.h> -#include <linux/jhash.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/mutex.h> -#include <linux/percpu.h> -#include <linux/rcupdate.h> -#include <linux/rtnetlink.h> -#include <linux/compat.h> -#include <linux/module.h> -#include <linux/if_link.h> -#include <net/net_namespace.h> -#include <net/lisp.h> -#include <net/gre.h> -#include <net/geneve.h> -#include <net/stt.h> -#include <net/vxlan.h> - -#include "datapath.h" -#include "gso.h" -#include "vport.h" -#include "vport-internal_dev.h" - -static LIST_HEAD(vport_ops_list); -static bool compat_gre_loaded = false; -static bool compat_ip6_tunnel_loaded = false; - -/* Protected by RCU read lock for reading, ovs_mutex for writing. */ -static struct hlist_head *dev_table; -#define VPORT_HASH_BUCKETS 1024 - -/** - * ovs_vport_init - initialize vport subsystem - * - * Called at module load time to initialize the vport subsystem. - */ -int ovs_vport_init(void) -{ - int err; - - dev_table = kcalloc(VPORT_HASH_BUCKETS, sizeof(struct hlist_head), - GFP_KERNEL); - if (!dev_table) - return -ENOMEM; - - err = lisp_init_module(); - if (err) - goto err_lisp; - err = gre_init(); - if (err && err != -EEXIST) { - goto err_gre; - } else { - if (err == -EEXIST) { - pr_warn("Cannot take GRE protocol rx entry"\ - "- The GRE/ERSPAN rx feature not supported\n"); - /* continue GRE tx */ - } - - err = ipgre_init(); - if (err && err != -EEXIST) - goto err_ipgre; - compat_gre_loaded = true; - } - err = ip6gre_init(); - if (err && err != -EEXIST) { - goto err_ip6gre; - } else { - if (err == -EEXIST) { - pr_warn("IPv6 GRE/ERSPAN Rx mode is not supported\n"); - goto skip_ip6_tunnel_init; - } - } - - err = ip6_tunnel_init(); - if (err) - goto err_ip6_tunnel; - else - compat_ip6_tunnel_loaded = true; - -skip_ip6_tunnel_init: - err = geneve_init_module(); - if (err) - goto err_geneve; - err = vxlan_init_module(); - if (err) - goto err_vxlan; - err = ovs_stt_init_module(); - if (err) - goto err_stt; - - return 0; - ovs_stt_cleanup_module(); -err_stt: - vxlan_cleanup_module(); -err_vxlan: - geneve_cleanup_module(); -err_geneve: - ip6_tunnel_cleanup(); -err_ip6_tunnel: - ip6gre_fini(); -err_ip6gre: - ipgre_fini(); -err_ipgre: - gre_exit(); -err_gre: - lisp_cleanup_module(); -err_lisp: - kfree(dev_table); - return err; -} - -/** - * ovs_vport_exit - shutdown vport subsystem - * - * Called at module exit time to shutdown the vport subsystem. - */ -void ovs_vport_exit(void) -{ - if (compat_gre_loaded) { - gre_exit(); - ipgre_fini(); - } - ovs_stt_cleanup_module(); - vxlan_cleanup_module(); - geneve_cleanup_module(); - if (compat_ip6_tunnel_loaded) - ip6_tunnel_cleanup(); - ip6gre_fini(); - lisp_cleanup_module(); - kfree(dev_table); -} - -static struct hlist_head *hash_bucket(const struct net *net, const char *name) -{ - unsigned int hash = jhash(name, strlen(name), (unsigned long) net); - return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; -} - -int __ovs_vport_ops_register(struct vport_ops *ops) -{ - int err = -EEXIST; - struct vport_ops *o; - - ovs_lock(); - list_for_each_entry(o, &vport_ops_list, list) - if (ops->type == o->type) - goto errout; - - list_add_tail(&ops->list, &vport_ops_list); - err = 0; -errout: - ovs_unlock(); - return err; -} -EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); - -void ovs_vport_ops_unregister(struct vport_ops *ops) -{ - ovs_lock(); - list_del(&ops->list); - ovs_unlock(); -} -EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); - -/** - * ovs_vport_locate - find a port that has already been created - * - * @name: name of port to find - * - * Must be called with ovs or RCU read lock. - */ -struct vport *ovs_vport_locate(const struct net *net, const char *name) -{ - struct hlist_head *bucket = hash_bucket(net, name); - struct vport *vport; - - hlist_for_each_entry_rcu(vport, bucket, hash_node) - if (!strcmp(name, ovs_vport_name(vport)) && - net_eq(ovs_dp_get_net(vport->dp), net)) - return vport; - - return NULL; -} - -/** - * ovs_vport_alloc - allocate and initialize new vport - * - * @priv_size: Size of private data area to allocate. - * @ops: vport device ops - * - * Allocate and initialize a new vport defined by @ops. The vport will contain - * a private data area of size @priv_size that can be accessed using - * vport_priv(). vports that are no longer needed should be released with - * vport_free(). - */ -struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, - const struct vport_parms *parms) -{ - struct vport *vport; - size_t alloc_size; - - alloc_size = sizeof(struct vport); - if (priv_size) { - alloc_size = ALIGN(alloc_size, VPORT_ALIGN); - alloc_size += priv_size; - } - - vport = kzalloc(alloc_size, GFP_KERNEL); - if (!vport) - return ERR_PTR(-ENOMEM); - - vport->dp = parms->dp; - vport->port_no = parms->port_no; - vport->ops = ops; - INIT_HLIST_NODE(&vport->dp_hash_node); - - if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { - kfree(vport); - return ERR_PTR(-EINVAL); - } - - return vport; -} -EXPORT_SYMBOL_GPL(ovs_vport_alloc); - -/** - * ovs_vport_free - uninitialize and free vport - * - * @vport: vport to free - * - * Frees a vport allocated with vport_alloc() when it is no longer needed. - * - * The caller must ensure that an RCU grace period has passed since the last - * time @vport was in a datapath. - */ -void ovs_vport_free(struct vport *vport) -{ - /* vport is freed from RCU callback or error path, Therefore - * it is safe to use raw dereference. - */ - kfree(rcu_dereference_raw(vport->upcall_portids)); - kfree(vport); -} -EXPORT_SYMBOL_GPL(ovs_vport_free); - -static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) -{ - struct vport_ops *ops; - - list_for_each_entry(ops, &vport_ops_list, list) - if (ops->type == parms->type) - return ops; - - return NULL; -} - -/** - * ovs_vport_add - add vport device (for kernel callers) - * - * @parms: Information about new vport. - * - * Creates a new vport with the specified configuration (which is dependent on - * device type). ovs_mutex must be held. - */ -struct vport *ovs_vport_add(const struct vport_parms *parms) -{ - struct vport_ops *ops; - struct vport *vport; - - ops = ovs_vport_lookup(parms); - if (ops) { - struct hlist_head *bucket; - - if (!try_module_get(ops->owner)) - return ERR_PTR(-EAFNOSUPPORT); - - vport = ops->create(parms); - if (IS_ERR(vport)) { - module_put(ops->owner); - return vport; - } - - bucket = hash_bucket(ovs_dp_get_net(vport->dp), - ovs_vport_name(vport)); - hlist_add_head_rcu(&vport->hash_node, bucket); - return vport; - } - - if (parms->type == OVS_VPORT_TYPE_GRE && !compat_gre_loaded) { - pr_warn("GRE protocol already loaded!\n"); - return ERR_PTR(-EAFNOSUPPORT); - } - /* Unlock to attempt module load and return -EAGAIN if load - * was successful as we need to restart the port addition - * workflow. - */ - ovs_unlock(); - request_module("vport-type-%d", parms->type); - ovs_lock(); - - if (!ovs_vport_lookup(parms)) - return ERR_PTR(-EAFNOSUPPORT); - else - return ERR_PTR(-EAGAIN); -} - -/** - * ovs_vport_set_options - modify existing vport device (for kernel callers) - * - * @vport: vport to modify. - * @options: New configuration. - * - * Modifies an existing device with the specified configuration (which is - * dependent on device type). ovs_mutex must be held. - */ -int ovs_vport_set_options(struct vport *vport, struct nlattr *options) -{ - if (!vport->ops->set_options) - return -EOPNOTSUPP; - return vport->ops->set_options(vport, options); -} - -/** - * ovs_vport_del - delete existing vport device - * - * @vport: vport to delete. - * - * Detaches @vport from its datapath and destroys it. ovs_mutex must be - * held. - */ -void ovs_vport_del(struct vport *vport) -{ - ASSERT_OVSL(); - - hlist_del_rcu(&vport->hash_node); - module_put(vport->ops->owner); - vport->ops->destroy(vport); -} - -/** - * ovs_vport_get_stats - retrieve device stats - * - * @vport: vport from which to retrieve the stats - * @stats: location to store stats - * - * Retrieves transmit, receive, and error stats for the given device. - * - * Must be called with ovs_mutex or rcu_read_lock. - */ -void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) -{ - const struct rtnl_link_stats64 *dev_stats; - struct rtnl_link_stats64 temp; - - dev_stats = dev_get_stats(vport->dev, &temp); - stats->rx_errors = dev_stats->rx_errors; - stats->tx_errors = dev_stats->tx_errors; - stats->tx_dropped = dev_stats->tx_dropped; - stats->rx_dropped = dev_stats->rx_dropped; - - stats->rx_bytes = dev_stats->rx_bytes; - stats->rx_packets = dev_stats->rx_packets; - stats->tx_bytes = dev_stats->tx_bytes; - stats->tx_packets = dev_stats->tx_packets; -} - -/** - * ovs_vport_get_options - retrieve device options - * - * @vport: vport from which to retrieve the options. - * @skb: sk_buff where options should be appended. - * - * Retrieves the configuration of the given device, appending an - * %OVS_VPORT_ATTR_OPTIONS attribute that in turn contains nested - * vport-specific attributes to @skb. - * - * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room, or another - * negative error code if a real error occurred. If an error occurs, @skb is - * left unmodified. - * - * Must be called with ovs_mutex or rcu_read_lock. - */ -int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) -{ - struct nlattr *nla; - int err; - - if (!vport->ops->get_options) - return 0; - - nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_OPTIONS); - if (!nla) - return -EMSGSIZE; - - err = vport->ops->get_options(vport, skb); - if (err) { - nla_nest_cancel(skb, nla); - return err; - } - - nla_nest_end(skb, nla); - return 0; -} - -/** - * ovs_vport_set_upcall_portids - set upcall portids of @vport. - * - * @vport: vport to modify. - * @ids: new configuration, an array of port ids. - * - * Sets the vport's upcall_portids to @ids. - * - * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed - * as an array of U32. - * - * Must be called with ovs_mutex. - */ -int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) -{ - struct vport_portids *old, *vport_portids; - - if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) - return -EINVAL; - - old = ovsl_dereference(vport->upcall_portids); - - vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), - GFP_KERNEL); - if (!vport_portids) - return -ENOMEM; - - vport_portids->n_ids = nla_len(ids) / sizeof(u32); - vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); - nla_memcpy(vport_portids->ids, ids, nla_len(ids)); - - rcu_assign_pointer(vport->upcall_portids, vport_portids); - - if (old) - kfree_rcu(old, rcu); - return 0; -} - -/** - * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. - * - * @vport: vport from which to retrieve the portids. - * @skb: sk_buff where portids should be appended. - * - * Retrieves the configuration of the given vport, appending the - * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall - * portids to @skb. - * - * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. - * If an error occurs, @skb is left unmodified. Must be called with - * ovs_mutex or rcu_read_lock. - */ -int ovs_vport_get_upcall_portids(const struct vport *vport, - struct sk_buff *skb) -{ - struct vport_portids *ids; - - ids = rcu_dereference_ovsl(vport->upcall_portids); - - if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) - return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, - ids->n_ids * sizeof(u32), (void *)ids->ids); - else - return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); -} - -/** - * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. - * - * @vport: vport from which the missed packet is received. - * @skb: skb that the missed packet was received. - * - * Uses the skb_get_hash() to select the upcall portid to send the - * upcall. - * - * Returns the portid of the target socket. Must be called with rcu_read_lock. - */ -u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) -{ - struct vport_portids *ids; - u32 ids_index; - u32 hash; - - ids = rcu_dereference(vport->upcall_portids); - - /* If there is only one portid, select it in the fast-path. */ - if (ids->n_ids == 1) - return ids->ids[0]; - - hash = skb_get_hash(skb); - ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); - return ids->ids[ids_index]; -} - -/** - * ovs_vport_receive - pass up received packet to the datapath for processing - * - * @vport: vport that received the packet - * @skb: skb that was received - * @tun_key: tunnel (if any) that carried packet - * - * Must be called with rcu_read_lock. The packet cannot be shared and - * skb->data should point to the Ethernet header. - */ -int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ip_tunnel_info *tun_info) -{ - struct sw_flow_key key; - int error; - - OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->mru = 0; - OVS_CB(skb)->cutlen = 0; - if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { - u32 mark; - - mark = skb->mark; - skb_scrub_packet(skb, true); - skb->mark = mark; - tun_info = NULL; - } - - ovs_skb_init_inner_protocol(skb); - skb_clear_ovs_gso_cb(skb); - /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_key_extract(tun_info, skb, &key); - if (unlikely(error)) { - kfree_skb(skb); - return error; - } - ovs_dp_process_packet(skb, &key); - return 0; -} - -static int packet_length(const struct sk_buff *skb, - struct net_device *dev) -{ - int length = skb->len - dev->hard_header_len; - - if (!skb_vlan_tag_present(skb) && - eth_type_vlan(skb->protocol)) - length -= VLAN_HLEN; - - /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow - * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none - * account for 802.1ad. e.g. is_skb_forwardable(). - */ - - return length > 0 ? length: 0; -} - -void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) -{ - int mtu = vport->dev->mtu; - - switch (vport->dev->type) { - case ARPHRD_NONE: - if (mac_proto == MAC_PROTO_ETHERNET) { - skb_reset_network_header(skb); - skb_reset_mac_len(skb); - skb->protocol = htons(ETH_P_TEB); - } else if (mac_proto != MAC_PROTO_NONE) { - WARN_ON_ONCE(1); - goto drop; - } - break; - case ARPHRD_ETHER: - if (mac_proto != MAC_PROTO_ETHERNET) - goto drop; - break; - default: - goto drop; - } - - if (unlikely(packet_length(skb, vport->dev) > mtu && - !skb_is_gso(skb))) { - net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - vport->dev->name, - packet_length(skb, vport->dev), mtu); - vport->dev->stats.tx_errors++; - goto drop; - } - - skb->dev = vport->dev; - vport->ops->send(skb); - return; - -drop: - kfree_skb(skb); -} diff --git a/datapath/vport.h b/datapath/vport.h deleted file mode 100644 index d630c34bc..000000000 --- a/datapath/vport.h +++ /dev/null @@ -1,205 +0,0 @@ -/* - * Copyright (c) 2007-2015 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#ifndef VPORT_H -#define VPORT_H 1 - -#include <linux/if_tunnel.h> -#include <linux/list.h> -#include <linux/netlink.h> -#include <linux/openvswitch.h> -#include <linux/reciprocal_div.h> -#include <linux/skbuff.h> -#include <linux/spinlock.h> -#include <linux/u64_stats_sync.h> - -#include "datapath.h" - -struct vport; -struct vport_parms; - -/* The following definitions are for users of the vport subsytem: */ - -int ovs_vport_init(void); -void ovs_vport_exit(void); - -struct vport *ovs_vport_add(const struct vport_parms *); -void ovs_vport_del(struct vport *); - -struct vport *ovs_vport_locate(const struct net *net, const char *name); - -void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); - -int ovs_vport_set_options(struct vport *, struct nlattr *options); -int ovs_vport_get_options(const struct vport *, struct sk_buff *); - -int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); -int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); -u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); - -/** - * struct vport_portids - array of netlink portids of a vport. - * must be protected by rcu. - * @rn_ids: The reciprocal value of @n_ids. - * @rcu: RCU callback head for deferred destruction. - * @n_ids: Size of @ids array. - * @ids: Array storing the Netlink socket pids to be used for packets received - * on this port that miss the flow table. - */ -struct vport_portids { - struct reciprocal_value rn_ids; - struct rcu_head rcu; - u32 n_ids; - u32 ids[]; -}; - -/** - * struct vport - one port within a datapath - * @dev: Pointer to net_device. - * @dp: Datapath to which this port belongs. - * @upcall_portids: RCU protected 'struct vport_portids'. - * @port_no: Index into @dp's @ports array. - * @hash_node: Element in @dev_table hash table in vport.c. - * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. - * @ops: Class structure. - * @detach_list: list used for detaching vport in net-exit call. - * @rcu: RCU callback head for deferred destruction. - */ -struct vport { - struct net_device *dev; - struct datapath *dp; - struct vport_portids __rcu *upcall_portids; - u16 port_no; - - struct hlist_node hash_node; - struct hlist_node dp_hash_node; - const struct vport_ops *ops; - - struct list_head detach_list; - struct rcu_head rcu; -}; - -/** - * struct vport_parms - parameters for creating a new vport - * - * @name: New vport's name. - * @type: New vport's type. - * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if - * none was supplied. - * @dp: New vport's datapath. - * @port_no: New vport's port number. - */ -struct vport_parms { - const char *name; - enum ovs_vport_type type; - struct nlattr *options; - - /* For ovs_vport_alloc(). */ - struct datapath *dp; - u16 port_no; - struct nlattr *upcall_portids; -}; - -/** - * struct vport_ops - definition of a type of virtual port - * - * @type: %OVS_VPORT_TYPE_* value for this type of virtual port. - * @create: Create a new vport configured as specified. On success returns - * a new vport allocated with ovs_vport_alloc(), otherwise an ERR_PTR() value. - * @destroy: Destroys a vport. Must call vport_free() on the vport but not - * before an RCU grace period has elapsed. - * @set_options: Modify the configuration of an existing vport. May be %NULL - * if modification is not supported. - * @get_options: Appends vport-specific attributes for the configuration of an - * existing vport to a &struct sk_buff. May be %NULL for a vport that does not - * have any configuration. - * @send: Send a packet on the device. - * zero for dropped packets or negative for error. - */ -struct vport_ops { - enum ovs_vport_type type; - - /* Called with ovs_mutex. */ - struct vport *(*create)(const struct vport_parms *); - void (*destroy)(struct vport *); - - int (*set_options)(struct vport *, struct nlattr *); - int (*get_options)(const struct vport *, struct sk_buff *); - - netdev_tx_t (*send)(struct sk_buff *skb); -#ifndef USE_UPSTREAM_TUNNEL - int (*fill_metadata_dst)(struct net_device *dev, struct sk_buff *skb); -#endif - struct module *owner; - struct list_head list; -}; - -struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, - const struct vport_parms *); -void ovs_vport_free(struct vport *); - -#define VPORT_ALIGN 8 - -/** - * vport_priv - access private data area of vport - * - * @vport: vport to access - * - * If a nonzero size was passed in priv_size of vport_alloc() a private data - * area was allocated on creation. This allows that area to be accessed and - * used for any purpose needed by the vport implementer. - */ -static inline void *vport_priv(const struct vport *vport) -{ - return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); -} - -/** - * vport_from_priv - lookup vport from private data pointer - * - * @priv: Start of private data area. - * - * It is sometimes useful to translate from a pointer to the private data - * area to the vport, such as in the case where the private data pointer is - * the result of a hash table lookup. @priv must point to the start of the - * private data area. - */ -static inline struct vport *vport_from_priv(void *priv) -{ - return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); -} - -int ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ip_tunnel_info *); - -static inline const char *ovs_vport_name(struct vport *vport) -{ - return vport->dev->name; -} - -int __ovs_vport_ops_register(struct vport_ops *ops); -#define ovs_vport_ops_register(ops) \ - ({ \ - (ops)->owner = THIS_MODULE; \ - __ovs_vport_ops_register(ops); \ - }) - -void ovs_vport_ops_unregister(struct vport_ops *ops); -void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto); - -#endif /* vport.h */ diff --git a/debian/copyright.in b/debian/copyright.in index 64d23795d..ff38792d9 100644 --- a/debian/copyright.in +++ b/debian/copyright.in @@ -81,7 +81,7 @@ Copyright: (c) 1990,1993, The Regents of the University of California License: BSD-3-clause Files: - datapath/linux/compat/include/linux/openvswitch.h + include/linux/openvswitch.h Copyright: (c) 2007-2017 Nicira, Inc. License: Apache-2.0-or-GPL-2.0 diff --git a/include/automake.mk b/include/automake.mk index e982da87d..1e3390ae0 100644 --- a/include/automake.mk +++ b/include/automake.mk @@ -1,6 +1,6 @@ BUILT_SOURCES += include/odp-netlink.h include/odp-netlink-macros.h -include/odp-netlink.h: datapath/linux/compat/include/linux/openvswitch.h \ +include/odp-netlink.h: include/linux/openvswitch.h \ build-aux/extract-odp-netlink-h $(AM_V_GEN)sed -f $(srcdir)/build-aux/extract-odp-netlink-h < $< > $@ diff --git a/include/linux/automake.mk b/include/linux/automake.mk index f857c7e08..cdae5eedc 100644 --- a/include/linux/automake.mk +++ b/include/linux/automake.mk @@ -1,6 +1,7 @@ noinst_HEADERS += \ include/linux/netlink.h \ include/linux/netfilter/nf_conntrack_sctp.h \ + include/linux/openvswitch.h \ include/linux/pkt_cls.h \ include/linux/gen_stats.h \ include/linux/tc_act/tc_mpls.h \ diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/include/linux/openvswitch.h index 8bb5abdc8..8bb5abdc8 100644 --- a/datapath/linux/compat/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h diff --git a/utilities/docker/debian/build-kernel-modules.sh b/utilities/docker/debian/build-kernel-modules.sh index 872ba1eb8..aaee73ff7 100755 --- a/utilities/docker/debian/build-kernel-modules.sh +++ b/utilities/docker/debian/build-kernel-modules.sh @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -KERNEL_VERSION=$1 +KERNEL_VERSION=host OVS_BRANCH=$2 GITHUB_SRC=$3 @@ -39,18 +39,13 @@ cd ovs config="./configure --localstatedir="/var" --sysconfdir="/etc" --prefix="/usr" --enable-ssl" -if [ $KERNEL_VERSION = "host" ]; then - eval $config -else - withlinux=" --with-linux=/lib/modules/$KERNEL_VERSION/build" - eval $config$withlinux -fi +eval $config -make -j8; make install; make modules_install +make -j8; make install # remove deps to make the container light weight. apt-get remove --purge -y ${build_deps} apt-get autoremove -y --purge cd ..; rm -rf ovs basic_utils="vim kmod net-tools uuid-runtime iproute2" -apt-get install -y ${basic_utils}
\ No newline at end of file +apt-get install -y ${basic_utils} diff --git a/utilities/ovs-dev.py b/utilities/ovs-dev.py index 534c5e7f1..d64e464f4 100755 --- a/utilities/ovs-dev.py +++ b/utilities/ovs-dev.py @@ -106,7 +106,7 @@ def conf(): pass # Directory exists. os.chdir(BUILD_GCC) - _sh(*(configure + ["--with-linux=/lib/modules/%s/build" % uname()])) + _sh(*(configure)) try: _sh("clang --version", check=True) @@ -184,12 +184,9 @@ def tag(): ctags = ['ctags', '-R', '-f', '.tags'] try: - _sh(*(ctags + ['--exclude="datapath/"'])) + _sh(*ctags) except: - try: - _sh(*ctags) # Some versions of ctags don't have --exclude - except: - pass + pass try: _sh('cscope', '-R', '-b') @@ -351,7 +348,7 @@ Basic Configuration: # First install the basic requirements needed to build Open vSwitch. sudo apt-get install git build-essential libtool autoconf pkg-config \\ - libssl-dev gdb libcap-ng-dev linux-headers-`uname -r` + libssl-dev gdb libcap-ng-dev # Next clone the Open vSwitch source. git clone https://github.com/openvswitch/ovs.git %(ovs)s @@ -362,14 +359,6 @@ Basic Configuration: # Build the switch. %(v)s conf make - # Install the kernel module - sudo insmod %(ovs)s/datapath/linux/openvswitch.ko - - # If needed, manually load all required vport modules: - sudo insmod %(ovs)s/datapath/linux/vport-vxlan.ko - sudo insmod %(ovs)s/datapath/linux/vport-geneve.ko - [...] - # Run the switch. %(v)s run |