summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--FAQ.md2
-rw-r--r--acinclude.m457
-rw-r--r--datapath/Modules.mk3
-rw-r--r--datapath/actions.c19
-rw-r--r--datapath/compat.h38
-rw-r--r--datapath/datapath.c28
-rw-r--r--datapath/datapath.h12
-rw-r--r--datapath/dp_notify.c5
-rw-r--r--datapath/flow.c16
-rw-r--r--datapath/flow.h81
-rw-r--r--datapath/flow_netlink.c151
-rw-r--r--datapath/flow_netlink.h8
-rw-r--r--datapath/flow_table.c6
-rw-r--r--datapath/linux/Modules.mk11
-rw-r--r--datapath/linux/compat/dev-openvswitch.c22
-rw-r--r--datapath/linux/compat/geneve.c1069
-rw-r--r--datapath/linux/compat/gre.c121
-rw-r--r--datapath/linux/compat/gso.c2
-rw-r--r--datapath/linux/compat/gso.h97
-rw-r--r--datapath/linux/compat/include/linux/etherdevice.h25
-rw-r--r--datapath/linux/compat/include/linux/if_link.h151
-rw-r--r--datapath/linux/compat/include/linux/if_vlan.h52
-rw-r--r--datapath/linux/compat/include/linux/list.h5
-rw-r--r--datapath/linux/compat/include/linux/netdev_features.h45
-rw-r--r--datapath/linux/compat/include/linux/netdevice.h117
-rw-r--r--datapath/linux/compat/include/linux/skbuff.h28
-rw-r--r--datapath/linux/compat/include/linux/stddef.h5
-rw-r--r--datapath/linux/compat/include/net/dst_metadata.h44
-rw-r--r--datapath/linux/compat/include/net/geneve.h65
-rw-r--r--datapath/linux/compat/include/net/gre.h90
-rw-r--r--datapath/linux/compat/include/net/inet_ecn.h59
-rw-r--r--datapath/linux/compat/include/net/ip6_route.h31
-rw-r--r--datapath/linux/compat/include/net/ip6_tunnel.h33
-rw-r--r--datapath/linux/compat/include/net/ip_tunnels.h224
-rw-r--r--datapath/linux/compat/include/net/lisp.h24
-rw-r--r--datapath/linux/compat/include/net/net_namespace.h5
-rw-r--r--datapath/linux/compat/include/net/route.h109
-rw-r--r--datapath/linux/compat/include/net/rtnetlink.h30
-rw-r--r--datapath/linux/compat/include/net/stt.h56
-rw-r--r--datapath/linux/compat/include/net/udp_tunnel.h65
-rw-r--r--datapath/linux/compat/include/net/vxlan.h271
-rw-r--r--datapath/linux/compat/ip_gre.c680
-rw-r--r--datapath/linux/compat/ip_tunnel.c285
-rw-r--r--datapath/linux/compat/ip_tunnels_core.c93
-rw-r--r--datapath/linux/compat/lisp.c711
-rw-r--r--datapath/linux/compat/netdevice.c119
-rw-r--r--datapath/linux/compat/skbuff-openvswitch.c32
-rw-r--r--datapath/linux/compat/stt.c502
-rw-r--r--datapath/linux/compat/udp_tunnel.c100
-rw-r--r--datapath/linux/compat/vxlan.c2100
-rw-r--r--datapath/vport-geneve.c193
-rw-r--r--datapath/vport-gre.c252
-rw-r--r--datapath/vport-internal_dev.c156
-rw-r--r--datapath/vport-lisp.c468
-rw-r--r--datapath/vport-netdev.c216
-rw-r--r--datapath/vport-netdev.h21
-rw-r--r--datapath/vport-stt.c161
-rw-r--r--datapath/vport-vxlan.c236
-rw-r--r--datapath/vport-vxlan.h11
-rw-r--r--datapath/vport.c324
-rw-r--r--datapath/vport.h81
61 files changed, 7597 insertions, 2426 deletions
diff --git a/FAQ.md b/FAQ.md
index 9280065c5..d99680771 100644
--- a/FAQ.md
+++ b/FAQ.md
@@ -156,7 +156,7 @@ A: The following table lists the Linux kernel versions against which the
| 2.1.x | 2.6.32 to 3.11
| 2.3.x | 2.6.32 to 3.14
| 2.4.x | 2.6.32 to 4.0
-| 2.5.x | 2.6.32 to 4.2
+| 2.5.x | 2.6.32 to 4.3
Open vSwitch userspace should also work with the Linux kernel module
built into Linux 3.3 and later.
diff --git a/acinclude.m4 b/acinclude.m4
index e4846d90a..7c8afaca6 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -134,10 +134,10 @@ AC_DEFUN([OVS_CHECK_LINUX], [
AC_MSG_RESULT([$kversion])
if test "$version" -ge 4; then
- if test "$version" = 4 && test "$patchlevel" -le 2; then
+ if test "$version" = 4 && test "$patchlevel" -le 3; then
: # Linux 4.x
else
- AC_ERROR([Linux kernel in $KBUILD is version $kversion, but version newer than 4.2.x is not supported (please refer to the FAQ for advice)])
+ AC_ERROR([Linux kernel in $KBUILD is version $kversion, but version newer than 4.3.x is not supported (please refer to the FAQ for advice)])
fi
elif test "$version" = 3; then
: # Linux 3.x
@@ -313,15 +313,28 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
mkdir -p datapath/linux
: > datapath/linux/kcompat.h.new
+ echo '#include <linux/version.h>
+#ifndef RHEL_RELEASE_CODE
+#define RHEL_RELEASE_CODE 0
+#define RHEL_RELEASE_VERSION(a, b) 0
+#endif' >> datapath/linux/kcompat.h.new
+
OVS_GREP_IFELSE([$KSRC/arch/x86/include/asm/checksum_32.h], [src_err,],
[OVS_DEFINE([HAVE_CSUM_COPY_DBG])])
+ OVS_GREP_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_dst_lookup.*net],
+ [OVS_DEFINE([HAVE_IPV6_DST_LOOKUP_NET])])
+ OVS_GREP_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_stub])
+
OVS_GREP_IFELSE([$KSRC/include/linux/err.h], [ERR_CAST])
OVS_GREP_IFELSE([$KSRC/include/linux/err.h], [IS_ERR_OR_NULL])
OVS_GREP_IFELSE([$KSRC/include/linux/etherdevice.h], [eth_hw_addr_random])
OVS_GREP_IFELSE([$KSRC/include/linux/etherdevice.h], [ether_addr_copy])
+ OVS_GREP_IFELSE([$KSRC/include/uapi/linux/if_link.h], [IFLA_GENEVE_TOS])
+ OVS_GREP_IFELSE([$KSRC/include/uapi/linux/if_link.h], [rtnl_link_stats64])
+ OVS_GREP_IFELSE([$KSRC/include/linux/if_link.h], [rtnl_link_stats64])
OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [vlan_set_encap_proto])
OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [vlan_hwaccel_push_inside])
@@ -329,9 +342,13 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
OVS_GREP_IFELSE([$KSRC/include/linux/in.h], [proto_ports_offset])
OVS_GREP_IFELSE([$KSRC/include/net/ip.h], [__ip_select_ident.*dst_entry],
[OVS_DEFINE([HAVE_IP_SELECT_IDENT_USING_DST_ENTRY])])
+ OVS_GREP_IFELSE([$KSRC/include/net/ip.h], [__ip_select_ident.*net],
+ [OVS_DEFINE([HAVE_IP_SELECT_IDENT_USING_NET])])
+
OVS_GREP_IFELSE([$KSRC/include/net/ip.h], [inet_get_local_port_range.*net],
[OVS_DEFINE([HAVE_INET_GET_LOCAL_PORT_RANGE_USING_NET])])
OVS_GREP_IFELSE([$KSRC/include/net/ip.h], [ip_is_fragment])
+ OVS_GREP_IFELSE([$KSRC/include/net/dst_metadata.h], [metadata_dst])
OVS_GREP_IFELSE([$KSRC/include/linux/net.h], [sock_create_kern.*net],
[OVS_DEFINE([HAVE_SOCK_CREATE_KERN_NET])])
@@ -340,17 +357,33 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [dev_get_by_index_rcu])
OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [__skb_gso_segment])
OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [can_checksum_protocol])
+ OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [ndo_get_iflink])
OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [netdev_features_t])
OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [pcpu_sw_netstats])
OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [netdev_rx_handler_register])
OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [net_device_extended])
OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [rx_handler_func_t.*pskb],
[OVS_DEFINE([HAVE_RX_HANDLER_PSKB])])
+ OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [netif_needs_gso.*net_device],
+ [OVS_DEFINE([HAVE_NETIF_NEEDS_GSO_NETDEV])])
+ OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [udp_offload])
+ OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [udp_offload.*uoff],
+ [OVS_DEFINE([HAVE_UDP_OFFLOAD_ARG_UOFF])])
+ OVS_GREP_IFELSE([$KSRC/include/linux/netdevice.h], [gro_remcsum])
+
+ OVS_GREP_IFELSE([$KSRC/include/linux/netfilter.h], [nf_hook_state])
+ OVS_GREP_IFELSE([$KSRC/include/linux/netfilter.h], [nf_register_net_hook])
OVS_GREP_IFELSE([$KSRC/include/linux/netfilter.h], [nf_hookfn.*nf_hook_ops],
[OVS_DEFINE([HAVE_NF_HOOKFN_ARG_OPS])])
OVS_GREP_IFELSE([$KSRC/include/linux/random.h], [prandom_u32])
+ OVS_GREP_IFELSE([$KSRC/include/net/rtnetlink.h], [get_link_net])
+ OVS_GREP_IFELSE([$KSRC/include/net/rtnetlink.h], [name_assign_type])
+ OVS_GREP_IFELSE([$KSRC/include/net/rtnetlink.h], [rtnl_create_link.*src_net],
+ [OVS_DEFINE([HAVE_RTNL_CREATE_LINK_SRC_NET])])
+ OVS_GREP_IFELSE([$KSRC/include/net/net_namespace.h], [possible_net_t])
+
OVS_GREP_IFELSE([$KSRC/include/linux/rcupdate.h], [rcu_read_lock_held], [],
[OVS_GREP_IFELSE([$KSRC/include/linux/rtnetlink.h],
[rcu_read_lock_held])])
@@ -364,7 +397,12 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [[[^@]]proto_data_valid],
[OVS_DEFINE([HAVE_PROTO_DATA_VALID])])
OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_checksum_start_offset])
+ OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_protocol])
+ OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_mac_header])
+ OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_network_header])
OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [kfree_skb_list])
+ OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_scrub_packet.*xnet],
+ [OVS_DEFINE([HAVE_SKB_SCRUB_PACKET_XNET])])
OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [rxhash])
OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [u16.*rxhash],
[OVS_DEFINE([HAVE_U16_RXHASH])])
@@ -423,7 +461,9 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
OVS_GREP_IFELSE([$KSRC/include/net/geneve.h], [geneve_hdr])
OVS_GREP_IFELSE([$KSRC/include/net/gre.h], [gre_cisco_register])
+ OVS_GREP_IFELSE([$KSRC/include/net/gre.h], [gre_handle_offloads])
OVS_GREP_IFELSE([$KSRC/include/net/ipv6.h], [IP6_FH_F_SKIP_RH])
+ OVS_GREP_IFELSE([$KSRC/include/net/ipv6.h], [ip6_local_out_sk])
OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_get_be16])
OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_put_be16])
OVS_GREP_IFELSE([$KSRC/include/net/netlink.h], [nla_put_be32])
@@ -438,7 +478,7 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
[OVS_DEFINE([HAVE_VLAN_BUG_WORKAROUND])])
OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [vlan_insert_tag_set_proto])
OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [__vlan_insert_tag])
-
+ OVS_GREP_IFELSE([$KSRC/include/linux/if_vlan.h], [vlan_get_protocol])
OVS_GREP_IFELSE([$KSRC/include/linux/u64_stats_sync.h], [u64_stats_fetch_begin_irq])
@@ -446,17 +486,18 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
[OVS_DEFINE([HAVE_RHEL_OVS_HOOK])])
OVS_GREP_IFELSE([$KSRC/include/net/vxlan.h], [struct vxlan_metadata],
[OVS_DEFINE([HAVE_VXLAN_METADATA])])
+ OVS_GREP_IFELSE([$KSRC/include/net/vxlan.h], [VXLAN_HF_RCO])
OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [udp_flow_src_port],
[OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [inet_get_local_port_range(net],
[OVS_DEFINE([HAVE_UDP_FLOW_SRC_PORT])])])
OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [udp_v4_check])
OVS_GREP_IFELSE([$KSRC/include/net/udp.h], [udp_set_csum])
- OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [ignore_df:1],
+ OVS_GREP_IFELSE([$KSRC/include/net/udp_tunnel.h], [udp_tunnel_gro_complete])
+ OVS_GREP_IFELSE([$KSRC/include/net/udp_tunnel.h], [ipv6_v6only],
+ [OVS_DEFINE([HAVE_UDP_TUNNEL_IPV6])])
+
+ OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [ignore_df],
[OVS_DEFINE([HAVE_IGNORE_DF_RENAME])])
- OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [SKB_GSO_GRE_CSUM],
- [OVS_DEFINE([HAVE_SKB_GSO_GRE_CSUM])])
- OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [SKB_GSO_UDP_TUNNEL_CSUM],
- [OVS_DEFINE([HAVE_SKB_GSO_UDP_TUNNEL_CSUM])])
OVS_GREP_IFELSE([$KSRC/include/uapi/linux/netdevice.h], [NET_NAME_UNKNOWN],
[OVS_DEFINE([HAVE_NET_NAME_UNKNOWN])])
diff --git a/datapath/Modules.mk b/datapath/Modules.mk
index 8dc3415c6..c06eafc1e 100644
--- a/datapath/Modules.mk
+++ b/datapath/Modules.mk
@@ -42,8 +42,7 @@ openvswitch_headers = \
vlan.h \
vport.h \
vport-internal_dev.h \
- vport-netdev.h \
- vport-vxlan.h
+ vport-netdev.h
openvswitch_extras = \
README.md
diff --git a/datapath/actions.c b/datapath/actions.c
index c529bbb9b..f45f61998 100644
--- a/datapath/actions.c
+++ b/datapath/actions.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -618,12 +618,11 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
else
kfree_skb(skb);
}
-
static int output_userspace(struct datapath *dp, struct sk_buff *skb,
struct sw_flow_key *key, const struct nlattr *attr,
const struct nlattr *actions, int actions_len)
{
- struct ovs_tunnel_info info;
+ struct ip_tunnel_info info;
struct dp_upcall_info upcall;
const struct nlattr *a;
int rem;
@@ -650,11 +649,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb,
if (vport) {
int err;
+ upcall.egress_tun_info = &info;
err = ovs_vport_get_egress_tun_info(vport, skb,
- &info);
- if (!err)
- upcall.egress_tun_info = &info;
+ &upcall);
+ if (err)
+ upcall.egress_tun_info = NULL;
}
+
break;
}
@@ -748,7 +749,11 @@ static int execute_set_action(struct sk_buff *skb,
{
/* Only tunnel set execution is supported without a mask. */
if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
- OVS_CB(skb)->egress_tun_info = nla_data(a);
+ struct ovs_tunnel_info *tun = nla_data(a);
+
+ ovs_skb_dst_drop(skb);
+ ovs_dst_hold((struct dst_entry *)tun->tun_dst);
+ ovs_skb_dst_set(skb, (struct dst_entry *)tun->tun_dst);
return 0;
}
diff --git a/datapath/compat.h b/datapath/compat.h
index c827b11aa..a30003f8b 100644
--- a/datapath/compat.h
+++ b/datapath/compat.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -43,41 +43,7 @@
#define inet_sport(sk) (inet_sk(sk)->inet_sport)
#endif
-static inline struct rtable *find_route(struct net *net,
- __be32 *saddr, __be32 daddr,
- u8 ipproto, u8 tos, u32 skb_mark)
-{
- struct rtable *rt;
- /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
- * router expect RT_TOS bits only.
- */
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
- struct flowi fl = { .nl_u = { .ip4_u = {
- .daddr = daddr,
- .saddr = *saddr,
- .tos = RT_TOS(tos) } },
- .mark = skb_mark,
- .proto = ipproto };
-
- if (unlikely(ip_route_output_key(net, &rt, &fl)))
- return ERR_PTR(-EADDRNOTAVAIL);
- *saddr = fl.nl_u.ip4_u.saddr;
- return rt;
-#else
- struct flowi4 fl = { .daddr = daddr,
- .saddr = *saddr,
- .flowi4_tos = RT_TOS(tos),
- .flowi4_mark = skb_mark,
- .flowi4_proto = ipproto };
-
- rt = ip_route_output_key(net, &fl);
- *saddr = fl.saddr;
- return rt;
-#endif
-}
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
static inline bool skb_encapsulation(struct sk_buff *skb)
{
return skb->encapsulation;
diff --git a/datapath/datapath.c b/datapath/datapath.c
index 5f362425e..32561a3ce 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -56,6 +56,7 @@
#include "flow.h"
#include "flow_table.h"
#include "flow_netlink.h"
+#include "gso.h"
#include "vlan.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
@@ -178,7 +179,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex)
const char *ovs_dp_name(const struct datapath *dp)
{
struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL);
- return vport->ops->get_name(vport);
+ return ovs_vport_name(vport);
}
static int get_dpifindex(const struct datapath *dp)
@@ -190,7 +191,7 @@ static int get_dpifindex(const struct datapath *dp)
local = ovs_vport_rcu(dp, OVSP_LOCAL);
if (local)
- ifindex = netdev_vport_priv(local)->dev->ifindex;
+ ifindex = local->dev->ifindex;
else
ifindex = 0;
@@ -480,10 +481,12 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
nla_len(upcall_info->userdata),
nla_data(upcall_info->userdata));
+
if (upcall_info->egress_tun_info) {
nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY);
err = ovs_nla_put_egress_tunnel_key(user_skb,
- upcall_info->egress_tun_info);
+ upcall_info->egress_tun_info,
+ upcall_info->egress_tun_opts);
BUG_ON(err);
nla_nest_end(user_skb, nla);
}
@@ -590,7 +593,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
goto err_flow_free;
rcu_assign_pointer(flow->sf_acts, acts);
- OVS_CB(packet)->egress_tun_info = NULL;
packet->priority = flow->key.phy.priority;
packet->mark = flow->key.phy.skb_mark;
@@ -607,6 +609,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
if (!input_vport)
goto err_unlock;
+ packet->dev = input_vport->dev;
OVS_CB(packet)->input_vport = input_vport;
sf_acts = rcu_dereference(flow->sf_acts);
@@ -1028,7 +1031,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info)
}
ovs_unlock();
- ovs_nla_free_flow_actions(old_acts);
+ ovs_nla_free_flow_actions_rcu(old_acts);
ovs_flow_free(new_flow, false);
}
@@ -1040,7 +1043,7 @@ err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
- kfree(acts);
+ ovs_nla_free_flow_actions(acts);
err_kfree_flow:
ovs_flow_free(new_flow, false);
error:
@@ -1167,7 +1170,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
if (reply)
ovs_notify(&dp_flow_genl_family, &ovs_dp_flow_multicast_group, reply, info);
if (old_acts)
- ovs_nla_free_flow_actions(old_acts);
+ ovs_nla_free_flow_actions_rcu(old_acts);
return 0;
@@ -1175,7 +1178,7 @@ err_unlock_ovs:
ovs_unlock();
kfree_skb(reply);
err_kfree_acts:
- kfree(acts);
+ ovs_nla_free_flow_actions(acts);
error:
return error;
}
@@ -1810,7 +1813,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) ||
nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) ||
- nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)))
+ nla_put_string(skb, OVS_VPORT_ATTR_NAME,
+ ovs_vport_name(vport)))
goto nla_put_failure;
ovs_vport_get_stats(vport, &vport_stats);
@@ -2228,13 +2232,11 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet,
struct vport *vport;
hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) {
- struct netdev_vport *netdev_vport;
if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL)
continue;
- netdev_vport = netdev_vport_priv(vport);
- if (dev_net(netdev_vport->dev) == dnet)
+ if (dev_net(vport->dev) == dnet)
list_add(&vport->detach_list, head);
}
}
diff --git a/datapath/datapath.h b/datapath/datapath.h
index aca9407a4..aefac6d25 100644
--- a/datapath/datapath.h
+++ b/datapath/datapath.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -26,12 +26,12 @@
#include <linux/skbuff.h>
#include <linux/u64_stats_sync.h>
#include <net/net_namespace.h>
+#include <net/ip_tunnels.h>
#include "compat.h"
#include "flow.h"
#include "flow_table.h"
#include "vlan.h"
-#include "vport.h"
#define DP_MAX_PORTS USHRT_MAX
#define DP_VPORT_HASH_BUCKETS 1024
@@ -95,13 +95,10 @@ struct datapath {
/**
* struct ovs_skb_cb - OVS data in skb CB
- * @egress_tun_info: Tunnel information about this packet on egress path.
- * NULL if the packet is not being tunneled.
* @input_vport: The original vport packet came in on. This value is cached
* when a packet is received by OVS.
*/
struct ovs_skb_cb {
- struct ovs_tunnel_info *egress_tun_info;
struct vport *input_vport;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -117,7 +114,8 @@ struct ovs_skb_cb {
* @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY.
*/
struct dp_upcall_info {
- const struct ovs_tunnel_info *egress_tun_info;
+ struct ip_tunnel_info *egress_tun_info;
+ const void *egress_tun_opts;
const struct nlattr *userdata;
const struct nlattr *actions;
int actions_len;
@@ -129,12 +127,10 @@ struct dp_upcall_info {
* struct ovs_net - Per net-namespace data for ovs.
* @dps: List of datapaths to enable dumping them all out.
* Protected by genl_mutex.
- * @vport_net: Per network namespace data for vport.
*/
struct ovs_net {
struct list_head dps;
struct work_struct dp_notify_work;
- struct vport_net vport_net;
};
extern int ovs_net_id;
diff --git a/datapath/dp_notify.c b/datapath/dp_notify.c
index f9a037510..9434c19c7 100644
--- a/datapath/dp_notify.c
+++ b/datapath/dp_notify.c
@@ -60,13 +60,10 @@ void ovs_dp_notify_wq(struct work_struct *work)
struct hlist_node *n;
hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) {
- struct netdev_vport *netdev_vport;
-
if (vport->ops->type != OVS_VPORT_TYPE_NETDEV)
continue;
- netdev_vport = netdev_vport_priv(vport);
- if (!(ovs_netdev_get_vport(netdev_vport->dev)))
+ if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH))
dp_detach_port_notify(vport);
}
}
diff --git a/datapath/flow.c b/datapath/flow.c
index 8ef60d134..3375d7b4d 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -48,7 +48,7 @@
#include "datapath.h"
#include "flow.h"
#include "flow_netlink.h"
-
+#include "vport.h"
#include "vlan.h"
u64 ovs_flow_used_time(unsigned long flow_jiffies)
@@ -684,19 +684,21 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key)
return key_extract(skb, key);
}
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb, struct sw_flow_key *key)
{
/* Extract metadata from packet. */
if (tun_info) {
- memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key));
+ if (ip_tunnel_info_af(tun_info) != AF_INET)
+ return -EINVAL;
+ memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key));
BUILD_BUG_ON(((1 << (sizeof(tun_info->options_len) * 8)) - 1) >
sizeof(key->tun_opts));
- if (tun_info->options) {
- memcpy(TUN_METADATA_OPTS(key, tun_info->options_len),
- tun_info->options, tun_info->options_len);
+ if (tun_info->options_len) {
+ ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len),
+ tun_info);
key->tun_opts_len = tun_info->options_len;
} else {
key->tun_opts_len = 0;
diff --git a/datapath/flow.h b/datapath/flow.h
index 2433436d8..1abb2e15a 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -32,31 +32,11 @@
#include <linux/time.h>
#include <linux/flex_array.h>
#include <net/inet_ecn.h>
+#include <net/ip_tunnels.h>
+#include <net/dst_metadata.h>
struct sk_buff;
-/* Used to memset ovs_key_ipv4_tunnel padding. */
-#define OVS_TUNNEL_KEY_SIZE \
- (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \
- FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst))
-
-struct ovs_key_ipv4_tunnel {
- __be64 tun_id;
- __be32 ipv4_src;
- __be32 ipv4_dst;
- __be16 tun_flags;
- u8 ipv4_tos;
- u8 ipv4_ttl;
- __be16 tp_src;
- __be16 tp_dst;
-} __packed __aligned(4); /* Minimize padding. */
-
-struct ovs_tunnel_info {
- struct ovs_key_ipv4_tunnel tunnel;
- const void *options;
- u8 options_len;
-};
-
/* Store options at the end of the array if they are less than the
* maximum size. This allows us to get the benefits of variable length
* matching for small options.
@@ -66,54 +46,9 @@ struct ovs_tunnel_info {
#define TUN_METADATA_OPTS(flow_key, opt_len) \
((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len)))
-static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
- __be32 saddr, __be32 daddr,
- u8 tos, u8 ttl,
- __be16 tp_src,
- __be16 tp_dst,
- __be64 tun_id,
- __be16 tun_flags,
- const void *opts,
- u8 opts_len)
-{
- tun_info->tunnel.tun_id = tun_id;
- tun_info->tunnel.ipv4_src = saddr;
- tun_info->tunnel.ipv4_dst = daddr;
- tun_info->tunnel.ipv4_tos = tos;
- tun_info->tunnel.ipv4_ttl = ttl;
- tun_info->tunnel.tun_flags = tun_flags;
-
- /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
- * the upper tunnel are used.
- * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
- */
- tun_info->tunnel.tp_src = tp_src;
- tun_info->tunnel.tp_dst = tp_dst;
-
- /* Clear struct padding. */
- if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE)
- memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE,
- 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE);
-
- tun_info->options = opts;
- tun_info->options_len = opts_len;
-}
-
-static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
- const struct iphdr *iph,
- __be16 tp_src,
- __be16 tp_dst,
- __be64 tun_id,
- __be16 tun_flags,
- const void *opts,
- u8 opts_len)
-{
- __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr,
- iph->tos, iph->ttl,
- tp_src, tp_dst,
- tun_id, tun_flags,
- opts, opts_len);
-}
+struct ovs_tunnel_info {
+ struct metadata_dst *tun_dst;
+};
#define OVS_SW_FLOW_KEY_METADATA_SIZE \
(offsetof(struct sw_flow_key, recirc_id) + \
@@ -122,7 +57,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info,
struct sw_flow_key {
u8 tun_opts[255];
u8 tun_opts_len;
- struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
+ struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
u32 skb_mark; /* SKB mark. */
@@ -273,7 +208,7 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies);
/* Update the non-metadata part of the flow key using skb. */
int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key);
-int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info,
+int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info,
struct sk_buff *skb,
struct sw_flow_key *key);
/* Extract key from packet coming from userspace. */
diff --git a/datapath/flow_netlink.c b/datapath/flow_netlink.c
index 3a3492beb..f95aa1436 100644
--- a/datapath/flow_netlink.c
+++ b/datapath/flow_netlink.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2014 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -45,11 +45,12 @@
#include <net/ipv6.h>
#include <net/ndisc.h>
#include <net/mpls.h>
+#include <net/vxlan.h>
#include "datapath.h"
#include "flow.h"
#include "flow_netlink.h"
-#include "vport-vxlan.h"
+#include "gso.h"
struct ovs_len_tbl {
int len;
@@ -485,7 +486,7 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr,
struct nlattr *a;
int rem;
unsigned long opt_key_offset;
- struct ovs_vxlan_opts opts;
+ struct vxlan_metadata opts;
BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts));
@@ -568,19 +569,19 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
tun_flags |= TUNNEL_KEY;
break;
case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_src,
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src,
nla_get_in_addr(a), is_mask);
break;
case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst,
+ SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst,
nla_get_in_addr(a), is_mask);
break;
case OVS_TUNNEL_KEY_ATTR_TOS:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos,
+ SW_FLOW_KEY_PUT(match, tun_key.tos,
nla_get_u8(a), is_mask);
break;
case OVS_TUNNEL_KEY_ATTR_TTL:
- SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl,
+ SW_FLOW_KEY_PUT(match, tun_key.ttl,
nla_get_u8(a), is_mask);
ttl = true;
break;
@@ -643,7 +644,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
}
if (!is_mask) {
- if (!match->key->tun_key.ipv4_dst) {
+ if (!match->key->tun_key.u.ipv4.dst) {
OVS_NLERR(log, "IPv4 tunnel dst address is zero");
return -EINVAL;
}
@@ -660,7 +661,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr,
static int vxlan_opt_to_nlattr(struct sk_buff *skb,
const void *tun_opts, int swkey_tun_opts_len)
{
- const struct ovs_vxlan_opts *opts = tun_opts;
+ const struct vxlan_metadata *opts = tun_opts;
struct nlattr *nla;
nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS);
@@ -675,22 +676,24 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb,
}
static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *output,
+ const struct ip_tunnel_key *output,
const void *tun_opts, int swkey_tun_opts_len)
{
if (output->tun_flags & TUNNEL_KEY &&
nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id))
return -EMSGSIZE;
- if (output->ipv4_src &&
- nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, output->ipv4_src))
+ if (output->u.ipv4.src &&
+ nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC,
+ output->u.ipv4.src))
return -EMSGSIZE;
- if (output->ipv4_dst &&
- nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->ipv4_dst))
+ if (output->u.ipv4.dst &&
+ nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST,
+ output->u.ipv4.dst))
return -EMSGSIZE;
- if (output->ipv4_tos &&
- nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos))
+ if (output->tos &&
+ nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos))
return -EMSGSIZE;
- if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl))
+ if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl))
return -EMSGSIZE;
if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) &&
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
@@ -712,8 +715,8 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS,
swkey_tun_opts_len, tun_opts))
return -EMSGSIZE;
- else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
- vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
+ else if (output->tun_flags & TUNNEL_VXLAN_OPT &&
+ vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len))
return -EMSGSIZE;
}
@@ -721,7 +724,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb,
}
static int ipv4_tun_to_nlattr(struct sk_buff *skb,
- const struct ovs_key_ipv4_tunnel *output,
+ const struct ip_tunnel_key *output,
const void *tun_opts, int swkey_tun_opts_len)
{
struct nlattr *nla;
@@ -740,10 +743,11 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb,
}
int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
- const struct ovs_tunnel_info *egress_tun_info)
+ const struct ip_tunnel_info *egress_tun_info,
+ const void *egress_tun_opts)
{
- return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel,
- egress_tun_info->options,
+ return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key,
+ egress_tun_opts,
egress_tun_info->options_len);
}
@@ -860,7 +864,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask);
}
- if (attrs & (1ULL << OVS_KEY_ATTR_IPV4)) {
+ if (attrs & (1 << OVS_KEY_ATTR_IPV4)) {
const struct ovs_key_ipv4 *ipv4_key;
ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]);
@@ -881,7 +885,7 @@ static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs,
ipv4_key->ipv4_src, is_mask);
SW_FLOW_KEY_PUT(match, ipv4.addr.dst,
ipv4_key->ipv4_dst, is_mask);
- attrs &= ~(1ULL << OVS_KEY_ATTR_IPV4);
+ attrs &= ~(1 << OVS_KEY_ATTR_IPV4);
}
if (attrs & (1ULL << OVS_KEY_ATTR_IPV6)) {
@@ -1152,7 +1156,7 @@ int ovs_nla_get_match(struct sw_flow_match *match,
/* The userspace does not send tunnel attributes that
* are 0, but we should not wildcard them nonetheless.
*/
- if (match->key->tun_key.ipv4_dst)
+ if (match->key->tun_key.u.ipv4.dst)
SW_FLOW_KEY_MEMSET_FIELD(match, tun_key,
0xff, true);
@@ -1324,7 +1328,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey,
if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority))
goto nla_put_failure;
- if ((swkey->tun_key.ipv4_dst || is_mask)) {
+ if ((swkey->tun_key.u.ipv4.dst || is_mask)) {
const void *opts = NULL;
if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT)
@@ -1585,20 +1589,49 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log)
return sfa;
}
-/* RCU callback used by ovs_nla_free_flow_actions. */
-static void rcu_free_acts_callback(struct rcu_head *rcu)
+static void ovs_nla_free_set_action(const struct nlattr *a)
{
- struct sw_flow_actions *sf_acts = container_of(rcu,
- struct sw_flow_actions, rcu);
+ const struct nlattr *ovs_key = nla_data(a);
+ struct ovs_tunnel_info *ovs_tun;
+
+ switch (nla_type(ovs_key)) {
+ case OVS_KEY_ATTR_TUNNEL_INFO:
+ ovs_tun = nla_data(ovs_key);
+ ovs_dst_release((struct dst_entry *)ovs_tun->tun_dst);
+ break;
+ }
+}
+
+void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+{
+ const struct nlattr *a;
+ int rem;
+
+ if (!sf_acts)
+ return;
+
+ nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) {
+ switch (nla_type(a)) {
+ case OVS_ACTION_ATTR_SET:
+ ovs_nla_free_set_action(a);
+ break;
+ }
+ }
+
kfree(sf_acts);
}
+static void __ovs_nla_free_flow_actions(struct rcu_head *head)
+{
+ ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu));
+}
+
/* Schedules 'sf_acts' to be freed after the next RCU grace period.
* The caller must hold rcu_read_lock for this to be sensible.
*/
-void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts)
+void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts)
{
- call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
+ call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions);
}
static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
@@ -1794,10 +1827,11 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
{
struct sw_flow_match match;
struct sw_flow_key key;
- struct ovs_tunnel_info *tun_info;
+ struct metadata_dst *tun_dst;
+ struct ip_tunnel_info *tun_info;
+ struct ovs_tunnel_info *ovs_tun;
struct nlattr *a;
- int start, opts_type;
- int err = 0;
+ int err = 0, start, opts_type;
ovs_match_init(&match, &key, NULL);
opts_type = ipv4_tun_from_nlattr(nla_data(attr), &match, false, log);
@@ -1820,27 +1854,31 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
if (start < 0)
return start;
+ tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL);
+ if (!tun_dst)
+ return -ENOMEM;
+
a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
- sizeof(*tun_info) + key.tun_opts_len, log);
- if (IS_ERR(a))
+ sizeof(*ovs_tun), log);
+ if (IS_ERR(a)) {
+ ovs_dst_release((struct dst_entry *)tun_dst);
return PTR_ERR(a);
+ }
- tun_info = nla_data(a);
- tun_info->tunnel = key.tun_key;
- tun_info->options_len = key.tun_opts_len;
+ ovs_tun = nla_data(a);
+ ovs_tun->tun_dst = tun_dst;
- if (tun_info->options_len) {
- /* We need to store the options in the action itself since
- * everything else will go away after flow setup. We can append
- * it to tun_info and then point there.
- */
- memcpy((tun_info + 1),
- TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len);
- tun_info->options = (tun_info + 1);
- } else {
- tun_info->options = NULL;
- }
+ tun_info = &tun_dst->u.tun_info;
+ tun_info->mode = IP_TUNNEL_INFO_TX;
+ tun_info->key = key.tun_key;
+ /* We need to store the options in the action itself since
+ * everything else will go away after flow setup. We can append
+ * it to tun_info and then point there.
+ */
+ ip_tunnel_info_opts_set(tun_info,
+ TUN_METADATA_OPTS(&key, key.tun_opts_len),
+ key.tun_opts_len);
add_nested_action_end(*sfa, start);
return err;
@@ -2225,7 +2263,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr,
err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type,
key->eth.tci, log);
if (err)
- kfree(*sfa);
+ ovs_nla_free_flow_actions(*sfa);
return err;
}
@@ -2275,15 +2313,16 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
switch (key_type) {
case OVS_KEY_ATTR_TUNNEL_INFO: {
- struct ovs_tunnel_info *tun_info = nla_data(ovs_key);
+ struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key);
+ struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info;
start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
if (!start)
return -EMSGSIZE;
- err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel,
+ err = ipv4_tun_to_nlattr(skb, &tun_info->key,
tun_info->options_len ?
- tun_info->options : NULL,
+ ip_tunnel_info_opts(tun_info) : NULL,
tun_info->options_len);
if (err)
return err;
diff --git a/datapath/flow_netlink.h b/datapath/flow_netlink.h
index 5c3d75bff..140bbe707 100644
--- a/datapath/flow_netlink.h
+++ b/datapath/flow_netlink.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2013 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -54,8 +54,9 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb);
int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key,
const struct nlattr *mask, bool log);
-int ovs_nla_put_egress_tunnel_key(struct sk_buff *,
- const struct ovs_tunnel_info *);
+int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb,
+ const struct ip_tunnel_info *egress_tun_info,
+ const void *egress_tun_opts);
bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log);
int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid,
@@ -69,5 +70,6 @@ int ovs_nla_put_actions(const struct nlattr *attr,
int len, struct sk_buff *skb);
void ovs_nla_free_flow_actions(struct sw_flow_actions *);
+void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *);
#endif /* flow_netlink.h */
diff --git a/datapath/flow_table.c b/datapath/flow_table.c
index eeadf8600..b51be69e8 100644
--- a/datapath/flow_table.c
+++ b/datapath/flow_table.c
@@ -45,6 +45,7 @@
#include <net/ndisc.h>
#include "vlan.h"
+#include "flow_netlink.h"
#define TBL_MIN_BUCKETS 1024
#define MASK_ARRAY_SIZE_MIN 16
@@ -151,7 +152,8 @@ static void flow_free(struct sw_flow *flow)
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
- kfree(rcu_dereference_raw(flow->sf_acts));
+ if (flow->sf_acts)
+ ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
for_each_node(node)
if (flow->stats[node])
kmem_cache_free(flow_stats_cache,
@@ -505,7 +507,7 @@ static u32 flow_hash(const struct sw_flow_key *key,
static int flow_key_start(const struct sw_flow_key *key)
{
- if (key->tun_key.ipv4_dst)
+ if (key->tun_key.u.ipv4.dst)
return 0;
else
return rounddown(offsetof(struct sw_flow_key, phy),
diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk
index 96c3d55d7..7e66e14a5 100644
--- a/datapath/linux/Modules.mk
+++ b/datapath/linux/Modules.mk
@@ -7,7 +7,10 @@ openvswitch_sources += \
linux/compat/gre.c \
linux/compat/gso.c \
linux/compat/genetlink-openvswitch.c \
+ linux/compat/ip_gre.c \
+ linux/compat/ip_tunnel.c \
linux/compat/ip_tunnels_core.c \
+ linux/compat/lisp.c \
linux/compat/netdevice.c \
linux/compat/net_namespace.c \
linux/compat/reciprocal_div.c \
@@ -33,6 +36,7 @@ openvswitch_headers += \
linux/compat/include/linux/if.h \
linux/compat/include/linux/if_arp.h \
linux/compat/include/linux/if_ether.h \
+ linux/compat/include/linux/if_link.h \
linux/compat/include/linux/if_vlan.h \
linux/compat/include/linux/in.h \
linux/compat/include/linux/ip.h \
@@ -40,6 +44,7 @@ openvswitch_headers += \
linux/compat/include/linux/jiffies.h \
linux/compat/include/linux/kconfig.h \
linux/compat/include/linux/kernel.h \
+ linux/compat/include/net/lisp.h \
linux/compat/include/linux/list.h \
linux/compat/include/linux/mpls.h \
linux/compat/include/linux/net.h \
@@ -63,17 +68,23 @@ openvswitch_headers += \
linux/compat/include/linux/workqueue.h \
linux/compat/include/net/checksum.h \
linux/compat/include/net/dst.h \
+ linux/compat/include/net/dst_metadata.h \
linux/compat/include/net/flow_keys.h \
linux/compat/include/net/genetlink.h \
linux/compat/include/net/geneve.h \
linux/compat/include/net/gre.h \
+ linux/compat/include/net/inet_ecn.h \
linux/compat/include/net/inet_frag.h \
linux/compat/include/net/ip.h \
linux/compat/include/net/ip_tunnels.h \
+ linux/compat/include/net/ip6_route.h \
+ linux/compat/include/net/ip6_tunnel.h \
linux/compat/include/net/ipv6.h \
linux/compat/include/net/mpls.h \
linux/compat/include/net/net_namespace.h \
linux/compat/include/net/netlink.h \
+ linux/compat/include/net/route.h \
+ linux/compat/include/net/rtnetlink.h \
linux/compat/include/net/udp.h \
linux/compat/include/net/udp_tunnel.h \
linux/compat/include/net/sock.h \
diff --git a/datapath/linux/compat/dev-openvswitch.c b/datapath/linux/compat/dev-openvswitch.c
index 38ec8fe9e..d7d4224a1 100644
--- a/datapath/linux/compat/dev-openvswitch.c
+++ b/datapath/linux/compat/dev-openvswitch.c
@@ -1,6 +1,7 @@
#include <linux/if_bridge.h>
#include <linux/netdevice.h>
#include <linux/version.h>
+#include <net/rtnetlink.h>
#ifndef HAVE_DEV_DISABLE_LRO
@@ -93,3 +94,24 @@ void rpl_netdev_rx_handler_unregister(struct net_device *dev)
EXPORT_SYMBOL_GPL(rpl_netdev_rx_handler_unregister);
#endif
+
+int rpl_rtnl_delete_link(struct net_device *dev)
+{
+ const struct rtnl_link_ops *ops;
+
+ ops = dev->rtnl_link_ops;
+ if (!ops || !ops->dellink)
+ return -EOPNOTSUPP;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
+ ops->dellink(dev);
+#else
+ {
+ LIST_HEAD(list_kill);
+
+ ops->dellink(dev, &list_kill);
+ unregister_netdevice_many(&list_kill);
+ }
+#endif
+ return 0;
+}
diff --git a/datapath/linux/compat/geneve.c b/datapath/linux/compat/geneve.c
index 85cf95f4c..297593ce6 100644
--- a/datapath/linux/compat/geneve.c
+++ b/datapath/linux/compat/geneve.c
@@ -1,122 +1,235 @@
/*
- * Geneve: Generic Network Virtualization Encapsulation
+ * GENEVE: Generic Network Virtualization Encapsulation
*
- * Copyright (c) 2014 Nicira, Inc.
+ * Copyright (c) 2015 Red Hat, Inc.
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/version.h>
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)
-
#include <linux/kernel.h>
-#include <linux/types.h>
#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/skbuff.h>
-#include <linux/list.h>
#include <linux/netdevice.h>
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/udp.h>
-#include <linux/igmp.h>
#include <linux/etherdevice.h>
-#include <linux/if_ether.h>
-#include <linux/if_vlan.h>
-#include <linux/ethtool.h>
-#include <linux/mutex.h>
-#include <net/arp.h>
-#include <net/ndisc.h>
-#include <net/ip.h>
-#include <net/ip_tunnels.h>
-#include <net/icmp.h>
-#include <net/udp.h>
-#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
+#include <linux/hash.h>
+#include <linux/if_link.h>
+
+#include <net/dst_metadata.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
#include <net/geneve.h>
#include <net/protocol.h>
-#include <net/udp_tunnel.h>
-#if IS_ENABLED(CONFIG_IPV6)
-#include <net/ipv6.h>
-#include <net/addrconf.h>
-#include <net/ip6_tunnel.h>
-#include <net/ip6_checksum.h>
-#endif
-#include "compat.h"
#include "gso.h"
+#include "vport-netdev.h"
+#include "compat.h"
+
+#ifndef HAVE_METADATA_DST
+#define GENEVE_NETDEV_VER "0.6"
+
+#define GENEVE_UDP_PORT 6081
+
+#define GENEVE_N_VID (1u << 24)
+#define GENEVE_VID_MASK (GENEVE_N_VID - 1)
-static void geneve_build_header(struct genevehdr *geneveh,
- __be16 tun_flags, u8 vni[3],
- u8 options_len, u8 *options)
+#define VNI_HASH_BITS 10
+#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
+
+#define GENEVE_VER 0
+#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
+
+/* per-network namespace private data for this module */
+struct geneve_net {
+ struct list_head geneve_list;
+ struct list_head sock_list;
+};
+
+static int geneve_net_id;
+
+/* Pseudo network device */
+struct geneve_dev {
+ struct hlist_node hlist; /* vni hash table */
+ struct net *net; /* netns for packet i/o */
+ struct net_device *dev; /* netdev for geneve tunnel */
+ struct geneve_sock *sock; /* socket used for geneve tunnel */
+ u8 vni[3]; /* virtual network ID for tunnel */
+ u8 ttl; /* TTL override */
+ u8 tos; /* TOS override */
+ struct sockaddr_in remote; /* IPv4 address for link partner */
+ struct list_head next; /* geneve's per namespace list */
+ __be16 dst_port;
+ bool collect_md;
+};
+
+struct geneve_sock {
+ bool collect_md;
+ struct list_head list;
+ struct socket *sock;
+ struct rcu_head rcu;
+ int refcnt;
+#ifdef HAVE_UDP_OFFLOAD
+ struct udp_offload udp_offloads;
+#endif
+ struct hlist_head vni_list[VNI_HASH_SIZE];
+};
+
+static inline __u32 geneve_net_vni_hash(u8 vni[3])
{
- geneveh->ver = GENEVE_VER;
- geneveh->opt_len = options_len / 4;
- geneveh->oam = !!(tun_flags & TUNNEL_OAM);
- geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
- geneveh->rsvd1 = 0;
- memcpy(geneveh->vni, vni, 3);
- geneveh->proto_type = htons(ETH_P_TEB);
- geneveh->rsvd2 = 0;
+ __u32 vnid;
- memcpy(geneveh->options, options, options_len);
+ vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2];
+ return hash_32(vnid, VNI_HASH_BITS);
}
-/* Transmit a fully formatted Geneve frame.
- *
- * When calling this function. The skb->data should point
- * to the geneve header which is fully formed.
- *
- * This function will add other UDP tunnel headers.
- */
-int rpl_geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
- struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
- __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
- __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
- bool csum, bool xnet)
+static __be64 vni_to_tunnel_id(const __u8 *vni)
{
- struct genevehdr *gnvh;
- int min_headroom;
+#ifdef __BIG_ENDIAN
+ return (vni[0] << 16) | (vni[1] << 8) | vni[2];
+#else
+ return (__force __be64)(((__force u64)vni[0] << 40) |
+ ((__force u64)vni[1] << 48) |
+ ((__force u64)vni[2] << 56));
+#endif
+}
+
+static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
+ __be32 addr, u8 vni[])
+{
+ struct hlist_head *vni_list_head;
+ struct geneve_dev *geneve;
+ __u32 hash;
+
+ /* Find the device for this VNI */
+ hash = geneve_net_vni_hash(vni);
+ vni_list_head = &gs->vni_list[hash];
+ hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
+ if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
+ addr == geneve->remote.sin_addr.s_addr)
+ return geneve;
+ }
+ return NULL;
+}
+
+static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
+{
+ return (struct genevehdr *)(udp_hdr(skb) + 1);
+}
+
+/* geneve receive/decap routine */
+static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+{
+ struct genevehdr *gnvh = geneve_hdr(skb);
+ struct metadata_dst *tun_dst;
+ struct geneve_dev *geneve = NULL;
+#ifdef HAVE_DEV_TSTATS
+ struct pcpu_sw_netstats *stats;
+#endif
+ struct iphdr *iph;
+ u8 *vni;
+ __be32 addr;
int err;
+ union {
+ struct metadata_dst dst;
+ char buf[sizeof(struct metadata_dst) + 256];
+ } buf;
- min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
- + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
- + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+ iph = ip_hdr(skb); /* outer IP header... */
- err = skb_cow_head(skb, min_headroom);
- if (unlikely(err)) {
- kfree_skb(skb);
- return err;
+ if (gs->collect_md) {
+ static u8 zero_vni[3];
+
+ vni = zero_vni;
+ addr = 0;
+ } else {
+ vni = gnvh->vni;
+ addr = iph->saddr;
}
- skb = vlan_hwaccel_push_inside(skb);
- if (unlikely(!skb))
- return -ENOMEM;
+ geneve = geneve_lookup(gs, addr, vni);
+ if (!geneve)
+ goto drop;
- skb = udp_tunnel_handle_offloads(skb, csum, (opt_len == 0));
- if (IS_ERR(skb))
- return PTR_ERR(skb);
+ if (ip_tunnel_collect_metadata() || gs->collect_md) {
+ __be16 flags;
- gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
- geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
+ flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
+ (gnvh->oam ? TUNNEL_OAM : 0) |
+ (gnvh->critical ? TUNNEL_CRIT_OPT : 0);
- ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ tun_dst = &buf.dst;
+ ovs_udp_tun_rx_dst(&tun_dst->u.tun_info, skb, AF_INET, flags,
+ vni_to_tunnel_id(gnvh->vni), gnvh->opt_len * 4);
+ /* Update tunnel dst according to Geneve options. */
+ ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
+ gnvh->options, gnvh->opt_len * 4);
+ } else {
+ /* Drop packets w/ critical options,
+ * since we don't support any...
+ */
+ tun_dst = NULL;
+ if (gnvh->critical)
+ goto drop;
+ }
+
+ skb_reset_mac_header(skb);
+ skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
+ skb->protocol = eth_type_trans(skb, geneve->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+ if (tun_dst)
+ ovs_skb_dst_set(skb, &tun_dst->dst);
+ else
+ goto drop;
+ /* Ignore packet loops (and multicast echo) */
+ if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
+ goto drop;
+
+ skb_reset_network_header(skb);
+
+ err = IP_ECN_decapsulate(iph, skb);
+
+ if (unlikely(err)) {
+ if (err > 1) {
+ ++geneve->dev->stats.rx_frame_errors;
+ ++geneve->dev->stats.rx_errors;
+ goto drop;
+ }
+ }
- return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
- tos, ttl, df, src_port, dst_port, xnet,
- !csum);
+#ifdef HAVE_DEV_TSTATS
+ stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)geneve->dev->tstats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
+#endif
+ netdev_port_receive(skb, &tun_dst->u.tun_info);
+ return;
+drop:
+ /* Consume bad packet */
+ kfree_skb(skb);
+}
+
+#ifdef HAVE_DEV_TSTATS
+/* Setup stats when device is created */
+static int geneve_init(struct net_device *dev)
+{
+ dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(rpl_geneve_xmit_skb);
+
+static void geneve_uninit(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+}
+#endif
/* Callback from net/ipv4/udp.c to receive packets */
static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
@@ -131,7 +244,6 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
/* Return packets with reserved bits set */
geneveh = geneve_hdr(skb);
-
if (unlikely(geneveh->ver != GENEVE_VER))
goto error;
@@ -147,7 +259,7 @@ static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (!gs)
goto drop;
- gs->rcv(gs, skb);
+ geneve_rx(gs, skb);
return 0;
drop:
@@ -186,14 +298,135 @@ static struct socket *geneve_create_sock(struct net *net, bool ipv6,
return sock;
}
+#ifdef HAVE_UDP_OFFLOAD
+static void geneve_notify_add_rx_port(struct geneve_sock *gs)
+{
+ struct sock *sk = gs->sock->sk;
+ sa_family_t sa_family = sk->sk_family;
+ int err;
+
+ if (sa_family == AF_INET) {
+ err = udp_add_offload(&gs->udp_offloads);
+ if (err)
+ pr_warn("geneve: udp_add_offload failed with status %d\n",
+ err);
+ }
+}
+
+static int geneve_hlen(struct genevehdr *gh)
+{
+ return sizeof(*gh) + gh->opt_len * 4;
+}
+
+#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF
+static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+#else
+static struct sk_buff **geneve_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb,
+ struct udp_offload *uoff)
+#endif
+{
+ struct sk_buff *p, **pp = NULL;
+ struct genevehdr *gh, *gh2;
+ unsigned int hlen, gh_len, off_gnv;
+ const struct packet_offload *ptype;
+ __be16 type;
+ int flush = 1;
+
+ off_gnv = skb_gro_offset(skb);
+ hlen = off_gnv + sizeof(*gh);
+ gh = skb_gro_header_fast(skb, off_gnv);
+ if (skb_gro_header_hard(skb, hlen)) {
+ gh = skb_gro_header_slow(skb, hlen, off_gnv);
+ if (unlikely(!gh))
+ goto out;
+ }
+
+ if (gh->ver != GENEVE_VER || gh->oam)
+ goto out;
+ gh_len = geneve_hlen(gh);
+
+ hlen = off_gnv + gh_len;
+ if (skb_gro_header_hard(skb, hlen)) {
+ gh = skb_gro_header_slow(skb, hlen, off_gnv);
+ if (unlikely(!gh))
+ goto out;
+ }
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ gh2 = (struct genevehdr *)(p->data + off_gnv);
+ if (gh->opt_len != gh2->opt_len ||
+ memcmp(gh, gh2, gh_len)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ type = gh->proto_type;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (!ptype) {
+ flush = 1;
+ goto out_unlock;
+ }
+
+ skb_gro_pull(skb, gh_len);
+ skb_gro_postpull_rcsum(skb, gh, gh_len);
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff)
+#else
+static int geneve_gro_complete(struct sk_buff *skb, int nhoff,
+ struct udp_offload *uoff)
+#endif
+{
+ struct genevehdr *gh;
+ struct packet_offload *ptype;
+ __be16 type;
+ int gh_len;
+ int err = -ENOSYS;
+
+ udp_tunnel_gro_complete(skb, nhoff);
+
+ gh = (struct genevehdr *)(skb->data + nhoff);
+ gh_len = geneve_hlen(gh);
+ type = gh->proto_type;
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype)
+ err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
+
+ rcu_read_unlock();
+ return err;
+}
+#endif
+
/* Create new listen socket if needed */
static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
- geneve_rcv_t *rcv, void *data,
bool ipv6)
{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
struct geneve_sock *gs;
struct socket *sock;
struct udp_tunnel_sock_cfg tunnel_cfg;
+ int h;
gs = kzalloc(sizeof(*gs), GFP_KERNEL);
if (!gs)
@@ -206,39 +439,671 @@ static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
}
gs->sock = sock;
- gs->rcv = rcv;
- gs->rcv_data = data;
+ gs->refcnt = 1;
+ for (h = 0; h < VNI_HASH_SIZE; ++h)
+ INIT_HLIST_HEAD(&gs->vni_list[h]);
+ /* Initialize the geneve udp offloads structure */
+#ifdef HAVE_UDP_OFFLOAD
+ gs->udp_offloads.port = port;
+ gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive;
+ gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete;
+ geneve_notify_add_rx_port(gs);
+#endif
/* Mark socket as an encapsulation socket */
tunnel_cfg.sk_user_data = gs;
tunnel_cfg.encap_type = 1;
tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
tunnel_cfg.encap_destroy = NULL;
setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
-
+ list_add(&gs->list, &gn->sock_list);
return gs;
}
-struct geneve_sock *rpl_geneve_sock_add(struct net *net, __be16 port,
- geneve_rcv_t *rcv, void *data,
- bool no_share, bool ipv6)
+static void geneve_notify_del_rx_port(struct geneve_sock *gs)
{
- return geneve_socket_create(net, port, rcv, data, ipv6);
+#ifdef HAVE_UDP_OFFLOAD
+ struct sock *sk = gs->sock->sk;
+ sa_family_t sa_family = sk->sk_family;
+
+ if (sa_family == AF_INET)
+ udp_del_offload(&gs->udp_offloads);
+#endif
}
-EXPORT_SYMBOL_GPL(rpl_geneve_sock_add);
-static void rcu_free_gs(struct rcu_head *rcu)
+static void free_gs_rcu(struct rcu_head *rcu)
{
struct geneve_sock *gs = container_of(rcu, struct geneve_sock, rcu);
kfree(gs);
}
-void rpl_geneve_sock_release(struct geneve_sock *gs)
+static void geneve_sock_release(struct geneve_sock *gs)
{
+ if (--gs->refcnt)
+ return;
+
+ list_del(&gs->list);
+ geneve_notify_del_rx_port(gs);
udp_tunnel_sock_release(gs->sock);
- call_rcu(&gs->rcu, rcu_free_gs);
+ call_rcu(&gs->rcu, free_gs_rcu);
}
-EXPORT_SYMBOL_GPL(rpl_geneve_sock_release);
-#endif /* kernel < 4.0 */
+static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
+ __be16 dst_port)
+{
+ struct geneve_sock *gs;
+
+ list_for_each_entry(gs, &gn->sock_list, list) {
+ if (inet_sport(gs->sock->sk) == dst_port &&
+ inet_sk(gs->sock->sk)->sk.sk_family == AF_INET) {
+ return gs;
+ }
+ }
+ return NULL;
+}
+
+static int geneve_open(struct net_device *dev)
+{
+ struct geneve_dev *geneve = netdev_priv(dev);
+ struct net *net = geneve->net;
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ struct geneve_sock *gs;
+ __u32 hash;
+
+ gs = geneve_find_sock(gn, geneve->dst_port);
+ if (gs) {
+ gs->refcnt++;
+ goto out;
+ }
+
+ gs = geneve_socket_create(net, geneve->dst_port, false);
+ if (IS_ERR(gs))
+ return PTR_ERR(gs);
+
+out:
+ gs->collect_md = geneve->collect_md;
+ geneve->sock = gs;
+
+ hash = geneve_net_vni_hash(geneve->vni);
+ hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]);
+ return 0;
+}
+
+static int geneve_stop(struct net_device *dev)
+{
+ struct geneve_dev *geneve = netdev_priv(dev);
+ struct geneve_sock *gs = geneve->sock;
+
+ if (!hlist_unhashed(&geneve->hlist))
+ hlist_del_rcu(&geneve->hlist);
+ geneve_sock_release(gs);
+ return 0;
+}
+
+static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
+ __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
+ bool csum)
+{
+ struct genevehdr *gnvh;
+ int min_headroom;
+ int err;
+
+ min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+ + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
+ + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+ err = skb_cow_head(skb, min_headroom);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ goto free_rt;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (!skb) {
+ err = -ENOMEM;
+ goto free_rt;
+ }
+
+ skb = udp_tunnel_handle_offloads(skb, csum, 0, false);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ goto free_rt;
+ }
+ gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
+ gnvh->ver = GENEVE_VER;
+ gnvh->opt_len = opt_len / 4;
+ gnvh->oam = !!(tun_flags & TUNNEL_OAM);
+ gnvh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
+ gnvh->rsvd1 = 0;
+ memcpy(gnvh->vni, vni, 3);
+ gnvh->proto_type = htons(ETH_P_TEB);
+ gnvh->rsvd2 = 0;
+ memcpy(gnvh->options, opt, opt_len);
+
+ ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ return 0;
+
+free_rt:
+ ip_rt_put(rt);
+ return err;
+}
+
+static struct rtable *geneve_get_rt(struct sk_buff *skb,
+ struct net_device *dev,
+ struct flowi4 *fl4,
+ struct ip_tunnel_info *info)
+{
+ struct geneve_dev *geneve = netdev_priv(dev);
+ struct rtable *rt = NULL;
+ __u8 tos;
+
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->flowi4_mark = skb->mark;
+ fl4->flowi4_proto = IPPROTO_UDP;
+
+ if (info) {
+ fl4->daddr = info->key.u.ipv4.dst;
+ fl4->saddr = info->key.u.ipv4.src;
+ fl4->flowi4_tos = RT_TOS(info->key.tos);
+ } else {
+ tos = geneve->tos;
+ if (tos == 1) {
+ const struct iphdr *iip = ip_hdr(skb);
+
+ tos = ip_tunnel_get_dsfield(iip, skb);
+ }
+
+ fl4->flowi4_tos = RT_TOS(tos);
+ fl4->daddr = geneve->remote.sin_addr.s_addr;
+ }
+
+ rt = ip_route_output_key(geneve->net, fl4);
+ if (IS_ERR(rt)) {
+ netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
+ dev->stats.tx_carrier_errors++;
+ return rt;
+ }
+ if (rt_dst(rt).dev == dev) { /* is this necessary? */
+ netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr);
+ dev->stats.collisions++;
+ ip_rt_put(rt);
+ return ERR_PTR(-EINVAL);
+ }
+ return rt;
+}
+
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+ vni[0] = (__force __u8)(tun_id >> 16);
+ vni[1] = (__force __u8)(tun_id >> 8);
+ vni[2] = (__force __u8)tun_id;
+#else
+ vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+ vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+ vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
+netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct geneve_dev *geneve = netdev_priv(dev);
+ struct geneve_sock *gs = geneve->sock;
+ struct ip_tunnel_info *info = NULL;
+ struct rtable *rt = NULL;
+ const struct iphdr *iip; /* interior IP header */
+ struct flowi4 fl4;
+ __u8 tos, ttl;
+ __be16 sport;
+ bool udp_csum;
+ __be16 df;
+ int err;
+
+ if (geneve->collect_md) {
+ info = skb_tunnel_info(skb);
+ if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX))) {
+ netdev_dbg(dev, "no tunnel metadata\n");
+ goto tx_error;
+ }
+ if (info && ip_tunnel_info_af(info) != AF_INET)
+ goto tx_error;
+ }
+
+ rt = geneve_get_rt(skb, dev, &fl4, info);
+ if (IS_ERR(rt)) {
+ netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+
+ sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
+ skb_reset_mac_header(skb);
+
+ iip = ip_hdr(skb);
+
+ if (info) {
+ const struct ip_tunnel_key *key = &info->key;
+ u8 *opts = NULL;
+ u8 vni[3];
+
+ tunnel_id_to_vni(key->tun_id, vni);
+ if (key->tun_flags & TUNNEL_GENEVE_OPT)
+ opts = ip_tunnel_info_opts(info);
+
+ udp_csum = !!(key->tun_flags & TUNNEL_CSUM);
+ err = geneve_build_skb(rt, skb, key->tun_flags, vni,
+ info->options_len, opts, udp_csum);
+ if (unlikely(err))
+ goto err;
+
+ tos = ip_tunnel_ecn_encap(key->tos, iip, skb);
+ ttl = key->ttl;
+ df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ } else {
+ udp_csum = false;
+ err = geneve_build_skb(rt, skb, 0, geneve->vni,
+ 0, NULL, udp_csum);
+ if (unlikely(err))
+ goto err;
+
+ tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
+ ttl = geneve->ttl;
+ if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
+ ttl = 1;
+ ttl = ttl ? : ip4_dst_hoplimit(&rt_dst(rt));
+ df = 0;
+ }
+ err = udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, fl4.saddr, fl4.daddr,
+ tos, ttl, df, sport, geneve->dst_port,
+ !net_eq(geneve->net, dev_net(geneve->dev)),
+ !udp_csum);
+
+ iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *) dev->tstats);
+ return NETDEV_TX_OK;
+
+tx_error:
+ dev_kfree_skb(skb);
+err:
+ dev->stats.tx_errors++;
+ return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL(rpl_geneve_xmit);
+
+static netdev_tx_t geneve_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ /* Drop All packets coming from networking stack. OVS-CB is
+ * not initialized for these packets.
+ */
+
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops geneve_netdev_ops = {
+#ifdef HAVE_DEV_TSTATS
+ .ndo_init = geneve_init,
+ .ndo_uninit = geneve_uninit,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+#endif
+ .ndo_open = geneve_open,
+ .ndo_stop = geneve_stop,
+ .ndo_start_xmit = geneve_dev_xmit,
+ .ndo_change_mtu = eth_change_mtu,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_mac_address = eth_mac_addr,
+};
+
+static void geneve_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strlcpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version));
+ strlcpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver));
+}
+
+static const struct ethtool_ops geneve_ethtool_ops = {
+ .get_drvinfo = geneve_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type geneve_type = {
+ .name = "geneve",
+};
+
+/* Initialize the device structure. */
+static void geneve_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->netdev_ops = &geneve_netdev_ops;
+ dev->ethtool_ops = &geneve_ethtool_ops;
+ dev->destructor = free_netdev;
+
+ SET_NETDEV_DEVTYPE(dev, &geneve_type);
+
+ dev->features |= NETIF_F_LLTX;
+ dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ dev->features |= NETIF_F_RXCSUM;
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+#endif
+#if 0
+ /* Not required */
+ netif_keep_dst(dev);
+#endif
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
+ eth_hw_addr_random(dev);
+}
+
+static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
+ [IFLA_GENEVE_ID] = { .type = NLA_U32 },
+ [IFLA_GENEVE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_GENEVE_TTL] = { .type = NLA_U8 },
+ [IFLA_GENEVE_TOS] = { .type = NLA_U8 },
+ [IFLA_GENEVE_PORT] = { .type = NLA_U16 },
+ [IFLA_GENEVE_COLLECT_METADATA] = { .type = NLA_FLAG },
+};
+
+static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (!data)
+ return -EINVAL;
+
+ if (data[IFLA_GENEVE_ID]) {
+ __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]);
+
+ if (vni >= GENEVE_VID_MASK)
+ return -ERANGE;
+ }
+
+ return 0;
+}
+
+static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
+ __be16 dst_port,
+ __be32 rem_addr,
+ u8 vni[],
+ bool *tun_on_same_port,
+ bool *tun_collect_md)
+{
+ struct geneve_dev *geneve, *t;
+
+ *tun_on_same_port = false;
+ *tun_collect_md = false;
+ t = NULL;
+ list_for_each_entry(geneve, &gn->geneve_list, next) {
+ if (geneve->dst_port == dst_port) {
+ *tun_collect_md = geneve->collect_md;
+ *tun_on_same_port = true;
+ }
+ if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) &&
+ rem_addr == geneve->remote.sin_addr.s_addr &&
+ dst_port == geneve->dst_port)
+ t = geneve;
+ }
+ return t;
+}
+
+static int geneve_configure(struct net *net, struct net_device *dev,
+ __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos,
+ __be16 dst_port, bool metadata)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ struct geneve_dev *t, *geneve = netdev_priv(dev);
+ bool tun_collect_md, tun_on_same_port;
+ int err;
+
+ if (metadata) {
+ if (rem_addr || vni || tos || ttl)
+ return -EINVAL;
+ }
+
+ geneve->net = net;
+ geneve->dev = dev;
+
+ geneve->vni[0] = (vni & 0x00ff0000) >> 16;
+ geneve->vni[1] = (vni & 0x0000ff00) >> 8;
+ geneve->vni[2] = vni & 0x000000ff;
+
+ geneve->remote.sin_addr.s_addr = rem_addr;
+ if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr)))
+ return -EINVAL;
+
+ geneve->ttl = ttl;
+ geneve->tos = tos;
+ geneve->dst_port = dst_port;
+ geneve->collect_md = metadata;
+
+ t = geneve_find_dev(gn, dst_port, rem_addr, geneve->vni,
+ &tun_on_same_port, &tun_collect_md);
+ if (t)
+ return -EBUSY;
+
+ if (metadata) {
+ if (tun_on_same_port)
+ return -EPERM;
+ } else {
+ if (tun_collect_md)
+ return -EPERM;
+ }
+
+ err = register_netdevice(dev);
+ if (err)
+ return err;
+
+ list_add(&geneve->next, &gn->geneve_list);
+ return 0;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+static int geneve_newlink(struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ struct net *net = &init_net;
+#else
+static int geneve_newlink(struct net *net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+#endif
+ __be16 dst_port = htons(GENEVE_UDP_PORT);
+ __u8 ttl = 0, tos = 0;
+ bool metadata = false;
+ __be32 rem_addr;
+ __u32 vni;
+
+ if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
+ return -EINVAL;
+
+ vni = nla_get_u32(data[IFLA_GENEVE_ID]);
+ rem_addr = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+
+ if (data[IFLA_GENEVE_TTL])
+ ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
+
+ if (data[IFLA_GENEVE_TOS])
+ tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
+
+ if (data[IFLA_GENEVE_PORT])
+ dst_port = nla_get_be16(data[IFLA_GENEVE_PORT]);
+
+ if (data[IFLA_GENEVE_COLLECT_METADATA])
+ metadata = true;
+
+ return geneve_configure(net, dev, rem_addr, vni,
+ ttl, tos, dst_port, metadata);
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+static void geneve_dellink(struct net_device *dev)
+#else
+static void geneve_dellink(struct net_device *dev, struct list_head *head)
+#endif
+{
+ struct geneve_dev *geneve = netdev_priv(dev);
+
+ list_del(&geneve->next);
+ unregister_netdevice_queue(dev, head);
+}
+
+static size_t geneve_get_size(const struct net_device *dev)
+{
+ return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */
+ nla_total_size(sizeof(struct in_addr)) + /* IFLA_GENEVE_REMOTE */
+ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL */
+ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TOS */
+ nla_total_size(sizeof(__be16)) + /* IFLA_GENEVE_PORT */
+ nla_total_size(0) + /* IFLA_GENEVE_COLLECT_METADATA */
+ 0;
+}
+
+static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct geneve_dev *geneve = netdev_priv(dev);
+ __u32 vni;
+
+ vni = (geneve->vni[0] << 16) | (geneve->vni[1] << 8) | geneve->vni[2];
+ if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
+ goto nla_put_failure;
+
+ if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
+ geneve->remote.sin_addr.s_addr))
+ goto nla_put_failure;
+
+ if (nla_put_u8(skb, IFLA_GENEVE_TTL, geneve->ttl) ||
+ nla_put_u8(skb, IFLA_GENEVE_TOS, geneve->tos))
+ goto nla_put_failure;
+
+ if (nla_put_be16(skb, IFLA_GENEVE_PORT, geneve->dst_port))
+ goto nla_put_failure;
+
+ if (geneve->collect_md) {
+ if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
+ goto nla_put_failure;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops geneve_link_ops __read_mostly = {
+ .kind = "ovs_geneve",
+ .maxtype = IFLA_GENEVE_MAX,
+ .policy = geneve_policy,
+ .priv_size = sizeof(struct geneve_dev),
+ .setup = geneve_setup,
+ .validate = geneve_validate,
+ .newlink = geneve_newlink,
+ .dellink = geneve_dellink,
+ .get_size = geneve_get_size,
+ .fill_info = geneve_fill_info,
+};
+
+struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name,
+ u8 name_assign_type, u16 dst_port)
+{
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ int err;
+
+ memset(tb, 0, sizeof(tb));
+ dev = rtnl_create_link(net, (char *) name, name_assign_type,
+ &geneve_link_ops, tb);
+ if (IS_ERR(dev))
+ return dev;
+
+ err = geneve_configure(net, dev, 0, 0, 0, 0, htons(dst_port), true);
+ if (err) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+ return dev;
+}
+EXPORT_SYMBOL_GPL(rpl_geneve_dev_create_fb);
+
+static __net_init int geneve_init_net(struct net *net)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+
+ INIT_LIST_HEAD(&gn->geneve_list);
+ INIT_LIST_HEAD(&gn->sock_list);
+ return 0;
+}
+
+static void __net_exit geneve_exit_net(struct net *net)
+{
+ struct geneve_net *gn = net_generic(net, geneve_net_id);
+ struct geneve_dev *geneve, *next;
+ struct net_device *dev, *aux;
+ LIST_HEAD(list);
+
+ rtnl_lock();
+
+ /* gather any geneve devices that were moved into this ns */
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &geneve_link_ops)
+ unregister_netdevice_queue(dev, &list);
+
+ /* now gather any other geneve devices that were created in this ns */
+ list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) {
+ /* If geneve->dev is in the same netns, it was already added
+ * to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(geneve->dev), net))
+ unregister_netdevice_queue(geneve->dev, &list);
+ }
+
+ /* unregister the devices gathered above */
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations geneve_net_ops = {
+ .init = geneve_init_net,
+ .exit = geneve_exit_net,
+ .id = &geneve_net_id,
+ .size = sizeof(struct geneve_net),
+};
+
+DEFINE_COMPAT_PNET_REG_FUNC(device)
+int rpl_geneve_init_module(void)
+{
+ int rc;
+
+ rc = register_pernet_subsys(&geneve_net_ops);
+ if (rc)
+ goto out1;
+
+ rc = rtnl_link_register(&geneve_link_ops);
+ if (rc)
+ goto out2;
+
+ pr_info("Geneve tunneling driver\n");
+ return 0;
+out2:
+ unregister_pernet_subsys(&geneve_net_ops);
+out1:
+ return rc;
+}
+
+void rpl_geneve_cleanup_module(void)
+{
+ rtnl_link_unregister(&geneve_link_ops);
+ unregister_pernet_subsys(&geneve_net_ops);
+}
+#endif
diff --git a/datapath/linux/compat/gre.c b/datapath/linux/compat/gre.c
index fe8138014..fa8d9368f 100644
--- a/datapath/linux/compat/gre.c
+++ b/datapath/linux/compat/gre.c
@@ -38,9 +38,10 @@
#include "gso.h"
+#ifndef HAVE_METADATA_DST
#if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX)
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0)
+#ifndef HAVE_GRE_HANDLE_OFFLOADS
#ifndef HAVE_GRE_CISCO_REGISTER
@@ -147,6 +148,43 @@ static __sum16 check_checksum(struct sk_buff *skb)
return csum;
}
+#define ip_gre_calc_hlen rpl_ip_gre_calc_hlen
+static int ip_gre_calc_hlen(__be16 o_flags)
+{
+ int addend = 4;
+
+ if (o_flags & TUNNEL_CSUM)
+ addend += 4;
+ if (o_flags & TUNNEL_KEY)
+ addend += 4;
+ if (o_flags & TUNNEL_SEQ)
+ addend += 4;
+ return addend;
+}
+
+#define gre_flags_to_tnl_flags rpl_gre_flags_to_tnl_flags
+static __be16 gre_flags_to_tnl_flags(__be16 flags)
+{
+ __be16 tflags = 0;
+
+ if (flags & GRE_CSUM)
+ tflags |= TUNNEL_CSUM;
+ if (flags & GRE_ROUTING)
+ tflags |= TUNNEL_ROUTING;
+ if (flags & GRE_KEY)
+ tflags |= TUNNEL_KEY;
+ if (flags & GRE_SEQ)
+ tflags |= TUNNEL_SEQ;
+ if (flags & GRE_STRICT)
+ tflags |= TUNNEL_STRICT;
+ if (flags & GRE_REC)
+ tflags |= TUNNEL_REC;
+ if (flags & GRE_VERSION)
+ tflags |= TUNNEL_VERSION;
+
+ return tflags;
+}
+
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
@@ -269,86 +307,7 @@ int rpl_gre_cisco_unregister(struct gre_cisco_protocol *proto)
EXPORT_SYMBOL_GPL(rpl_gre_cisco_unregister);
#endif /* !HAVE_GRE_CISCO_REGISTER */
-
-/* GRE TX side. */
-static void gre_nop_fix(struct sk_buff *skb) { }
-
-static void gre_csum_fix(struct sk_buff *skb)
-{
- struct gre_base_hdr *greh;
- __be32 *options;
- int gre_offset = skb_transport_offset(skb);
-
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
- options = ((__be32 *)greh + 1);
-
- *options = 0;
- *(__sum16 *)options = csum_fold(skb_checksum(skb, gre_offset,
- skb->len - gre_offset, 0));
-}
-
-static bool is_gre_gso(struct sk_buff *skb)
-{
- return skb_is_gso(skb);
-}
-
-void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
- int hdr_len)
-{
- struct gre_base_hdr *greh;
-
- __skb_push(skb, hdr_len);
-
- greh = (struct gre_base_hdr *)skb->data;
- greh->flags = tnl_flags_to_gre_flags(tpi->flags);
- greh->protocol = tpi->proto;
-
- if (tpi->flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
- __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
-
- if (tpi->flags & TUNNEL_SEQ) {
- *ptr = tpi->seq;
- ptr--;
- }
- if (tpi->flags & TUNNEL_KEY) {
- *ptr = tpi->key;
- ptr--;
- }
- if (tpi->flags & TUNNEL_CSUM && !is_gre_gso(skb)) {
- *ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
- skb->len, 0));
- }
- }
-
- ovs_skb_set_inner_protocol(skb, tpi->proto);
-}
-EXPORT_SYMBOL_GPL(rpl_gre_build_header);
-
-struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
-{
- int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE;
- gso_fix_segment_t fix_segment;
-
- if (gre_csum)
- fix_segment = gre_csum_fix;
- else
- fix_segment = gre_nop_fix;
-
- return ovs_iptunnel_handle_offloads(skb, gre_csum, type, fix_segment);
-}
-#else
-struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
-{
- if (skb_is_gso(skb) && skb_is_encapsulated(skb)) {
- kfree_skb(skb);
- return ERR_PTR(-ENOSYS);
- }
- skb_clear_ovs_gso_cb(skb);
-#undef gre_handle_offloads
- return gre_handle_offloads(skb, gre_csum);
-}
#endif
-EXPORT_SYMBOL_GPL(rpl_gre_handle_offloads);
#endif /* CONFIG_NET_IPGRE_DEMUX */
+#endif /* HAVE_METADATA_DST */
diff --git a/datapath/linux/compat/gso.c b/datapath/linux/compat/gso.c
index 2c19b5890..c52b2b136 100644
--- a/datapath/linux/compat/gso.c
+++ b/datapath/linux/compat/gso.c
@@ -130,7 +130,7 @@ int rpl_dev_queue_xmit(struct sk_buff *skb)
if (mpls)
features &= NETIF_F_SG;
- if (netif_needs_gso(skb->dev, skb, features)) {
+ if (netif_needs_gso(skb, features)) {
struct sk_buff *nskb;
nskb = skb_gso_segment(skb, features);
diff --git a/datapath/linux/compat/gso.h b/datapath/linux/compat/gso.h
index 6fcaff8d6..eb756ebe0 100644
--- a/datapath/linux/compat/gso.h
+++ b/datapath/linux/compat/gso.h
@@ -2,30 +2,36 @@
#define __LINUX_GSO_WRAPPER_H
#include <linux/version.h>
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
-
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <net/protocol.h>
-
#include "datapath.h"
+
typedef void (*gso_fix_segment_t)(struct sk_buff *);
struct ovs_gso_cb {
struct ovs_skb_cb dp_cb;
+#ifndef HAVE_METADATA_DST
+ struct metadata_dst *tun_dst;
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
gso_fix_segment_t fix_segment;
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0)
+#endif
+#ifndef HAVE_INNER_PROTOCOL
__be16 inner_protocol;
#endif
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#ifndef HAVE_INNER_MAC_HEADER
unsigned int inner_mac_header;
#endif
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
+#ifndef HAVE_INNER_NETWORK_HEADER
unsigned int inner_network_header;
#endif
};
#define OVS_GSO_CB(skb) ((struct ovs_gso_cb *)(skb)->cb)
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/protocol.h>
+
static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb)
{
OVS_GSO_CB(skb)->fix_segment = NULL;
@@ -37,7 +43,7 @@ static inline void skb_clear_ovs_gso_cb(struct sk_buff *skb)
}
#endif
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#ifndef HAVE_INNER_MAC_HEADER
static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
{
return skb->head + OVS_GSO_CB(skb)->inner_mac_header;
@@ -48,9 +54,9 @@ static inline void skb_set_inner_mac_header(const struct sk_buff *skb,
{
OVS_GSO_CB(skb)->inner_mac_header = (skb->data - skb->head) + offset;
}
-#endif
+#endif /* HAVE_INNER_MAC_HEADER */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
+#ifndef HAVE_INNER_NETWORK_HEADER
static inline unsigned char *skb_inner_network_header(const struct sk_buff *skb)
{
return skb->head + OVS_GSO_CB(skb)->inner_network_header;
@@ -88,15 +94,17 @@ static inline int ovs_skb_inner_transport_offset(const struct sk_buff *skb)
return skb_inner_transport_header(skb) - skb->data;
}
-#endif
+#endif /* HAVE_INNER_NETWORK_HEADER */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,11,0)
-static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) {
+#ifndef HAVE_INNER_PROTOCOL
+static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb)
+{
OVS_GSO_CB(skb)->inner_protocol = htons(0);
}
static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb,
- __be16 ethertype) {
+ __be16 ethertype)
+{
OVS_GSO_CB(skb)->inner_protocol = ethertype;
}
@@ -107,31 +115,28 @@ static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb)
#else
-static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb) {
+static inline void ovs_skb_init_inner_protocol(struct sk_buff *skb)
+{
/* Nothing to do. The inner_protocol is either zero or
* has been set to a value by another user.
* Either way it may be considered initialised.
*/
}
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
-static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb,
- __be16 ethertype)
+static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb)
{
- skb->inner_protocol = ethertype;
+ return skb->inner_protocol;
}
+
+#ifdef ENCAP_TYPE_ETHER
+#define ovs_skb_set_inner_protocol skb_set_inner_protocol
#else
static inline void ovs_skb_set_inner_protocol(struct sk_buff *skb,
__be16 ethertype)
{
- skb_set_inner_protocol(skb, ethertype);
-}
-#endif
-
-static inline __be16 ovs_skb_get_inner_protocol(struct sk_buff *skb)
-{
- return skb->inner_protocol;
+ skb->inner_protocol = ethertype;
}
+#endif /* ENCAP_TYPE_ETHER */
#endif /* 3.11 */
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
@@ -153,4 +158,40 @@ static inline void skb_reset_inner_headers(struct sk_buff *skb)
}
#endif /* 3.18 */
+#ifndef HAVE_METADATA_DST
+/* We need two separate functions to manage different dst in this case.
+ * First is dst_entry and second is tunnel-dst.
+ * So define ovs_* separate functions for tun_dst.
+ */
+static inline void ovs_skb_dst_set(struct sk_buff *skb, void *dst)
+{
+ OVS_GSO_CB(skb)->tun_dst = (void *)dst;
+}
+
+static inline struct ip_tunnel_info *ovs_skb_tunnel_info(struct sk_buff *skb)
+{
+ return &OVS_GSO_CB(skb)->tun_dst->u.tun_info;
+}
+
+static inline void ovs_skb_dst_drop(struct sk_buff *skb)
+{
+ OVS_GSO_CB(skb)->tun_dst = NULL;
+}
+
+static inline void ovs_dst_hold(void *dst)
+{
+}
+
+static inline void ovs_dst_release(struct dst_entry *dst)
+{
+ kfree(dst);
+}
+
+#else
+#define ovs_skb_dst_set skb_dst_set
+#define ovs_skb_dst_drop skb_dst_drop
+#define ovs_dst_hold dst_hold
+#define ovs_dst_release dst_release
+#endif
+
#endif
diff --git a/datapath/linux/compat/include/linux/etherdevice.h b/datapath/linux/compat/include/linux/etherdevice.h
index c9c0a999d..850b7798d 100644
--- a/datapath/linux/compat/include/linux/etherdevice.h
+++ b/datapath/linux/compat/include/linux/etherdevice.h
@@ -64,4 +64,29 @@ static inline bool eth_proto_is_802_3(__be16 proto)
}
#endif
+#define ether_addr_equal rpl_ether_addr_equal
+static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+ u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) |
+ ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4)));
+
+ return fold == 0;
+#else
+ const u16 *a = (const u16 *)addr1;
+ const u16 *b = (const u16 *)addr2;
+
+ return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0;
+#endif
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)
+#define eth_gro_receive rpl_eth_gro_receive
+struct sk_buff **rpl_eth_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb);
+
+#define eth_gro_complete rpl_eth_gro_complete
+int rpl_eth_gro_complete(struct sk_buff *skb, int nhoff);
+#endif
+
#endif
diff --git a/datapath/linux/compat/include/linux/if_link.h b/datapath/linux/compat/include/linux/if_link.h
new file mode 100644
index 000000000..6209dcbad
--- /dev/null
+++ b/datapath/linux/compat/include/linux/if_link.h
@@ -0,0 +1,151 @@
+#ifndef _LINUX_IF_LINK_WRAPPER_H
+#define _LINUX_IF_LINK_WRAPPER_H
+
+#include_next<linux/if_link.h>
+
+/* GENEVE section */
+enum {
+#define IFLA_GENEVE_UNSPEC rpl_IFLA_GENEVE_UNSPEC
+ IFLA_GENEVE_UNSPEC,
+
+#define IFLA_GENEVE_ID rpl_IFLA_GENEVE_ID
+ IFLA_GENEVE_ID,
+
+#define IFLA_GENEVE_REMOTE rpl_IFLA_GENEVE_REMOTE
+ IFLA_GENEVE_REMOTE,
+
+#define IFLA_GENEVE_TTL rpl_IFLA_GENEVE_TTL
+ IFLA_GENEVE_TTL,
+
+#define IFLA_GENEVE_TOS rpl_IFLA_GENEVE_TOS
+ IFLA_GENEVE_TOS,
+
+#define IFLA_GENEVE_PORT rpl_IFLA_GENEVE_PORT
+ IFLA_GENEVE_PORT, /* destination port */
+
+#define IFLA_GENEVE_COLLECT_METADATA rpl_IFLA_GENEVE_COLLECT_METADATA
+ IFLA_GENEVE_COLLECT_METADATA,
+
+#define __IFLA_GENEVE_MAX rpl__IFLA_GENEVE_MAX
+ __IFLA_GENEVE_MAX
+};
+#undef IFLA_GENEVE_MAX
+#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1)
+
+/* STT section */
+enum {
+ IFLA_STT_PORT, /* destination port */
+ __IFLA_STT_MAX
+};
+#define IFLA_STT_MAX (__IFLA_STT_MAX - 1)
+
+/* LISP section */
+enum {
+ IFLA_LISP_PORT, /* destination port */
+ __IFLA_LISP_MAX
+};
+#define IFLA_LISP_MAX (__IFLA_LISP_MAX - 1)
+
+/* VXLAN section */
+enum {
+#define IFLA_VXLAN_UNSPEC rpl_IFLA_VXLAN_UNSPEC
+ IFLA_VXLAN_UNSPEC,
+#define IFLA_VXLAN_ID rpl_IFLA_VXLAN_ID
+ IFLA_VXLAN_ID,
+#define IFLA_VXLAN_GROUP rpl_IFLA_VXLAN_GROUP
+ IFLA_VXLAN_GROUP, /* group or remote address */
+#define IFLA_VXLAN_LINK rpl_IFLA_VXLAN_LINK
+ IFLA_VXLAN_LINK,
+#define IFLA_VXLAN_LOCAL rpl_IFLA_VXLAN_LOCAL
+ IFLA_VXLAN_LOCAL,
+#define IFLA_VXLAN_TTL rpl_IFLA_VXLAN_TTL
+ IFLA_VXLAN_TTL,
+#define IFLA_VXLAN_TOS rpl_IFLA_VXLAN_TOS
+ IFLA_VXLAN_TOS,
+#define IFLA_VXLAN_LEARNING rpl_IFLA_VXLAN_LEARNING
+ IFLA_VXLAN_LEARNING,
+#define IFLA_VXLAN_AGEING rpl_IFLA_VXLAN_AGEING
+ IFLA_VXLAN_AGEING,
+#define IFLA_VXLAN_LIMIT rpl_IFLA_VXLAN_LIMIT
+ IFLA_VXLAN_LIMIT,
+#define IFLA_VXLAN_PORT_RANGE rpl_IFLA_VXLAN_PORT_RANGE
+ IFLA_VXLAN_PORT_RANGE, /* source port */
+#define IFLA_VXLAN_PROXY rpl_IFLA_VXLAN_PROXY
+ IFLA_VXLAN_PROXY,
+#define IFLA_VXLAN_RSC rpl_IFLA_VXLAN_RSC
+ IFLA_VXLAN_RSC,
+#define IFLA_VXLAN_L2MISS rpl_IFLA_VXLAN_L2MISS
+ IFLA_VXLAN_L2MISS,
+#define IFLA_VXLAN_L3MISS rpl_IFLA_VXLAN_L3MISS
+ IFLA_VXLAN_L3MISS,
+#define IFLA_VXLAN_PORT rpl_IFLA_VXLAN_PORT
+ IFLA_VXLAN_PORT, /* destination port */
+#define IFLA_VXLAN_GROUP6 rpl_IFLA_VXLAN_GROUP6
+ IFLA_VXLAN_GROUP6,
+#define IFLA_VXLAN_LOCAL6 rpl_IFLA_VXLAN_LOCAL6
+ IFLA_VXLAN_LOCAL6,
+#define IFLA_VXLAN_UDP_CSUM rpl_IFLA_VXLAN_UDP_CSUM
+ IFLA_VXLAN_UDP_CSUM,
+#define IFLA_VXLAN_UDP_ZERO_CSUM6_TX rpl_IFLA_VXLAN_UDP_ZERO_CSUM6_TX
+ IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
+#define IFLA_VXLAN_UDP_ZERO_CSUM6_RX rpl_IFLA_VXLAN_UDP_ZERO_CSUM6_RX
+ IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
+#define IFLA_VXLAN_REMCSUM_TX rpl_IFLA_VXLAN_REMCSUM_TX
+ IFLA_VXLAN_REMCSUM_TX,
+#define IFLA_VXLAN_REMCSUM_RX rpl_IFLA_VXLAN_REMCSUM_RX
+ IFLA_VXLAN_REMCSUM_RX,
+#define IFLA_VXLAN_GBP rpl_IFLA_VXLAN_GBP
+ IFLA_VXLAN_GBP,
+#define IFLA_VXLAN_REMCSUM_NOPARTIAL rpl_IFLA_VXLAN_REMCSUM_NOPARTIAL
+ IFLA_VXLAN_REMCSUM_NOPARTIAL,
+#define IFLA_VXLAN_COLLECT_METADATA rpl_IFLA_VXLAN_COLLECT_METADATA
+ IFLA_VXLAN_COLLECT_METADATA,
+#define __IFLA_VXLAN_MAX rpl___IFLA_VXLAN_MAX
+ __IFLA_VXLAN_MAX
+};
+
+#undef IFLA_VXLAN_MAX
+#define IFLA_VXLAN_MAX (rpl___IFLA_VXLAN_MAX - 1)
+
+#define ifla_vxlan_port_range rpl_ifla_vxlan_port_range
+struct ifla_vxlan_port_range {
+ __be16 low;
+ __be16 high;
+};
+
+#ifndef HAVE_RTNL_LINK_STATS64
+/* The main device statistics structure */
+struct rtnl_link_stats64 {
+ __u64 rx_packets; /* total packets received */
+ __u64 tx_packets; /* total packets transmitted */
+ __u64 rx_bytes; /* total bytes received */
+ __u64 tx_bytes; /* total bytes transmitted */
+ __u64 rx_errors; /* bad packets received */
+ __u64 tx_errors; /* packet transmit problems */
+ __u64 rx_dropped; /* no space in linux buffers */
+ __u64 tx_dropped; /* no space available in linux */
+ __u64 multicast; /* multicast packets received */
+ __u64 collisions;
+
+ /* detailed rx_errors: */
+ __u64 rx_length_errors;
+ __u64 rx_over_errors; /* receiver ring buff overflow */
+ __u64 rx_crc_errors; /* recved pkt with crc error */
+ __u64 rx_frame_errors; /* recv'd frame alignment error */
+ __u64 rx_fifo_errors; /* recv'r fifo overrun */
+ __u64 rx_missed_errors; /* receiver missed packet */
+
+ /* detailed tx_errors */
+ __u64 tx_aborted_errors;
+ __u64 tx_carrier_errors;
+ __u64 tx_fifo_errors;
+ __u64 tx_heartbeat_errors;
+ __u64 tx_window_errors;
+
+ /* for cslip etc */
+ __u64 rx_compressed;
+ __u64 tx_compressed;
+};
+#endif
+
+#endif
diff --git a/datapath/linux/compat/include/linux/if_vlan.h b/datapath/linux/compat/include/linux/if_vlan.h
index 060bb62ba..a8d7bfab6 100644
--- a/datapath/linux/compat/include/linux/if_vlan.h
+++ b/datapath/linux/compat/include/linux/if_vlan.h
@@ -177,4 +177,56 @@ static inline int rpl_vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci)
#define skb_vlan_tag_get(skb) vlan_tx_tag_get(skb)
#endif
+#ifndef HAVE_VLAN_GET_PROTOCOL
+
+static inline __be16 __vlan_get_protocol(struct sk_buff *skb, __be16 type,
+ int *depth)
+{
+ unsigned int vlan_depth = skb->mac_len;
+
+ /* if type is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
+ if (vlan_depth) {
+ if (WARN_ON(vlan_depth < VLAN_HLEN))
+ return 0;
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
+ }
+ do {
+ struct vlan_hdr *vh;
+
+ if (unlikely(!pskb_may_pull(skb,
+ vlan_depth + VLAN_HLEN)))
+ return 0;
+
+ vh = (struct vlan_hdr *)(skb->data + vlan_depth);
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (type == htons(ETH_P_8021Q) ||
+ type == htons(ETH_P_8021AD));
+ }
+
+ if (depth)
+ *depth = vlan_depth;
+
+ return type;
+}
+
+/**
+ * vlan_get_protocol - get protocol EtherType.
+ * @skb: skbuff to query
+ *
+ * Returns the EtherType of the packet, regardless of whether it is
+ * vlan encapsulated (normal or hardware accelerated) or not.
+ */
+static inline __be16 vlan_get_protocol(struct sk_buff *skb)
+{
+ return __vlan_get_protocol(skb, skb->protocol, NULL);
+}
+
+#endif
#endif /* linux/if_vlan.h wrapper */
diff --git a/datapath/linux/compat/include/linux/list.h b/datapath/linux/compat/include/linux/list.h
index 18cce8a37..4234c17ce 100644
--- a/datapath/linux/compat/include/linux/list.h
+++ b/datapath/linux/compat/include/linux/list.h
@@ -23,4 +23,9 @@
#endif
+#ifndef list_first_entry_or_null
+#define list_first_entry_or_null(ptr, type, member) \
+ (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL)
+#endif
+
#endif
diff --git a/datapath/linux/compat/include/linux/netdev_features.h b/datapath/linux/compat/include/linux/netdev_features.h
index 04eb77dc0..e4a310729 100644
--- a/datapath/linux/compat/include/linux/netdev_features.h
+++ b/datapath/linux/compat/include/linux/netdev_features.h
@@ -5,21 +5,23 @@
#include_next <linux/netdev_features.h>
#endif
+#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)
+/* On RHEL 6, netdev features are defined in netdevice.h header. */
+#include <linux/netdevice.h>
+#endif
+
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
#define NETIF_F_HW_VLAN_CTAG_TX NETIF_F_HW_VLAN_TX
#endif
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
-#define NETIF_F_GSO_ENCAP_ALL 0
-
-#else
-
#ifndef NETIF_F_GSO_GRE
#define NETIF_F_GSO_GRE 0
#endif
#ifndef NETIF_F_GSO_GRE_CSUM
#define NETIF_F_GSO_GRE_CSUM 0
+#else
+#define HAVE_NETIF_F_GSO_GRE_CSUM
#endif
#ifndef NETIF_F_GSO_IPIP
@@ -32,16 +34,39 @@
#ifndef NETIF_F_GSO_UDP_TUNNEL
#define NETIF_F_GSO_UDP_TUNNEL 0
+#else
+#define HAVE_NETIF_F_GSO_UDP_TUNNEL 0
#endif
#ifndef NETIF_F_GSO_UDP_TUNNEL_CSUM
#define NETIF_F_GSO_UDP_TUNNEL_CSUM 0
+#define SKB_GSO_UDP_TUNNEL_CSUM 0
#endif
#ifndef NETIF_F_GSO_MPLS
#define NETIF_F_GSO_MPLS 0
#endif
+#ifndef NETIF_F_HW_VLAN_STAG_TX
+#define NETIF_F_HW_VLAN_STAG_TX 0
+#endif
+
+#ifndef NETIF_F_GSO_TUNNEL_REMCSUM
+#define NETIF_F_GSO_TUNNEL_REMCSUM 0
+#define SKB_GSO_TUNNEL_REMCSUM 0
+#else
+/* support for REM_CSUM is added in 3.19 but API are not defined
+ * till 4.0, so turn on REMSUM support on kernel 4.0 onwards.
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)
+#define HAVE_NETIF_F_GSO_TUNNEL_REMCSUM
+#endif
+#endif
+
+#ifndef NETIF_F_RXCSUM
+#define NETIF_F_RXCSUM 0
+#endif
+
#ifndef NETIF_F_GSO_ENCAP_ALL
#define NETIF_F_GSO_ENCAP_ALL (NETIF_F_GSO_GRE | \
NETIF_F_GSO_GRE_CSUM | \
@@ -52,6 +77,16 @@
NETIF_F_GSO_MPLS)
#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0)
+#define SKB_GSO_GRE 0
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#define SKB_GSO_UDP_TUNNEL 0
+#endif
+
+#ifndef HAVE_NETIF_F_GSO_GRE_CSUM
+#define SKB_GSO_GRE_CSUM 0
#endif
#endif
diff --git a/datapath/linux/compat/include/linux/netdevice.h b/datapath/linux/compat/include/linux/netdevice.h
index 0fb2144c6..576989da5 100644
--- a/datapath/linux/compat/include/linux/netdevice.h
+++ b/datapath/linux/compat/include/linux/netdevice.h
@@ -22,6 +22,13 @@ struct net;
#define IFF_LIVE_ADDR_CHANGE 0
#endif
+#ifndef IFF_NO_QUEUE
+#define IFF_NO_QUEUE 0
+#endif
+#ifndef IFF_OPENVSWITCH
+#define IFF_OPENVSWITCH 0
+#endif
+
#ifndef to_net_dev
#define to_net_dev(class) container_of(class, struct net_device, NETDEV_DEV_MEMBER)
#endif
@@ -34,9 +41,8 @@ struct net;
#endif
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33)
-extern void unregister_netdevice_queue(struct net_device *dev,
- struct list_head *head);
-extern void unregister_netdevice_many(struct list_head *head);
+#define unregister_netdevice_queue(dev, head) unregister_netdevice(dev)
+#define unregister_netdevice_many(head)
#endif
#ifndef HAVE_DEV_DISABLE_LRO
@@ -112,18 +118,15 @@ struct sk_buff *rpl_skb_gso_segment(struct sk_buff *skb, netdev_features_t featu
netdev_features_t rpl_netif_skb_features(struct sk_buff *skb);
#endif
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
-static inline int rpl_netif_needs_gso(struct net_device *dev,
- struct sk_buff *skb, int features)
+#ifdef HAVE_NETIF_NEEDS_GSO_NETDEV
+#define netif_needs_gso rpl_netif_needs_gso
+static inline bool netif_needs_gso(struct sk_buff *skb,
+ netdev_features_t features)
{
-#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
- unlikely(skb->ip_summed != CHECKSUM_PARTIAL));
-#else
- return netif_needs_gso(skb, features);
-#endif
+ unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
+ (skb->ip_summed != CHECKSUM_UNNECESSARY)));
}
-#define netif_needs_gso rpl_netif_needs_gso
#endif
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0)
@@ -175,6 +178,11 @@ struct pcpu_sw_netstats {
};
#endif
+#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(8,0)
+/* Use compat version for all redhas releases */
+#undef netdev_alloc_pcpu_stats
+#endif
+
#ifndef netdev_alloc_pcpu_stats
#define netdev_alloc_pcpu_stats(type) \
({ \
@@ -191,4 +199,89 @@ struct pcpu_sw_netstats {
})
#endif
+#ifndef NET_NAME_USER
+#define NET_NAME_USER 3
+#endif
+
+#ifndef HAVE_GRO_REMCSUM
+struct gro_remcsum {
+};
+
+#define skb_gro_remcsum_init(grc)
+#define skb_gro_remcsum_cleanup(a1, a2)
+#else
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0)
+
+#define skb_gro_remcsum_process rpl_skb_gro_remcsum_process
+static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr,
+ unsigned int off, size_t hdrlen,
+ int start, int offset,
+ struct gro_remcsum *grc,
+ bool nopartial)
+{
+ __wsum delta;
+ size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start);
+
+ BUG_ON(!NAPI_GRO_CB(skb)->csum_valid);
+
+ if (!nopartial) {
+ NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start;
+ return ptr;
+ }
+
+ ptr = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, off + plen)) {
+ ptr = skb_gro_header_slow(skb, off + plen, off);
+ if (!ptr)
+ return NULL;
+ }
+
+ delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum,
+ start, offset);
+
+ /* Adjust skb->csum since we changed the packet */
+ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta);
+
+ grc->offset = off + hdrlen + offset;
+ grc->delta = delta;
+
+ return ptr;
+}
+#endif
+#endif
+
+#ifndef HAVE_RTNL_LINK_STATS64
+#define dev_get_stats rpl_dev_get_stats
+struct rtnl_link_stats64 *rpl_dev_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *storage);
+
+#else
+#define HAVE_DEV_TSTATS
+#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)
+#undef HAVE_DEV_TSTATS
#endif
+#endif
+
+#if RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(7,0)
+/* Only required on RHEL 6. */
+#define dev_get_stats dev_get_stats64
+#endif
+
+#ifndef netdev_dbg
+#define netdev_dbg(__dev, format, args...) \
+do { \
+ printk(KERN_DEBUG "%s ", __dev->name); \
+ printk(KERN_DEBUG format, ##args); \
+} while (0)
+#endif
+
+#ifndef netdev_info
+#define netdev_info(__dev, format, args...) \
+do { \
+ printk(KERN_INFO "%s ", __dev->name); \
+ printk(KERN_INFO format, ##args); \
+} while (0)
+
+#endif
+
+#endif /* __LINUX_NETDEVICE_WRAPPER_H */
diff --git a/datapath/linux/compat/include/linux/skbuff.h b/datapath/linux/compat/include/linux/skbuff.h
index 4d81bc80a..0edcbfdd2 100644
--- a/datapath/linux/compat/include/linux/skbuff.h
+++ b/datapath/linux/compat/include/linux/skbuff.h
@@ -15,25 +15,8 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
#endif
#include_next <linux/skbuff.h>
-
#include <linux/jhash.h>
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0)
-#define SKB_GSO_GRE 0
-#endif
-
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
-#define SKB_GSO_UDP_TUNNEL 0
-#endif
-
-#ifndef HAVE_SKB_GSO_GRE_CSUM
-#define SKB_GSO_GRE_CSUM 0
-#endif
-
-#ifndef HAVE_SKB_GSO_UDP_TUNNEL_CSUM
-#define SKB_GSO_UDP_TUNNEL_CSUM 0
-#endif
-
#ifndef HAVE_IGNORE_DF_RENAME
#define ignore_df local_df
#endif
@@ -403,4 +386,15 @@ static inline unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int le
}
#endif
+
+#ifndef HAVE_SKB_SCRUB_PACKET_XNET
+#define skb_scrub_packet rpl_skb_scrub_packet
+void rpl_skb_scrub_packet(struct sk_buff *skb, bool xnet);
+#endif
+
+#define skb_pop_mac_header rpl_skb_pop_mac_header
+static inline void skb_pop_mac_header(struct sk_buff *skb)
+{
+ skb->mac_header = skb->network_header;
+}
#endif
diff --git a/datapath/linux/compat/include/linux/stddef.h b/datapath/linux/compat/include/linux/stddef.h
index 9b68f710f..f2b7c319a 100644
--- a/datapath/linux/compat/include/linux/stddef.h
+++ b/datapath/linux/compat/include/linux/stddef.h
@@ -12,6 +12,11 @@ enum {
};
#endif /* !HAVE_BOOL_TYPE */
+#ifndef offsetofend
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
+#endif
+
#endif /* __KERNEL__ */
#endif
diff --git a/datapath/linux/compat/include/net/dst_metadata.h b/datapath/linux/compat/include/net/dst_metadata.h
new file mode 100644
index 000000000..f15bb0308
--- /dev/null
+++ b/datapath/linux/compat/include/net/dst_metadata.h
@@ -0,0 +1,44 @@
+#ifndef __NET_DST_METADATA_WRAPPER_H
+#define __NET_DST_METADATA_WRAPPER_H 1
+
+#ifdef HAVE_METADATA_DST
+#include_next <net/dst_metadata.h>
+#else
+#include <linux/skbuff.h>
+#include <net/ip_tunnels.h>
+#include <net/dst.h>
+
+struct metadata_dst {
+ unsigned long dst;
+ union {
+ struct ip_tunnel_info tun_info;
+ } u;
+};
+
+static inline struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
+{
+ struct metadata_dst *md_dst;
+
+ md_dst = kmalloc(sizeof(*md_dst) + optslen, flags);
+ if (!md_dst)
+ return NULL;
+
+ return md_dst;
+}
+#define skb_tunnel_info ovs_skb_tunnel_info
+#endif
+static inline void ovs_ip_tun_rx_dst(struct ip_tunnel_info *tun_info,
+ struct sk_buff *skb, __be16 flags,
+ __be64 tunnel_id, int md_size)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ ip_tunnel_key_init(&tun_info->key,
+ iph->saddr, iph->daddr, iph->tos, iph->ttl,
+ 0, 0, tunnel_id, flags);
+ tun_info->mode = 0;
+}
+
+void ovs_ip_tunnel_rcv(struct net_device *dev, struct sk_buff *skb,
+ struct metadata_dst *tun_dst);
+#endif /* __NET_DST_METADATA_WRAPPER_H */
diff --git a/datapath/linux/compat/include/net/geneve.h b/datapath/linux/compat/include/net/geneve.h
index 4f250c2f6..550f4a77e 100644
--- a/datapath/linux/compat/include/net/geneve.h
+++ b/datapath/linux/compat/include/net/geneve.h
@@ -1,17 +1,24 @@
#ifndef __NET_GENEVE_WRAPPER_H
#define __NET_GENEVE_WRAPPER_H 1
-#include <linux/version.h>
-
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)
-#include_next <net/geneve.h>
-#else
-
#ifdef CONFIG_INET
#include <net/udp_tunnel.h>
#endif
+#ifdef HAVE_METADATA_DST
+#include_next <net/geneve.h>
+
+static inline int rpl_geneve_init_module(void)
+{
+ return 0;
+}
+static inline void rpl_geneve_cleanup_module(void)
+{}
+
+#define geneve_xmit dev_queue_xmit
+
+#else
/* Geneve Header:
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |Ver| Opt Len |O|C| Rsvd. | Protocol Type |
@@ -69,43 +76,19 @@ struct genevehdr {
};
#ifdef CONFIG_INET
-struct geneve_sock;
-
-typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb);
-
-struct geneve_sock {
- geneve_rcv_t *rcv;
- void *rcv_data;
- struct socket *sock;
- struct rcu_head rcu;
-};
-
-#define GENEVE_VER 0
-#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))
-
-#define geneve_sock_add rpl_geneve_sock_add
-struct geneve_sock *rpl_geneve_sock_add(struct net *net, __be16 port,
- geneve_rcv_t *rcv, void *data,
- bool no_share, bool ipv6);
-
-#define geneve_sock_release rpl_geneve_sock_release
-void rpl_geneve_sock_release(struct geneve_sock *vs);
-
-#define geneve_xmit_skb rpl_geneve_xmit_skb
-int rpl_geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt,
- struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
- __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
- __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
- bool csum, bool xnet);
+#define geneve_dev_create_fb rpl_geneve_dev_create_fb
+struct net_device *rpl_geneve_dev_create_fb(struct net *net, const char *name,
+ u8 name_assign_type, u16 dst_port);
#endif /*ifdef CONFIG_INET */
-#endif /* kernel < 4.0 */
+int rpl_geneve_init_module(void);
+void rpl_geneve_cleanup_module(void);
+
+#define geneve_xmit rpl_geneve_xmit
+netdev_tx_t rpl_geneve_xmit(struct sk_buff *skb);
-#ifndef HAVE_GENEVE_HDR
-static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
-{
- return (struct genevehdr *)(udp_hdr(skb) + 1);
-}
#endif
+#define geneve_init_module rpl_geneve_init_module
+#define geneve_cleanup_module rpl_geneve_cleanup_module
-#endif /*ifdef__NET_GENEVE_WRAPPER_H */
+#endif /*ifdef__NET_GENEVE_H */
diff --git a/datapath/linux/compat/include/net/gre.h b/datapath/linux/compat/include/net/gre.h
index 6e0df0fd8..09053b573 100644
--- a/datapath/linux/compat/include/net/gre.h
+++ b/datapath/linux/compat/include/net/gre.h
@@ -3,6 +3,19 @@
#include <linux/skbuff.h>
#include <net/ip_tunnels.h>
+#ifdef HAVE_METADATA_DST
+#include_next <net/gre.h>
+
+static inline int rpl_ipgre_init(void)
+{
+ return 0;
+}
+static inline void rpl_ipgre_fini(void)
+{}
+
+#define gre_fb_xmit dev_queue_xmit
+
+#else
#include <linux/version.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37) || \
@@ -28,81 +41,28 @@ int rpl_gre_cisco_register(struct gre_cisco_protocol *proto);
#define gre_cisco_unregister rpl_gre_cisco_unregister
int rpl_gre_cisco_unregister(struct gre_cisco_protocol *proto);
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
+#ifndef GRE_HEADER_SECTION
struct gre_base_hdr {
__be16 flags;
__be16 protocol;
};
#define GRE_HEADER_SECTION 4
+#endif
-static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
-{
- __be16 tflags = 0;
-
- if (flags & GRE_CSUM)
- tflags |= TUNNEL_CSUM;
- if (flags & GRE_ROUTING)
- tflags |= TUNNEL_ROUTING;
- if (flags & GRE_KEY)
- tflags |= TUNNEL_KEY;
- if (flags & GRE_SEQ)
- tflags |= TUNNEL_SEQ;
- if (flags & GRE_STRICT)
- tflags |= TUNNEL_STRICT;
- if (flags & GRE_REC)
- tflags |= TUNNEL_REC;
- if (flags & GRE_VERSION)
- tflags |= TUNNEL_VERSION;
-
- return tflags;
-}
-
-static inline __be16 tnl_flags_to_gre_flags(__be16 tflags)
-{
- __be16 flags = 0;
-
- if (tflags & TUNNEL_CSUM)
- flags |= GRE_CSUM;
- if (tflags & TUNNEL_ROUTING)
- flags |= GRE_ROUTING;
- if (tflags & TUNNEL_KEY)
- flags |= GRE_KEY;
- if (tflags & TUNNEL_SEQ)
- flags |= GRE_SEQ;
- if (tflags & TUNNEL_STRICT)
- flags |= GRE_STRICT;
- if (tflags & TUNNEL_REC)
- flags |= GRE_REC;
- if (tflags & TUNNEL_VERSION)
- flags |= GRE_VERSION;
-
- return flags;
-}
-#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) */
#endif /* HAVE_GRE_CISCO_REGISTER */
-#define gre_handle_offloads rpl_gre_handle_offloads
-struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
+int rpl_ipgre_init(void);
+void rpl_ipgre_fini(void);
-#if LINUX_VERSION_CODE < KERNEL_VERSION(3,12,0)
+#define gretap_fb_dev_create rpl_gretap_fb_dev_create
+struct net_device *rpl_gretap_fb_dev_create(struct net *net, const char *name,
+ u8 name_assign_type);
-#define gre_build_header rpl_gre_build_header
-void rpl_gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
- int hdr_len);
+#define gre_fb_xmit rpl_gre_fb_xmit
+netdev_tx_t rpl_gre_fb_xmit(struct sk_buff *skb);
+#endif /* HAVE_METADATA_DST */
-#define ip_gre_calc_hlen rpl_ip_gre_calc_hlen
-static inline int ip_gre_calc_hlen(__be16 o_flags)
-{
- int addend = 4;
-
- if (o_flags & TUNNEL_CSUM)
- addend += 4;
- if (o_flags & TUNNEL_KEY)
- addend += 4;
- if (o_flags & TUNNEL_SEQ)
- addend += 4;
- return addend;
-}
-#endif
+#define ipgre_init rpl_ipgre_init
+#define ipgre_fini rpl_ipgre_fini
#endif
diff --git a/datapath/linux/compat/include/net/inet_ecn.h b/datapath/linux/compat/include/net/inet_ecn.h
new file mode 100644
index 000000000..f0591b322
--- /dev/null
+++ b/datapath/linux/compat/include/net/inet_ecn.h
@@ -0,0 +1,59 @@
+#ifndef _INET_ECN_WRAPPER_H_
+#define _INET_ECN_WRAPPER_H_
+
+#include_next <net/inet_ecn.h>
+
+#define INET_ECN_decapsulate rpl_INET_ECN_decapsulate
+static inline int INET_ECN_decapsulate(struct sk_buff *skb,
+ __u8 outer, __u8 inner)
+{
+ if (INET_ECN_is_not_ect(inner)) {
+ switch (outer & INET_ECN_MASK) {
+ case INET_ECN_NOT_ECT:
+ return 0;
+ case INET_ECN_ECT_0:
+ case INET_ECN_ECT_1:
+ return 1;
+ case INET_ECN_CE:
+ return 2;
+ }
+ }
+
+ if (INET_ECN_is_ce(outer))
+ INET_ECN_set_ce(skb);
+
+ return 0;
+}
+
+#define IP_ECN_decapsulate rpl_IP_ECN_decapsulate
+static inline int IP_ECN_decapsulate(const struct iphdr *oiph,
+ struct sk_buff *skb)
+{
+ __u8 inner;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ inner = ip_hdr(skb)->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ inner = ipv6_get_dsfield(ipv6_hdr(skb));
+ else
+ return 0;
+
+ return INET_ECN_decapsulate(skb, oiph->tos, inner);
+}
+
+#define IP6_ECN_decapsulate rpl_IP6_ECN_decapsulate
+static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h,
+ struct sk_buff *skb)
+{
+ __u8 inner;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ inner = ip_hdr(skb)->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ inner = ipv6_get_dsfield(ipv6_hdr(skb));
+ else
+ return 0;
+
+ return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner);
+}
+#endif
diff --git a/datapath/linux/compat/include/net/ip6_route.h b/datapath/linux/compat/include/net/ip6_route.h
new file mode 100644
index 000000000..3f495e783
--- /dev/null
+++ b/datapath/linux/compat/include/net/ip6_route.h
@@ -0,0 +1,31 @@
+#ifndef __NET_IP6_ROUTE_WRAPPER
+#define __NET_IP6_ROUTE_WRAPPER
+
+#include <net/route.h>
+#include <net/ipv6.h>
+
+#include_next<net/ip6_route.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+
+static inline
+struct dst_entry *rpl_ip6_route_output(struct net *net, const struct sock *sk,
+ struct flowi6 *fl6)
+{
+ struct flowi fl;
+
+ memset(&fl, 0, sizeof(fl));
+ fl.oif = fl6->flowi6_oif;
+ fl.fl6_dst = fl6->daddr;
+ fl.fl6_src = fl6->saddr;
+ fl.mark = fl6->flowi6_mark;
+ fl.proto = fl6->flowi6_proto;
+
+ return ip6_route_output(net, (struct sock *) sk, &fl);
+}
+#define ip6_route_output rpl_ip6_route_output
+
+#define ip6_dst_hoplimit(dst) dst_metric(dst, RTAX_HOPLIMIT)
+
+#endif /* 2.6.39 */
+#endif
diff --git a/datapath/linux/compat/include/net/ip6_tunnel.h b/datapath/linux/compat/include/net/ip6_tunnel.h
new file mode 100644
index 000000000..ce650879b
--- /dev/null
+++ b/datapath/linux/compat/include/net/ip6_tunnel.h
@@ -0,0 +1,33 @@
+#ifndef _NET_IP6_TUNNEL_WRAPER_H
+#define _NET_IP6_TUNNEL_WRAPER_H
+
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <linux/if_tunnel.h>
+#include <linux/ip6_tunnel.h>
+#include_next <net/ip6_tunnel.h>
+
+#include "gso.h"
+
+#define ip6tunnel_xmit rpl_ip6tunnel_xmit
+static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb,
+ struct net_device *dev)
+{
+ int pkt_len, err;
+
+ pkt_len = skb->len - skb_inner_network_offset(skb);
+ /* TODO: Fix GSO for ipv6 */
+#ifdef HAVE_IP6_LOCAL_OUT_SK
+ err = ip6_local_out_sk(sk, skb);
+#else
+ err = ip6_local_out(skb);
+#endif
+ if (net_xmit_eval(err) != 0)
+ pkt_len = net_xmit_eval(err);
+ else
+ pkt_len = err;
+
+ iptunnel_xmit_stats(pkt_len, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
+}
+
+#endif
diff --git a/datapath/linux/compat/include/net/ip_tunnels.h b/datapath/linux/compat/include/net/ip_tunnels.h
index 3ed6f9193..47dce784d 100644
--- a/datapath/linux/compat/include/net/ip_tunnels.h
+++ b/datapath/linux/compat/include/net/ip_tunnels.h
@@ -69,11 +69,11 @@ struct tnl_ptk_info {
#endif
#ifndef TUNNEL_GENEVE_OPT
-#define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800)
+#define TUNNEL_GENEVE_OPT __cpu_to_be16(0x0800)
#endif
#ifndef TUNNEL_VXLAN_OPT
-#define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000)
+#define TUNNEL_VXLAN_OPT __cpu_to_be16(0x1000)
#endif
/* Older kernels defined TUNNEL_OPTIONS_PRESENT to GENEVE only */
@@ -83,4 +83,224 @@ struct tnl_ptk_info {
#define skb_is_encapsulated ovs_skb_is_encapsulated
bool ovs_skb_is_encapsulated(struct sk_buff *skb);
+#ifndef HAVE_METADATA_DST
+/* Used to memset ip_tunnel padding. */
+#define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst)
+
+/* Used to memset ipv4 address padding. */
+#define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst)
+#define IP_TUNNEL_KEY_IPV4_PAD_LEN \
+ (FIELD_SIZEOF(struct ip_tunnel_key, u) - \
+ FIELD_SIZEOF(struct ip_tunnel_key, u.ipv4))
+
+struct ip_tunnel_key {
+ __be64 tun_id;
+ union {
+ struct {
+ __be32 src;
+ __be32 dst;
+ } ipv4;
+ struct {
+ struct in6_addr src;
+ struct in6_addr dst;
+ } ipv6;
+ } u;
+ __be16 tun_flags;
+ u8 tos; /* TOS for IPv4, TC for IPv6 */
+ u8 ttl; /* TTL for IPv4, HL for IPv6 */
+ __be16 tp_src;
+ __be16 tp_dst;
+};
+
+/* Flags for ip_tunnel_info mode. */
+#define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */
+#define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */
+
+struct ip_tunnel_info {
+ struct ip_tunnel_key key;
+ u8 options_len;
+ u8 mode;
+};
+
+static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info *tun_info)
+{
+ return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
+}
+
+static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info)
+{
+ return info + 1;
+}
+
+static inline void ip_tunnel_info_opts_get(void *to,
+ const struct ip_tunnel_info *info)
+{
+ memcpy(to, info + 1, info->options_len);
+}
+
+static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info,
+ const void *from, int len)
+{
+ memcpy(ip_tunnel_info_opts(info), from, len);
+ info->options_len = len;
+}
+
+static inline void ip_tunnel_key_init(struct ip_tunnel_key *key,
+ __be32 saddr, __be32 daddr,
+ u8 tos, u8 ttl,
+ __be16 tp_src, __be16 tp_dst,
+ __be64 tun_id, __be16 tun_flags)
+{
+ key->tun_id = tun_id;
+ key->u.ipv4.src = saddr;
+ key->u.ipv4.dst = daddr;
+ memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD,
+ 0, IP_TUNNEL_KEY_IPV4_PAD_LEN);
+ key->tos = tos;
+ key->ttl = ttl;
+ key->tun_flags = tun_flags;
+
+ /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of
+ * the upper tunnel are used.
+ * E.g: GRE over IPSEC, the tp_src and tp_port are zero.
+ */
+ key->tp_src = tp_src;
+ key->tp_dst = tp_dst;
+
+ /* Clear struct padding. */
+ if (sizeof(*key) != IP_TUNNEL_KEY_SIZE)
+ memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE,
+ 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE);
+}
+
+#define ip_tunnel_collect_metadata() true
+
+
+#define ip_tunnel rpl_ip_tunnel
+
+struct ip_tunnel {
+ struct net_device *dev;
+ struct net *net; /* netns for packet i/o */
+
+ int err_count; /* Number of arrived ICMP errors */
+ unsigned long err_time; /* Time when the last ICMP error
+ * arrived
+ */
+
+ /* These four fields used only by GRE */
+ u32 i_seqno; /* The last seen seqno */
+ u32 o_seqno; /* The last output seqno */
+ int tun_hlen; /* Precalculated header length */
+ int mlink;
+
+ struct ip_tunnel_parm parms;
+
+ int encap_hlen; /* Encap header length (FOU,GUE) */
+ int hlen; /* tun_hlen + encap_hlen */
+
+ int ip_tnl_net_id;
+ bool collect_md;
+};
+
+#define ip_tunnel_net rpl_ip_tunnel_net
+struct ip_tunnel_net {
+ struct ip_tunnel __rcu *collect_md_tun;
+ struct rtnl_link_ops *rtnl_ops;
+};
+
+
+#ifndef HAVE_PCPU_SW_NETSTATS
+#define ip_tunnel_get_stats64 rpl_ip_tunnel_get_stats64
+struct rtnl_link_stats64 *rpl_ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot);
+#endif
+
+#define ip_tunnel_get_dsfield rpl_ip_tunnel_get_dsfield
+static inline u8 ip_tunnel_get_dsfield(const struct iphdr *iph,
+ const struct sk_buff *skb)
+{
+ if (skb->protocol == htons(ETH_P_IP))
+ return iph->tos;
+ else if (skb->protocol == htons(ETH_P_IPV6))
+ return ipv6_get_dsfield((const struct ipv6hdr *)iph);
+ else
+ return 0;
+}
+
+#define ip_tunnel_ecn_encap rpl_ip_tunnel_ecn_encap
+static inline u8 ip_tunnel_ecn_encap(u8 tos, const struct iphdr *iph,
+ const struct sk_buff *skb)
+{
+ u8 inner = ip_tunnel_get_dsfield(iph, skb);
+
+ return INET_ECN_encapsulate(tos, inner);
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+#define iptunnel_xmit_stats(err, stats, dummy) \
+do { \
+ if (err > 0) { \
+ (stats)->tx_bytes += err; \
+ (stats)->tx_packets++; \
+ } else if (err < 0) { \
+ (stats)->tx_errors++; \
+ (stats)->tx_aborted_errors++; \
+ } else { \
+ (stats)->tx_dropped++; \
+ } \
+} while (0)
+
+#else
+#define iptunnel_xmit_stats rpl_iptunnel_xmit_stats
+static inline void iptunnel_xmit_stats(int err,
+ struct net_device_stats *err_stats,
+ struct pcpu_sw_netstats __percpu *stats)
+{
+ if (err > 0) {
+ struct pcpu_sw_netstats *tstats = this_cpu_ptr(stats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += err;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else if (err < 0) {
+ err_stats->tx_errors++;
+ err_stats->tx_aborted_errors++;
+ } else {
+ err_stats->tx_dropped++;
+ }
+}
+#endif
+
+#define ip_tunnel_init rpl_ip_tunnel_init
+int rpl_ip_tunnel_init(struct net_device *dev);
+
+#define ip_tunnel_uninit rpl_ip_tunnel_uninit
+void rpl_ip_tunnel_uninit(struct net_device *dev);
+
+#define ip_tunnel_change_mtu rpl_ip_tunnel_change_mtu
+int rpl_ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
+
+#define ip_tunnel_newlink rpl_ip_tunnel_newlink
+int rpl_ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p);
+
+#define ip_tunnel_dellink rpl_ip_tunnel_dellink
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+void rpl_ip_tunnel_dellink(struct net_device *dev, struct list_head *head);
+#else
+void rpl_ip_tunnel_dellink(struct net_device *dev);
+#endif
+
+#define ip_tunnel_init_net rpl_ip_tunnel_init_net
+int rpl_ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+ struct rtnl_link_ops *ops, char *devname);
+
+#define ip_tunnel_delete_net rpl_ip_tunnel_delete_net
+void rpl_ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops);
+
+#define ip_tunnel_setup rpl_ip_tunnel_setup
+void rpl_ip_tunnel_setup(struct net_device *dev, int net_id);
+
+#endif /* HAVE_METADATA_DST */
#endif /* __NET_IP_TUNNELS_H */
diff --git a/datapath/linux/compat/include/net/lisp.h b/datapath/linux/compat/include/net/lisp.h
new file mode 100644
index 000000000..b8af17dbb
--- /dev/null
+++ b/datapath/linux/compat/include/net/lisp.h
@@ -0,0 +1,24 @@
+#ifndef __NET_LISP_WRAPPER_H
+#define __NET_LISP_WRAPPER_H 1
+
+#ifdef CONFIG_INET
+#include <net/udp_tunnel.h>
+#endif
+
+
+#ifdef CONFIG_INET
+#define lisp_dev_create_fb rpl_lisp_dev_create_fb
+struct net_device *rpl_lisp_dev_create_fb(struct net *net, const char *name,
+ u8 name_assign_type, u16 dst_port);
+#endif /*ifdef CONFIG_INET */
+
+#define lisp_init_module rpl_lisp_init_module
+int rpl_lisp_init_module(void);
+
+#define lisp_cleanup_module rpl_lisp_cleanup_module
+void rpl_lisp_cleanup_module(void);
+
+#define lisp_xmit rpl_lisp_xmit
+netdev_tx_t rpl_lisp_xmit(struct sk_buff *skb);
+
+#endif /*ifdef__NET_LISP_H */
diff --git a/datapath/linux/compat/include/net/net_namespace.h b/datapath/linux/compat/include/net/net_namespace.h
index edfa131d9..9f5087216 100644
--- a/datapath/linux/compat/include/net/net_namespace.h
+++ b/datapath/linux/compat/include/net/net_namespace.h
@@ -17,6 +17,9 @@ struct rpl_pernet_operations {
#define register_pernet_device rpl_register_pernet_gen_device
#define unregister_pernet_device rpl_unregister_pernet_gen_device
+#define register_pernet_subsys rpl_register_pernet_gen_device
+#define unregister_pernet_subsys rpl_unregister_pernet_gen_device
+
#define compat_init_net ovs_compat_init_net
int ovs_compat_init_net(struct net *net, struct rpl_pernet_operations *pnet);
#define compat_exit_net ovs_compat_exit_net
@@ -51,7 +54,7 @@ static void rpl_unregister_pernet_gen_##TYPE(struct rpl_pernet_operations *rpl_p
#define DEFINE_COMPAT_PNET_REG_FUNC(TYPE)
#endif /* 2.6.33 */
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
+#ifndef HAVE_POSSIBLE_NET_T
typedef struct {
#ifdef CONFIG_NET_NS
struct net *net;
diff --git a/datapath/linux/compat/include/net/route.h b/datapath/linux/compat/include/net/route.h
new file mode 100644
index 000000000..bfabdc1a8
--- /dev/null
+++ b/datapath/linux/compat/include/net/route.h
@@ -0,0 +1,109 @@
+#ifndef __NET_ROUTE_H_WRAPPER
+#define __NET_ROUTE_H_WRAPPER
+
+#include_next <net/route.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+struct flowi_common {
+ int flowic_oif;
+ __u32 flowic_mark;
+ __u8 flowic_tos;
+ __u8 flowic_proto;
+};
+
+union flowi_uli {
+ struct {
+ __be16 dport;
+ __be16 sport;
+ } ports;
+
+ struct {
+ __u8 type;
+ __u8 code;
+ } icmpt;
+
+ struct {
+ __le16 dport;
+ __le16 sport;
+ } dnports;
+
+ __be32 spi;
+ __be32 gre_key;
+
+ struct {
+ __u8 type;
+ } mht;
+};
+
+struct flowi4 {
+ struct flowi_common __fl_common;
+#define flowi4_oif __fl_common.flowic_oif
+#define flowi4_iif __fl_common.flowic_iif
+#define flowi4_mark __fl_common.flowic_mark
+#define flowi4_tos __fl_common.flowic_tos
+#define flowi4_scope __fl_common.flowic_scope
+#define flowi4_proto __fl_common.flowic_proto
+#define flowi4_flags __fl_common.flowic_flags
+#define flowi4_secid __fl_common.flowic_secid
+#define flowi4_tun_key __fl_common.flowic_tun_key
+
+ union flowi_uli uli;
+#define fl4_gre_key uli.gre_key
+
+ /* (saddr,daddr) must be grouped, same order as in IP header */
+ __be32 saddr;
+ __be32 daddr;
+
+} __attribute__((__aligned__(BITS_PER_LONG/8)));
+
+struct flowi6 {
+ struct flowi_common __fl_common;
+#define flowi6_oif __fl_common.flowic_oif
+#define flowi6_iif __fl_common.flowic_iif
+#define flowi6_mark __fl_common.flowic_mark
+#define flowi6_tos __fl_common.flowic_tos
+#define flowi6_scope __fl_common.flowic_scope
+#define flowi6_proto __fl_common.flowic_proto
+#define flowi6_flags __fl_common.flowic_flags
+#define flowi6_secid __fl_common.flowic_secid
+#define flowi6_tun_key __fl_common.flowic_tun_key
+ struct in6_addr daddr;
+ struct in6_addr saddr;
+ __be32 flowlabel;
+ union flowi_uli uli;
+#define fl6_sport uli.ports.sport
+#define fl6_dport uli.ports.dport
+#define fl6_icmp_type uli.icmpt.type
+#define fl6_icmp_code uli.icmpt.code
+#define fl6_ipsec_spi uli.spi
+#define fl6_mh_type uli.mht.type
+#define fl6_gre_key uli.gre_key
+} __attribute__((__aligned__(BITS_PER_LONG/8)));
+
+static inline struct rtable *rpl_ip_route_output_key(struct net *net, struct flowi4 *flp)
+{
+ struct rtable *rt;
+ /* Tunnel configuration keeps DSCP part of TOS bits, But Linux
+ * router expect RT_TOS bits only.
+ */
+
+ struct flowi fl = { .nl_u = { .ip4_u = {
+ .daddr = flp->daddr,
+ .saddr = flp->saddr,
+ .tos = RT_TOS(flp->flowi4_tos) } },
+ .mark = flp->flowi4_mark,
+ .proto = flp->flowi4_proto };
+
+ if (unlikely(ip_route_output_key(net, &rt, &fl)))
+ return ERR_PTR(-EADDRNOTAVAIL);
+ flp->saddr = fl.nl_u.ip4_u.saddr;
+ return rt;
+}
+#define ip_route_output_key rpl_ip_route_output_key
+
+static inline int ip4_dst_hoplimit(const struct dst_entry *dst)
+{
+ return dst_metric(dst, RTAX_HOPLIMIT);
+}
+#endif
+#endif
diff --git a/datapath/linux/compat/include/net/rtnetlink.h b/datapath/linux/compat/include/net/rtnetlink.h
new file mode 100644
index 000000000..6db4a76ab
--- /dev/null
+++ b/datapath/linux/compat/include/net/rtnetlink.h
@@ -0,0 +1,30 @@
+#ifndef __NET_RTNETLINK_WRAPPER_H
+#define __NET_RTNETLINK_WRAPPER_H
+#include_next <net/rtnetlink.h>
+
+#define rtnl_delete_link rpl_rtnl_delete_link
+int rpl_rtnl_delete_link(struct net_device *dev);
+
+#ifndef HAVE_NAME_ASSIGN_TYPE
+#ifdef HAVE_RTNL_CREATE_LINK_SRC_NET
+static inline struct net_device *rpl_rtnl_create_link(struct net *net, const char *ifname,
+ unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops,
+ struct nlattr *tb[])
+{
+ return rtnl_create_link(net, net, (char *)ifname, ops, tb);
+}
+
+#else
+static inline struct net_device *rpl_rtnl_create_link(struct net *net, const char *ifname,
+ unsigned char name_assign_type,
+ const struct rtnl_link_ops *ops,
+ struct nlattr *tb[])
+{
+ return rtnl_create_link(net, (char *)ifname, ops, tb);
+}
+#endif
+#define rtnl_create_link rpl_rtnl_create_link
+#endif
+
+#endif
diff --git a/datapath/linux/compat/include/net/stt.h b/datapath/linux/compat/include/net/stt.h
index 13812b1f2..28d4dc53c 100644
--- a/datapath/linux/compat/include/net/stt.h
+++ b/datapath/linux/compat/include/net/stt.h
@@ -2,6 +2,7 @@
#define __NET_STT_H 1
#include <linux/kconfig.h>
+#include <linux/errno.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) && IS_ENABLED(CONFIG_NETFILTER)
#include <net/ip_tunnels.h>
#define OVS_STT
@@ -30,42 +31,37 @@ static inline struct stthdr *stt_hdr(const struct sk_buff *skb)
sizeof(struct tcphdr));
}
-struct stt_sock;
-typedef void (stt_rcv_t)(struct stt_sock *stt_sock, struct sk_buff *skb);
+struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name,
+ u8 name_assign_type, u16 dst_port);
-/* @list: Per-net list of STT ports.
- * @rcv: The callback is called on STT packet recv, STT reassembly can generate
- * multiple packets, in this case first packet has tunnel outer header, rest
- * of the packets are inner packet segments with no stt header.
- * @rcv_data: user data.
- * @sock: Fake TCP socket for the STT port.
- */
-struct stt_sock {
- struct list_head list;
- stt_rcv_t *rcv;
- void *rcv_data;
- struct socket *sock;
- struct rcu_head rcu;
-};
+netdev_tx_t ovs_stt_xmit(struct sk_buff *skb);
-#define stt_sock_add rpl_stt_sock_add
-struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port,
- stt_rcv_t *rcv, void *data);
+int ovs_stt_init_module(void);
-#define stt_sock_release rpl_stt_sock_release
-void rpl_stt_sock_release(struct stt_sock *stt_sock);
+void ovs_stt_cleanup_module(void);
+#else
+static inline int ovs_stt_init_module(void)
+{
+ return 0;
+}
-#define stt_xmit_skb rpl_stt_xmit_skb
-int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
- __be32 src, __be32 dst, __u8 tos,
- __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
- __be64 tun_id);
+static inline void ovs_stt_cleanup_module(void)
+{}
-#define stt_init_module ovs_stt_init_module
-int ovs_stt_init_module(void);
+static inline struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name,
+ u8 name_assign_type, u16 dst_port)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+static inline netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
+{
+ BUG();
+ return NETDEV_TX_OK;
+}
+#endif
+#define stt_dev_create_fb ovs_stt_dev_create_fb
+#define stt_init_module ovs_stt_init_module
#define stt_cleanup_module ovs_stt_cleanup_module
-void ovs_stt_cleanup_module(void);
-#endif
#endif /*ifdef__NET_STT_H */
diff --git a/datapath/linux/compat/include/net/udp_tunnel.h b/datapath/linux/compat/include/net/udp_tunnel.h
index d33474648..85aed9809 100644
--- a/datapath/linux/compat/include/net/udp_tunnel.h
+++ b/datapath/linux/compat/include/net/udp_tunnel.h
@@ -4,12 +4,14 @@
#include <linux/version.h>
#include <linux/kconfig.h>
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)
+#include <net/dst_metadata.h>
+#include <linux/netdev_features.h>
+#ifdef HAVE_UDP_TUNNEL_IPV6
#include_next <net/udp_tunnel.h>
static inline struct sk_buff *
rpl_udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum,
- bool is_vxlan)
+ int type, bool is_vxlan)
{
if (skb_is_gso(skb) && skb_is_encapsulated(skb)) {
kfree_skb(skb);
@@ -19,18 +21,6 @@ rpl_udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum,
}
#define udp_tunnel_handle_offloads rpl_udp_tunnel_handle_offloads
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
-static inline int rpl_udp_tunnel_xmit_skb(struct rtable *rt,
- struct sock *sk, struct sk_buff *skb,
- __be32 src, __be32 dst, __u8 tos, __u8 ttl,
- __be16 df, __be16 src_port, __be16 dst_port,
- bool xnet, bool nocheck)
-{
- return udp_tunnel_xmit_skb(rt, skb, src, dst, tos, ttl, df, src_port,
- dst_port, xnet, nocheck);
-}
-#define udp_tunnel_xmit_skb rpl_udp_tunnel_xmit_skb
-#endif
#else
#include <net/ip_tunnels.h>
@@ -58,7 +48,8 @@ struct udp_port_cfg {
__be16 peer_udp_port;
unsigned int use_udp_checksums:1,
use_udp6_tx_checksums:1,
- use_udp6_rx_checksums:1;
+ use_udp6_rx_checksums:1,
+ ipv6_v6only:1;
};
#define udp_sock_create rpl_udp_sock_create
@@ -96,13 +87,20 @@ void rpl_udp_tunnel_sock_release(struct socket *sock);
void ovs_udp_gso(struct sk_buff *skb);
void ovs_udp_csum_gso(struct sk_buff *skb);
+#define udp_tunnel_encap_enable(sock) udp_encap_enable()
static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb,
bool udp_csum,
+ int type,
bool is_vxlan)
{
- int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
void (*fix_segment)(struct sk_buff *);
+ if (skb_is_gso(skb) && skb_is_encapsulated(skb)) {
+ kfree_skb(skb);
+ return ERR_PTR(-ENOSYS);
+ }
+
+ type |= udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
if (!udp_csum)
fix_segment = ovs_udp_gso;
else
@@ -116,7 +114,38 @@ static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb,
return ovs_iptunnel_handle_offloads(skb, udp_csum, type, fix_segment);
}
-#define udp_tunnel_encap_enable(sock) udp_encap_enable()
+#if IS_ENABLED(CONFIG_IPV6)
+#define udp_tunnel6_xmit_skb rpl_udp_tunnel6_xmit_skb
+int rpl_udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
+ struct net_device *dev, struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ __u8 prio, __u8 ttl, __be16 src_port,
+ __be16 dst_port, bool nocheck);
+#endif
+
+static inline void udp_tunnel_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct udphdr *uh;
+
+ uh = (struct udphdr *)(skb->data + nhoff - sizeof(struct udphdr));
+ skb_shinfo(skb)->gso_type |= uh->check ?
+ SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+}
+#endif
+
+static inline void ovs_udp_tun_rx_dst(struct ip_tunnel_info *info,
+ struct sk_buff *skb,
+ unsigned short family,
+ __be16 flags, __be64 tunnel_id, int md_size)
+{
+ if (family == AF_INET)
+ ovs_ip_tun_rx_dst(info, skb, flags, tunnel_id, md_size);
+
+ info->key.tp_src = udp_hdr(skb)->source;
+ info->key.tp_dst = udp_hdr(skb)->dest;
+ if (udp_hdr(skb)->check)
+ info->key.tun_flags |= TUNNEL_CSUM;
+}
-#endif /* Linux version < 4.0 */
#endif
diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h
index cafff7954..13de97ac1 100644
--- a/datapath/linux/compat/include/net/vxlan.h
+++ b/datapath/linux/compat/include/net/vxlan.h
@@ -1,32 +1,39 @@
#ifndef __NET_VXLAN_WRAPPER_H
-#define __NET_VXLAN_WRAPPER_H 1
+#define __NET_VXLAN_WRAPPER_H 1
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/udp.h>
-#include <net/gre.h>
-
-#include <linux/version.h>
+#ifdef CONFIG_INET
+#include <net/udp_tunnel.h>
+#endif
-#ifdef HAVE_VXLAN_METADATA
-#define USE_UPSTREAM_VXLAN
+#ifdef HAVE_METADATA_DST
#include_next <net/vxlan.h>
-#endif
-#ifndef VXLAN_HLEN
-/* VXLAN header flags. */
-#define VXLAN_HF_VNI 0x08000000
-#ifndef VXLAN_HF_GBP
-#define VXLAN_HF_GBP 0x80000000
-#endif
+static inline int rpl_vxlan_init_module(void)
+{
+ return 0;
+}
+static inline void rpl_vxlan_cleanup_module(void)
+{}
-#define VXLAN_N_VID (1u << 24)
-#define VXLAN_VID_MASK (VXLAN_N_VID - 1)
-#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
-#endif
+#define vxlan_xmit dev_queue_xmit
+
+#else
+
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/udp.h>
+#include <net/dst_metadata.h>
+
+#include "compat.h"
+#include "gso.h"
+
+#define VNI_HASH_BITS 10
+#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
-#ifndef VXLAN_GBP_USED_BITS
/*
* VXLAN Group Based Policy Extension:
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
@@ -64,6 +71,7 @@ struct vxlanhdr_gbp {
__be16 policy_id;
__be32 vx_vni;
};
+
#define VXLAN_GBP_USED_BITS (VXLAN_HF_GBP | 0xFFFFFF)
/* skb->mark mapping
@@ -76,75 +84,194 @@ struct vxlanhdr_gbp {
#define VXLAN_GBP_POLICY_APPLIED (BIT(3) << 16)
#define VXLAN_GBP_ID_MASK (0xFFFF)
-#define VXLAN_F_GBP 0x800
-#endif
-
-#ifndef VXLAN_F_UDP_CSUM
-#define VXLAN_F_UDP_CSUM 0x40
-#endif
-
-#ifndef VXLAN_F_RCV_FLAGS
-#define VXLAN_F_RCV_FLAGS VXLAN_F_GBP
-#endif
+/* VXLAN protocol header:
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |G|R|R|R|I|R|R|C| Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | VXLAN Network Identifier (VNI) | Reserved |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ * G = 1 Group Policy (VXLAN-GBP)
+ * I = 1 VXLAN Network Identifier (VNI) present
+ * C = 1 Remote checksum offload (RCO)
+ */
+struct vxlanhdr {
+ __be32 vx_flags;
+ __be32 vx_vni;
+};
-#ifdef USE_UPSTREAM_VXLAN
-static inline int rpl_vxlan_xmit_skb(struct rtable *rt, struct sock *sk,
- struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos,
- __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
- struct vxlan_metadata *md, bool xnet, u32 vxflags)
-{
- if (skb_is_gso(skb) && skb_is_encapsulated(skb)) {
- kfree_skb(skb);
- return -ENOSYS;
- }
+/* VXLAN header flags. */
+#define VXLAN_HF_RCO BIT(24)
+#define VXLAN_HF_VNI BIT(27)
+#define VXLAN_HF_GBP BIT(31)
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
- return vxlan_xmit_skb(rt, skb, src, dst, tos, ttl, df,
-#else
- return vxlan_xmit_skb(rt, sk, skb, src, dst, tos, ttl, df,
-#endif
- src_port, dst_port, md, xnet, vxflags);
-}
+/* Remote checksum offload header option */
+#define VXLAN_RCO_MASK 0x7f /* Last byte of vni field */
+#define VXLAN_RCO_UDP 0x80 /* Indicate UDP RCO (TCP when not set *) */
+#define VXLAN_RCO_SHIFT 1 /* Left shift of start */
+#define VXLAN_RCO_SHIFT_MASK ((1 << VXLAN_RCO_SHIFT) - 1)
+#define VXLAN_MAX_REMCSUM_START (VXLAN_RCO_MASK << VXLAN_RCO_SHIFT)
-#define vxlan_xmit_skb rpl_vxlan_xmit_skb
-#else /* USE_UPSTREAM_VXLAN */
+#define VXLAN_N_VID (1u << 24)
+#define VXLAN_VID_MASK (VXLAN_N_VID - 1)
+#define VXLAN_VNI_MASK (VXLAN_VID_MASK << 8)
+#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
struct vxlan_metadata {
- __be32 vni;
- u32 gbp;
+ __be32 vni;
+ u32 gbp;
};
-#define vxlan_sock rpl_vxlan_sock
-struct rpl_vxlan_sock;
-
-#define vxlan_rcv_t rpl_vxlan_rcv_t
-typedef void (vxlan_rcv_t)(struct vxlan_sock *vh, struct sk_buff *skb,
- struct vxlan_metadata *md);
+#define VNI_HASH_BITS 10
+#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
+#define FDB_HASH_BITS 8
+#define FDB_HASH_SIZE (1<<FDB_HASH_BITS)
/* per UDP socket information */
struct vxlan_sock {
struct hlist_node hlist;
- vxlan_rcv_t *rcv;
- void *data;
struct work_struct del_work;
struct socket *sock;
struct rcu_head rcu;
+ struct hlist_head vni_list[VNI_HASH_SIZE];
+ atomic_t refcnt;
+#ifdef HAVE_UDP_OFFLOAD
+ struct udp_offload udp_offloads;
+#endif
u32 flags;
};
-#define vxlan_sock_add rpl_vxlan_sock_add
-struct vxlan_sock *rpl_vxlan_sock_add(struct net *net, __be16 port,
- vxlan_rcv_t *rcv, void *data,
- bool no_share, u32 flags);
+union vxlan_addr {
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ struct sockaddr sa;
+};
+
+struct vxlan_rdst {
+ union vxlan_addr remote_ip;
+ __be16 remote_port;
+ u32 remote_vni;
+ u32 remote_ifindex;
+ struct list_head list;
+ struct rcu_head rcu;
+};
+
+struct vxlan_config {
+ union vxlan_addr remote_ip;
+ union vxlan_addr saddr;
+ u32 vni;
+ int remote_ifindex;
+ int mtu;
+ __be16 dst_port;
+ __u16 port_min;
+ __u16 port_max;
+ __u8 tos;
+ __u8 ttl;
+ u32 flags;
+ unsigned long age_interval;
+ unsigned int addrmax;
+ bool no_share;
+};
+
+/* Pseudo network device */
+struct vxlan_dev {
+ struct hlist_node hlist; /* vni hash table */
+ struct list_head next; /* vxlan's per namespace list */
+ struct vxlan_sock *vn_sock; /* listening socket */
+ struct net_device *dev;
+ struct net *net; /* netns for packet i/o */
+ struct vxlan_rdst default_dst; /* default destination */
+ u32 flags; /* VXLAN_F_* in vxlan.h */
+
+ struct timer_list age_timer;
+ spinlock_t hash_lock;
+ unsigned int addrcnt;
-#define vxlan_sock_release rpl_vxlan_sock_release
-void rpl_vxlan_sock_release(struct vxlan_sock *vs);
+ struct vxlan_config cfg;
+ struct hlist_head fdb_head[FDB_HASH_SIZE];
+};
+
+#define VXLAN_F_LEARN 0x01
+#define VXLAN_F_PROXY 0x02
+#define VXLAN_F_RSC 0x04
+#define VXLAN_F_L2MISS 0x08
+#define VXLAN_F_L3MISS 0x10
+#define VXLAN_F_IPV6 0x20
+#define VXLAN_F_UDP_CSUM 0x40
+#define VXLAN_F_UDP_ZERO_CSUM6_TX 0x80
+#define VXLAN_F_UDP_ZERO_CSUM6_RX 0x100
+#define VXLAN_F_REMCSUM_TX 0x200
+#define VXLAN_F_REMCSUM_RX 0x400
+#define VXLAN_F_GBP 0x800
+#define VXLAN_F_REMCSUM_NOPARTIAL 0x1000
+#define VXLAN_F_COLLECT_METADATA 0x2000
+
+/* Flags that are used in the receive path. These flags must match in
+ * order for a socket to be shareable
+ */
+#define VXLAN_F_RCV_FLAGS (VXLAN_F_GBP | \
+ VXLAN_F_UDP_ZERO_CSUM6_RX | \
+ VXLAN_F_REMCSUM_RX | \
+ VXLAN_F_REMCSUM_NOPARTIAL | \
+ VXLAN_F_COLLECT_METADATA)
+#define vxlan_dev_create rpl_vxlan_dev_create
+struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name,
+ u8 name_assign_type, struct vxlan_config *conf);
+
+static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan)
+{
+ return inet_sport(vxlan->vn_sock->sock->sk);
+}
+
+static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
+ netdev_features_t features)
+{
+ u8 l4_hdr = 0;
+
+ if (!skb_encapsulation(skb))
+ return features;
+
+ switch (vlan_get_protocol(skb)) {
+ case htons(ETH_P_IP):
+ l4_hdr = ip_hdr(skb)->protocol;
+ break;
+ case htons(ETH_P_IPV6):
+ l4_hdr = ipv6_hdr(skb)->nexthdr;
+ break;
+ default:
+ return features;
+ }
+
+ if ((l4_hdr == IPPROTO_UDP) && (
+#ifdef ENCAP_TYPE_ETHER
+ skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
+#endif
+ ovs_skb_get_inner_protocol(skb) != htons(ETH_P_TEB) ||
+ (skb_inner_mac_header(skb) - skb_transport_header(skb) !=
+ sizeof(struct udphdr) + sizeof(struct vxlanhdr))))
+ return features & ~(NETIF_F_ALL_CSUM | NETIF_F_GSO_MASK);
+
+ return features;
+}
+
+/* IP header + UDP + VXLAN + Ethernet header */
+#define VXLAN_HEADROOM (20 + 8 + 8 + 14)
+/* IPv6 header + UDP + VXLAN + Ethernet header */
+#define VXLAN6_HEADROOM (40 + 8 + 8 + 14)
+
+static inline unsigned short vxlan_get_sk_family(struct vxlan_sock *vs)
+{
+ return vs->sock->sk->sk_family;
+}
+
+int rpl_vxlan_init_module(void);
+void rpl_vxlan_cleanup_module(void);
+
+#define vxlan_xmit rpl_vxlan_xmit
+netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb);
+#endif
-#define vxlan_xmit_skb rpl_vxlan_xmit_skb
-int rpl_vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
- __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port,
- struct vxlan_metadata *md, bool xnet, u32 vxflags);
+#define vxlan_init_module rpl_vxlan_init_module
+#define vxlan_cleanup_module rpl_vxlan_cleanup_module
-#endif /* !HAVE_VXLAN_METADATA */
#endif
diff --git a/datapath/linux/compat/ip_gre.c b/datapath/linux/compat/ip_gre.c
new file mode 100644
index 000000000..c9197e965
--- /dev/null
+++ b/datapath/linux/compat/ip_gre.c
@@ -0,0 +1,680 @@
+/*
+ * Linux NET3: GRE over IP protocol decoder.
+ *
+ * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kconfig.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/netdev_features.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/gre.h>
+#include <net/dst_metadata.h>
+
+#ifndef HAVE_METADATA_DST
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+#include "gso.h"
+#include "vport-netdev.h"
+
+static int gre_tap_net_id __read_mostly;
+
+#define ip_gre_calc_hlen rpl_ip_gre_calc_hlen
+static int ip_gre_calc_hlen(__be16 o_flags)
+{
+ int addend = 4;
+
+ if (o_flags & TUNNEL_CSUM)
+ addend += 4;
+ if (o_flags & TUNNEL_KEY)
+ addend += 4;
+ if (o_flags & TUNNEL_SEQ)
+ addend += 4;
+ return addend;
+}
+
+#define tnl_flags_to_gre_flags rpl_tnl_flags_to_gre_flags
+static __be16 tnl_flags_to_gre_flags(__be16 tflags)
+{
+ __be16 flags = 0;
+
+ if (tflags & TUNNEL_CSUM)
+ flags |= GRE_CSUM;
+ if (tflags & TUNNEL_ROUTING)
+ flags |= GRE_ROUTING;
+ if (tflags & TUNNEL_KEY)
+ flags |= GRE_KEY;
+ if (tflags & TUNNEL_SEQ)
+ flags |= GRE_SEQ;
+ if (tflags & TUNNEL_STRICT)
+ flags |= GRE_STRICT;
+ if (tflags & TUNNEL_REC)
+ flags |= GRE_REC;
+ if (tflags & TUNNEL_VERSION)
+ flags |= GRE_VERSION;
+
+ return flags;
+}
+
+static __be64 key_to_tunnel_id(__be32 key)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be64)((__force u32)key);
+#else
+ return (__force __be64)((__force u64)key << 32);
+#endif
+}
+
+/* Returns the least-significant 32 bits of a __be64. */
+static __be32 tunnel_id_to_key(__be64 x)
+{
+#ifdef __BIG_ENDIAN
+ return (__force __be32)x;
+#else
+ return (__force __be32)((__force u64)x >> 32);
+#endif
+}
+
+static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+{
+ struct net *net = dev_net(skb->dev);
+ struct metadata_dst tun_dst;
+ struct ip_tunnel_net *itn;
+ const struct iphdr *iph;
+ struct ip_tunnel *tunnel;
+
+ if (tpi->proto != htons(ETH_P_TEB))
+ return PACKET_REJECT;
+
+ itn = net_generic(net, gre_tap_net_id);
+
+ iph = ip_hdr(skb);
+ tunnel = rcu_dereference(itn->collect_md_tun);
+ if (tunnel) {
+ __be16 flags;
+ __be64 tun_id;
+ int err;
+
+
+ skb_pop_mac_header(skb);
+ flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ tun_id = key_to_tunnel_id(tpi->key);
+ ovs_ip_tun_rx_dst(&tun_dst.u.tun_info, skb, flags, tun_id, 0);
+
+ skb_reset_network_header(skb);
+ err = IP_ECN_decapsulate(iph, skb);
+ if (unlikely(err)) {
+ if (err > 1) {
+ ++tunnel->dev->stats.rx_frame_errors;
+ ++tunnel->dev->stats.rx_errors;
+ return PACKET_REJECT;
+ }
+ }
+
+ ovs_ip_tunnel_rcv(tunnel->dev, skb, &tun_dst);
+ return PACKET_RCVD;
+ }
+ return PACKET_REJECT;
+}
+
+static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
+{
+ if (ipgre_rcv(skb, tpi) == PACKET_RCVD)
+ return 0;
+
+ kfree_skb(skb);
+ return 0;
+}
+
+#ifndef HAVE_GRE_HANDLE_OFFLOADS
+static void gre_nop_fix(struct sk_buff *skb) { }
+
+static void gre_csum_fix(struct sk_buff *skb)
+{
+ struct gre_base_hdr *greh;
+ __be32 *options;
+ int gre_offset = skb_transport_offset(skb);
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ options = ((__be32 *)greh + 1);
+
+ *options = 0;
+ *(__sum16 *)options = csum_fold(skb_checksum(skb, gre_offset,
+ skb->len - gre_offset, 0));
+}
+
+static bool is_gre_gso(struct sk_buff *skb)
+{
+ return skb_is_gso(skb);
+}
+
+static struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
+{
+ int type = gre_csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE;
+ gso_fix_segment_t fix_segment;
+
+ if (gre_csum)
+ fix_segment = gre_csum_fix;
+ else
+ fix_segment = gre_nop_fix;
+
+ return ovs_iptunnel_handle_offloads(skb, gre_csum, type, fix_segment);
+}
+#else
+
+static bool is_gre_gso(struct sk_buff *skb)
+{
+ return skb_shinfo(skb)->gso_type &
+ (SKB_GSO_GRE | SKB_GSO_GRE_CSUM);
+}
+
+static struct sk_buff *rpl_gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
+{
+ if (skb_is_gso(skb) && skb_is_encapsulated(skb)) {
+ kfree_skb(skb);
+ return ERR_PTR(-ENOSYS);
+ }
+#undef gre_handle_offloads
+ return gre_handle_offloads(skb, gre_csum);
+}
+#endif
+
+static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
+ __be16 proto, __be32 key, __be32 seq)
+{
+ struct gre_base_hdr *greh;
+
+ skb_push(skb, hdr_len);
+
+ skb_reset_transport_header(skb);
+ greh = (struct gre_base_hdr *)skb->data;
+ greh->flags = tnl_flags_to_gre_flags(flags);
+ greh->protocol = proto;
+
+ if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) {
+ __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
+
+ if (flags & TUNNEL_SEQ) {
+ *ptr = seq;
+ ptr--;
+ }
+ if (flags & TUNNEL_KEY) {
+ *ptr = key;
+ ptr--;
+ }
+ if (flags & TUNNEL_CSUM && !is_gre_gso(skb)) {
+ *ptr = 0;
+ *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
+ skb->len, 0));
+ }
+ }
+ ovs_skb_set_inner_protocol(skb, proto);
+}
+
+
+netdev_tx_t rpl_gre_fb_xmit(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel_info *tun_info;
+ const struct ip_tunnel_key *key;
+ struct flowi4 fl;
+ struct rtable *rt;
+ int min_headroom;
+ int tunnel_hlen;
+ __be16 df, flags;
+ int err;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ ip_tunnel_info_af(tun_info) != AF_INET))
+ goto err_free_skb;
+
+ key = &tun_info->key;
+ memset(&fl, 0, sizeof(fl));
+ fl.daddr = key->u.ipv4.dst;
+ fl.saddr = key->u.ipv4.src;
+ fl.flowi4_tos = RT_TOS(key->tos);
+ fl.flowi4_mark = skb->mark;
+ fl.flowi4_proto = IPPROTO_GRE;
+
+ rt = ip_route_output_key(net, &fl);
+ if (IS_ERR(rt))
+ goto err_free_skb;
+
+ tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
+
+ min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+ + tunnel_hlen + sizeof(struct iphdr)
+ + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto err_free_rt;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (unlikely(!skb)) {
+ err = -ENOMEM;
+ goto err_free_rt;
+ }
+
+ /* Push Tunnel header. */
+ skb = rpl_gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM));
+ if (IS_ERR(skb)) {
+ skb = NULL;
+ goto err_free_rt;
+ }
+
+ flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY);
+ build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB),
+ tunnel_id_to_key(tun_info->key.tun_id), 0);
+
+ df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
+ key->u.ipv4.dst, IPPROTO_GRE,
+ key->tos, key->ttl, df, false);
+ iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
+ return NETDEV_TX_OK;
+
+err_free_rt:
+ ip_rt_put(rt);
+err_free_skb:
+ kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL(rpl_gre_fb_xmit);
+
+#define GRE_FEATURES (NETIF_F_SG | \
+ NETIF_F_FRAGLIST | \
+ NETIF_F_HIGHDMA | \
+ NETIF_F_HW_CSUM | \
+ NETIF_F_NETNS_LOCAL)
+
+static void __gre_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel;
+ int t_hlen;
+
+ tunnel = netdev_priv(dev);
+ tunnel->parms.iph.protocol = IPPROTO_GRE;
+ tunnel->tun_hlen = ip_gre_calc_hlen(tunnel->parms.o_flags);
+
+ tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
+
+ t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4;
+ dev->mtu = ETH_DATA_LEN - t_hlen - 4;
+
+ dev->features |= GRE_FEATURES;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ dev->hw_features |= GRE_FEATURES;
+#endif
+
+ if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
+ /* TCP offload with GRE SEQ is not supported. */
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+#endif
+ /* Can use a lockless transmit, unless we generate
+ * output sequences
+ */
+ dev->features |= NETIF_F_LLTX;
+ }
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static int gre_err(struct sk_buff *skb, u32 info,
+ const struct tnl_ptk_info *tpi)
+{
+ return PACKET_REJECT;
+}
+
+static struct gre_cisco_protocol ipgre_protocol = {
+ .handler = gre_rcv,
+ .err_handler = gre_err,
+ .priority = 1,
+};
+
+static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ __be16 flags;
+
+ if (!data)
+ return 0;
+
+ flags = 0;
+ if (data[IFLA_GRE_IFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
+ if (data[IFLA_GRE_OFLAGS])
+ flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
+ if (flags & (GRE_VERSION|GRE_ROUTING))
+ return -EINVAL;
+
+ return 0;
+}
+
+static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ __be32 daddr;
+
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ if (!data)
+ goto out;
+
+ if (data[IFLA_GRE_REMOTE]) {
+ memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
+ if (!daddr)
+ return -EINVAL;
+ }
+
+out:
+ return ipgre_tunnel_validate(tb, data);
+}
+
+static void ipgre_netlink_parms(struct net_device *dev,
+ struct nlattr *data[],
+ struct nlattr *tb[],
+ struct ip_tunnel_parm *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ parms->iph.protocol = IPPROTO_GRE;
+}
+
+static int gre_tap_init(struct net_device *dev)
+{
+ __gre_tunnel_init(dev);
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+
+ return ip_tunnel_init(dev);
+}
+
+static netdev_tx_t gre_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ /* Drop All packets coming from networking stack. OVS-CB is
+ * not initialized for these packets.
+ */
+
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops gre_tap_netdev_ops = {
+ .ndo_init = gre_tap_init,
+ .ndo_uninit = ip_tunnel_uninit,
+ .ndo_start_xmit = gre_dev_xmit,
+ .ndo_set_mac_address = eth_mac_addr,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_change_mtu = ip_tunnel_change_mtu,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+#endif
+#ifdef HAVE_NDO_GET_IFLINK
+ .ndo_get_iflink = ip_tunnel_get_iflink,
+#endif
+};
+
+static void ipgre_tap_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->netdev_ops = &gre_tap_netdev_ops;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ ip_tunnel_setup(dev, gre_tap_net_id);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+static int ipgre_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+#else
+static int ipgre_newlink(struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+#endif
+{
+ struct ip_tunnel_parm p;
+ int err;
+
+ ipgre_netlink_parms(dev, data, tb, &p);
+ err = ip_tunnel_newlink(dev, tb, &p);
+ return err;
+
+}
+
+static size_t ipgre_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_GRE_LINK */
+ nla_total_size(4) +
+ /* IFLA_GRE_IFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_OFLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_IKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_OKEY */
+ nla_total_size(4) +
+ /* IFLA_GRE_LOCAL */
+ nla_total_size(4) +
+ /* IFLA_GRE_REMOTE */
+ nla_total_size(4) +
+ /* IFLA_GRE_TTL */
+ nla_total_size(1) +
+ /* IFLA_GRE_TOS */
+ nla_total_size(1) +
+ /* IFLA_GRE_PMTUDISC */
+ nla_total_size(1) +
+ /* IFLA_GRE_ENCAP_TYPE */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_FLAGS */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_SPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_ENCAP_DPORT */
+ nla_total_size(2) +
+ /* IFLA_GRE_COLLECT_METADATA */
+ nla_total_size(0) +
+ 0;
+}
+
+static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct ip_tunnel *t = netdev_priv(dev);
+ struct ip_tunnel_parm *p = &t->parms;
+
+ if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
+ nla_put_be16(skb, IFLA_GRE_IFLAGS, tnl_flags_to_gre_flags(p->i_flags)) ||
+ nla_put_be16(skb, IFLA_GRE_OFLAGS, tnl_flags_to_gre_flags(p->o_flags)) ||
+ nla_put_be32(skb, IFLA_GRE_IKEY, p->i_key) ||
+ nla_put_be32(skb, IFLA_GRE_OKEY, p->o_key) ||
+ nla_put_in_addr(skb, IFLA_GRE_LOCAL, p->iph.saddr) ||
+ nla_put_in_addr(skb, IFLA_GRE_REMOTE, p->iph.daddr) ||
+ nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) ||
+ nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) ||
+ nla_put_u8(skb, IFLA_GRE_PMTUDISC,
+ !!(p->iph.frag_off & htons(IP_DF))))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
+ [IFLA_GRE_LINK] = { .type = NLA_U32 },
+ [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
+ [IFLA_GRE_IKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_OKEY] = { .type = NLA_U32 },
+ [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
+ [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
+ [IFLA_GRE_TTL] = { .type = NLA_U8 },
+ [IFLA_GRE_TOS] = { .type = NLA_U8 },
+ [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
+};
+
+static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
+ .kind = "ovs_gretap",
+ .maxtype = IFLA_GRE_MAX,
+ .policy = ipgre_policy,
+ .priv_size = sizeof(struct ip_tunnel),
+ .setup = ipgre_tap_setup,
+ .validate = ipgre_tap_validate,
+ .newlink = ipgre_newlink,
+ .dellink = ip_tunnel_dellink,
+ .get_size = ipgre_get_size,
+ .fill_info = ipgre_fill_info,
+#ifdef HAVE_GET_LINK_NET
+ .get_link_net = ip_tunnel_get_link_net,
+#endif
+};
+
+struct net_device *rpl_gretap_fb_dev_create(struct net *net, const char *name,
+ u8 name_assign_type)
+{
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ struct ip_tunnel *t;
+ int err;
+
+ memset(&tb, 0, sizeof(tb));
+
+ dev = rtnl_create_link(net, (char *)name, name_assign_type,
+ &ipgre_tap_ops, tb);
+ if (IS_ERR(dev))
+ return dev;
+
+ t = netdev_priv(dev);
+ t->collect_md = true;
+ /* Configure flow based GRE device. */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ err = ipgre_newlink(net, dev, tb, NULL);
+#else
+ err = ipgre_newlink(dev, tb, NULL);
+#endif
+ if (err < 0)
+ goto out;
+ return dev;
+out:
+ free_netdev(dev);
+ return ERR_PTR(err);
+}
+EXPORT_SYMBOL_GPL(rpl_gretap_fb_dev_create);
+
+static int __net_init ipgre_tap_init_net(struct net *net)
+{
+ return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0");
+}
+
+static void __net_exit ipgre_tap_exit_net(struct net *net)
+{
+ struct ip_tunnel_net *itn = net_generic(net, gre_tap_net_id);
+
+ ip_tunnel_delete_net(itn, &ipgre_tap_ops);
+}
+
+static struct pernet_operations ipgre_tap_net_ops = {
+ .init = ipgre_tap_init_net,
+ .exit = ipgre_tap_exit_net,
+ .id = &gre_tap_net_id,
+ .size = sizeof(struct ip_tunnel_net),
+};
+
+DEFINE_COMPAT_PNET_REG_FUNC(device);
+
+int rpl_ipgre_init(void)
+{
+ int err;
+
+ err = register_pernet_device(&ipgre_tap_net_ops);
+ if (err < 0)
+ goto pnet_tap_faied;
+
+ err = gre_cisco_register(&ipgre_protocol);
+ if (err < 0) {
+ pr_info("%s: can't add protocol\n", __func__);
+ goto add_proto_failed;
+ }
+
+ err = rtnl_link_register(&ipgre_tap_ops);
+ if (err < 0)
+ goto tap_ops_failed;
+
+ pr_info("GRE over IPv4 tunneling driver\n");
+ return 0;
+
+tap_ops_failed:
+ gre_cisco_unregister(&ipgre_protocol);
+add_proto_failed:
+ unregister_pernet_device(&ipgre_tap_net_ops);
+pnet_tap_faied:
+ return err;
+}
+
+void rpl_ipgre_fini(void)
+{
+ rtnl_link_unregister(&ipgre_tap_ops);
+ gre_cisco_unregister(&ipgre_protocol);
+ unregister_pernet_device(&ipgre_tap_net_ops);
+}
+
+#endif
diff --git a/datapath/linux/compat/ip_tunnel.c b/datapath/linux/compat/ip_tunnel.c
new file mode 100644
index 000000000..f43e2d457
--- /dev/null
+++ b/datapath/linux/compat/ip_tunnel.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2013 Nicira, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/capability.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/kconfig.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/rculist.h>
+#include <linux/err.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ip_tunnels.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/rtnetlink.h>
+#include <net/udp.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+#include "compat.h"
+
+#ifndef HAVE_METADATA_DST
+static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, t);
+ else
+ WARN_ONCE(1, "%s: collect md not set\n", t->dev->name);
+}
+
+static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
+{
+ if (t->collect_md)
+ rcu_assign_pointer(itn->collect_md_tun, NULL);
+}
+
+static inline void init_tunnel_flow(struct flowi4 *fl4,
+ int proto,
+ __be32 daddr, __be32 saddr,
+ __be32 key, __u8 tos, int oif)
+{
+ memset(fl4, 0, sizeof(*fl4));
+ fl4->flowi4_oif = oif;
+ fl4->daddr = daddr;
+ fl4->saddr = saddr;
+ fl4->flowi4_tos = tos;
+ fl4->flowi4_proto = proto;
+ fl4->fl4_gre_key = key;
+}
+
+static int ip_tunnel_bind_dev(struct net_device *dev)
+{
+ struct net_device *tdev = NULL;
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ const struct iphdr *iph;
+ int hlen = LL_MAX_HEADER;
+ int mtu = ETH_DATA_LEN;
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ iph = &tunnel->parms.iph;
+
+ /* Guess output device to choose reasonable mtu and needed_headroom */
+ if (iph->daddr) {
+ struct flowi4 fl4;
+ struct rtable *rt;
+
+ init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
+ iph->saddr, tunnel->parms.o_key,
+ RT_TOS(iph->tos), tunnel->parms.link);
+ rt = ip_route_output_key(tunnel->net, &fl4);
+
+ if (!IS_ERR(rt)) {
+ tdev = rt_dst(rt).dev;
+ ip_rt_put(rt);
+ }
+ if (dev->type != ARPHRD_ETHER)
+ dev->flags |= IFF_POINTOPOINT;
+ }
+
+ if (!tdev && tunnel->parms.link)
+ tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
+
+ if (tdev) {
+ hlen = tdev->hard_header_len + tdev->needed_headroom;
+ mtu = tdev->mtu;
+ }
+
+ dev->needed_headroom = t_hlen + hlen;
+ mtu -= (dev->hard_header_len + t_hlen);
+
+ if (mtu < 68)
+ mtu = 68;
+
+ return mtu;
+}
+
+int rpl_ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+
+ if (new_mtu < 68 ||
+ new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static void ip_tunnel_dev_free(struct net_device *dev)
+{
+#ifdef HAVE_DEV_TSTATS
+ free_percpu(dev->tstats);
+#endif
+ free_netdev(dev);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+void rpl_ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
+#else
+void rpl_ip_tunnel_dellink(struct net_device *dev)
+#endif
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
+
+ ip_tunnel_del(itn, netdev_priv(dev));
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ unregister_netdevice_queue(dev, head);
+#endif
+}
+
+int rpl_ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
+ struct rtnl_link_ops *ops, char *devname)
+{
+ struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
+
+ itn->collect_md_tun = NULL;
+ itn->rtnl_ops = ops;
+ return 0;
+}
+
+static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+ struct rtnl_link_ops *ops)
+{
+ struct ip_tunnel *t;
+
+ t = rtnl_dereference(itn->collect_md_tun);
+ if (!t)
+ return;
+ unregister_netdevice_queue(t->dev, head);
+}
+
+void rpl_ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
+{
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ ip_tunnel_destroy(itn, &list, ops);
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+
+int rpl_ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
+ struct ip_tunnel_parm *p)
+{
+ struct ip_tunnel *nt;
+ struct net *net = dev_net(dev);
+ struct ip_tunnel_net *itn;
+ int mtu;
+ int err;
+
+ nt = netdev_priv(dev);
+ itn = net_generic(net, nt->ip_tnl_net_id);
+
+ if (nt->collect_md) {
+ if (rtnl_dereference(itn->collect_md_tun))
+ return -EEXIST;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ nt->net = net;
+ nt->parms = *p;
+ err = register_netdevice(dev);
+ if (err)
+ goto out;
+
+ if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
+ eth_hw_addr_random(dev);
+
+ mtu = ip_tunnel_bind_dev(dev);
+ if (!tb[IFLA_MTU])
+ dev->mtu = mtu;
+
+ ip_tunnel_add(itn, nt);
+out:
+ return err;
+}
+
+int rpl_ip_tunnel_init(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct iphdr *iph = &tunnel->parms.iph;
+
+ dev->destructor = ip_tunnel_dev_free;
+#ifdef HAVE_DEV_TSTATS
+ dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+#endif
+ tunnel->dev = dev;
+ tunnel->net = dev_net(dev);
+ strcpy(tunnel->parms.name, dev->name);
+ iph->version = 4;
+ iph->ihl = 5;
+
+ if (tunnel->collect_md)
+ dev->features |= NETIF_F_NETNS_LOCAL;
+
+ return 0;
+}
+
+void rpl_ip_tunnel_uninit(struct net_device *dev)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+ struct net *net = tunnel->net;
+ struct ip_tunnel_net *itn;
+
+ itn = net_generic(net, tunnel->ip_tnl_net_id);
+ ip_tunnel_del(itn, netdev_priv(dev));
+}
+
+/* Do least required initialization, rest of init is done in tunnel_init call */
+void rpl_ip_tunnel_setup(struct net_device *dev, int net_id)
+{
+ struct ip_tunnel *tunnel = netdev_priv(dev);
+
+ tunnel->ip_tnl_net_id = net_id;
+}
+#endif
diff --git a/datapath/linux/compat/ip_tunnels_core.c b/datapath/linux/compat/ip_tunnels_core.c
index 8ff7cd79f..179fa47b2 100644
--- a/datapath/linux/compat/ip_tunnels_core.c
+++ b/datapath/linux/compat/ip_tunnels_core.c
@@ -34,6 +34,7 @@
#include "compat.h"
#include "gso.h"
+#include "vport-netdev.h"
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,18,0)
int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
@@ -44,11 +45,11 @@ int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
struct iphdr *iph;
int err;
- nf_reset(skb);
- secpath_reset(skb);
+ skb_scrub_packet(skb, xnet);
+
skb_clear_hash(skb);
- skb_dst_drop(skb);
skb_dst_set(skb, &rt_dst(rt));
+
#if 0
/* Do not clear ovs_skb_cb. It will be done in gso code. */
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
@@ -71,6 +72,9 @@ int rpl_iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
#ifdef HAVE_IP_SELECT_IDENT_USING_DST_ENTRY
__ip_select_ident(iph, &rt_dst(rt), (skb_shinfo(skb)->gso_segs ?: 1) - 1);
+#elif defined(HAVE_IP_SELECT_IDENT_USING_NET)
+ __ip_select_ident(dev_net(rt->dst.dev), iph,
+ skb_shinfo(skb)->gso_segs ?: 1);
#else
__ip_select_ident(iph, skb_shinfo(skb)->gso_segs ?: 1);
#endif
@@ -84,7 +88,7 @@ EXPORT_SYMBOL_GPL(rpl_iptunnel_xmit);
struct sk_buff *ovs_iptunnel_handle_offloads(struct sk_buff *skb,
bool csum_help, int gso_type_mask,
- void (*fix_segment)(struct sk_buff *))
+ void (*fix_segment)(struct sk_buff *))
{
int err;
@@ -180,3 +184,84 @@ bool ovs_skb_is_encapsulated(struct sk_buff *skb)
return ovs_skb_get_inner_protocol(skb) || skb_encapsulation(skb);
}
EXPORT_SYMBOL_GPL(ovs_skb_is_encapsulated);
+
+/* derived from ip_tunnel_rcv(). */
+void ovs_ip_tunnel_rcv(struct net_device *dev, struct sk_buff *skb,
+ struct metadata_dst *tun_dst)
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ struct pcpu_sw_netstats *tstats;
+
+ tstats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)dev->tstats);
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+#endif
+
+ skb_reset_mac_header(skb);
+ skb_scrub_packet(skb, false);
+ skb->protocol = eth_type_trans(skb, dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+ ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst);
+
+#ifndef HAVE_METADATA_DST
+ netdev_port_receive(skb, &tun_dst->u.tun_info);
+#else
+ netif_rx(skb);
+#endif
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+#ifndef HAVE_PCPU_SW_NETSTATS
+#define netdev_stats_to_stats64 rpl_netdev_stats_to_stats64
+static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
+ const struct net_device_stats *netdev_stats)
+{
+#if BITS_PER_LONG == 64
+ BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+ memcpy(stats64, netdev_stats, sizeof(*stats64));
+#else
+ size_t i, n = sizeof(*stats64) / sizeof(u64);
+ const unsigned long *src = (const unsigned long *)netdev_stats;
+ u64 *dst = (u64 *)stats64;
+
+ BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
+ sizeof(*stats64) / sizeof(u64));
+ for (i = 0; i < n; i++)
+ dst[i] = src[i];
+#endif
+}
+
+struct rtnl_link_stats64 *rpl_ip_tunnel_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *tot)
+{
+ int i;
+
+ netdev_stats_to_stats64(tot, &dev->stats);
+
+ for_each_possible_cpu(i) {
+ const struct pcpu_sw_netstats *tstats =
+ per_cpu_ptr((struct pcpu_sw_netstats __percpu *)dev->tstats, i);
+ u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin_irq(&tstats->syncp);
+ rx_packets = tstats->rx_packets;
+ tx_packets = tstats->tx_packets;
+ rx_bytes = tstats->rx_bytes;
+ tx_bytes = tstats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&tstats->syncp, start));
+
+ tot->rx_packets += rx_packets;
+ tot->tx_packets += tx_packets;
+ tot->rx_bytes += rx_bytes;
+ tot->tx_bytes += tx_bytes;
+ }
+
+ return tot;
+}
+#endif
+#endif
diff --git a/datapath/linux/compat/lisp.c b/datapath/linux/compat/lisp.c
new file mode 100644
index 000000000..e5a6a7fe0
--- /dev/null
+++ b/datapath/linux/compat/lisp.c
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) 2015 Nicira, Inc.
+ * Copyright (c) 2013 Cisco Systems, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/version.h>
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/module.h>
+#include <linux/rculist.h>
+#include <linux/udp.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/lisp.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/route.h>
+#include <net/udp.h>
+#include <net/udp_tunnel.h>
+#include <net/xfrm.h>
+
+#include "datapath.h"
+#include "gso.h"
+#include "vport.h"
+#include "gso.h"
+#include "vport-netdev.h"
+
+#define LISP_UDP_PORT 4341
+#define LISP_NETDEV_VER "0.1"
+static int lisp_net_id;
+
+/* Pseudo network device */
+struct lisp_dev {
+ struct net *net; /* netns for packet i/o */
+ struct net_device *dev; /* netdev for lisp tunnel */
+ struct socket *sock;
+ __be16 dst_port;
+ struct list_head next;
+};
+
+/* per-network namespace private data for this module */
+struct lisp_net {
+ struct list_head lisp_list;
+};
+
+/*
+ * LISP encapsulation header:
+ *
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |N|L|E|V|I|flags| Nonce/Map-Version |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * | Instance ID/Locator Status Bits |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ */
+
+/**
+ * struct lisphdr - LISP header
+ * @nonce_present: Flag indicating the presence of a 24 bit nonce value.
+ * @locator_status_bits_present: Flag indicating the presence of Locator Status
+ * Bits (LSB).
+ * @solicit_echo_nonce: Flag indicating the use of the echo noncing mechanism.
+ * @map_version_present: Flag indicating the use of mapping versioning.
+ * @instance_id_present: Flag indicating the presence of a 24 bit Instance ID.
+ * @reserved_flags: 3 bits reserved for future flags.
+ * @nonce: 24 bit nonce value.
+ * @map_version: 24 bit mapping version.
+ * @locator_status_bits: Locator Status Bits: 32 bits when instance_id_present
+ * is not set, 8 bits when it is.
+ * @instance_id: 24 bit Instance ID
+ */
+struct lisphdr {
+#ifdef __LITTLE_ENDIAN_BITFIELD
+ __u8 reserved_flags:3;
+ __u8 instance_id_present:1;
+ __u8 map_version_present:1;
+ __u8 solicit_echo_nonce:1;
+ __u8 locator_status_bits_present:1;
+ __u8 nonce_present:1;
+#else
+ __u8 nonce_present:1;
+ __u8 locator_status_bits_present:1;
+ __u8 solicit_echo_nonce:1;
+ __u8 map_version_present:1;
+ __u8 instance_id_present:1;
+ __u8 reserved_flags:3;
+#endif
+ union {
+ __u8 nonce[3];
+ __u8 map_version[3];
+ } u1;
+ union {
+ __be32 locator_status_bits;
+ struct {
+ __u8 instance_id[3];
+ __u8 locator_status_bits;
+ } word2;
+ } u2;
+};
+
+#define LISP_HLEN (sizeof(struct udphdr) + sizeof(struct lisphdr))
+
+static inline struct lisphdr *lisp_hdr(const struct sk_buff *skb)
+{
+ return (struct lisphdr *)(udp_hdr(skb) + 1);
+}
+
+/* Convert 64 bit tunnel ID to 24 bit Instance ID. */
+static void tunnel_id_to_instance_id(__be64 tun_id, __u8 *iid)
+{
+
+#ifdef __BIG_ENDIAN
+ iid[0] = (__force __u8)(tun_id >> 16);
+ iid[1] = (__force __u8)(tun_id >> 8);
+ iid[2] = (__force __u8)tun_id;
+#else
+ iid[0] = (__force __u8)((__force u64)tun_id >> 40);
+ iid[1] = (__force __u8)((__force u64)tun_id >> 48);
+ iid[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
+/* Convert 24 bit Instance ID to 64 bit tunnel ID. */
+static __be64 instance_id_to_tunnel_id(__u8 *iid)
+{
+#ifdef __BIG_ENDIAN
+ return (iid[0] << 16) | (iid[1] << 8) | iid[2];
+#else
+ return (__force __be64)(((__force u64)iid[0] << 40) |
+ ((__force u64)iid[1] << 48) |
+ ((__force u64)iid[2] << 56));
+#endif
+}
+
+/* Compute source UDP port for outgoing packet.
+ * Currently we use the flow hash.
+ */
+static u16 get_src_port(struct net *net, struct sk_buff *skb)
+{
+ u32 hash = skb_get_hash(skb);
+ unsigned int range;
+ int high;
+ int low;
+
+ if (!hash) {
+ if (skb->protocol == htons(ETH_P_IP)) {
+ struct iphdr *iph;
+ int size = (sizeof(iph->saddr) * 2) / sizeof(u32);
+
+ iph = (struct iphdr *) skb_network_header(skb);
+ hash = jhash2((const u32 *)&iph->saddr, size, 0);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ struct ipv6hdr *ipv6hdr;
+
+ ipv6hdr = (struct ipv6hdr *) skb_network_header(skb);
+ hash = jhash2((const u32 *)&ipv6hdr->saddr,
+ (sizeof(struct in6_addr) * 2) / sizeof(u32), 0);
+ } else {
+ pr_warn_once("LISP inner protocol is not IP when "
+ "calculating hash.\n");
+ }
+ }
+
+ inet_get_local_port_range(net, &low, &high);
+ range = (high - low) + 1;
+ return (((u64) hash * range) >> 32) + low;
+}
+
+static void lisp_build_header(struct sk_buff *skb,
+ const struct ip_tunnel_key *tun_key)
+{
+ struct lisphdr *lisph;
+
+ lisph = (struct lisphdr *)__skb_push(skb, sizeof(struct lisphdr));
+ lisph->nonce_present = 0; /* We don't support echo nonce algorithm */
+ lisph->locator_status_bits_present = 1; /* Set LSB */
+ lisph->solicit_echo_nonce = 0; /* No echo noncing */
+ lisph->map_version_present = 0; /* No mapping versioning, nonce instead */
+ lisph->instance_id_present = 1; /* Store the tun_id as Instance ID */
+ lisph->reserved_flags = 0; /* Reserved flags, set to 0 */
+
+ lisph->u1.nonce[0] = 0;
+ lisph->u1.nonce[1] = 0;
+ lisph->u1.nonce[2] = 0;
+
+ tunnel_id_to_instance_id(tun_key->tun_id, &lisph->u2.word2.instance_id[0]);
+ lisph->u2.word2.locator_status_bits = 1;
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static int lisp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+ struct net_device *dev;
+ struct lisphdr *lisph;
+ struct iphdr *inner_iph;
+ struct metadata_dst *tun_dst;
+#ifndef HAVE_METADATA_DST
+ struct metadata_dst temp;
+#endif
+ __be64 key;
+ struct ethhdr *ethh;
+ __be16 protocol;
+
+ dev = rcu_dereference_sk_user_data(sk);
+ if (unlikely(!dev))
+ goto error;
+
+ if (iptunnel_pull_header(skb, LISP_HLEN, 0))
+ goto error;
+
+ lisph = lisp_hdr(skb);
+
+ if (lisph->instance_id_present != 1)
+ key = 0;
+ else
+ key = instance_id_to_tunnel_id(&lisph->u2.word2.instance_id[0]);
+
+ /* Save outer tunnel values */
+#ifndef HAVE_METADATA_DST
+ tun_dst = &temp;
+ ovs_udp_tun_rx_dst(&tun_dst->u.tun_info, skb, AF_INET, TUNNEL_KEY, key, 0);
+#else
+ tun_dst = udp_tun_rx_dst(skb, AF_INET, TUNNEL_KEY, key, 0);
+#endif
+ /* Drop non-IP inner packets */
+ inner_iph = (struct iphdr *)(lisph + 1);
+ switch (inner_iph->version) {
+ case 4:
+ protocol = htons(ETH_P_IP);
+ break;
+ case 6:
+ protocol = htons(ETH_P_IPV6);
+ break;
+ default:
+ goto error;
+ }
+ skb->protocol = protocol;
+
+ /* Add Ethernet header */
+ ethh = (struct ethhdr *)skb_push(skb, ETH_HLEN);
+ memset(ethh, 0, ETH_HLEN);
+ ethh->h_dest[0] = 0x02;
+ ethh->h_source[0] = 0x02;
+ ethh->h_proto = protocol;
+
+ ovs_ip_tunnel_rcv(dev, skb, tun_dst);
+ goto out;
+
+error:
+ kfree_skb(skb);
+out:
+ return 0;
+}
+
+netdev_tx_t rpl_lisp_xmit(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct lisp_dev *lisp_dev = netdev_priv(dev);
+ struct net *net = lisp_dev->net;
+ int network_offset = skb_network_offset(skb);
+ struct ip_tunnel_info *info;
+ struct ip_tunnel_key *tun_key;
+ struct rtable *rt;
+ int min_headroom;
+ __be16 src_port, dst_port;
+ struct flowi4 fl;
+ __be16 df;
+ int err;
+
+ info = skb_tunnel_info(skb);
+ if (unlikely(!info)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ if (skb->protocol != htons(ETH_P_IP) &&
+ skb->protocol != htons(ETH_P_IPV6)) {
+ err = 0;
+ goto error;
+ }
+
+ tun_key = &info->key;
+
+ /* Route lookup */
+ memset(&fl, 0, sizeof(fl));
+ fl.daddr = tun_key->u.ipv4.dst;
+ fl.saddr = tun_key->u.ipv4.src;
+ fl.flowi4_tos = RT_TOS(tun_key->tos);
+ fl.flowi4_mark = skb->mark;
+ fl.flowi4_proto = IPPROTO_UDP;
+ rt = ip_route_output_key(net, &fl);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto error;
+ }
+
+ min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+ + sizeof(struct iphdr) + LISP_HLEN;
+
+ if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
+ int head_delta = SKB_DATA_ALIGN(min_headroom -
+ skb_headroom(skb) +
+ 16);
+
+ err = pskb_expand_head(skb, max_t(int, head_delta, 0),
+ 0, GFP_ATOMIC);
+ if (unlikely(err))
+ goto err_free_rt;
+ }
+
+ /* Reset l2 headers. */
+ skb_pull(skb, network_offset);
+ skb_reset_mac_header(skb);
+ vlan_set_tci(skb, 0);
+
+ skb = udp_tunnel_handle_offloads(skb, false, 0, false);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
+ skb = NULL;
+ goto err_free_rt;
+ }
+
+ src_port = htons(get_src_port(net, skb));
+ dst_port = lisp_dev->dst_port;
+
+ lisp_build_header(skb, tun_key);
+
+ skb->ignore_df = 1;
+
+ ovs_skb_set_inner_protocol(skb, skb->protocol);
+
+ df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ err = udp_tunnel_xmit_skb(rt, lisp_dev->sock->sk, skb,
+ fl.saddr, tun_key->u.ipv4.dst,
+ tun_key->tos, tun_key->ttl,
+ df, src_port, dst_port, false, true);
+
+ iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
+ return NETDEV_TX_OK;
+
+err_free_rt:
+ ip_rt_put(rt);
+error:
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL(rpl_lisp_xmit);
+
+#ifdef HAVE_DEV_TSTATS
+/* Setup stats when device is created */
+static int lisp_init(struct net_device *dev)
+{
+ dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void lisp_uninit(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+}
+#endif
+
+static struct socket *create_sock(struct net *net, bool ipv6,
+ __be16 port)
+{
+ struct socket *sock;
+ struct udp_port_cfg udp_conf;
+ int err;
+
+ memset(&udp_conf, 0, sizeof(udp_conf));
+
+ if (ipv6) {
+ udp_conf.family = AF_INET6;
+ } else {
+ udp_conf.family = AF_INET;
+ udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+ }
+
+ udp_conf.local_udp_port = port;
+
+ /* Open UDP socket */
+ err = udp_sock_create(net, &udp_conf, &sock);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ return sock;
+}
+
+static int lisp_open(struct net_device *dev)
+{
+ struct lisp_dev *lisp = netdev_priv(dev);
+ struct udp_tunnel_sock_cfg tunnel_cfg;
+ struct net *net = lisp->net;
+
+ lisp->sock = create_sock(net, false, lisp->dst_port);
+ if (IS_ERR(lisp->sock))
+ return PTR_ERR(lisp->sock);
+
+ /* Mark socket as an encapsulation socket */
+ tunnel_cfg.sk_user_data = dev;
+ tunnel_cfg.encap_type = 1;
+ tunnel_cfg.encap_rcv = lisp_rcv;
+ tunnel_cfg.encap_destroy = NULL;
+ setup_udp_tunnel_sock(net, lisp->sock, &tunnel_cfg);
+ return 0;
+}
+
+static int lisp_stop(struct net_device *dev)
+{
+ struct lisp_dev *lisp = netdev_priv(dev);
+
+ udp_tunnel_sock_release(lisp->sock);
+ lisp->sock = NULL;
+ return 0;
+}
+
+static netdev_tx_t lisp_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+#ifdef HAVE_METADATA_DST
+ return rpl_lisp_xmit(skb);
+#else
+ /* Drop All packets coming from networking stack. OVS-CB is
+ * not initialized for these packets.
+ */
+
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+#endif
+}
+
+static const struct net_device_ops lisp_netdev_ops = {
+#ifdef HAVE_DEV_TSTATS
+ .ndo_init = lisp_init,
+ .ndo_uninit = lisp_uninit,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+#endif
+ .ndo_open = lisp_open,
+ .ndo_stop = lisp_stop,
+ .ndo_start_xmit = lisp_dev_xmit,
+ .ndo_change_mtu = eth_change_mtu,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_mac_address = eth_mac_addr,
+};
+
+static void lisp_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strlcpy(drvinfo->version, LISP_NETDEV_VER, sizeof(drvinfo->version));
+ strlcpy(drvinfo->driver, "lisp", sizeof(drvinfo->driver));
+}
+
+static const struct ethtool_ops lisp_ethtool_ops = {
+ .get_drvinfo = lisp_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type lisp_type = {
+ .name = "lisp",
+};
+
+/* Initialize the device structure. */
+static void lisp_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->netdev_ops = &lisp_netdev_ops;
+ dev->ethtool_ops = &lisp_ethtool_ops;
+ dev->destructor = free_netdev;
+
+ SET_NETDEV_DEVTYPE(dev, &lisp_type);
+
+ dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL;
+ dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ dev->features |= NETIF_F_RXCSUM;
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+#endif
+#ifdef HAVE_METADATA_DST
+ netif_keep_dst(dev);
+#endif
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
+ eth_hw_addr_random(dev);
+}
+
+static const struct nla_policy lisp_policy[IFLA_LISP_MAX + 1] = {
+ [IFLA_LISP_PORT] = { .type = NLA_U16 },
+};
+
+static int lisp_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ return 0;
+}
+
+static struct lisp_dev *find_dev(struct net *net, __be16 dst_port)
+{
+ struct lisp_net *ln = net_generic(net, lisp_net_id);
+ struct lisp_dev *dev;
+
+ list_for_each_entry(dev, &ln->lisp_list, next) {
+ if (dev->dst_port == dst_port)
+ return dev;
+ }
+ return NULL;
+}
+
+static int lisp_configure(struct net *net, struct net_device *dev,
+ __be16 dst_port)
+{
+ struct lisp_net *ln = net_generic(net, lisp_net_id);
+ struct lisp_dev *lisp = netdev_priv(dev);
+ int err;
+
+ lisp->net = net;
+ lisp->dev = dev;
+
+ lisp->dst_port = dst_port;
+
+ if (find_dev(net, dst_port))
+ return -EBUSY;
+
+ err = register_netdevice(dev);
+ if (err)
+ return err;
+
+ list_add(&lisp->next, &ln->lisp_list);
+ return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+static int lisp_newlink(struct net *net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+#else
+static int lisp_newlink(struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+
+{
+ struct net *net = &init_net;
+#endif
+ __be16 dst_port = htons(LISP_UDP_PORT);
+
+ if (data[IFLA_LISP_PORT])
+ dst_port = nla_get_be16(data[IFLA_LISP_PORT]);
+
+ return lisp_configure(net, dev, dst_port);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+static void lisp_dellink(struct net_device *dev, struct list_head *head)
+#else
+static void lisp_dellink(struct net_device *dev)
+#endif
+{
+ struct lisp_dev *lisp = netdev_priv(dev);
+
+ list_del(&lisp->next);
+ unregister_netdevice_queue(dev, head);
+}
+
+static size_t lisp_get_size(const struct net_device *dev)
+{
+ return nla_total_size(sizeof(__be32)); /* IFLA_LISP_PORT */
+}
+
+static int lisp_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct lisp_dev *lisp = netdev_priv(dev);
+
+ if (nla_put_be16(skb, IFLA_LISP_PORT, lisp->dst_port))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static struct rtnl_link_ops lisp_link_ops __read_mostly = {
+ .kind = "lisp",
+ .maxtype = IFLA_LISP_MAX,
+ .policy = lisp_policy,
+ .priv_size = sizeof(struct lisp_dev),
+ .setup = lisp_setup,
+ .validate = lisp_validate,
+ .newlink = lisp_newlink,
+ .dellink = lisp_dellink,
+ .get_size = lisp_get_size,
+ .fill_info = lisp_fill_info,
+};
+
+struct net_device *rpl_lisp_dev_create_fb(struct net *net, const char *name,
+ u8 name_assign_type, u16 dst_port)
+{
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ int err;
+
+ memset(tb, 0, sizeof(tb));
+ dev = rtnl_create_link(net, (char *) name, name_assign_type,
+ &lisp_link_ops, tb);
+ if (IS_ERR(dev))
+ return dev;
+
+ err = lisp_configure(net, dev, htons(dst_port));
+ if (err) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+ return dev;
+}
+EXPORT_SYMBOL_GPL(rpl_lisp_dev_create_fb);
+
+static int lisp_init_net(struct net *net)
+{
+ struct lisp_net *ln = net_generic(net, lisp_net_id);
+
+ INIT_LIST_HEAD(&ln->lisp_list);
+ return 0;
+}
+
+static void lisp_exit_net(struct net *net)
+{
+ struct lisp_net *ln = net_generic(net, lisp_net_id);
+ struct lisp_dev *lisp, *next;
+ struct net_device *dev, *aux;
+ LIST_HEAD(list);
+
+ rtnl_lock();
+
+ /* gather any lisp devices that were moved into this ns */
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &lisp_link_ops)
+ unregister_netdevice_queue(dev, &list);
+
+ list_for_each_entry_safe(lisp, next, &ln->lisp_list, next) {
+ /* If lisp->dev is in the same netns, it was already added
+ * to the lisp by the previous loop.
+ */
+ if (!net_eq(dev_net(lisp->dev), net))
+ unregister_netdevice_queue(lisp->dev, &list);
+ }
+
+ /* unregister the devices gathered above */
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations lisp_net_ops = {
+ .init = lisp_init_net,
+ .exit = lisp_exit_net,
+ .id = &lisp_net_id,
+ .size = sizeof(struct lisp_net),
+};
+
+DEFINE_COMPAT_PNET_REG_FUNC(device)
+int rpl_lisp_init_module(void)
+{
+ int rc;
+
+ rc = register_pernet_subsys(&lisp_net_ops);
+ if (rc)
+ goto out1;
+
+ rc = rtnl_link_register(&lisp_link_ops);
+ if (rc)
+ goto out2;
+
+ pr_info("LISP tunneling driver\n");
+ return 0;
+out2:
+ unregister_pernet_subsys(&lisp_net_ops);
+out1:
+ return rc;
+}
+
+void rpl_lisp_cleanup_module(void)
+{
+ rtnl_link_unregister(&lisp_link_ops);
+ unregister_pernet_subsys(&lisp_net_ops);
+}
diff --git a/datapath/linux/compat/netdevice.c b/datapath/linux/compat/netdevice.c
index 483d665d8..e28b878ee 100644
--- a/datapath/linux/compat/netdevice.c
+++ b/datapath/linux/compat/netdevice.c
@@ -117,3 +117,122 @@ struct sk_buff *rpl__skb_gso_segment(struct sk_buff *skb,
EXPORT_SYMBOL_GPL(rpl__skb_gso_segment);
#endif /* OVS_USE_COMPAT_GSO_SEGMENTATION */
+
+#ifdef HAVE_UDP_OFFLOAD
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)
+struct sk_buff **rpl_eth_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ struct sk_buff *p, **pp = NULL;
+ struct ethhdr *eh, *eh2;
+ unsigned int hlen, off_eth;
+ const struct packet_offload *ptype;
+ __be16 type;
+ int flush = 1;
+
+ off_eth = skb_gro_offset(skb);
+ hlen = off_eth + sizeof(*eh);
+ eh = skb_gro_header_fast(skb, off_eth);
+ if (skb_gro_header_hard(skb, hlen)) {
+ eh = skb_gro_header_slow(skb, hlen, off_eth);
+ if (unlikely(!eh))
+ goto out;
+ }
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ eh2 = (struct ethhdr *)(p->data + off_eth);
+ if (compare_ether_header(eh, eh2)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ type = eh->h_proto;
+
+ rcu_read_lock();
+ ptype = gro_find_receive_by_type(type);
+ if (ptype == NULL) {
+ flush = 1;
+ goto out_unlock;
+ }
+
+ skb_gro_pull(skb, sizeof(*eh));
+ skb_gro_postpull_rcsum(skb, eh, sizeof(*eh));
+ pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+int rpl_eth_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
+ __be16 type = eh->h_proto;
+ struct packet_offload *ptype;
+ int err = -ENOSYS;
+
+ if (skb->encapsulation)
+ skb_set_inner_mac_header(skb, nhoff);
+
+ rcu_read_lock();
+ ptype = gro_find_complete_by_type(type);
+ if (ptype != NULL)
+ err = ptype->callbacks.gro_complete(skb, nhoff +
+ sizeof(struct ethhdr));
+
+ rcu_read_unlock();
+ return err;
+}
+
+#endif
+#endif /* HAVE_UDP_OFFLOAD */
+
+#ifndef HAVE_RTNL_LINK_STATS64
+#undef dev_get_stats
+struct rtnl_link_stats64 *rpl_dev_get_stats(struct net_device *dev,
+ struct rtnl_link_stats64 *storage)
+{
+ const struct net_device_stats *stats = dev_get_stats(dev);
+
+#define copy(s) storage->s = stats->s
+
+ copy(rx_packets);
+ copy(tx_packets);
+ copy(rx_bytes);
+ copy(tx_bytes);
+ copy(rx_errors);
+ copy(tx_errors);
+ copy(rx_dropped);
+ copy(tx_dropped);
+ copy(multicast);
+ copy(collisions);
+
+ copy(rx_length_errors);
+ copy(rx_over_errors);
+ copy(rx_crc_errors);
+ copy(rx_frame_errors);
+ copy(rx_fifo_errors);
+ copy(rx_missed_errors);
+
+ copy(tx_aborted_errors);
+ copy(tx_carrier_errors);
+ copy(tx_fifo_errors);
+ copy(tx_heartbeat_errors);
+ copy(tx_window_errors);
+
+ copy(rx_compressed);
+ copy(tx_compressed);
+
+#undef copy
+ return storage;
+}
+#endif
diff --git a/datapath/linux/compat/skbuff-openvswitch.c b/datapath/linux/compat/skbuff-openvswitch.c
index fad1cc7d0..c46798df6 100644
--- a/datapath/linux/compat/skbuff-openvswitch.c
+++ b/datapath/linux/compat/skbuff-openvswitch.c
@@ -2,6 +2,7 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/if_vlan.h>
+#include <linux/kconfig.h>
#include "gso.h"
@@ -280,3 +281,34 @@ void rpl_kfree_skb_list(struct sk_buff *segs)
}
EXPORT_SYMBOL(rpl_kfree_skb_list);
#endif
+
+#ifndef HAVE_SKB_SCRUB_PACKET_XNET
+
+#define nf_reset_trace rpl_nf_reset_trace
+static void nf_reset_trace(struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE) || defined(CONFIG_NF_TABLES)
+ skb->nf_trace = 0;
+#endif
+}
+
+void rpl_skb_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ skb->tstamp.tv64 = 0;
+ skb->pkt_type = PACKET_HOST;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ skb->skb_iif = 0;
+#endif
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ secpath_reset(skb);
+ nf_reset(skb);
+ nf_reset_trace(skb);
+
+ if (!xnet)
+ return;
+
+ skb_orphan(skb);
+ skb->mark = 0;
+}
+#endif
diff --git a/datapath/linux/compat/stt.c b/datapath/linux/compat/stt.c
index 0659c0b63..107aa2bb2 100644
--- a/datapath/linux/compat/stt.c
+++ b/datapath/linux/compat/stt.c
@@ -9,6 +9,7 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <asm/unaligned.h>
#include <linux/delay.h>
@@ -28,9 +29,11 @@
#include <linux/tcp.h>
#include <linux/workqueue.h>
+#include <net/dst_metadata.h>
#include <net/icmp.h>
#include <net/inet_ecn.h>
#include <net/ip.h>
+#include <net/ip_tunnels.h>
#include <net/ip6_checksum.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
@@ -40,10 +43,29 @@
#include <net/udp.h>
#include "gso.h"
+#include "compat.h"
+
+#define STT_NETDEV_VER "0.1"
+#define STT_DST_PORT 7471
#ifdef OVS_STT
#define STT_VER 0
+/* @list: Per-net list of STT ports.
+ * @rcv: The callback is called on STT packet recv, STT reassembly can generate
+ * multiple packets, in this case first packet has tunnel outer header, rest
+ * of the packets are inner packet segments with no stt header.
+ * @rcv_data: user data.
+ * @sock: Fake TCP socket for the STT port.
+ */
+struct stt_dev {
+ struct net_device *dev;
+ struct net *net;
+ struct list_head next;
+ struct socket *sock;
+ __be16 dst_port;
+};
+
#define STT_CSUM_VERIFIED BIT(0)
#define STT_CSUM_PARTIAL BIT(1)
#define STT_PROTO_IPV4 BIT(2)
@@ -127,7 +149,8 @@ struct frag_skb_cb {
/* per-network namespace private data for this module */
struct stt_net {
- struct list_head sock_list;
+ struct list_head stt_list;
+ int n_tunnels;
};
static int stt_net_id;
@@ -144,14 +167,14 @@ static DEFINE_PER_CPU(u32, pkt_seq_counter);
static void clean_percpu(struct work_struct *work);
static DECLARE_DELAYED_WORK(clean_percpu_wq, clean_percpu);
-static struct stt_sock *stt_find_sock(struct net *net, __be16 port)
+static struct stt_dev *stt_find_sock(struct net *net, __be16 port)
{
struct stt_net *sn = net_generic(net, stt_net_id);
- struct stt_sock *stt_sock;
+ struct stt_dev *stt_dev;
- list_for_each_entry_rcu(stt_sock, &sn->sock_list, list) {
- if (inet_sk(stt_sock->sock->sk)->inet_sport == port)
- return stt_sock;
+ list_for_each_entry_rcu(stt_dev, &sn->stt_list, next) {
+ if (inet_sk(stt_dev->sock->sk)->inet_sport == port)
+ return stt_dev;
}
return NULL;
}
@@ -788,7 +811,6 @@ static int skb_list_xmit(struct rtable *rt, struct sk_buff *skb, __be32 src,
if (next)
dst_clone(&rt->dst);
- skb_clear_ovs_gso_cb(skb);
skb->next = NULL;
len += iptunnel_xmit(NULL, rt, skb, src, dst, IPPROTO_TCP,
tos, ttl, df, false);
@@ -835,7 +857,7 @@ static u8 skb_get_l4_proto(struct sk_buff *skb, __be16 l3_proto)
return 0;
}
-int rpl_stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
+static int stt_xmit_skb(struct sk_buff *skb, struct rtable *rt,
__be32 src, __be32 dst, __u8 tos,
__u8 ttl, __be16 df, __be16 src_port, __be16 dst_port,
__be64 tun_id)
@@ -906,7 +928,57 @@ err_free_rt:
kfree_skb(skb);
return ret;
}
-EXPORT_SYMBOL_GPL(rpl_stt_xmit_skb);
+
+netdev_tx_t ovs_stt_xmit(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct stt_dev *stt_dev = netdev_priv(dev);
+ struct net *net = stt_dev->net;
+ __be16 dport = inet_sk(stt_dev->sock->sk)->inet_sport;
+ struct ip_tunnel_key *tun_key;
+ struct ip_tunnel_info *tun_info;
+ struct rtable *rt;
+ struct flowi4 fl;
+ __be16 sport;
+ __be16 df;
+ int err;
+
+ tun_info = skb_tunnel_info(skb);
+ if (unlikely(!tun_info)) {
+ err = -EINVAL;
+ goto error;
+ }
+
+ tun_key = &tun_info->key;
+
+ /* Route lookup */
+ memset(&fl, 0, sizeof(fl));
+ fl.daddr = tun_key->u.ipv4.dst;
+ fl.saddr = tun_key->u.ipv4.src;
+ fl.flowi4_tos = RT_TOS(tun_key->tos);
+ fl.flowi4_mark = skb->mark;
+ fl.flowi4_proto = IPPROTO_TCP;
+ rt = ip_route_output_key(net, &fl);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto error;
+ }
+
+ df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+ sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+ skb->ignore_df = 1;
+
+ err = stt_xmit_skb(skb, rt, fl.saddr, tun_key->u.ipv4.dst,
+ tun_key->tos, tun_key->ttl,
+ df, sport, dport, tun_key->tun_id);
+ iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
+ return NETDEV_TX_OK;
+error:
+ kfree_skb(skb);
+ dev->stats.tx_errors++;
+ return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL(ovs_stt_xmit);
static void free_frag(struct stt_percpu *stt_percpu,
struct pkt_frag *frag)
@@ -1213,7 +1285,41 @@ static bool set_offloads(struct sk_buff *skb)
return true;
}
-static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb)
+
+#ifndef HAVE_METADATA_DST
+static int __rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
+{
+ struct metadata_dst tun_dst;
+
+ ovs_ip_tun_rx_dst(&tun_dst.u.tun_info, skb, TUNNEL_KEY | TUNNEL_CSUM,
+ get_unaligned(&stt_hdr(skb)->key), 0);
+ tun_dst.u.tun_info.key.tp_src = tcp_hdr(skb)->source;
+ tun_dst.u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
+
+ ovs_ip_tunnel_rcv(stt_dev->dev, skb, &tun_dst);
+ return 0;
+}
+#else
+static int __rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
+{
+ struct metadata_dst *tun_dst;
+ __be16 flags;
+ __be64 tun_id;
+
+ flags = TUNNEL_KEY | TUNNEL_CSUM;
+ tun_id = get_unaligned(&stt_hdr(skb)->key);
+ tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0);
+ if (!tun_dst)
+ return -ENOMEM;
+ tun_dst->u.tun_info.key.tp_src = tcp_hdr(skb)->source;
+ tun_dst->u.tun_info.key.tp_dst = tcp_hdr(skb)->dest;
+
+ ovs_ip_tunnel_rcv(stt_dev->dev, skb, tun_dst);
+ return 0;
+}
+
+#endif
+static void stt_rcv(struct stt_dev *stt_dev, struct sk_buff *skb)
{
int err;
@@ -1242,11 +1348,14 @@ static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb)
if (skb_shinfo(skb)->frag_list && try_to_segment(skb))
goto drop;
- stt_sock->rcv(stt_sock, skb);
+ err = __rcv(stt_dev, skb);
+ if (err)
+ goto drop;
return;
drop:
/* Consume bad packet */
kfree_skb_list(skb);
+ stt_dev->dev->stats.rx_errors++;
}
static void tcp_sock_release(struct socket *sock)
@@ -1324,15 +1433,27 @@ static void clean_percpu(struct work_struct *work)
#define FIRST_PARAM unsigned int hooknum
#endif
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0)
+#ifdef HAVE_NF_HOOK_STATE
+#if RHEL_RELEASE_CODE > RHEL_RELEASE_VERSION(7,0)
+/* RHEL nfhook hacks. */
+#ifndef __GENKSYMS__
+#define LAST_PARAM const struct net_device *in, const struct net_device *out, \
+ const struct nf_hook_state *state
+#else
+#define LAST_PARAM const struct net_device *in, const struct net_device *out, \
+ int (*okfn)(struct sk_buff *)
+#endif
+#else
#define LAST_PARAM const struct nf_hook_state *state
+#endif
#else
-#define LAST_PARAM const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)
+#define LAST_PARAM const struct net_device *in, const struct net_device *out, \
+ int (*okfn)(struct sk_buff *)
#endif
static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM)
{
- struct stt_sock *stt_sock;
+ struct stt_dev *stt_dev;
int ip_hdr_len;
if (ip_hdr(skb)->protocol != IPPROTO_TCP)
@@ -1344,12 +1465,12 @@ static unsigned int nf_ip_hook(FIRST_PARAM, struct sk_buff *skb, LAST_PARAM)
skb_set_transport_header(skb, ip_hdr_len);
- stt_sock = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest);
- if (!stt_sock)
+ stt_dev = stt_find_sock(dev_net(skb->dev), tcp_hdr(skb)->dest);
+ if (!stt_dev)
return NF_ACCEPT;
__skb_pull(skb, ip_hdr_len + sizeof(struct tcphdr));
- stt_rcv(stt_sock, skb);
+ stt_rcv(stt_dev, skb);
return NF_STOLEN;
}
@@ -1361,8 +1482,9 @@ static struct nf_hook_ops nf_hook_ops __read_mostly = {
.priority = INT_MAX,
};
-static int stt_start(void)
+static int stt_start(struct net *net)
{
+ struct stt_net *sn = net_generic(net, stt_net_id);
int err;
int i;
@@ -1401,12 +1523,25 @@ static int stt_start(void)
if (err)
goto free_percpu;
}
+ schedule_clean_percpu();
+ n_tunnels++;
+
+ if (sn->n_tunnels) {
+ sn->n_tunnels++;
+ return 0;
+ }
+#ifdef HAVE_NF_REGISTER_NET_HOOK
+ /* On kernel which support per net nf-hook, nf_register_hook() takes
+ * rtnl-lock, which results in dead lock in stt-dev-create. Therefore
+ * use this new API.
+ */
+ err = nf_register_net_hook(net, &nf_hook_ops);
+#else
err = nf_register_hook(&nf_hook_ops);
+#endif
if (err)
goto free_percpu;
-
- schedule_clean_percpu();
- n_tunnels++;
+ sn->n_tunnels++;
return 0;
free_percpu:
@@ -1423,17 +1558,26 @@ error:
return err;
}
-static void stt_cleanup(void)
+static void stt_cleanup(struct net *net)
{
+ struct stt_net *sn = net_generic(net, stt_net_id);
int i;
+ sn->n_tunnels--;
+ if (sn->n_tunnels)
+ goto out;
+#ifdef HAVE_NF_REGISTER_NET_HOOK
+ nf_unregister_net_hook(net, &nf_hook_ops);
+#else
+ nf_unregister_hook(&nf_hook_ops);
+#endif
+
+out:
n_tunnels--;
if (n_tunnels)
return;
cancel_delayed_work_sync(&clean_percpu_wq);
- nf_unregister_hook(&nf_hook_ops);
-
for_each_possible_cpu(i) {
struct stt_percpu *stt_percpu = per_cpu_ptr(stt_percpu_data, i);
int j;
@@ -1451,102 +1595,306 @@ static void stt_cleanup(void)
free_percpu(stt_percpu_data);
}
-static struct stt_sock *stt_socket_create(struct net *net, __be16 port,
- stt_rcv_t *rcv, void *data)
+static netdev_tx_t stt_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
- struct stt_net *sn = net_generic(net, stt_net_id);
- struct stt_sock *stt_sock;
- struct socket *sock;
+#ifdef HAVE_METADATA_DST
+ return ovs_stt_xmit(skb);
+#else
+ /* Drop All packets coming from networking stack. OVS-CB is
+ * not initialized for these packets.
+ */
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+#endif
+}
+
+/* Setup stats when device is created */
+static int stt_init(struct net_device *dev)
+{
+ dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void stt_uninit(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+}
+
+static int stt_open(struct net_device *dev)
+{
+ struct stt_dev *stt = netdev_priv(dev);
+ struct net *net = stt->net;
int err;
- stt_sock = kzalloc(sizeof(*stt_sock), GFP_KERNEL);
- if (!stt_sock)
- return ERR_PTR(-ENOMEM);
+ err = stt_start(net);
+ if (err)
+ return err;
- err = tcp_sock_create4(net, port, &sock);
- if (err) {
- kfree(stt_sock);
- return ERR_PTR(err);
- }
+ err = tcp_sock_create4(net, stt->dst_port, &stt->sock);
+ if (err)
+ return err;
+ return 0;
+}
- stt_sock->sock = sock;
- stt_sock->rcv = rcv;
- stt_sock->rcv_data = data;
+static int stt_stop(struct net_device *dev)
+{
+ struct stt_dev *stt_dev = netdev_priv(dev);
+ struct net *net = stt_dev->net;
- list_add_rcu(&stt_sock->list, &sn->sock_list);
+ tcp_sock_release(stt_dev->sock);
+ stt_dev->sock = NULL;
+ stt_cleanup(net);
+ return 0;
+}
- return stt_sock;
+static const struct net_device_ops stt_netdev_ops = {
+ .ndo_init = stt_init,
+ .ndo_uninit = stt_uninit,
+ .ndo_open = stt_open,
+ .ndo_stop = stt_stop,
+ .ndo_start_xmit = stt_dev_xmit,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+ .ndo_change_mtu = eth_change_mtu,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_mac_address = eth_mac_addr,
+};
+
+static void stt_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strlcpy(drvinfo->version, STT_NETDEV_VER, sizeof(drvinfo->version));
+ strlcpy(drvinfo->driver, "stt", sizeof(drvinfo->driver));
}
-static void __stt_sock_release(struct stt_sock *stt_sock)
+static const struct ethtool_ops stt_ethtool_ops = {
+ .get_drvinfo = stt_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type stt_type = {
+ .name = "stt",
+};
+
+/* Initialize the device structure. */
+static void stt_setup(struct net_device *dev)
{
- list_del_rcu(&stt_sock->list);
- tcp_sock_release(stt_sock->sock);
- kfree_rcu(stt_sock, rcu);
+ ether_setup(dev);
+
+ dev->netdev_ops = &stt_netdev_ops;
+ dev->ethtool_ops = &stt_ethtool_ops;
+ dev->destructor = free_netdev;
+
+ SET_NETDEV_DEVTYPE(dev, &stt_type);
+
+ dev->features |= NETIF_F_LLTX | NETIF_F_NETNS_LOCAL;
+ dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ dev->features |= NETIF_F_RXCSUM;
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+
+ dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+
+#ifdef HAVE_METADATA_DST
+ netif_keep_dst(dev);
+#endif
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
+ eth_hw_addr_random(dev);
}
-struct stt_sock *rpl_stt_sock_add(struct net *net, __be16 port,
- stt_rcv_t *rcv, void *data)
+static const struct nla_policy stt_policy[IFLA_STT_MAX + 1] = {
+ [IFLA_STT_PORT] = { .type = NLA_U16 },
+};
+
+static int stt_validate(struct nlattr *tb[], struct nlattr *data[])
{
- struct stt_sock *stt_sock;
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
+ return -EINVAL;
+
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
+ return -EADDRNOTAVAIL;
+ }
+
+ return 0;
+}
+
+static struct stt_dev *find_dev(struct net *net, __be16 dst_port)
+{
+ struct stt_net *sn = net_generic(net, stt_net_id);
+ struct stt_dev *dev;
+
+ list_for_each_entry(dev, &sn->stt_list, next) {
+ if (dev->dst_port == dst_port)
+ return dev;
+ }
+ return NULL;
+}
+
+static int stt_configure(struct net *net, struct net_device *dev,
+ __be16 dst_port)
+{
+ struct stt_net *sn = net_generic(net, stt_net_id);
+ struct stt_dev *stt = netdev_priv(dev);
int err;
- err = stt_start();
+ stt->net = net;
+ stt->dev = dev;
+
+ stt->dst_port = dst_port;
+
+ if (find_dev(net, dst_port))
+ return -EBUSY;
+
+ err = register_netdevice(dev);
if (err)
- return ERR_PTR(err);
+ return err;
- mutex_lock(&stt_mutex);
- rcu_read_lock();
- stt_sock = stt_find_sock(net, port);
- rcu_read_unlock();
- if (stt_sock)
- stt_sock = ERR_PTR(-EBUSY);
- else
- stt_sock = stt_socket_create(net, port, rcv, data);
+ list_add(&stt->next, &sn->stt_list);
+ return 0;
+}
+
+static int stt_newlink(struct net *net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+{
+ __be16 dst_port = htons(STT_DST_PORT);
+
+ if (data[IFLA_STT_PORT])
+ dst_port = nla_get_be16(data[IFLA_STT_PORT]);
+
+ return stt_configure(net, dev, dst_port);
+}
+
+static void stt_dellink(struct net_device *dev, struct list_head *head)
+{
+ struct stt_dev *stt = netdev_priv(dev);
- mutex_unlock(&stt_mutex);
+ list_del(&stt->next);
+ unregister_netdevice_queue(dev, head);
+}
+
+static size_t stt_get_size(const struct net_device *dev)
+{
+ return nla_total_size(sizeof(__be32)); /* IFLA_STT_PORT */
+}
+
+static int stt_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct stt_dev *stt = netdev_priv(dev);
+
+ if (nla_put_be16(skb, IFLA_STT_PORT, stt->dst_port))
+ goto nla_put_failure;
- if (IS_ERR(stt_sock))
- stt_cleanup();
+ return 0;
- return stt_sock;
+nla_put_failure:
+ return -EMSGSIZE;
}
-EXPORT_SYMBOL_GPL(rpl_stt_sock_add);
-void rpl_stt_sock_release(struct stt_sock *stt_sock)
+static struct rtnl_link_ops stt_link_ops __read_mostly = {
+ .kind = "stt",
+ .maxtype = IFLA_STT_MAX,
+ .policy = stt_policy,
+ .priv_size = sizeof(struct stt_dev),
+ .setup = stt_setup,
+ .validate = stt_validate,
+ .newlink = stt_newlink,
+ .dellink = stt_dellink,
+ .get_size = stt_get_size,
+ .fill_info = stt_fill_info,
+};
+
+struct net_device *ovs_stt_dev_create_fb(struct net *net, const char *name,
+ u8 name_assign_type, u16 dst_port)
{
- mutex_lock(&stt_mutex);
- if (stt_sock) {
- __stt_sock_release(stt_sock);
- stt_cleanup();
+ struct nlattr *tb[IFLA_MAX + 1];
+ struct net_device *dev;
+ int err;
+
+ memset(tb, 0, sizeof(tb));
+ dev = rtnl_create_link(net, (char *) name, name_assign_type,
+ &stt_link_ops, tb);
+ if (IS_ERR(dev))
+ return dev;
+
+ err = stt_configure(net, dev, htons(dst_port));
+ if (err) {
+ free_netdev(dev);
+ return ERR_PTR(err);
}
- mutex_unlock(&stt_mutex);
+ return dev;
}
-EXPORT_SYMBOL_GPL(rpl_stt_sock_release);
+EXPORT_SYMBOL_GPL(ovs_stt_dev_create_fb);
static int stt_init_net(struct net *net)
{
struct stt_net *sn = net_generic(net, stt_net_id);
- INIT_LIST_HEAD(&sn->sock_list);
+ INIT_LIST_HEAD(&sn->stt_list);
return 0;
}
+static void stt_exit_net(struct net *net)
+{
+ struct stt_net *sn = net_generic(net, stt_net_id);
+ struct stt_dev *stt, *next;
+ struct net_device *dev, *aux;
+ LIST_HEAD(list);
+
+ rtnl_lock();
+
+ /* gather any stt devices that were moved into this ns */
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &stt_link_ops)
+ unregister_netdevice_queue(dev, &list);
+
+ list_for_each_entry_safe(stt, next, &sn->stt_list, next) {
+ /* If stt->dev is in the same netns, it was already added
+ * to the stt by the previous loop.
+ */
+ if (!net_eq(dev_net(stt->dev), net))
+ unregister_netdevice_queue(stt->dev, &list);
+ }
+
+ /* unregister the devices gathered above */
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+
static struct pernet_operations stt_net_ops = {
.init = stt_init_net,
+ .exit = stt_exit_net,
.id = &stt_net_id,
.size = sizeof(struct stt_net),
};
-int ovs_stt_init_module(void)
+int stt_init_module(void)
{
- return register_pernet_subsys(&stt_net_ops);
+ int rc;
+
+ rc = register_pernet_subsys(&stt_net_ops);
+ if (rc)
+ goto out1;
+
+ rc = rtnl_link_register(&stt_link_ops);
+ if (rc)
+ goto out2;
+
+ pr_info("STT tunneling driver\n");
+ return 0;
+out2:
+ unregister_pernet_subsys(&stt_net_ops);
+out1:
+ return rc;
}
-EXPORT_SYMBOL_GPL(ovs_stt_init_module);
-void ovs_stt_cleanup_module(void)
+void stt_cleanup_module(void)
{
+ rtnl_link_unregister(&stt_link_ops);
unregister_pernet_subsys(&stt_net_ops);
}
-EXPORT_SYMBOL_GPL(ovs_stt_cleanup_module);
#endif
diff --git a/datapath/linux/compat/udp_tunnel.c b/datapath/linux/compat/udp_tunnel.c
index 19a1ea562..f72e64563 100644
--- a/datapath/linux/compat/udp_tunnel.c
+++ b/datapath/linux/compat/udp_tunnel.c
@@ -1,6 +1,6 @@
#include <linux/version.h>
-#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0)
+#ifndef HAVE_METADATA_DST
#include <linux/module.h>
#include <linux/errno.h>
@@ -12,6 +12,9 @@
#include <net/udp.h>
#include <net/udp_tunnel.h>
#include <net/net_namespace.h>
+#include <net/ip6_checksum.h>
+#include <net/ip6_tunnel.h>
+
int rpl_udp_sock_create(struct net *net, struct udp_port_cfg *cfg,
struct socket **sockp)
@@ -168,4 +171,97 @@ void rpl_udp_tunnel_sock_release(struct socket *sock)
}
EXPORT_SYMBOL_GPL(rpl_udp_tunnel_sock_release);
-#endif /* Linux version < 4.0 */
+#if IS_ENABLED(CONFIG_IPV6)
+
+#define udp_v6_check rpl_udp_v6_check
+static __sum16 udp_v6_check(int len,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __wsum base)
+{
+ return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base);
+}
+
+#define udp6_set_csum rpl_udp6_set_csum
+static void udp6_set_csum(bool nocheck, struct sk_buff *skb,
+ const struct in6_addr *saddr,
+ const struct in6_addr *daddr, int len)
+{
+ struct udphdr *uh = udp_hdr(skb);
+
+ if (nocheck)
+ uh->check = 0;
+ else if (skb_is_gso(skb))
+ uh->check = ~udp_v6_check(len, saddr, daddr, 0);
+ else if (skb_dst(skb) && skb_dst(skb)->dev &&
+ (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
+ uh->check = ~udp_v6_check(len, saddr, daddr, 0);
+ } else {
+ __wsum csum;
+
+ BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
+
+ uh->check = 0;
+ csum = skb_checksum(skb, 0, len, 0);
+ uh->check = udp_v6_check(len, saddr, daddr, csum);
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+}
+
+#define ip6_flow_hdr rpl_ip6_flow_hdr
+static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass,
+ __be32 flowlabel)
+{
+ *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel;
+}
+
+int rpl_udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
+ struct net_device *dev, struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ __u8 prio, __u8 ttl, __be16 src_port,
+ __be16 dst_port, bool nocheck)
+{
+ struct udphdr *uh;
+ struct ipv6hdr *ip6h;
+
+ __skb_push(skb, sizeof(*uh));
+ skb_reset_transport_header(skb);
+ uh = udp_hdr(skb);
+
+ uh->dest = dst_port;
+ uh->source = src_port;
+
+ uh->len = htons(skb->len);
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+ IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED
+ | IPSKB_REROUTED);
+ skb_dst_set(skb, dst);
+
+ udp6_set_csum(nocheck, skb, saddr, daddr, skb->len);
+
+ __skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+ ip6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ip6h, prio, htonl(0));
+ ip6h->payload_len = htons(skb->len);
+ ip6h->nexthdr = IPPROTO_UDP;
+ ip6h->hop_limit = ttl;
+ ip6h->daddr = *daddr;
+ ip6h->saddr = *saddr;
+
+ ip6tunnel_xmit(sk, skb, dev);
+ return 0;
+}
+#endif
+#endif
diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c
index fd454ae29..4076a2fd3 100644
--- a/datapath/linux/compat/vxlan.c
+++ b/datapath/linux/compat/vxlan.c
@@ -1,25 +1,13 @@
/*
- * Copyright (c) 2007-2013 Nicira, Inc.
+ * VXLAN: Virtual eXtensible Local Area Network
*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
+ * Copyright (c) 2012-2013 Vyatta Inc.
*
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
- *
- * This code is derived from kernel vxlan module.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
*/
-#include <linux/version.h>
-
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/kernel.h>
@@ -39,10 +27,10 @@
#include <linux/if_vlan.h>
#include <linux/hash.h>
#include <linux/ethtool.h>
+#include <linux/netdev_features.h>
#include <net/arp.h>
#include <net/ndisc.h>
#include <net/ip.h>
-#include <net/gre.h>
#include <net/ip_tunnels.h>
#include <net/icmp.h>
#include <net/udp.h>
@@ -54,27 +42,878 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/vxlan.h>
+#include <net/protocol.h>
+#include <net/udp_tunnel.h>
+#include <net/ip6_route.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/ip6_tunnel.h>
+#include <net/ip6_checksum.h>
+#endif
+#include <net/dst_metadata.h>
-#include "compat.h"
-#include "datapath.h"
+#ifndef HAVE_METADATA_DST
#include "gso.h"
-#include "vlan.h"
+#include "vport-netdev.h"
+
+#define VXLAN_VERSION "0.1"
+
+#define PORT_HASH_BITS 8
+#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
+#define FDB_AGE_DEFAULT 300 /* 5 min */
+#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
+
+#ifndef NTF_SELF
+#define NTF_SELF 0x02
+#endif
+
+/* UDP port for VXLAN traffic.
+ * The IANA assigned port is 4789, but the Linux default is 8472
+ * for compatibility with early adopters.
+ */
+static unsigned short vxlan_port __read_mostly = 8472;
+module_param_named(udp_port, vxlan_port, ushort, 0444);
+MODULE_PARM_DESC(udp_port, "Destination UDP port");
+
+static int vxlan_net_id;
+static struct rtnl_link_ops vxlan_link_ops;
-#ifndef USE_UPSTREAM_VXLAN
+static const u8 all_zeros_mac[ETH_ALEN];
-/* VXLAN protocol header */
-struct vxlanhdr {
- __be32 vx_flags;
- __be32 vx_vni;
+static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
+ bool no_share, u32 flags);
+
+/* per-network namespace private data for this module */
+struct vxlan_net {
+ struct list_head vxlan_list;
+ struct hlist_head sock_list[PORT_HASH_SIZE];
+ spinlock_t sock_lock;
+};
+
+/* Forwarding table entry */
+struct vxlan_fdb {
+ struct hlist_node hlist; /* linked list of entries */
+ struct rcu_head rcu;
+ unsigned long updated; /* jiffies */
+ unsigned long used;
+ struct list_head remotes;
+ u8 eth_addr[ETH_ALEN];
+ u16 state; /* see ndm_state */
+ u8 flags; /* see ndm_flags */
};
+/* salt for hash table */
+static u32 vxlan_salt __read_mostly;
+static struct workqueue_struct *vxlan_wq;
+
+static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
+{
+ return vs->flags & VXLAN_F_COLLECT_METADATA ||
+ ip_tunnel_collect_metadata();
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static inline
+bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
+{
+ if (a->sa.sa_family != b->sa.sa_family)
+ return false;
+ if (a->sa.sa_family == AF_INET6)
+ return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
+ else
+ return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
+}
+
+static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
+{
+ if (ipa->sa.sa_family == AF_INET6)
+ return ipv6_addr_any(&ipa->sin6.sin6_addr);
+ else
+ return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+}
+
+static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
+{
+ if (ipa->sa.sa_family == AF_INET6)
+ return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
+ else
+ return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+}
+
+#else /* !CONFIG_IPV6 */
+
+static inline
+bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
+{
+ return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
+}
+
+static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
+{
+ return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+}
+
+static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
+{
+ return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+}
+
+#endif
+
+/* Virtual Network hash table head */
+static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
+{
+ return &vs->vni_list[hash_32(id, VNI_HASH_BITS)];
+}
+
+/* Socket hash table head */
+static inline struct hlist_head *vs_head(struct net *net, __be16 port)
+{
+ struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+ return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
+}
+
+/* First remote destination for a forwarding entry.
+ * Guaranteed to be non-NULL because remotes are never deleted.
+ */
+static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
+{
+ return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
+}
+
+static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
+{
+ return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
+}
+
+/* Find VXLAN socket based on network namespace, address family and UDP port
+ * and enabled unshareable flags.
+ */
+static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
+ __be16 port, u32 flags)
+{
+ struct vxlan_sock *vs;
+
+ flags &= VXLAN_F_RCV_FLAGS;
+
+ hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
+ if (inet_sport(vs->sock->sk) == port &&
+ vxlan_get_sk_family(vs) == family &&
+ vs->flags == flags)
+ return vs;
+ }
+ return NULL;
+}
+
+static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id)
+{
+ struct vxlan_dev *vxlan;
+
+ hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
+ if (vxlan->default_dst.remote_vni == id)
+ return vxlan;
+ }
+
+ return NULL;
+}
+
+/* Look up VNI in a per net namespace table */
+static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id,
+ sa_family_t family, __be16 port,
+ u32 flags)
+{
+ struct vxlan_sock *vs;
+
+ vs = vxlan_find_sock(net, family, port, flags);
+ if (!vs)
+ return NULL;
+
+ return vxlan_vs_find_vni(vs, id);
+}
+
+/* Fill in neighbour message in skbuff. */
+static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
+ const struct vxlan_fdb *fdb,
+ u32 portid, u32 seq, int type, unsigned int flags,
+ const struct vxlan_rdst *rdst)
+{
+ return -EINVAL;
+}
+
+static inline size_t vxlan_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ndmsg))
+ + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
+ + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
+ + nla_total_size(sizeof(__be16)) /* NDA_PORT */
+ + nla_total_size(sizeof(__be32)) /* NDA_VNI */
+ + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
+ + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
+ + nla_total_size(sizeof(struct nda_cacheinfo));
+}
+
+static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
+ struct vxlan_rdst *rd, int type)
+{
+ struct net *net = dev_net(vxlan->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
+ if (skb == NULL)
+ goto errout;
+
+ err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+ return;
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
+}
+
+/* Hash Ethernet address */
+static u32 eth_hash(const unsigned char *addr)
+{
+ u64 value = get_unaligned((u64 *)addr);
+
+ /* only want 6 bytes */
+#ifdef __BIG_ENDIAN
+ value >>= 16;
+#else
+ value <<= 16;
+#endif
+ return hash_64(value, FDB_HASH_BITS);
+}
+
+/* Hash chain to use given mac address */
+static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
+ const u8 *mac)
+{
+ return &vxlan->fdb_head[eth_hash(mac)];
+}
+
+/* Look up Ethernet address in forwarding table */
+static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
+ const u8 *mac)
+{
+ struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
+ struct vxlan_fdb *f;
+
+ hlist_for_each_entry_rcu(f, head, hlist) {
+ if (ether_addr_equal(mac, f->eth_addr))
+ return f;
+ }
+
+ return NULL;
+}
+
+static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
+ const u8 *mac)
+{
+ struct vxlan_fdb *f;
+
+ f = __vxlan_find_mac(vxlan, mac);
+ if (f)
+ f->used = jiffies;
+
+ return f;
+}
+
+/* caller should hold vxlan->hash_lock */
+static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
+ union vxlan_addr *ip, __be16 port,
+ __u32 vni, __u32 ifindex)
+{
+ struct vxlan_rdst *rd;
+
+ list_for_each_entry(rd, &f->remotes, list) {
+ if (vxlan_addr_equal(&rd->remote_ip, ip) &&
+ rd->remote_port == port &&
+ rd->remote_vni == vni &&
+ rd->remote_ifindex == ifindex)
+ return rd;
+ }
+
+ return NULL;
+}
+
+/* Replace destination of unicast mac */
+static int vxlan_fdb_replace(struct vxlan_fdb *f,
+ union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex)
+{
+ struct vxlan_rdst *rd;
+
+ rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
+ if (rd)
+ return 0;
+
+ rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
+ if (!rd)
+ return 0;
+ rd->remote_ip = *ip;
+ rd->remote_port = port;
+ rd->remote_vni = vni;
+ rd->remote_ifindex = ifindex;
+ return 1;
+}
+
+/* Add/update destinations for multicast */
+static int vxlan_fdb_append(struct vxlan_fdb *f,
+ union vxlan_addr *ip, __be16 port, __u32 vni,
+ __u32 ifindex, struct vxlan_rdst **rdp)
+{
+ struct vxlan_rdst *rd;
+
+ rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
+ if (rd)
+ return 0;
+
+ rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
+ if (rd == NULL)
+ return -ENOBUFS;
+ rd->remote_ip = *ip;
+ rd->remote_port = port;
+ rd->remote_vni = vni;
+ rd->remote_ifindex = ifindex;
+
+ list_add_tail_rcu(&rd->list, &f->remotes);
+
+ *rdp = rd;
+ return 1;
+}
+
+#ifdef HAVE_UDP_OFFLOAD
+#ifdef HAVE_NETIF_F_GSO_TUNNEL_REMCSUM
+static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
+ unsigned int off,
+ struct vxlanhdr *vh, size_t hdrlen,
+ u32 data, struct gro_remcsum *grc,
+ bool nopartial)
+{
+ size_t start, offset;
+
+ if (skb->remcsum_offload)
+ return vh;
+
+ if (!NAPI_GRO_CB(skb)->csum_valid)
+ return NULL;
+
+ start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
+ offset = start + ((data & VXLAN_RCO_UDP) ?
+ offsetof(struct udphdr, check) :
+ offsetof(struct tcphdr, check));
+
+ vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
+ start, offset, grc, nopartial);
+
+ skb->remcsum_offload = 1;
+
+ return vh;
+}
+#else
+static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
+ unsigned int off,
+ struct vxlanhdr *vh, size_t hdrlen,
+ u32 data, struct gro_remcsum *grc,
+ bool nopartial)
+{
+ return NULL;
+}
+#endif
+
+#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF
+static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+#else
+static struct sk_buff **vxlan_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb,
+ struct udp_offload *uoff)
+#endif
+{
+#ifdef HAVE_UDP_OFFLOAD_ARG_UOFF
+ struct vxlan_sock *vs = container_of(uoff, struct vxlan_sock,
+ udp_offloads);
+#else
+ struct vxlan_sock *vs = NULL;
+#endif
+ struct sk_buff *p, **pp = NULL;
+ struct vxlanhdr *vh, *vh2;
+ unsigned int hlen, off_vx;
+ int flush = 1;
+ u32 flags;
+ struct gro_remcsum grc;
+
+ skb_gro_remcsum_init(&grc);
+
+ off_vx = skb_gro_offset(skb);
+ hlen = off_vx + sizeof(*vh);
+ vh = skb_gro_header_fast(skb, off_vx);
+ if (skb_gro_header_hard(skb, hlen)) {
+ vh = skb_gro_header_slow(skb, hlen, off_vx);
+ if (unlikely(!vh))
+ goto out;
+ }
+
+ skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));
+
+ flags = ntohl(vh->vx_flags);
+
+ if ((flags & VXLAN_HF_RCO) && vs && (vs->flags & VXLAN_F_REMCSUM_RX)) {
+
+ vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
+ ntohl(vh->vx_vni), &grc,
+ !!(vs->flags &
+ VXLAN_F_REMCSUM_NOPARTIAL));
+
+ if (!vh)
+ goto out;
+ }
+
+ skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */
+
+ flush = 0;
+
+ for (p = *head; p; p = p->next) {
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ vh2 = (struct vxlanhdr *)(p->data + off_vx);
+ if (vh->vx_flags != vh2->vx_flags ||
+ vh->vx_vni != vh2->vx_vni) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+ }
+
+ pp = eth_gro_receive(head, skb);
+
+out:
+ skb_gro_remcsum_cleanup(skb, &grc);
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+#ifndef HAVE_UDP_OFFLOAD_ARG_UOFF
+static int vxlan_gro_complete(struct sk_buff *skb, int nhoff)
+#else
+static int vxlan_gro_complete(struct sk_buff *skb, int nhoff,
+ struct udp_offload *uoff)
+#endif
+{
+ udp_tunnel_gro_complete(skb, nhoff);
+
+ return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
+}
+
+/* Notify netdevs that UDP port started listening */
+static void vxlan_notify_add_rx_port(struct vxlan_sock *vs)
+{
+ struct net_device *dev;
+ struct sock *sk = vs->sock->sk;
+ struct net *net = sock_net(sk);
+ sa_family_t sa_family = vxlan_get_sk_family(vs);
+ __be16 port = inet_sk(sk)->inet_sport;
+ int err;
+
+ if (sa_family == AF_INET) {
+ err = udp_add_offload(&vs->udp_offloads);
+ if (err)
+ pr_warn("vxlan: udp_add_offload failed with status %d\n", err);
+ }
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (dev->netdev_ops->ndo_add_vxlan_port)
+ dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family,
+ port);
+ }
+ rcu_read_unlock();
+}
+
+/* Notify netdevs that UDP port is no more listening */
+static void vxlan_notify_del_rx_port(struct vxlan_sock *vs)
+{
+ struct net_device *dev;
+ struct sock *sk = vs->sock->sk;
+ struct net *net = sock_net(sk);
+ sa_family_t sa_family = vxlan_get_sk_family(vs);
+ __be16 port = inet_sk(sk)->inet_sport;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (dev->netdev_ops->ndo_del_vxlan_port)
+ dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family,
+ port);
+ }
+ rcu_read_unlock();
+
+ if (sa_family == AF_INET)
+ udp_del_offload(&vs->udp_offloads);
+}
+#endif
+
+/* Add new entry to forwarding table -- assumes lock held */
+static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+ const u8 *mac, union vxlan_addr *ip,
+ __u16 state, __u16 flags,
+ __be16 port, __u32 vni, __u32 ifindex,
+ __u8 ndm_flags)
+{
+ struct vxlan_rdst *rd = NULL;
+ struct vxlan_fdb *f;
+ int notify = 0;
+
+ f = __vxlan_find_mac(vxlan, mac);
+ if (f) {
+ if (flags & NLM_F_EXCL) {
+ netdev_dbg(vxlan->dev,
+ "lost race to create %pM\n", mac);
+ return -EEXIST;
+ }
+ if (f->state != state) {
+ f->state = state;
+ f->updated = jiffies;
+ notify = 1;
+ }
+ if (f->flags != ndm_flags) {
+ f->flags = ndm_flags;
+ f->updated = jiffies;
+ notify = 1;
+ }
+ if ((flags & NLM_F_REPLACE)) {
+ /* Only change unicasts */
+ if (!(is_multicast_ether_addr(f->eth_addr) ||
+ is_zero_ether_addr(f->eth_addr))) {
+ notify |= vxlan_fdb_replace(f, ip, port, vni,
+ ifindex);
+ } else
+ return -EOPNOTSUPP;
+ }
+ if ((flags & NLM_F_APPEND) &&
+ (is_multicast_ether_addr(f->eth_addr) ||
+ is_zero_ether_addr(f->eth_addr))) {
+ int rc = vxlan_fdb_append(f, ip, port, vni, ifindex,
+ &rd);
+
+ if (rc < 0)
+ return rc;
+ notify |= rc;
+ }
+ } else {
+ if (!(flags & NLM_F_CREATE))
+ return -ENOENT;
+
+ if (vxlan->cfg.addrmax &&
+ vxlan->addrcnt >= vxlan->cfg.addrmax)
+ return -ENOSPC;
+
+ /* Disallow replace to add a multicast entry */
+ if ((flags & NLM_F_REPLACE) &&
+ (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
+ return -EOPNOTSUPP;
+
+ netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
+ f = kmalloc(sizeof(*f), GFP_ATOMIC);
+ if (!f)
+ return -ENOMEM;
+
+ notify = 1;
+ f->state = state;
+ f->flags = ndm_flags;
+ f->updated = f->used = jiffies;
+ INIT_LIST_HEAD(&f->remotes);
+ memcpy(f->eth_addr, mac, ETH_ALEN);
+
+ vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
+
+ ++vxlan->addrcnt;
+ hlist_add_head_rcu(&f->hlist,
+ vxlan_fdb_head(vxlan, mac));
+ }
+
+ if (notify) {
+ if (rd == NULL)
+ rd = first_remote_rtnl(f);
+ vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH);
+ }
+
+ return 0;
+}
+
+static void vxlan_fdb_free(struct rcu_head *head)
+{
+ struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
+ struct vxlan_rdst *rd, *nd;
+
+ list_for_each_entry_safe(rd, nd, &f->remotes, list)
+ kfree(rd);
+ kfree(f);
+}
+
+static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
+{
+ netdev_dbg(vxlan->dev,
+ "delete %pM\n", f->eth_addr);
+
+ --vxlan->addrcnt;
+ vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH);
+
+ hlist_del_rcu(&f->hlist);
+ call_rcu(&f->rcu, vxlan_fdb_free);
+}
+
+/* Watch incoming packets to learn mapping between Ethernet address
+ * and Tunnel endpoint.
+ * Return true if packet is bogus and should be dropped.
+ */
+static bool vxlan_snoop(struct net_device *dev,
+ union vxlan_addr *src_ip, const u8 *src_mac)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_fdb *f;
+
+ f = vxlan_find_mac(vxlan, src_mac);
+ if (likely(f)) {
+ struct vxlan_rdst *rdst = first_remote_rcu(f);
+
+ if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip)))
+ return false;
+
+ /* Don't migrate static entries, drop packets */
+ if (f->state & NUD_NOARP)
+ return true;
+
+ if (net_ratelimit())
+ netdev_info(dev,
+ "%pM migrated from %pIS to %pIS\n",
+ src_mac, &rdst->remote_ip.sa, &src_ip->sa);
+
+ rdst->remote_ip = *src_ip;
+ f->updated = jiffies;
+ vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH);
+ } else {
+ /* learned new entry */
+ spin_lock(&vxlan->hash_lock);
+
+ /* close off race between vxlan_flush and incoming packets */
+ if (netif_running(dev))
+ vxlan_fdb_create(vxlan, src_mac, src_ip,
+ NUD_REACHABLE,
+ NLM_F_EXCL|NLM_F_CREATE,
+ vxlan->cfg.dst_port,
+ vxlan->default_dst.remote_vni,
+ 0, NTF_SELF);
+ spin_unlock(&vxlan->hash_lock);
+ }
+
+ return false;
+}
+
+/* See if multicast group is already in use by other ID */
+static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
+{
+ struct vxlan_dev *vxlan;
+
+ /* The vxlan_sock is only used by dev, leaving group has
+ * no effect on other vxlan devices.
+ */
+ if (atomic_read(&dev->vn_sock->refcnt) == 1)
+ return false;
+
+ list_for_each_entry(vxlan, &vn->vxlan_list, next) {
+ if (!netif_running(vxlan->dev) || vxlan == dev)
+ continue;
+
+ if (vxlan->vn_sock != dev->vn_sock)
+ continue;
+
+ if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
+ &dev->default_dst.remote_ip))
+ continue;
+
+ if (vxlan->default_dst.remote_ifindex !=
+ dev->default_dst.remote_ifindex)
+ continue;
+
+ return true;
+ }
+
+ return false;
+}
+
+static void vxlan_sock_release(struct vxlan_sock *vs)
+{
+ struct sock *sk = vs->sock->sk;
+ struct net *net = sock_net(sk);
+ struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+
+ if (!atomic_dec_and_test(&vs->refcnt))
+ return;
+
+ spin_lock(&vn->sock_lock);
+ hlist_del_rcu(&vs->hlist);
+#ifdef HAVE_UDP_OFFLOAD
+ vxlan_notify_del_rx_port(vs);
+#endif
+ spin_unlock(&vn->sock_lock);
+
+ queue_work(vxlan_wq, &vs->del_work);
+}
+
+/* Update multicast group membership when first VNI on
+ * multicast address is brought up
+ */
+static int vxlan_igmp_join(struct vxlan_dev *vxlan)
+{
+ return -EINVAL;
+}
+
+/* Inverse of vxlan_igmp_join when last VNI is brought down */
+static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
+{
+ return -EINVAL;
+}
+
+#ifdef HAVE_VXLAN_HF_RCO
+static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
+ size_t hdrlen, u32 data, bool nopartial)
+{
+ size_t start, offset, plen;
+
+ if (skb->remcsum_offload)
+ return vh;
+
+ start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT;
+ offset = start + ((data & VXLAN_RCO_UDP) ?
+ offsetof(struct udphdr, check) :
+ offsetof(struct tcphdr, check));
+
+ plen = hdrlen + offset + sizeof(u16);
+
+ if (!pskb_may_pull(skb, plen))
+ return NULL;
+
+ vh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
+
+ skb_remcsum_process(skb, (void *)vh + hdrlen, start, offset,
+ nopartial);
+
+ return vh;
+}
+#endif
+
+static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
+ struct vxlan_metadata *md, u32 vni,
+ struct metadata_dst *tun_dst)
+{
+ struct iphdr *oip = NULL;
+ struct ipv6hdr *oip6 = NULL;
+ struct vxlan_dev *vxlan;
+#ifdef HAVE_DEV_TSTATS
+ struct pcpu_sw_netstats *stats;
+#endif
+ union vxlan_addr saddr;
+ int err = 0;
+ union vxlan_addr *remote_ip;
+
+ /* For flow based devices, map all packets to VNI 0 */
+ if (vs->flags & VXLAN_F_COLLECT_METADATA)
+ vni = 0;
+
+ /* Is this VNI defined? */
+ vxlan = vxlan_vs_find_vni(vs, vni);
+ if (!vxlan)
+ goto drop;
+
+ remote_ip = &vxlan->default_dst.remote_ip;
+ skb_reset_mac_header(skb);
+ skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev)));
+ skb->protocol = eth_type_trans(skb, vxlan->dev);
+ skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
+
+ /* Ignore packet loops (and multicast echo) */
+ if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
+ goto drop;
+
+ /* Re-examine inner Ethernet packet */
+ if (remote_ip->sa.sa_family == AF_INET) {
+ oip = ip_hdr(skb);
+ saddr.sin.sin_addr.s_addr = oip->saddr;
+ saddr.sa.sa_family = AF_INET;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ oip6 = ipv6_hdr(skb);
+ saddr.sin6.sin6_addr = oip6->saddr;
+ saddr.sa.sa_family = AF_INET6;
+#endif
+ }
+
+ if (tun_dst) {
+ ovs_skb_dst_set(skb, (struct dst_entry *)tun_dst);
+ tun_dst = NULL;
+ } else {
+ goto drop;
+ }
+
+ if ((vxlan->flags & VXLAN_F_LEARN) &&
+ vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source))
+ goto drop;
+
+ skb_reset_network_header(skb);
+ /* In flow-based mode, GBP is carried in dst_metadata */
+ if (!(vs->flags & VXLAN_F_COLLECT_METADATA))
+ skb->mark = md->gbp;
+
+ if (oip6)
+ err = IP6_ECN_decapsulate(oip6, skb);
+ if (oip)
+ err = IP_ECN_decapsulate(oip, skb);
+
+ if (unlikely(err)) {
+ if (err > 1) {
+ ++vxlan->dev->stats.rx_frame_errors;
+ ++vxlan->dev->stats.rx_errors;
+ goto drop;
+ }
+ }
+
+#ifdef HAVE_DEV_TSTATS
+ stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)vxlan->dev->tstats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
+#endif
+ netdev_port_receive(skb, skb_tunnel_info(skb));
+ return;
+drop:
+
+ /* Consume bad packet */
+ kfree_skb(skb);
+}
+
/* Callback from net/ipv4/udp.c to receive packets */
static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
struct vxlan_sock *vs;
struct vxlanhdr *vxh;
u32 flags, vni;
- struct vxlan_metadata md = {0};
+ struct vxlan_metadata _md;
+ struct vxlan_metadata *md = &_md;
+ union {
+ struct metadata_dst dst;
+ char buf[sizeof(struct metadata_dst) + sizeof(*md)];
+ } buf;
/* Need Vxlan and inner Ethernet header to be present */
if (!pskb_may_pull(skb, VXLAN_HLEN))
@@ -93,73 +932,83 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB)))
goto drop;
+ vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1);
vs = rcu_dereference_sk_user_data(sk);
if (!vs)
goto drop;
+#ifdef HAVE_VXLAN_HF_RCO
+ if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
+ vxh = vxlan_remcsum(skb, vxh, sizeof(struct vxlanhdr), vni,
+ !!(vs->flags & VXLAN_F_REMCSUM_NOPARTIAL));
+ if (!vxh)
+ goto drop;
+
+ flags &= ~VXLAN_HF_RCO;
+ vni &= VXLAN_VNI_MASK;
+ }
+#endif
+
+ if (vxlan_collect_metadata(vs)) {
+ ovs_udp_tun_rx_dst(&buf.dst.u.tun_info, skb, AF_INET, TUNNEL_KEY,
+ cpu_to_be64(vni >> 8), sizeof(*md));
+
+ md = ip_tunnel_info_opts(&buf.dst.u.tun_info);
+ } else {
+ memset(md, 0, sizeof(*md));
+ }
+
/* For backwards compatibility, only allow reserved fields to be
- * used by VXLAN extensions if explicitly requested.
- */
+ * used by VXLAN extensions if explicitly requested.
+ */
if ((flags & VXLAN_HF_GBP) && (vs->flags & VXLAN_F_GBP)) {
struct vxlanhdr_gbp *gbp;
gbp = (struct vxlanhdr_gbp *)vxh;
- md.gbp = ntohs(gbp->policy_id);
+ md->gbp = ntohs(gbp->policy_id);
+
+ buf.dst.u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
if (gbp->dont_learn)
- md.gbp |= VXLAN_GBP_DONT_LEARN;
+ md->gbp |= VXLAN_GBP_DONT_LEARN;
if (gbp->policy_applied)
- md.gbp |= VXLAN_GBP_POLICY_APPLIED;
+ md->gbp |= VXLAN_GBP_POLICY_APPLIED;
flags &= ~VXLAN_GBP_USED_BITS;
}
- if (flags || (vni & 0xff)) {
+ if (flags || vni & ~VXLAN_VNI_MASK) {
/* If there are any unprocessed flags remaining treat
- * this as a malformed packet. This behavior diverges from
- * VXLAN RFC (RFC7348) which stipulates that bits in reserved
- * in reserved fields are to be ignored. The approach here
- * maintains compatbility with previous stack code, and also
- * is more robust and provides a little more security in
- * adding extensions to VXLAN.
- */
+ * this as a malformed packet. This behavior diverges from
+ * VXLAN RFC (RFC7348) which stipulates that bits in reserved
+ * in reserved fields are to be ignored. The approach here
+ * maintains compatibility with previous stack code, and also
+ * is more robust and provides a little more security in
+ * adding extensions to VXLAN.
+ */
goto bad_flags;
}
- md.vni = vxh->vx_vni;
- vs->rcv(vs, skb, &md);
+ vxlan_rcv(vs, skb, md, vni >> 8, &buf.dst);
return 0;
drop:
/* Consume bad packet */
kfree_skb(skb);
return 0;
+
bad_flags:
- pr_debug("invalid vxlan flags=%#x vni=%#x\n",
- ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
+ netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
+ ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
error:
/* Return non vxlan pkt */
return 1;
}
-static void vxlan_sock_put(struct sk_buff *skb)
-{
- sock_put(skb->sk);
-}
-
-/* On transmit, associate with the tunnel socket */
-static void vxlan_set_owner(struct sock *sk, struct sk_buff *skb)
-{
- skb_orphan(skb);
- sock_hold(sk);
- skb->sk = sk;
- skb->destructor = vxlan_sock_put;
-}
-
static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
struct vxlan_metadata *md)
{
@@ -180,15 +1029,130 @@ static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
}
-int rpl_vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
- __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
- __be16 src_port, __be16 dst_port,
- struct vxlan_metadata *md, bool xnet, u32 vxflags)
+#if IS_ENABLED(CONFIG_IPV6)
+static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
+ struct net_device *dev, struct in6_addr *saddr,
+ struct in6_addr *daddr, __u8 prio, __u8 ttl,
+ __be16 src_port, __be16 dst_port, __be32 vni,
+ struct vxlan_metadata *md, bool xnet, u32 vxflags)
+{
+ struct vxlanhdr *vxh;
+ int min_headroom;
+ int err;
+ bool udp_sum = !(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX);
+ int type = 0;
+
+ if ((vxflags & VXLAN_F_REMCSUM_TX) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ int csum_start = skb_checksum_start_offset(skb);
+
+ if (csum_start <= VXLAN_MAX_REMCSUM_START &&
+ !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
+ (skb->csum_offset == offsetof(struct udphdr, check) ||
+ skb->csum_offset == offsetof(struct tcphdr, check))) {
+ udp_sum = false;
+ type |= SKB_GSO_TUNNEL_REMCSUM;
+ /* Add support for remote csum. */
+ if (!SKB_GSO_TUNNEL_REMCSUM) {
+ kfree_skb(skb);
+ err = -EOPNOTSUPP;
+ goto err;
+ }
+ }
+ }
+
+ skb_scrub_packet(skb, xnet);
+
+ min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
+ + VXLAN_HLEN + sizeof(struct ipv6hdr)
+ + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+
+ /* Need space for new headers (invalidates iph ptr) */
+ err = skb_cow_head(skb, min_headroom);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ goto err;
+ }
+
+ skb = vlan_hwaccel_push_inside(skb);
+ if (WARN_ON(!skb)) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ skb = udp_tunnel_handle_offloads(skb, udp_sum, type, true);
+ if (IS_ERR(skb)) {
+ err = -EINVAL;
+ goto err;
+ }
+
+ vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
+ vxh->vx_flags = htonl(VXLAN_HF_VNI);
+ vxh->vx_vni = vni;
+
+ if (type & SKB_GSO_TUNNEL_REMCSUM) {
+ u16 hdrlen = sizeof(struct vxlanhdr);
+ u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
+ VXLAN_RCO_SHIFT;
+
+ if (skb->csum_offset == offsetof(struct udphdr, check))
+ data |= VXLAN_RCO_UDP;
+
+ vxh->vx_vni |= htonl(data);
+ vxh->vx_flags |= htonl(VXLAN_HF_RCO);
+
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
+ skb->encapsulation = 0;
+#endif
+ }
+ }
+
+ if (vxflags & VXLAN_F_GBP)
+ vxlan_build_gbp_hdr(vxh, vxflags, md);
+
+ ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+
+ udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio,
+ ttl, src_port, dst_port,
+ !!(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX));
+ return 0;
+err:
+ dst_release(dst);
+ return err;
+}
+#endif
+
+static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
+ __be16 src_port, __be16 dst_port, __be32 vni,
+ struct vxlan_metadata *md, bool xnet, u32 vxflags)
{
struct vxlanhdr *vxh;
int min_headroom;
int err;
bool udp_sum = !!(vxflags & VXLAN_F_UDP_CSUM);
+ int type = 0;
+
+ if ((vxflags & VXLAN_F_REMCSUM_TX) &&
+ skb->ip_summed == CHECKSUM_PARTIAL) {
+ int csum_start = skb_checksum_start_offset(skb);
+
+ if (csum_start <= VXLAN_MAX_REMCSUM_START &&
+ !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
+ (skb->csum_offset == offsetof(struct udphdr, check) ||
+ skb->csum_offset == offsetof(struct tcphdr, check))) {
+ udp_sum = false;
+ type |= SKB_GSO_TUNNEL_REMCSUM;
+
+ if (!SKB_GSO_TUNNEL_REMCSUM) {
+ kfree_skb(skb);
+ return -EOPNOTSUPP;
+ }
+ }
+ }
min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
+ VXLAN_HLEN + sizeof(struct iphdr)
@@ -205,28 +1169,601 @@ int rpl_vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
if (WARN_ON(!skb))
return -ENOMEM;
- skb = udp_tunnel_handle_offloads(skb, udp_sum, true);
+ skb = udp_tunnel_handle_offloads(skb, udp_sum, type, true);
if (IS_ERR(skb))
return PTR_ERR(skb);
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
vxh->vx_flags = htonl(VXLAN_HF_VNI);
- vxh->vx_vni = md->vni;
+ vxh->vx_vni = vni;
+
+ if (type & SKB_GSO_TUNNEL_REMCSUM) {
+ u16 hdrlen = sizeof(struct vxlanhdr);
+ u32 data = (skb_checksum_start_offset(skb) - hdrlen) >>
+ VXLAN_RCO_SHIFT;
+
+ if (skb->csum_offset == offsetof(struct udphdr, check))
+ data |= VXLAN_RCO_UDP;
+
+ vxh->vx_vni |= htonl(data);
+ vxh->vx_flags |= htonl(VXLAN_HF_RCO);
+ if (!skb_is_gso(skb)) {
+ skb->ip_summed = CHECKSUM_NONE;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
+ skb->encapsulation = 0;
+#endif
+ }
+ }
if (vxflags & VXLAN_F_GBP)
vxlan_build_gbp_hdr(vxh, vxflags, md);
- vxlan_set_owner(sk, skb);
-
ovs_skb_set_inner_protocol(skb, htons(ETH_P_TEB));
return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,
ttl, df, src_port, dst_port, xnet,
- !udp_sum);
+ !(vxflags & VXLAN_F_UDP_CSUM));
+}
+
+static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
+ struct vxlan_rdst *rdst, bool did_rsc)
+{
+ struct ip_tunnel_info *info;
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct sock *sk = vxlan->vn_sock->sock->sk;
+ unsigned short family = vxlan_get_sk_family(vxlan->vn_sock);
+ struct rtable *rt = NULL;
+ const struct iphdr *old_iph;
+ struct flowi4 fl4;
+ union vxlan_addr *dst;
+ union vxlan_addr remote_ip;
+ struct vxlan_metadata _md;
+ struct vxlan_metadata *md = &_md;
+ __be16 src_port = 0, dst_port;
+ u32 vni;
+ __be16 df = 0;
+ __u8 tos, ttl;
+ int err;
+ u32 flags = vxlan->flags;
+
+ info = skb_tunnel_info(skb);
+
+ if (rdst) {
+ dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
+ vni = rdst->remote_vni;
+ dst = &rdst->remote_ip;
+ } else {
+ if (!info) {
+ WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
+ dev->name);
+ goto drop;
+ }
+ if (family != ip_tunnel_info_af(info))
+ goto drop;
+
+ dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
+ vni = be64_to_cpu(info->key.tun_id);
+ remote_ip.sa.sa_family = family;
+ if (family == AF_INET)
+ remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
+ else
+ remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
+ dst = &remote_ip;
+ }
+
+ if (vxlan_addr_any(dst)) {
+ if (did_rsc) {
+ /* short-circuited back to local bridge */
+ WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
+ dev->name);
+ }
+ goto drop;
+ }
+
+ old_iph = ip_hdr(skb);
+
+ ttl = vxlan->cfg.ttl;
+ if (!ttl && vxlan_addr_multicast(dst))
+ ttl = 1;
+
+ tos = vxlan->cfg.tos;
+ if (tos == 1)
+ tos = ip_tunnel_get_dsfield(old_iph, skb);
+
+ src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
+ vxlan->cfg.port_max, true);
+
+ if (info) {
+ if (info->key.tun_flags & TUNNEL_CSUM)
+ flags |= VXLAN_F_UDP_CSUM;
+ else
+ flags &= ~VXLAN_F_UDP_CSUM;
+
+ ttl = info->key.ttl;
+ tos = info->key.tos;
+
+ if (info->options_len)
+ md = ip_tunnel_info_opts(info);
+ } else {
+ md->gbp = skb->mark;
+ }
+
+ if (dst->sa.sa_family == AF_INET) {
+ if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT))
+ df = htons(IP_DF);
+
+ memset(&fl4, 0, sizeof(fl4));
+ fl4.flowi4_oif = rdst ? rdst->remote_ifindex : 0;
+ fl4.flowi4_tos = RT_TOS(tos);
+ fl4.flowi4_mark = skb->mark;
+ fl4.flowi4_proto = IPPROTO_UDP;
+ fl4.daddr = dst->sin.sin_addr.s_addr;
+ fl4.saddr = vxlan->cfg.saddr.sin.sin_addr.s_addr;
+
+ rt = ip_route_output_key(vxlan->net, &fl4);
+ if (IS_ERR(rt)) {
+ netdev_dbg(dev, "no route to %pI4\n",
+ &dst->sin.sin_addr.s_addr);
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+
+ if (rt_dst(rt).dev == dev) {
+ netdev_dbg(dev, "circular route to %pI4\n",
+ &dst->sin.sin_addr.s_addr);
+ dev->stats.collisions++;
+ goto rt_tx_error;
+ }
+
+ /* Bypass encapsulation if the destination is local */
+ if (rt->rt_flags & RTCF_LOCAL &&
+ !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
+ struct vxlan_dev *dst_vxlan;
+
+ ip_rt_put(rt);
+ dst_vxlan = vxlan_find_vni(vxlan->net, vni,
+ dst->sa.sa_family, dst_port,
+ vxlan->flags);
+ if (!dst_vxlan)
+ goto tx_error;
+ WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
+ dev->name);
+ goto tx_error;
+ }
+
+ tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
+ ttl = ttl ? : ip4_dst_hoplimit(&rt_dst(rt));
+ err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
+ dst->sin.sin_addr.s_addr, tos, ttl, df,
+ src_port, dst_port, htonl(vni << 8), md,
+ !net_eq(vxlan->net, dev_net(vxlan->dev)),
+ flags);
+ if (err < 0) {
+ /* skb is already freed. */
+ skb = NULL;
+ goto rt_tx_error;
+ }
+
+ iptunnel_xmit_stats(err, &dev->stats, (struct pcpu_sw_netstats __percpu *)dev->tstats);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else {
+ struct dst_entry *ndst;
+ struct flowi6 fl6;
+ u32 rt6i_flags;
+
+ memset(&fl6, 0, sizeof(fl6));
+ fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
+ fl6.daddr = dst->sin6.sin6_addr;
+ fl6.saddr = vxlan->cfg.saddr.sin6.sin6_addr;
+ fl6.flowi6_mark = skb->mark;
+ fl6.flowi6_proto = IPPROTO_UDP;
+
+#ifdef HAVE_IPV6_DST_LOOKUP_NET
+ if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) {
+#else
+#ifdef HAVE_IPV6_STUB
+ if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) {
+#else
+ ndst = ip6_route_output(vxlan->net, sk, &fl6);
+ if (ndst->error) {
+#endif
+#endif
+ netdev_dbg(dev, "no route to %pI6\n",
+ &dst->sin6.sin6_addr);
+ dev->stats.tx_carrier_errors++;
+ goto tx_error;
+ }
+
+ if (ndst->dev == dev) {
+ netdev_dbg(dev, "circular route to %pI6\n",
+ &dst->sin6.sin6_addr);
+ dst_release(ndst);
+ dev->stats.collisions++;
+ goto tx_error;
+ }
+
+ /* Bypass encapsulation if the destination is local */
+ rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
+ if (rt6i_flags & RTF_LOCAL &&
+ !(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
+ struct vxlan_dev *dst_vxlan;
+
+ dst_release(ndst);
+ dst_vxlan = vxlan_find_vni(vxlan->net, vni,
+ dst->sa.sa_family, dst_port,
+ vxlan->flags);
+ if (!dst_vxlan)
+ goto tx_error;
+ WARN_ONCE(1, "%s: vxlan_encap_bypass not supported\n",
+ dev->name);
+ goto tx_error;
+ }
+
+ ttl = ttl ? : ip6_dst_hoplimit(ndst);
+ err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
+ 0, ttl, src_port, dst_port, htonl(vni << 8), md,
+ !net_eq(vxlan->net, dev_net(vxlan->dev)),
+ flags);
+#endif
+ }
+
+ return;
+
+drop:
+ dev->stats.tx_dropped++;
+ goto tx_free;
+
+rt_tx_error:
+ ip_rt_put(rt);
+tx_error:
+ dev->stats.tx_errors++;
+tx_free:
+ dev_kfree_skb(skb);
+}
+
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ * source port is based on hash of flow
+ */
+netdev_tx_t rpl_vxlan_xmit(struct sk_buff *skb)
+{
+ struct net_device *dev = skb->dev;
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ const struct ip_tunnel_info *info;
+
+ info = skb_tunnel_info(skb);
+
+ skb_reset_mac_header(skb);
+
+ if ((vxlan->flags & VXLAN_F_PROXY))
+ goto out;
+
+ if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
+ info && info->mode & IP_TUNNEL_INFO_TX) {
+ vxlan_xmit_one(skb, dev, NULL, false);
+ return NETDEV_TX_OK;
+ }
+out:
+ pr_warn("vxlan: unsupported flag set %x", vxlan->flags);
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+EXPORT_SYMBOL(rpl_vxlan_xmit);
+
+/* Walk the forwarding table and purge stale entries */
+static void vxlan_cleanup(unsigned long arg)
+{
+ struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
+ unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
+ unsigned int h;
+
+ if (!netif_running(vxlan->dev))
+ return;
+
+ for (h = 0; h < FDB_HASH_SIZE; ++h) {
+ struct hlist_node *p, *n;
+
+ spin_lock_bh(&vxlan->hash_lock);
+ hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+ struct vxlan_fdb *f
+ = container_of(p, struct vxlan_fdb, hlist);
+ unsigned long timeout;
+
+ if (f->state & NUD_PERMANENT)
+ continue;
+
+ timeout = f->used + vxlan->cfg.age_interval * HZ;
+ if (time_before_eq(timeout, jiffies)) {
+ netdev_dbg(vxlan->dev,
+ "garbage collect %pM\n",
+ f->eth_addr);
+ f->state = NUD_STALE;
+ vxlan_fdb_destroy(vxlan, f);
+ } else if (time_before(timeout, next_timer))
+ next_timer = timeout;
+ }
+ spin_unlock_bh(&vxlan->hash_lock);
+ }
+
+ mod_timer(&vxlan->age_timer, next_timer);
}
-EXPORT_SYMBOL_GPL(rpl_vxlan_xmit_skb);
-static void rcu_free_vs(struct rcu_head *rcu)
+static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
+{
+ struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
+ __u32 vni = vxlan->default_dst.remote_vni;
+
+ vxlan->vn_sock = vs;
+ spin_lock(&vn->sock_lock);
+ hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
+ spin_unlock(&vn->sock_lock);
+}
+
+/* Setup stats when device is created */
+#ifdef HAVE_DEV_TSTATS
+static int vxlan_init(struct net_device *dev)
+{
+ dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ return 0;
+}
+#endif
+
+static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan)
+{
+ struct vxlan_fdb *f;
+
+ spin_lock_bh(&vxlan->hash_lock);
+ f = __vxlan_find_mac(vxlan, all_zeros_mac);
+ if (f)
+ vxlan_fdb_destroy(vxlan, f);
+ spin_unlock_bh(&vxlan->hash_lock);
+}
+
+#ifdef HAVE_DEV_TSTATS
+static void vxlan_uninit(struct net_device *dev)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+
+ vxlan_fdb_delete_default(vxlan);
+
+ free_percpu(dev->tstats);
+}
+#endif
+
+/* Start ageing timer and join group when device is brought up */
+static int vxlan_open(struct net_device *dev)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_sock *vs;
+ int ret = 0;
+
+ vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port,
+ vxlan->cfg.no_share, vxlan->flags);
+ if (IS_ERR(vs))
+ return PTR_ERR(vs);
+
+ vxlan_vs_add_dev(vs, vxlan);
+
+ if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
+ ret = vxlan_igmp_join(vxlan);
+ if (ret == -EADDRINUSE)
+ ret = 0;
+ if (ret) {
+ vxlan_sock_release(vs);
+ return ret;
+ }
+ }
+
+ if (vxlan->cfg.age_interval)
+ mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
+
+ return ret;
+}
+
+/* Purge the forwarding table */
+static void vxlan_flush(struct vxlan_dev *vxlan)
+{
+ unsigned int h;
+
+ spin_lock_bh(&vxlan->hash_lock);
+ for (h = 0; h < FDB_HASH_SIZE; ++h) {
+ struct hlist_node *p, *n;
+
+ hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
+ struct vxlan_fdb *f
+ = container_of(p, struct vxlan_fdb, hlist);
+ /* the all_zeros_mac entry is deleted at vxlan_uninit */
+ if (!is_zero_ether_addr(f->eth_addr))
+ vxlan_fdb_destroy(vxlan, f);
+ }
+ }
+ spin_unlock_bh(&vxlan->hash_lock);
+}
+
+/* Cleanup timer and forwarding table on shutdown */
+static int vxlan_stop(struct net_device *dev)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
+ struct vxlan_sock *vs = vxlan->vn_sock;
+ int ret = 0;
+
+ if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
+ !vxlan_group_used(vn, vxlan))
+ ret = vxlan_igmp_leave(vxlan);
+
+ del_timer_sync(&vxlan->age_timer);
+
+ vxlan_flush(vxlan);
+ vxlan_sock_release(vs);
+
+ return ret;
+}
+
+/* Stub, nothing needs to be done. */
+static void vxlan_set_multicast_list(struct net_device *dev)
+{
+}
+
+static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_rdst *dst = &vxlan->default_dst;
+ struct net_device *lowerdev;
+ int max_mtu;
+
+ lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex);
+ if (lowerdev == NULL)
+ return eth_change_mtu(dev, new_mtu);
+
+ if (dst->remote_ip.sa.sa_family == AF_INET6)
+ max_mtu = lowerdev->mtu - VXLAN6_HEADROOM;
+ else
+ max_mtu = lowerdev->mtu - VXLAN_HEADROOM;
+
+ if (new_mtu < 68 || new_mtu > max_mtu)
+ return -EINVAL;
+
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+static netdev_tx_t vxlan_dev_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ /* Drop All packets coming from networking stack. OVS-CB is
+ * not initialized for these packets.
+ */
+
+ dev_kfree_skb(skb);
+ dev->stats.tx_dropped++;
+ return NETDEV_TX_OK;
+}
+
+static const struct net_device_ops vxlan_netdev_ops = {
+#ifdef HAVE_DEV_TSTATS
+ .ndo_init = vxlan_init,
+ .ndo_uninit = vxlan_uninit,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+#endif
+ .ndo_open = vxlan_open,
+ .ndo_stop = vxlan_stop,
+ .ndo_start_xmit = vxlan_dev_xmit,
+ .ndo_set_rx_mode = vxlan_set_multicast_list,
+ .ndo_change_mtu = vxlan_change_mtu,
+ .ndo_validate_addr = eth_validate_addr,
+ .ndo_set_mac_address = eth_mac_addr,
+};
+
+/* Info for udev, that this is a virtual tunnel endpoint */
+static struct device_type vxlan_type = {
+ .name = "vxlan",
+};
+
+/* Initialize the device structure. */
+static void vxlan_setup(struct net_device *dev)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ unsigned int h;
+
+ eth_hw_addr_random(dev);
+ ether_setup(dev);
+
+ dev->netdev_ops = &vxlan_netdev_ops;
+ dev->destructor = free_netdev;
+ SET_NETDEV_DEVTYPE(dev, &vxlan_type);
+
+ dev->features |= NETIF_F_LLTX;
+ dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ dev->features |= NETIF_F_RXCSUM;
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+
+ dev->vlan_features = dev->features;
+ dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
+#endif
+
+#if 0
+ netif_keep_dst(dev);
+#endif
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
+
+ INIT_LIST_HEAD(&vxlan->next);
+ spin_lock_init(&vxlan->hash_lock);
+
+ init_timer_deferrable(&vxlan->age_timer);
+ vxlan->age_timer.function = vxlan_cleanup;
+ vxlan->age_timer.data = (unsigned long) vxlan;
+
+ vxlan->cfg.dst_port = htons(vxlan_port);
+
+ vxlan->dev = dev;
+
+ for (h = 0; h < FDB_HASH_SIZE; ++h)
+ INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
+}
+
+static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
+ [IFLA_VXLAN_PORT] = { .type = NLA_U16 },
+};
+
+static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
+{
+ if (tb[IFLA_ADDRESS]) {
+ if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
+ pr_debug("invalid link address (not ethernet)\n");
+ return -EINVAL;
+ }
+
+ if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
+ pr_debug("invalid all zero ethernet address\n");
+ return -EADDRNOTAVAIL;
+ }
+ }
+
+ if (!data)
+ return -EINVAL;
+
+ if (data[IFLA_VXLAN_ID]) {
+ __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+ if (id >= VXLAN_VID_MASK)
+ return -ERANGE;
+ }
+
+ if (data[IFLA_VXLAN_PORT_RANGE]) {
+ const struct ifla_vxlan_port_range *p
+ = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
+
+ if (ntohs(p->high) < ntohs(p->low)) {
+ pr_debug("port range %u .. %u not valid\n",
+ ntohs(p->low), ntohs(p->high));
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static void vxlan_get_drvinfo(struct net_device *netdev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
+ strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
+}
+
+static const struct ethtool_ops vxlan_ethtool_ops = {
+ .get_drvinfo = vxlan_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+static void free_vs_rcu(struct rcu_head *rcu)
{
struct vxlan_sock *vs = container_of(rcu, struct vxlan_sock, rcu);
@@ -236,9 +1773,9 @@ static void rcu_free_vs(struct rcu_head *rcu)
static void vxlan_del_work(struct work_struct *work)
{
struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
-
udp_tunnel_sock_release(vs->sock);
- call_rcu(&vs->rcu, rcu_free_vs);
+
+ call_rcu(&vs->rcu, free_vs_rcu);
}
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
@@ -252,13 +1789,11 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
if (ipv6) {
udp_conf.family = AF_INET6;
- /* The checksum flag is silently ignored but it
- * doesn't make sense here anyways because OVS enables
- * checksums on a finer granularity than per-socket.
- */
+ udp_conf.use_udp6_rx_checksums =
+ !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
+ udp_conf.ipv6_v6only = 1;
} else {
udp_conf.family = AF_INET;
- udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
}
udp_conf.local_udp_port = port;
@@ -271,32 +1806,51 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
return sock;
}
+/* Create new listen socket if needed */
static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
- vxlan_rcv_t *rcv, void *data, u32 flags)
+ u32 flags)
{
+ struct vxlan_net *vn = net_generic(net, vxlan_net_id);
struct vxlan_sock *vs;
struct socket *sock;
+ unsigned int h;
+ bool ipv6 = !!(flags & VXLAN_F_IPV6);
struct udp_tunnel_sock_cfg tunnel_cfg;
- vs = kmalloc(sizeof(*vs), GFP_KERNEL);
- if (!vs) {
- pr_debug("memory alocation failure\n");
+ vs = kzalloc(sizeof(*vs), GFP_KERNEL);
+ if (!vs)
return ERR_PTR(-ENOMEM);
- }
+
+ for (h = 0; h < VNI_HASH_SIZE; ++h)
+ INIT_HLIST_HEAD(&vs->vni_list[h]);
INIT_WORK(&vs->del_work, vxlan_del_work);
- sock = vxlan_create_sock(net, false, port, flags);
+ sock = vxlan_create_sock(net, ipv6, port, flags);
if (IS_ERR(sock)) {
+ pr_info("Cannot bind port %d, err=%ld\n", ntohs(port),
+ PTR_ERR(sock));
kfree(vs);
return ERR_CAST(sock);
}
vs->sock = sock;
- vs->rcv = rcv;
- vs->data = data;
+ atomic_set(&vs->refcnt, 1);
vs->flags = (flags & VXLAN_F_RCV_FLAGS);
+ /* Initialize the vxlan udp offloads structure */
+#ifdef HAVE_UDP_OFFLOAD
+ vs->udp_offloads.port = port;
+ vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive;
+ vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete;
+ vxlan_notify_add_rx_port(vs);
+#endif
+
+ spin_lock(&vn->sock_lock);
+ hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
+ spin_unlock(&vn->sock_lock);
+
+ /* Mark socket as an encapsulation socket. */
tunnel_cfg.sk_user_data = vs;
tunnel_cfg.encap_type = 1;
tunnel_cfg.encap_rcv = vxlan_udp_encap_recv;
@@ -307,20 +1861,378 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
return vs;
}
-struct vxlan_sock *rpl_vxlan_sock_add(struct net *net, __be16 port,
- vxlan_rcv_t *rcv, void *data,
- bool no_share, u32 flags)
+static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port,
+ bool no_share, u32 flags)
{
- return vxlan_socket_create(net, port, rcv, data, flags);
+ struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+ struct vxlan_sock *vs;
+ bool ipv6 = flags & VXLAN_F_IPV6;
+
+ if (!no_share) {
+ spin_lock(&vn->sock_lock);
+ vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port,
+ flags);
+ if (vs) {
+ if (!atomic_add_unless(&vs->refcnt, 1, 0))
+ vs = ERR_PTR(-EBUSY);
+ spin_unlock(&vn->sock_lock);
+ return vs;
+ }
+ spin_unlock(&vn->sock_lock);
+ }
+
+ return vxlan_socket_create(net, port, flags);
}
-EXPORT_SYMBOL_GPL(rpl_vxlan_sock_add);
-void rpl_vxlan_sock_release(struct vxlan_sock *vs)
+static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
+ struct vxlan_config *conf)
{
- ASSERT_OVSL();
+ struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_rdst *dst = &vxlan->default_dst;
+ int err;
+ bool use_ipv6 = false;
+ __be16 default_port = vxlan->cfg.dst_port;
+
+ vxlan->net = src_net;
+
+ dst->remote_vni = conf->vni;
+
+ memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));
+
+ /* Unless IPv6 is explicitly requested, assume IPv4 */
+ if (!dst->remote_ip.sa.sa_family)
+ dst->remote_ip.sa.sa_family = AF_INET;
+
+ if (dst->remote_ip.sa.sa_family == AF_INET6 ||
+ vxlan->cfg.saddr.sa.sa_family == AF_INET6) {
+ if (!IS_ENABLED(CONFIG_IPV6))
+ return -EPFNOSUPPORT;
+ use_ipv6 = true;
+ }
+
+ if (conf->remote_ifindex) {
+ struct net_device *lowerdev
+ = __dev_get_by_index(src_net, conf->remote_ifindex);
+
+ dst->remote_ifindex = conf->remote_ifindex;
+
+ if (!lowerdev) {
+ pr_info("ifindex %d does not exist\n", dst->remote_ifindex);
+ return -ENODEV;
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (use_ipv6) {
+ struct inet6_dev *idev = __in6_dev_get(lowerdev);
+ if (idev && idev->cnf.disable_ipv6) {
+ pr_info("IPv6 is disabled via sysctl\n");
+ return -EPERM;
+ }
+ vxlan->flags |= VXLAN_F_IPV6;
+ }
+#endif
- queue_work(system_wq, &vs->del_work);
+ if (!conf->mtu)
+ dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
+
+ dev->needed_headroom = lowerdev->hard_header_len +
+ (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
+ } else if (use_ipv6) {
+ vxlan->flags |= VXLAN_F_IPV6;
+ dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM;
+ } else {
+ dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM;
+ }
+
+ memcpy(&vxlan->cfg, conf, sizeof(*conf));
+ if (!vxlan->cfg.dst_port)
+ vxlan->cfg.dst_port = default_port;
+ vxlan->flags |= conf->flags;
+
+ if (!vxlan->cfg.age_interval)
+ vxlan->cfg.age_interval = FDB_AGE_DEFAULT;
+
+ if (vxlan_find_vni(src_net, conf->vni, use_ipv6 ? AF_INET6 : AF_INET,
+ vxlan->cfg.dst_port, vxlan->flags))
+ return -EEXIST;
+
+ dev->ethtool_ops = &vxlan_ethtool_ops;
+
+ /* create an fdb entry for a valid default destination */
+ if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
+ err = vxlan_fdb_create(vxlan, all_zeros_mac,
+ &vxlan->default_dst.remote_ip,
+ NUD_REACHABLE|NUD_PERMANENT,
+ NLM_F_EXCL|NLM_F_CREATE,
+ vxlan->cfg.dst_port,
+ vxlan->default_dst.remote_vni,
+ vxlan->default_dst.remote_ifindex,
+ NTF_SELF);
+ if (err)
+ return err;
+ }
+
+ err = register_netdevice(dev);
+ if (err) {
+ vxlan_fdb_delete_default(vxlan);
+ return err;
+ }
+
+ list_add(&vxlan->next, &vn->vxlan_list);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(rpl_vxlan_sock_release);
-#endif /* !USE_UPSTREAM_VXLAN */
+struct net_device *rpl_vxlan_dev_create(struct net *net, const char *name,
+ u8 name_assign_type, struct vxlan_config *conf)
+{
+ struct nlattr *tb[IFLA_MAX+1];
+ struct net_device *dev;
+ int err;
+
+ memset(&tb, 0, sizeof(tb));
+
+ dev = rtnl_create_link(net, (char *)name, name_assign_type,
+ &vxlan_link_ops, tb);
+ if (IS_ERR(dev))
+ return dev;
+
+ err = vxlan_dev_configure(net, dev, conf);
+ if (err < 0) {
+ free_netdev(dev);
+ return ERR_PTR(err);
+ }
+
+ return dev;
+}
+EXPORT_SYMBOL_GPL(rpl_vxlan_dev_create);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+static int vxlan_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+#else
+static int vxlan_newlink(struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[])
+#endif
+{
+ return -EINVAL;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+static void vxlan_dellink(struct net_device *dev, struct list_head *head)
+#else
+static void vxlan_dellink(struct net_device *dev)
+#endif
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
+
+ spin_lock(&vn->sock_lock);
+ if (!hlist_unhashed(&vxlan->hlist))
+ hlist_del_rcu(&vxlan->hlist);
+ spin_unlock(&vn->sock_lock);
+
+ list_del(&vxlan->next);
+ unregister_netdevice_queue(dev, head);
+}
+
+static size_t vxlan_get_size(const struct net_device *dev)
+{
+
+ return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */
+ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
+ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
+ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */
+ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
+ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
+ nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
+ nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
+ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
+ 0;
+}
+
+static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ const struct vxlan_dev *vxlan = netdev_priv(dev);
+
+ if (nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+#ifdef HAVE_GET_LINK_NET
+static struct net *vxlan_get_link_net(const struct net_device *dev)
+{
+ struct vxlan_dev *vxlan = netdev_priv(dev);
+
+ return vxlan->net;
+}
+#endif
+
+static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
+ .kind = "ovs_vxlan",
+ .maxtype = IFLA_VXLAN_MAX,
+ .policy = vxlan_policy,
+ .priv_size = sizeof(struct vxlan_dev),
+ .setup = vxlan_setup,
+ .validate = vxlan_validate,
+ .newlink = vxlan_newlink,
+ .dellink = vxlan_dellink,
+ .get_size = vxlan_get_size,
+ .fill_info = vxlan_fill_info,
+#ifdef HAVE_GET_LINK_NET
+ .get_link_net = vxlan_get_link_net,
+#endif
+};
+
+static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
+ struct net_device *dev)
+{
+ struct vxlan_dev *vxlan, *next;
+ LIST_HEAD(list_kill);
+
+ list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
+ struct vxlan_rdst *dst = &vxlan->default_dst;
+
+ /* In case we created vxlan device with carrier
+ * and we loose the carrier due to module unload
+ * we also need to remove vxlan device. In other
+ * cases, it's not necessary and remote_ifindex
+ * is 0 here, so no matches.
+ */
+ if (dst->remote_ifindex == dev->ifindex)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ vxlan_dellink(vxlan->dev, &list_kill);
+#else
+ vxlan_dellink(vxlan->dev);
+#endif
+ }
+
+ unregister_netdevice_many(&list_kill);
+}
+
+static int vxlan_lowerdev_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
+
+ if (event == NETDEV_UNREGISTER)
+ vxlan_handle_lowerdev_unregister(vn, dev);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block vxlan_notifier_block __read_mostly = {
+ .notifier_call = vxlan_lowerdev_event,
+};
+
+static __net_init int vxlan_init_net(struct net *net)
+{
+ struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+ unsigned int h;
+
+ INIT_LIST_HEAD(&vn->vxlan_list);
+ spin_lock_init(&vn->sock_lock);
+
+ for (h = 0; h < PORT_HASH_SIZE; ++h)
+ INIT_HLIST_HEAD(&vn->sock_list[h]);
+
+ return 0;
+}
+
+static void __net_exit vxlan_exit_net(struct net *net)
+{
+ struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+ struct vxlan_dev *vxlan, *next;
+ struct net_device *dev, *aux;
+ LIST_HEAD(list);
+
+ rtnl_lock();
+ for_each_netdev_safe(net, dev, aux)
+ if (dev->rtnl_link_ops == &vxlan_link_ops)
+ unregister_netdevice_queue(dev, &list);
+
+ list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
+ /* If vxlan->dev is in the same netns, it has already been added
+ * to the list by the previous loop.
+ */
+ if (!net_eq(dev_net(vxlan->dev), net))
+ unregister_netdevice_queue(vxlan->dev, &list);
+ }
+
+ unregister_netdevice_many(&list);
+ rtnl_unlock();
+}
+
+static struct pernet_operations vxlan_net_ops = {
+ .init = vxlan_init_net,
+ .exit = vxlan_exit_net,
+ .id = &vxlan_net_id,
+ .size = sizeof(struct vxlan_net),
+};
+
+DEFINE_COMPAT_PNET_REG_FUNC(device)
+int rpl_vxlan_init_module(void)
+{
+ int rc;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+ vxlan_wq = create_workqueue("vxlan");
+#else
+ vxlan_wq = alloc_workqueue("vxlan", 0, 0);
+#endif
+ if (!vxlan_wq)
+ return -ENOMEM;
+
+ get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
+
+ rc = register_pernet_subsys(&vxlan_net_ops);
+ if (rc)
+ goto out1;
+
+ rc = register_netdevice_notifier(&vxlan_notifier_block);
+ if (rc)
+ goto out2;
+
+ rc = rtnl_link_register(&vxlan_link_ops);
+ if (rc)
+ goto out3;
+
+ pr_info("VxLAN tunneling driver\n");
+ return 0;
+out3:
+ unregister_netdevice_notifier(&vxlan_notifier_block);
+out2:
+ unregister_pernet_subsys(&vxlan_net_ops);
+out1:
+ destroy_workqueue(vxlan_wq);
+ return rc;
+}
+
+void rpl_vxlan_cleanup_module(void)
+{
+ rtnl_link_unregister(&vxlan_link_ops);
+ unregister_netdevice_notifier(&vxlan_notifier_block);
+ destroy_workqueue(vxlan_wq);
+ unregister_pernet_subsys(&vxlan_net_ops);
+ /* rcu_barrier() is called by netns */
+}
+#endif
diff --git a/datapath/vport-geneve.c b/datapath/vport-geneve.c
index 4ab224dac..3b5c1ab32 100644
--- a/datapath/vport-geneve.c
+++ b/datapath/vport-geneve.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2014 Nicira, Inc.
+ * Copyright (c) 2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -26,96 +26,42 @@
#include "datapath.h"
#include "vport.h"
+#include "vport-netdev.h"
static struct vport_ops ovs_geneve_vport_ops;
-
/**
* struct geneve_port - Keeps track of open UDP ports
- * @gs: The socket created for this port number.
- * @name: vport name.
+ * @dst_port: destination port.
*/
struct geneve_port {
- struct geneve_sock *gs;
- char name[IFNAMSIZ];
+ u16 port_no;
};
-static LIST_HEAD(geneve_ports);
-
static inline struct geneve_port *geneve_vport(const struct vport *vport)
{
return vport_priv(vport);
}
-/* Convert 64 bit tunnel ID to 24 bit VNI. */
-static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
-{
-#ifdef __BIG_ENDIAN
- vni[0] = (__force __u8)(tun_id >> 16);
- vni[1] = (__force __u8)(tun_id >> 8);
- vni[2] = (__force __u8)tun_id;
-#else
- vni[0] = (__force __u8)((__force u64)tun_id >> 40);
- vni[1] = (__force __u8)((__force u64)tun_id >> 48);
- vni[2] = (__force __u8)((__force u64)tun_id >> 56);
-#endif
-}
-
-/* Convert 24 bit VNI to 64 bit tunnel ID. */
-static __be64 vni_to_tunnel_id(const __u8 *vni)
-{
-#ifdef __BIG_ENDIAN
- return (vni[0] << 16) | (vni[1] << 8) | vni[2];
-#else
- return (__force __be64)(((__force u64)vni[0] << 40) |
- ((__force u64)vni[1] << 48) |
- ((__force u64)vni[2] << 56));
-#endif
-}
-
-static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb)
-{
- struct vport *vport = gs->rcv_data;
- struct genevehdr *geneveh = geneve_hdr(skb);
- int opts_len;
- struct ovs_tunnel_info tun_info;
- __be64 key;
- __be16 flags;
-
- opts_len = geneveh->opt_len * 4;
-
- flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
- (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) |
- (geneveh->oam ? TUNNEL_OAM : 0) |
- (geneveh->critical ? TUNNEL_CRIT_OPT : 0);
-
- key = vni_to_tunnel_id(geneveh->vni);
-
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, flags,
- geneveh->options, opts_len);
-
- ovs_vport_receive(vport, skb, &tun_info);
-}
-
static int geneve_get_options(const struct vport *vport,
struct sk_buff *skb)
{
struct geneve_port *geneve_port = geneve_vport(vport);
- __be16 dst_port = inet_sport(geneve_port->gs->sock->sk);
- if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no))
return -EMSGSIZE;
return 0;
}
-static void geneve_tnl_destroy(struct vport *vport)
+static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct dp_upcall_info *upcall)
{
struct geneve_port *geneve_port = geneve_vport(vport);
+ struct net *net = ovs_dp_get_net(vport->dp);
+ __be16 dport = htons(geneve_port->port_no);
+ __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
- geneve_sock_release(geneve_port->gs);
-
- ovs_vport_deferred_free(vport);
+ return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
+ skb, IPPROTO_UDP, sport, dport);
}
static struct vport *geneve_tnl_create(const struct vport_parms *parms)
@@ -123,11 +69,11 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;
struct geneve_port *geneve_port;
- struct geneve_sock *gs;
+ struct net_device *dev;
struct vport *vport;
struct nlattr *a;
- int err;
u16 dst_port;
+ int err;
if (!options) {
err = -EINVAL;
@@ -149,111 +95,42 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
return vport;
geneve_port = geneve_vport(vport);
- strncpy(geneve_port->name, parms->name, IFNAMSIZ);
+ geneve_port->port_no = dst_port;
- gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0);
- if (IS_ERR(gs)) {
+ rtnl_lock();
+ dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
ovs_vport_free(vport);
- return (void *)gs;
+ return ERR_CAST(dev);
}
- geneve_port->gs = gs;
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
return vport;
error:
return ERR_PTR(err);
}
-static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct ovs_tunnel_info *tun_info;
- struct net *net = ovs_dp_get_net(vport->dp);
- struct geneve_port *geneve_port = geneve_vport(vport);
- __be16 dport = inet_sport(geneve_port->gs->sock->sk);
- __be16 sport;
- __be32 saddr;
- struct rtable *rt;
- u8 vni[3], opts_len, *opts;
- __be16 df;
- int err;
-
- tun_info = OVS_CB(skb)->egress_tun_info;
- if (unlikely(!tun_info)) {
- err = -EINVAL;
- goto error;
- }
-
- tun_key = &tun_info->tunnel;
-
- saddr = tun_key->ipv4_src;
- rt = find_route(ovs_dp_get_net(vport->dp),
- &saddr, tun_key->ipv4_dst,
- IPPROTO_UDP, tun_key->ipv4_tos,
- skb->mark);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
-
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
- tunnel_id_to_vni(tun_key->tun_id, vni);
- skb->ignore_df = 1;
-
- if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) {
- opts = (u8 *)tun_info->options;
- opts_len = tun_info->options_len;
- } else {
- opts = NULL;
- opts_len = 0;
- }
-
- err = geneve_xmit_skb(geneve_port->gs, rt, skb, saddr,
- tun_key->ipv4_dst, tun_key->ipv4_tos,
- tun_key->ipv4_ttl, df, sport, dport,
- tun_key->tun_flags, vni, opts_len, opts,
- !!(tun_key->tun_flags & TUNNEL_CSUM), false);
- if (err < 0)
- ip_rt_put(rt);
- return err;
-
-error:
- kfree_skb(skb);
- return err;
-}
-
-static const char *geneve_get_name(const struct vport *vport)
+static struct vport *geneve_create(const struct vport_parms *parms)
{
- struct geneve_port *geneve_port = geneve_vport(vport);
-
- return geneve_port->name;
-}
+ struct vport *vport;
-static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
-{
- struct geneve_port *geneve_port = geneve_vport(vport);
- struct net *net = ovs_dp_get_net(vport->dp);
- __be16 dport = inet_sport(geneve_port->gs->sock->sk);
- __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+ vport = geneve_tnl_create(parms);
+ if (IS_ERR(vport))
+ return vport;
- /* Get tp_src and tp_dst, refert to geneve_build_header().
- */
- return ovs_tunnel_get_egress_info(egress_tun_info,
- ovs_dp_get_net(vport->dp),
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_UDP, skb->mark, sport, dport);
+ return ovs_netdev_link(vport, parms->name);
}
static struct vport_ops ovs_geneve_vport_ops = {
- .type = OVS_VPORT_TYPE_GENEVE,
- .create = geneve_tnl_create,
- .destroy = geneve_tnl_destroy,
- .get_name = geneve_get_name,
- .get_options = geneve_get_options,
- .send = geneve_tnl_send,
+ .type = OVS_VPORT_TYPE_GENEVE,
+ .create = geneve_create,
+ .destroy = ovs_netdev_tunnel_destroy,
+ .get_options = geneve_get_options,
+ .send = geneve_xmit,
+ .owner = THIS_MODULE,
.get_egress_tun_info = geneve_get_egress_tun_info,
- .owner = THIS_MODULE,
};
static int __init ovs_geneve_tnl_init(void)
@@ -269,6 +146,6 @@ static void __exit ovs_geneve_tnl_exit(void)
module_init(ovs_geneve_tnl_init);
module_exit(ovs_geneve_tnl_exit);
-MODULE_DESCRIPTION("OVS: Geneve swiching port");
+MODULE_DESCRIPTION("OVS: Geneve switching port");
MODULE_LICENSE("GPL");
MODULE_ALIAS("vport-type-5");
diff --git a/datapath/vport-gre.c b/datapath/vport-gre.c
index 0328fe51e..a9ac0d48a 100644
--- a/datapath/vport-gre.c
+++ b/datapath/vport-gre.c
@@ -16,9 +16,6 @@
* 02110-1301, USA
*/
-#include <linux/kconfig.h>
-#if IS_ENABLED(CONFIG_NET_IPGRE_DEMUX)
-
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/if.h>
@@ -48,256 +45,58 @@
#include "datapath.h"
#include "vport.h"
+#include "vport-netdev.h"
static struct vport_ops ovs_gre_vport_ops;
-/* Returns the least-significant 32 bits of a __be64. */
-static __be32 be64_get_low32(__be64 x)
-{
-#ifdef __BIG_ENDIAN
- return (__force __be32)x;
-#else
- return (__force __be32)((__force u64)x >> 32);
-#endif
-}
-
-static __be16 filter_tnl_flags(__be16 flags)
-{
- return flags & (TUNNEL_CSUM | TUNNEL_KEY);
-}
-
-static struct sk_buff *__build_header(struct sk_buff *skb,
- int tunnel_hlen)
-{
- struct tnl_ptk_info tpi;
- const struct ovs_key_ipv4_tunnel *tun_key;
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
-
- skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
- if (IS_ERR(skb))
- return skb;
-
- tpi.flags = filter_tnl_flags(tun_key->tun_flags);
- tpi.proto = htons(ETH_P_TEB);
- tpi.key = be64_get_low32(tun_key->tun_id);
- tpi.seq = 0;
- gre_build_header(skb, &tpi, tunnel_hlen);
-
- return skb;
-}
-
-static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
+static struct vport *gre_tnl_create(const struct vport_parms *parms)
{
-#ifdef __BIG_ENDIAN
- return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
-#else
- return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
-#endif
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_rcv(struct sk_buff *skb,
- const struct tnl_ptk_info *tpi)
-{
- struct ovs_tunnel_info tun_info;
- struct ovs_net *ovs_net;
- struct vport *vport;
- __be64 key;
-
- ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
- vport = rcu_dereference(ovs_net->vport_net.gre_vport);
- if (unlikely(!vport))
- return PACKET_REJECT;
-
- key = key_to_tunnel_id(tpi->key, tpi->seq);
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key,
- filter_tnl_flags(tpi->flags), NULL, 0);
-
- ovs_vport_receive(vport, skb, &tun_info);
- return PACKET_RCVD;
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int gre_err(struct sk_buff *skb, u32 info,
- const struct tnl_ptk_info *tpi)
-{
- struct ovs_net *ovs_net;
+ struct net *net = ovs_dp_get_net(parms->dp);
+ struct net_device *dev;
struct vport *vport;
- ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
- vport = rcu_dereference(ovs_net->vport_net.gre_vport);
-
- if (unlikely(!vport))
- return PACKET_REJECT;
- else
- return PACKET_RCVD;
-}
-
-static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
- const struct ovs_key_ipv4_tunnel *tun_key;
- struct rtable *rt;
- int min_headroom;
- __be16 df;
- int tunnel_hlen;
- __be32 saddr;
- int err;
-
- if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
- err = -EINVAL;
- goto err_free_skb;
- }
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
- saddr = tun_key->ipv4_src;
- rt = find_route(ovs_dp_get_net(vport->dp),
- &saddr, tun_key->ipv4_dst,
- IPPROTO_GRE, tun_key->ipv4_tos,
- skb->mark);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto err_free_skb;
- }
-
- tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags);
-
- min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
- + tunnel_hlen + sizeof(struct iphdr)
- + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
- if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
- int head_delta = SKB_DATA_ALIGN(min_headroom -
- skb_headroom(skb) +
- 16);
- err = pskb_expand_head(skb, max_t(int, head_delta, 0),
- 0, GFP_ATOMIC);
- if (unlikely(err))
- goto err_free_rt;
- }
-
- skb = vlan_hwaccel_push_inside(skb);
- if (unlikely(!skb)) {
- err = -ENOMEM;
- goto err_free_rt;
- }
-
- /* Push Tunnel header. */
- skb = __build_header(skb, tunnel_hlen);
- if (IS_ERR(skb)) {
- err = PTR_ERR(skb);
- skb = NULL;
- goto err_free_rt;
+ vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ rtnl_lock();
+ dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ return ERR_CAST(dev);
}
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
- htons(IP_DF) : 0;
-
- skb->ignore_df = 1;
-
- return iptunnel_xmit(skb->sk, rt, skb, saddr,
- tun_key->ipv4_dst, IPPROTO_GRE,
- tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false);
-err_free_rt:
- ip_rt_put(rt);
-err_free_skb:
- kfree_skb(skb);
- return err;
-}
-
-static struct gre_cisco_protocol gre_protocol = {
- .handler = gre_rcv,
- .err_handler = gre_err,
- .priority = 1,
-};
-
-static int gre_ports;
-static int gre_init(void)
-{
- int err;
-
- gre_ports++;
- if (gre_ports > 1)
- return 0;
-
- err = gre_cisco_register(&gre_protocol);
- if (err)
- pr_warn("cannot register gre protocol handler\n");
-
- return err;
-}
-
-static void gre_exit(void)
-{
- gre_ports--;
- if (gre_ports > 0)
- return;
-
- gre_cisco_unregister(&gre_protocol);
-}
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
-static const char *gre_get_name(const struct vport *vport)
-{
- return vport_priv(vport);
+ return vport;
}
static struct vport *gre_create(const struct vport_parms *parms)
{
- struct net *net = ovs_dp_get_net(parms->dp);
- struct ovs_net *ovs_net;
struct vport *vport;
- int err;
-
- err = gre_init();
- if (err)
- return ERR_PTR(err);
-
- ovs_net = net_generic(net, ovs_net_id);
- if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
- vport = ERR_PTR(-EEXIST);
- goto error;
- }
- vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms);
+ vport = gre_tnl_create(parms);
if (IS_ERR(vport))
- goto error;
-
- strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
- rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
- return vport;
-
-error:
- gre_exit();
- return vport;
-}
-
-static void gre_tnl_destroy(struct vport *vport)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct ovs_net *ovs_net;
-
- ovs_net = net_generic(net, ovs_net_id);
+ return vport;
- RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL);
- ovs_vport_deferred_free(vport);
- gre_exit();
+ return ovs_netdev_link(vport, parms->name);
}
static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
+ struct dp_upcall_info *upcall)
{
- return ovs_tunnel_get_egress_info(egress_tun_info,
- ovs_dp_get_net(vport->dp),
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_GRE, skb->mark, 0, 0);
+ return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
+ skb, IPPROTO_GRE, 0, 0);
}
static struct vport_ops ovs_gre_vport_ops = {
.type = OVS_VPORT_TYPE_GRE,
.create = gre_create,
- .destroy = gre_tnl_destroy,
- .get_name = gre_get_name,
- .send = gre_tnl_send,
+ .send = gre_fb_xmit,
.get_egress_tun_info = gre_get_egress_tun_info,
+ .destroy = ovs_netdev_tunnel_destroy,
.owner = THIS_MODULE,
};
@@ -317,4 +116,3 @@ module_exit(ovs_gre_tnl_exit);
MODULE_DESCRIPTION("OVS: GRE switching port");
MODULE_LICENSE("GPL");
MODULE_ALIAS("vport-type-3");
-#endif
diff --git a/datapath/vport-internal_dev.c b/datapath/vport-internal_dev.c
index f38f9be07..7f216792b 100644
--- a/datapath/vport-internal_dev.c
+++ b/datapath/vport-internal_dev.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -22,15 +22,16 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
-#include <linux/netdev_features.h>
#include <linux/skbuff.h>
-#include <linux/version.h>
+#include <linux/percpu.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/netdev_features.h>
#include <net/dst.h>
#include <net/xfrm.h>
+#include <net/rtnetlink.h>
#include "datapath.h"
-#include "vlan.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
@@ -45,42 +46,30 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev)
return netdev_priv(netdev);
}
-/* This function is only called by the kernel network layer.*/
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
-static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev,
- struct rtnl_link_stats64 *stats)
-{
-#else
-static struct net_device_stats *internal_dev_sys_stats(struct net_device *netdev)
-{
- struct net_device_stats *stats = &netdev->stats;
-#endif
- struct vport *vport = ovs_internal_dev_get_vport(netdev);
- struct ovs_vport_stats vport_stats;
-
- ovs_vport_get_stats(vport, &vport_stats);
-
- /* The tx and rx stats need to be swapped because the
- * switch and host OS have opposite perspectives.
- */
- stats->rx_packets = vport_stats.tx_packets;
- stats->tx_packets = vport_stats.rx_packets;
- stats->rx_bytes = vport_stats.tx_bytes;
- stats->tx_bytes = vport_stats.rx_bytes;
- stats->rx_errors = vport_stats.tx_errors;
- stats->tx_errors = vport_stats.rx_errors;
- stats->rx_dropped = vport_stats.tx_dropped;
- stats->tx_dropped = vport_stats.rx_dropped;
-
- return stats;
-}
-
/* Called with rcu_read_lock_bh. */
static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
{
+ int len, err;
+
+ len = skb->len;
rcu_read_lock();
- ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
+ err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
rcu_read_unlock();
+
+ if (likely(!err)) {
+#ifdef HAVE_DEV_TSTATS
+ struct pcpu_sw_netstats *tstats;
+
+ tstats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)netdev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += len;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+#endif
+ } else {
+ netdev->stats.tx_errors++;
+ }
return 0;
}
@@ -132,17 +121,32 @@ static void internal_dev_destructor(struct net_device *dev)
free_netdev(dev);
}
+#ifdef HAVE_DEV_TSTATS
+static int internal_dev_init(struct net_device *dev)
+{
+ dev->tstats = (typeof(dev->tstats)) netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+ return 0;
+}
+
+static void internal_dev_uninit(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+}
+#endif
+
static const struct net_device_ops internal_dev_netdev_ops = {
+#ifdef HAVE_DEV_TSTATS
+ .ndo_init = internal_dev_init,
+ .ndo_uninit = internal_dev_uninit,
+ .ndo_get_stats64 = ip_tunnel_get_stats64,
+#endif
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
.ndo_start_xmit = internal_dev_xmit,
.ndo_set_mac_address = eth_mac_addr,
.ndo_change_mtu = internal_dev_change_mtu,
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
- .ndo_get_stats64 = internal_dev_get_stats,
-#else
- .ndo_get_stats = internal_dev_sys_stats,
-#endif
};
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -156,7 +160,7 @@ static void do_setup(struct net_device *netdev)
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
- netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
netdev->destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -169,62 +173,56 @@ static void do_setup(struct net_device *netdev)
netdev->vlan_features = netdev->features;
netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
- netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
-#endif
-
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
netdev->hw_enc_features = netdev->features;
#endif
-
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,39)
+ netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
+#endif
eth_hw_addr_random(netdev);
}
static struct vport *internal_dev_create(const struct vport_parms *parms)
{
struct vport *vport;
- struct netdev_vport *netdev_vport;
struct internal_dev *internal_dev;
int err;
- vport = ovs_vport_alloc(sizeof(struct netdev_vport),
- &ovs_internal_vport_ops, parms);
+ vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms);
if (IS_ERR(vport)) {
err = PTR_ERR(vport);
goto error;
}
- netdev_vport = netdev_vport_priv(vport);
-
- netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev),
- parms->name, NET_NAME_UNKNOWN, do_setup);
- if (!netdev_vport->dev) {
+ vport->dev = alloc_netdev(sizeof(struct internal_dev),
+ parms->name, NET_NAME_UNKNOWN, do_setup);
+ if (!vport->dev) {
err = -ENOMEM;
goto error_free_vport;
}
- dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp));
- internal_dev = internal_dev_priv(netdev_vport->dev);
+ dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
+ internal_dev = internal_dev_priv(vport->dev);
internal_dev->vport = vport;
/* Restrict bridge port to current netns. */
if (vport->port_no == OVSP_LOCAL)
- netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL;
+ vport->dev->features |= NETIF_F_NETNS_LOCAL;
rtnl_lock();
- err = register_netdevice(netdev_vport->dev);
+ err = register_netdevice(vport->dev);
if (err)
goto error_free_netdev;
- dev_set_promiscuity(netdev_vport->dev, 1);
+ dev_set_promiscuity(vport->dev, 1);
rtnl_unlock();
- netif_start_queue(netdev_vport->dev);
+ netif_start_queue(vport->dev);
return vport;
error_free_netdev:
rtnl_unlock();
- free_netdev(netdev_vport->dev);
+ free_netdev(vport->dev);
error_free_vport:
ovs_vport_free(vport);
error:
@@ -233,26 +231,27 @@ error:
static void internal_dev_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
- netif_stop_queue(netdev_vport->dev);
+ netif_stop_queue(vport->dev);
rtnl_lock();
- dev_set_promiscuity(netdev_vport->dev, -1);
+ dev_set_promiscuity(vport->dev, -1);
/* unregister_netdevice() waits for an RCU grace period. */
- unregister_netdevice(netdev_vport->dev);
+ unregister_netdevice(vport->dev);
rtnl_unlock();
}
-static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
+static netdev_tx_t internal_dev_recv(struct sk_buff *skb)
{
- struct net_device *netdev = netdev_vport_priv(vport)->dev;
- int len;
+ struct net_device *netdev = skb->dev;
+#ifdef HAVE_DEV_TSTATS
+ struct pcpu_sw_netstats *stats;
+#endif
if (unlikely(!(netdev->flags & IFF_UP))) {
kfree_skb(skb);
- return 0;
+ netdev->stats.rx_dropped++;
+ return NETDEV_TX_OK;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
@@ -260,7 +259,7 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
if (unlikely(!vlan_insert_tag_set_proto(skb,
skb->vlan_proto,
skb_vlan_tag_get(skb))))
- return 0;
+ return NETDEV_TX_OK;
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->csum = csum_add(skb->csum,
@@ -271,27 +270,30 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb)
}
#endif
- len = skb->len;
-
skb_dst_drop(skb);
nf_reset(skb);
secpath_reset(skb);
- skb->dev = netdev;
skb->pkt_type = PACKET_HOST;
skb->protocol = eth_type_trans(skb, netdev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
- netif_rx(skb);
+#ifdef HAVE_DEV_TSTATS
+ stats = this_cpu_ptr((struct pcpu_sw_netstats __percpu *)netdev->tstats);
+ u64_stats_update_begin(&stats->syncp);
+ stats->rx_packets++;
+ stats->rx_bytes += skb->len;
+ u64_stats_update_end(&stats->syncp);
+#endif
- return len;
+ netif_rx(skb);
+ return NETDEV_TX_OK;
}
static struct vport_ops ovs_internal_vport_ops = {
.type = OVS_VPORT_TYPE_INTERNAL,
.create = internal_dev_create,
.destroy = internal_dev_destroy,
- .get_name = ovs_netdev_get_name,
.send = internal_dev_recv,
};
diff --git a/datapath/vport-lisp.c b/datapath/vport-lisp.c
index 104a21d66..e6c00facd 100644
--- a/datapath/vport-lisp.c
+++ b/datapath/vport-lisp.c
@@ -1,332 +1,67 @@
/*
- * Copyright (c) 2011 Nicira, Inc.
- * Copyright (c) 2013 Cisco Systems, Inc.
+ * Copyright (c) 2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
- * 02110-1301, USA
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/version.h>
-
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/net.h>
-#include <linux/module.h>
#include <linux/rculist.h>
#include <linux/udp.h>
+#include <linux/if_vlan.h>
+#include <linux/module.h>
+#include <net/lisp.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
#include <net/udp.h>
-#include <net/udp_tunnel.h>
#include <net/xfrm.h>
#include "datapath.h"
-#include "gso.h"
#include "vport.h"
+#include "vport-netdev.h"
-/*
- * LISP encapsulation header:
- *
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * |N|L|E|V|I|flags| Nonce/Map-Version |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- * | Instance ID/Locator Status Bits |
- * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
- *
- */
-
-/**
- * struct lisphdr - LISP header
- * @nonce_present: Flag indicating the presence of a 24 bit nonce value.
- * @locator_status_bits_present: Flag indicating the presence of Locator Status
- * Bits (LSB).
- * @solicit_echo_nonce: Flag indicating the use of the echo noncing mechanism.
- * @map_version_present: Flag indicating the use of mapping versioning.
- * @instance_id_present: Flag indicating the presence of a 24 bit Instance ID.
- * @reserved_flags: 3 bits reserved for future flags.
- * @nonce: 24 bit nonce value.
- * @map_version: 24 bit mapping version.
- * @locator_status_bits: Locator Status Bits: 32 bits when instance_id_present
- * is not set, 8 bits when it is.
- * @instance_id: 24 bit Instance ID
- */
-struct lisphdr {
-#ifdef __LITTLE_ENDIAN_BITFIELD
- __u8 reserved_flags:3;
- __u8 instance_id_present:1;
- __u8 map_version_present:1;
- __u8 solicit_echo_nonce:1;
- __u8 locator_status_bits_present:1;
- __u8 nonce_present:1;
-#else
- __u8 nonce_present:1;
- __u8 locator_status_bits_present:1;
- __u8 solicit_echo_nonce:1;
- __u8 map_version_present:1;
- __u8 instance_id_present:1;
- __u8 reserved_flags:3;
-#endif
- union {
- __u8 nonce[3];
- __u8 map_version[3];
- } u1;
- union {
- __be32 locator_status_bits;
- struct {
- __u8 instance_id[3];
- __u8 locator_status_bits;
- } word2;
- } u2;
-};
-
-#define LISP_HLEN (sizeof(struct udphdr) + sizeof(struct lisphdr))
-
+static struct vport_ops ovs_lisp_vport_ops;
/**
* struct lisp_port - Keeps track of open UDP ports
- * @dst_port: lisp UDP port no.
- * @list: list element in @lisp_ports.
- * @lisp_rcv_socket: The socket created for this port number.
- * @name: vport name.
+ * @dst_port: destination port.
*/
struct lisp_port {
- __be16 dst_port;
- struct list_head list;
- struct socket *lisp_rcv_socket;
- char name[IFNAMSIZ];
+ u16 port_no;
};
-static LIST_HEAD(lisp_ports);
-static struct vport_ops ovs_lisp_vport_ops;
-
static inline struct lisp_port *lisp_vport(const struct vport *vport)
{
return vport_priv(vport);
}
-static struct lisp_port *lisp_find_port(struct net *net, __be16 port)
-{
- struct lisp_port *lisp_port;
-
- list_for_each_entry_rcu(lisp_port, &lisp_ports, list) {
- if (lisp_port->dst_port == port &&
- net_eq(sock_net(lisp_port->lisp_rcv_socket->sk), net))
- return lisp_port;
- }
-
- return NULL;
-}
-
-static inline struct lisphdr *lisp_hdr(const struct sk_buff *skb)
-{
- return (struct lisphdr *)(udp_hdr(skb) + 1);
-}
-
-/* Convert 64 bit tunnel ID to 24 bit Instance ID. */
-static void tunnel_id_to_instance_id(__be64 tun_id, __u8 *iid)
-{
-
-#ifdef __BIG_ENDIAN
- iid[0] = (__force __u8)(tun_id >> 16);
- iid[1] = (__force __u8)(tun_id >> 8);
- iid[2] = (__force __u8)tun_id;
-#else
- iid[0] = (__force __u8)((__force u64)tun_id >> 40);
- iid[1] = (__force __u8)((__force u64)tun_id >> 48);
- iid[2] = (__force __u8)((__force u64)tun_id >> 56);
-#endif
-}
-
-/* Convert 24 bit Instance ID to 64 bit tunnel ID. */
-static __be64 instance_id_to_tunnel_id(__u8 *iid)
-{
-#ifdef __BIG_ENDIAN
- return (iid[0] << 16) | (iid[1] << 8) | iid[2];
-#else
- return (__force __be64)(((__force u64)iid[0] << 40) |
- ((__force u64)iid[1] << 48) |
- ((__force u64)iid[2] << 56));
-#endif
-}
-
-/* Compute source UDP port for outgoing packet.
- * Currently we use the flow hash.
- */
-static u16 get_src_port(struct net *net, struct sk_buff *skb)
-{
- u32 hash = skb_get_hash(skb);
- unsigned int range;
- int high;
- int low;
-
- if (!hash) {
- if (skb->protocol == htons(ETH_P_IP)) {
- struct iphdr *iph;
- int size = (sizeof(iph->saddr) * 2) / sizeof(u32);
-
- iph = (struct iphdr *) skb_network_header(skb);
- hash = jhash2((const u32 *)&iph->saddr, size, 0);
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
- struct ipv6hdr *ipv6hdr;
-
- ipv6hdr = (struct ipv6hdr *) skb_network_header(skb);
- hash = jhash2((const u32 *)&ipv6hdr->saddr,
- (sizeof(struct in6_addr) * 2) / sizeof(u32), 0);
- } else {
- pr_warn_once("LISP inner protocol is not IP when "
- "calculating hash.\n");
- }
- }
-
- inet_get_local_port_range(net, &low, &high);
- range = (high - low) + 1;
- return (((u64) hash * range) >> 32) + low;
-}
-
-static void lisp_build_header(struct sk_buff *skb)
-{
- struct lisphdr *lisph;
- const struct ovs_key_ipv4_tunnel *tun_key;
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
-
- lisph = (struct lisphdr *)__skb_push(skb, sizeof(struct lisphdr));
- lisph->nonce_present = 0; /* We don't support echo nonce algorithm */
- lisph->locator_status_bits_present = 1; /* Set LSB */
- lisph->solicit_echo_nonce = 0; /* No echo noncing */
- lisph->map_version_present = 0; /* No mapping versioning, nonce instead */
- lisph->instance_id_present = 1; /* Store the tun_id as Instance ID */
- lisph->reserved_flags = 0; /* Reserved flags, set to 0 */
-
- lisph->u1.nonce[0] = 0;
- lisph->u1.nonce[1] = 0;
- lisph->u1.nonce[2] = 0;
-
- tunnel_id_to_instance_id(tun_key->tun_id, &lisph->u2.word2.instance_id[0]);
- lisph->u2.word2.locator_status_bits = 1;
-}
-
-/* Called with rcu_read_lock and BH disabled. */
-static int lisp_rcv(struct sock *sk, struct sk_buff *skb)
-{
- struct lisp_port *lisp_port;
- struct lisphdr *lisph;
- struct iphdr *iph, *inner_iph;
- struct ovs_tunnel_info tun_info;
- __be64 key;
- struct ethhdr *ethh;
- __be16 protocol;
-
- lisp_port = rcu_dereference_sk_user_data(sk);
- if (unlikely(!lisp_port))
- goto error;
-
- if (iptunnel_pull_header(skb, LISP_HLEN, 0))
- goto error;
-
- lisph = lisp_hdr(skb);
-
- if (lisph->instance_id_present != 1)
- key = 0;
- else
- key = instance_id_to_tunnel_id(&lisph->u2.word2.instance_id[0]);
-
- /* Save outer tunnel values */
- iph = ip_hdr(skb);
- ovs_flow_tun_info_init(&tun_info, iph,
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, TUNNEL_KEY, NULL, 0);
-
- /* Drop non-IP inner packets */
- inner_iph = (struct iphdr *)(lisph + 1);
- switch (inner_iph->version) {
- case 4:
- protocol = htons(ETH_P_IP);
- break;
- case 6:
- protocol = htons(ETH_P_IPV6);
- break;
- default:
- goto error;
- }
- skb->protocol = protocol;
-
- /* Add Ethernet header */
- ethh = (struct ethhdr *)skb_push(skb, ETH_HLEN);
- memset(ethh, 0, ETH_HLEN);
- ethh->h_dest[0] = 0x02;
- ethh->h_source[0] = 0x02;
- ethh->h_proto = protocol;
-
- ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
-
- ovs_vport_receive(vport_from_priv(lisp_port), skb, &tun_info);
- goto out;
-
-error:
- kfree_skb(skb);
-out:
- return 0;
-}
-
-static int lisp_socket_init(struct lisp_port *lisp_port, struct net *net)
-{
- struct udp_port_cfg udp_conf;
- struct udp_tunnel_sock_cfg tunnel_cfg;
- int err;
-
- memset(&udp_conf, 0, sizeof(udp_conf));
-
- udp_conf.family = AF_INET;
- udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
- udp_conf.local_udp_port = lisp_port->dst_port;
-
- err = udp_sock_create(net, &udp_conf, &lisp_port->lisp_rcv_socket);
- if (err < 0) {
- pr_warn("cannot register lisp protocol handler: %d\n", err);
- return err;
- }
-
- tunnel_cfg.sk_user_data = lisp_port;
- tunnel_cfg.encap_type = 1;
- tunnel_cfg.encap_rcv = lisp_rcv;
- tunnel_cfg.encap_destroy = NULL;
-
- setup_udp_tunnel_sock(net, lisp_port->lisp_rcv_socket, &tunnel_cfg);
-
- return 0;
-}
-
-static int lisp_get_options(const struct vport *vport, struct sk_buff *skb)
+static int lisp_get_options(const struct vport *vport,
+ struct sk_buff *skb)
{
struct lisp_port *lisp_port = lisp_vport(vport);
- if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(lisp_port->dst_port)))
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, lisp_port->port_no))
return -EMSGSIZE;
return 0;
}
-static void lisp_tnl_destroy(struct vport *vport)
+static int lisp_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct dp_upcall_info *upcall)
{
struct lisp_port *lisp_port = lisp_vport(vport);
+ struct net *net = ovs_dp_get_net(vport->dp);
+ __be16 dport = htons(lisp_port->port_no);
+ __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
- list_del_rcu(&lisp_port->list);
- udp_tunnel_sock_release(lisp_port->lisp_rcv_socket);
- ovs_vport_deferred_free(vport);
+ return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
+ skb, IPPROTO_UDP, sport, dport);
}
static struct vport *lisp_tnl_create(const struct vport_parms *parms)
@@ -334,10 +69,11 @@ static struct vport *lisp_tnl_create(const struct vport_parms *parms)
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;
struct lisp_port *lisp_port;
+ struct net_device *dev;
struct vport *vport;
struct nlattr *a;
- int err;
u16 dst_port;
+ int err;
if (!options) {
err = -EINVAL;
@@ -353,158 +89,48 @@ static struct vport *lisp_tnl_create(const struct vport_parms *parms)
goto error;
}
- /* Verify if we already have a socket created for this port */
- if (lisp_find_port(net, htons(dst_port))) {
- err = -EEXIST;
- goto error;
- }
-
vport = ovs_vport_alloc(sizeof(struct lisp_port),
&ovs_lisp_vport_ops, parms);
if (IS_ERR(vport))
return vport;
lisp_port = lisp_vport(vport);
- lisp_port->dst_port = htons(dst_port);
- strncpy(lisp_port->name, parms->name, IFNAMSIZ);
-
- err = lisp_socket_init(lisp_port, net);
- if (err)
- goto error_free;
+ lisp_port->port_no = dst_port;
+
+ rtnl_lock();
+ dev = lisp_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
+ ovs_vport_free(vport);
+ return ERR_CAST(dev);
+ }
- list_add_tail_rcu(&lisp_port->list, &lisp_ports);
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
return vport;
-
-error_free:
- ovs_vport_free(vport);
error:
return ERR_PTR(err);
}
-static int lisp_send(struct vport *vport, struct sk_buff *skb)
-{
- struct ovs_key_ipv4_tunnel *tun_key;
- struct lisp_port *lisp_port = lisp_vport(vport);
- struct net *net = ovs_dp_get_net(vport->dp);
- int network_offset = skb_network_offset(skb);
- struct rtable *rt;
- int min_headroom;
- __be32 saddr;
- __be16 src_port, dst_port;
- __be16 df;
- int sent_len;
- int err;
-
- if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
- err = -EINVAL;
- goto error;
- }
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
-
- if (skb->protocol != htons(ETH_P_IP) &&
- skb->protocol != htons(ETH_P_IPV6)) {
- err = 0;
- goto error;
- }
-
- /* Route lookup */
- saddr = tun_key->ipv4_src;
- rt = find_route(ovs_dp_get_net(vport->dp),
- &saddr, tun_key->ipv4_dst,
- IPPROTO_UDP, tun_key->ipv4_tos,
- skb->mark);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
-
- min_headroom = LL_RESERVED_SPACE(rt_dst(rt).dev) + rt_dst(rt).header_len
- + sizeof(struct iphdr) + LISP_HLEN;
-
- if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
- int head_delta = SKB_DATA_ALIGN(min_headroom -
- skb_headroom(skb) +
- 16);
-
- err = pskb_expand_head(skb, max_t(int, head_delta, 0),
- 0, GFP_ATOMIC);
- if (unlikely(err))
- goto err_free_rt;
- }
-
- /* Reset l2 headers. */
- skb_pull(skb, network_offset);
- skb_reset_mac_header(skb);
- vlan_set_tci(skb, 0);
-
- skb = udp_tunnel_handle_offloads(skb, false, false);
- if (IS_ERR(skb)) {
- err = PTR_ERR(skb);
- skb = NULL;
- goto err_free_rt;
- }
-
- src_port = htons(get_src_port(net, skb));
- dst_port = lisp_port->dst_port;
-
- lisp_build_header(skb);
-
- skb->ignore_df = 1;
-
- ovs_skb_set_inner_protocol(skb, skb->protocol);
-
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- sent_len = udp_tunnel_xmit_skb(rt, lisp_port->lisp_rcv_socket->sk, skb,
- saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos, tun_key->ipv4_ttl,
- df, src_port, dst_port, false, true);
-
- return sent_len > 0 ? sent_len + network_offset : sent_len;
-
-err_free_rt:
- ip_rt_put(rt);
-error:
- kfree_skb(skb);
- return err;
-}
-
-static const char *lisp_get_name(const struct vport *vport)
+static struct vport *lisp_create(const struct vport_parms *parms)
{
- struct lisp_port *lisp_port = lisp_vport(vport);
- return lisp_port->name;
-}
-
-static int lisp_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct lisp_port *lisp_port = lisp_vport(vport);
+ struct vport *vport;
- if (skb->protocol != htons(ETH_P_IP) &&
- skb->protocol != htons(ETH_P_IPV6)) {
- return -EINVAL;
- }
+ vport = lisp_tnl_create(parms);
+ if (IS_ERR(vport))
+ return vport;
- /*
- * Get tp_src and tp_dst, refert to lisp_build_header().
- */
- return ovs_tunnel_get_egress_info(egress_tun_info, net,
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_UDP, skb->mark,
- htons(get_src_port(net, skb)),
- lisp_port->dst_port);
+ return ovs_netdev_link(vport, parms->name);
}
static struct vport_ops ovs_lisp_vport_ops = {
- .type = OVS_VPORT_TYPE_LISP,
- .create = lisp_tnl_create,
- .destroy = lisp_tnl_destroy,
- .get_name = lisp_get_name,
- .get_options = lisp_get_options,
- .send = lisp_send,
+ .type = OVS_VPORT_TYPE_LISP,
+ .create = lisp_create,
+ .destroy = ovs_netdev_tunnel_destroy,
+ .get_options = lisp_get_options,
+ .send = lisp_xmit,
+ .owner = THIS_MODULE,
.get_egress_tun_info = lisp_get_egress_tun_info,
- .owner = THIS_MODULE,
};
static int __init ovs_lisp_tnl_init(void)
@@ -520,6 +146,6 @@ static void __exit ovs_lisp_tnl_exit(void)
module_init(ovs_lisp_tnl_init);
module_exit(ovs_lisp_tnl_exit);
-MODULE_DESCRIPTION("OVS: LISP switching port");
+MODULE_DESCRIPTION("OVS: Lisp switching port");
MODULE_LICENSE("GPL");
MODULE_ALIAS("vport-type-105");
diff --git a/datapath/vport-netdev.c b/datapath/vport-netdev.c
index 6c8373740..21431d3c3 100644
--- a/datapath/vport-netdev.c
+++ b/datapath/vport-netdev.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -26,32 +26,61 @@
#include <linux/rtnetlink.h>
#include <linux/skbuff.h>
#include <linux/openvswitch.h>
-#include <linux/netdevice.h>
-#include <net/llc.h>
+#include <net/ip_tunnels.h>
+#include <net/rtnetlink.h>
#include "datapath.h"
-#include "vlan.h"
+#include "gso.h"
+#include "vport.h"
#include "vport-internal_dev.h"
#include "vport-netdev.h"
static struct vport_ops ovs_netdev_vport_ops;
-static void netdev_port_receive(struct vport *vport, struct sk_buff *skb);
+
+/* Must be called with rcu_read_lock. */
+void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info)
+{
+ struct vport *vport;
+
+ vport = ovs_netdev_get_vport(skb->dev);
+ if (unlikely(!vport))
+ goto error;
+
+ if (unlikely(skb_warn_if_lro(skb)))
+ goto error;
+
+ /* Make our own copy of the packet. Otherwise we will mangle the
+ * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
+ */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return;
+
+ skb_push(skb, ETH_HLEN);
+ ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+ ovs_vport_receive(vport, skb, tun_info);
+ return;
+error:
+ kfree_skb(skb);
+}
+
+#ifndef HAVE_METADATA_DST
+#define port_receive(skb) netdev_port_receive(skb, NULL)
+#else
+#define port_receive(skb) netdev_port_receive(skb, skb_tunnel_info(skb))
+#endif
#if defined HAVE_RX_HANDLER_PSKB /* 2.6.39 and above or backports */
/* Called with rcu_read_lock and bottom-halves disabled. */
static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
{
struct sk_buff *skb = *pskb;
- struct vport *vport;
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
- vport = ovs_netdev_get_vport(skb->dev);
-
- netdev_port_receive(vport, skb);
-
+ port_receive(skb);
return RX_HANDLER_CONSUMED;
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) || \
@@ -59,15 +88,10 @@ static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb)
/* Called with rcu_read_lock and bottom-halves disabled. */
static struct sk_buff *netdev_frame_hook(struct sk_buff *skb)
{
- struct vport *vport;
-
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return skb;
- vport = ovs_netdev_get_vport(skb->dev);
-
- netdev_port_receive(vport, skb);
-
+ port_receive(skb);
return NULL;
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32)
@@ -79,7 +103,7 @@ static struct sk_buff *netdev_frame_hook(struct sk_buff *skb)
static struct sk_buff *netdev_frame_hook(struct net_bridge_port *p,
struct sk_buff *skb)
{
- netdev_port_receive((struct vport *)p, skb);
+ port_receive(skb);
return NULL;
}
#else
@@ -92,167 +116,112 @@ static struct net_device *get_dpdev(const struct datapath *dp)
local = ovs_vport_ovsl(dp, OVSP_LOCAL);
BUG_ON(!local);
- return netdev_vport_priv(local)->dev;
+ return local->dev;
}
-static struct vport *netdev_create(const struct vport_parms *parms)
+struct vport *ovs_netdev_link(struct vport *vport, const char *name)
{
- struct vport *vport;
- struct netdev_vport *netdev_vport;
int err;
- vport = ovs_vport_alloc(sizeof(struct netdev_vport),
- &ovs_netdev_vport_ops, parms);
- if (IS_ERR(vport)) {
- err = PTR_ERR(vport);
- goto error;
- }
-
- netdev_vport = netdev_vport_priv(vport);
-
- netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name);
- if (!netdev_vport->dev) {
+ vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name);
+ if (!vport->dev) {
err = -ENODEV;
goto error_free_vport;
}
- if (netdev_vport->dev->flags & IFF_LOOPBACK ||
- netdev_vport->dev->type != ARPHRD_ETHER ||
- ovs_is_internal_dev(netdev_vport->dev)) {
+ if (vport->dev->flags & IFF_LOOPBACK ||
+ vport->dev->type != ARPHRD_ETHER ||
+ ovs_is_internal_dev(vport->dev)) {
err = -EINVAL;
goto error_put;
}
rtnl_lock();
- err = netdev_master_upper_dev_link(netdev_vport->dev,
+ err = netdev_master_upper_dev_link(vport->dev,
get_dpdev(vport->dp));
if (err)
goto error_unlock;
- err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook,
+ err = netdev_rx_handler_register(vport->dev, netdev_frame_hook,
vport);
if (err)
goto error_master_upper_dev_unlink;
- dev_disable_lro(netdev_vport->dev);
- dev_set_promiscuity(netdev_vport->dev, 1);
- netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH;
+ dev_disable_lro(vport->dev);
+ dev_set_promiscuity(vport->dev, 1);
+ vport->dev->priv_flags |= IFF_OVS_DATAPATH;
rtnl_unlock();
return vport;
error_master_upper_dev_unlink:
- netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp));
+ netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp));
error_unlock:
rtnl_unlock();
error_put:
- dev_put(netdev_vport->dev);
+ dev_put(vport->dev);
error_free_vport:
ovs_vport_free(vport);
-error:
return ERR_PTR(err);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_link);
+
+static struct vport *netdev_create(const struct vport_parms *parms)
+{
+ struct vport *vport;
+
+ vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms);
+ if (IS_ERR(vport))
+ return vport;
+
+ return ovs_netdev_link(vport, parms->name);
+}
-static void free_port_rcu(struct rcu_head *rcu)
+static void vport_netdev_free(struct rcu_head *rcu)
{
- struct netdev_vport *netdev_vport = container_of(rcu,
- struct netdev_vport, rcu);
+ struct vport *vport = container_of(rcu, struct vport, rcu);
- dev_put(netdev_vport->dev);
- ovs_vport_free(vport_from_priv(netdev_vport));
+ if (vport->dev)
+ dev_put(vport->dev);
+ ovs_vport_free(vport);
}
void ovs_netdev_detach_dev(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
ASSERT_RTNL();
- netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
- netdev_rx_handler_unregister(netdev_vport->dev);
- netdev_upper_dev_unlink(netdev_vport->dev,
- netdev_master_upper_dev_get(netdev_vport->dev));
- dev_set_promiscuity(netdev_vport->dev, -1);
+ vport->dev->priv_flags &= ~IFF_OVS_DATAPATH;
+ netdev_rx_handler_unregister(vport->dev);
+ netdev_upper_dev_unlink(vport->dev,
+ netdev_master_upper_dev_get(vport->dev));
+ dev_set_promiscuity(vport->dev, -1);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev);
static void netdev_destroy(struct vport *vport)
{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
-
rtnl_lock();
- if (ovs_netdev_get_vport(netdev_vport->dev))
+ if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
ovs_netdev_detach_dev(vport);
rtnl_unlock();
- call_rcu(&netdev_vport->rcu, free_port_rcu);
-}
-
-const char *ovs_netdev_get_name(const struct vport *vport)
-{
- const struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- return netdev_vport->dev->name;
-}
-
-/* Must be called with rcu_read_lock. */
-static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
-{
- if (unlikely(!vport))
- goto error;
-
- if (unlikely(skb_warn_if_lro(skb)))
- goto error;
-
- /* Make our own copy of the packet. Otherwise we will mangle the
- * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
- * (No one comes after us, since we tell handle_bridge() that we took
- * the packet.)
- */
- skb = skb_share_check(skb, GFP_ATOMIC);
- if (unlikely(!skb))
- return;
-
- skb_push(skb, ETH_HLEN);
- ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
-
- ovs_vport_receive(vport, skb, NULL);
- return;
-
-error:
- kfree_skb(skb);
+ call_rcu(&vport->rcu, vport_netdev_free);
}
-static unsigned int packet_length(const struct sk_buff *skb)
+void ovs_netdev_tunnel_destroy(struct vport *vport)
{
- unsigned int length = skb->len - ETH_HLEN;
-
- if (skb->protocol == htons(ETH_P_8021Q))
- length -= VLAN_HLEN;
-
- return length;
-}
-
-static int netdev_send(struct vport *vport, struct sk_buff *skb)
-{
- struct netdev_vport *netdev_vport = netdev_vport_priv(vport);
- int mtu = netdev_vport->dev->mtu;
- int len;
-
- if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
- net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
- netdev_vport->dev->name,
- packet_length(skb), mtu);
- goto drop;
- }
-
- skb->dev = netdev_vport->dev;
- len = skb->len;
- dev_queue_xmit(skb);
+ rtnl_lock();
+ if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
+ ovs_netdev_detach_dev(vport);
- return len;
+ /* Early release so we can unregister the device */
+ dev_put(vport->dev);
+ rtnl_delete_link(vport->dev);
+ vport->dev = NULL;
+ rtnl_unlock();
-drop:
- kfree_skb(skb);
- return 0;
+ call_rcu(&vport->rcu, vport_netdev_free);
}
+EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy);
/* Returns null if this device is not attached to a datapath. */
struct vport *ovs_netdev_get_vport(struct net_device *dev)
@@ -285,8 +254,7 @@ static struct vport_ops ovs_netdev_vport_ops = {
.type = OVS_VPORT_TYPE_NETDEV,
.create = netdev_create,
.destroy = netdev_destroy,
- .get_name = ovs_netdev_get_name,
- .send = netdev_send,
+ .send = dev_queue_xmit,
};
int __init ovs_netdev_init(void)
diff --git a/datapath/vport-netdev.h b/datapath/vport-netdev.h
index 6f7038e79..f8fbb8689 100644
--- a/datapath/vport-netdev.h
+++ b/datapath/vport-netdev.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2011 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -26,22 +26,15 @@
struct vport *ovs_netdev_get_vport(struct net_device *dev);
-struct netdev_vport {
- struct rcu_head rcu;
-
- struct net_device *dev;
-};
-
-static inline struct netdev_vport *
-netdev_vport_priv(const struct vport *vport)
-{
- return vport_priv(vport);
-}
-
-const char *ovs_netdev_get_name(const struct vport *);
+struct vport *ovs_netdev_link(struct vport *vport, const char *name);
+void ovs_netdev_send(struct vport *vport, struct sk_buff *skb);
void ovs_netdev_detach_dev(struct vport *);
int __init ovs_netdev_init(void);
void ovs_netdev_exit(void);
+void ovs_netdev_tunnel_destroy(struct vport *vport);
+
+void netdev_port_receive(struct sk_buff *skb, struct ip_tunnel_info *tun_info);
+
#endif /* vport_netdev.h */
diff --git a/datapath/vport-stt.c b/datapath/vport-stt.c
index 4eb0282fa..9e2079a16 100644
--- a/datapath/vport-stt.c
+++ b/datapath/vport-stt.c
@@ -9,34 +9,34 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/ip.h>
-#include <linux/module.h>
#include <linux/net.h>
#include <linux/rculist.h>
#include <linux/udp.h>
+#include <linux/if_vlan.h>
+#include <linux/module.h>
+#include <net/stt.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
-#include <net/stt.h>
#include <net/udp.h>
+#include <net/xfrm.h>
+#include <net/stt.h>
#include "datapath.h"
#include "vport.h"
+#include "vport-netdev.h"
#ifdef OVS_STT
static struct vport_ops ovs_stt_vport_ops;
-
/**
- * struct stt_port
- * @stt_sock: The socket created for this port number.
- * @name: vport name.
+ * struct stt_port - Keeps track of open UDP ports
+ * @dst_port: destination port.
*/
struct stt_port {
- struct stt_sock *stt_sock;
- char name[IFNAMSIZ];
+ u16 port_no;
};
static inline struct stt_port *stt_vport(const struct vport *vport)
@@ -44,42 +44,26 @@ static inline struct stt_port *stt_vport(const struct vport *vport)
return vport_priv(vport);
}
-static void stt_rcv(struct stt_sock *stt_sock, struct sk_buff *skb)
-{
- struct vport *vport = stt_sock->rcv_data;
- struct stthdr *stth = stt_hdr(skb);
- struct ovs_tunnel_info tun_info;
- struct sk_buff *next;
-
- ovs_flow_tun_info_init(&tun_info, ip_hdr(skb),
- tcp_hdr(skb)->source, tcp_hdr(skb)->dest,
- get_unaligned(&stth->key),
- TUNNEL_KEY | TUNNEL_CSUM,
- NULL, 0);
- do {
- next = skb->next;
- skb->next = NULL;
- ovs_vport_receive(vport, skb, &tun_info);
- } while ((skb = next));
-}
-
-static int stt_tnl_get_options(const struct vport *vport,
- struct sk_buff *skb)
+static int stt_get_options(const struct vport *vport,
+ struct sk_buff *skb)
{
struct stt_port *stt_port = stt_vport(vport);
- struct inet_sock *sk = inet_sk(stt_port->stt_sock->sock->sk);
- if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport)))
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, stt_port->port_no))
return -EMSGSIZE;
return 0;
}
-static void stt_tnl_destroy(struct vport *vport)
+static int stt_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
+ struct dp_upcall_info *upcall)
{
struct stt_port *stt_port = stt_vport(vport);
+ struct net *net = ovs_dp_get_net(vport->dp);
+ __be16 dport = htons(stt_port->port_no);
+ __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
- stt_sock_release(stt_port->stt_sock);
- ovs_vport_deferred_free(vport);
+ return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp),
+ skb, IPPROTO_UDP, sport, dport);
}
static struct vport *stt_tnl_create(const struct vport_parms *parms)
@@ -87,11 +71,11 @@ static struct vport *stt_tnl_create(const struct vport_parms *parms)
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;
struct stt_port *stt_port;
- struct stt_sock *stt_sock;
+ struct net_device *dev;
struct vport *vport;
struct nlattr *a;
- int err;
u16 dst_port;
+ int err;
if (!options) {
err = -EINVAL;
@@ -113,113 +97,52 @@ static struct vport *stt_tnl_create(const struct vport_parms *parms)
return vport;
stt_port = stt_vport(vport);
- strncpy(stt_port->name, parms->name, IFNAMSIZ);
+ stt_port->port_no = dst_port;
- stt_sock = stt_sock_add(net, htons(dst_port), stt_rcv, vport);
- if (IS_ERR(stt_sock)) {
+ rtnl_lock();
+ dev = stt_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
ovs_vport_free(vport);
- return ERR_CAST(stt_sock);
+ return ERR_CAST(dev);
}
- stt_port->stt_sock = stt_sock;
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
return vport;
error:
return ERR_PTR(err);
}
-static int stt_tnl_send(struct vport *vport, struct sk_buff *skb)
-{
- struct net *net = ovs_dp_get_net(vport->dp);
- struct stt_port *stt_port = stt_vport(vport);
- __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport;
- const struct ovs_key_ipv4_tunnel *tun_key;
- const struct ovs_tunnel_info *tun_info;
- struct rtable *rt;
- __be16 sport;
- __be32 saddr;
- __be16 df;
- int err;
-
- tun_info = OVS_CB(skb)->egress_tun_info;
- if (unlikely(!tun_info)) {
- err = -EINVAL;
- goto error;
- }
-
- tun_key = &tun_info->tunnel;
- /* Route lookup */
- saddr = tun_key->ipv4_src;
- rt = find_route(ovs_dp_get_net(vport->dp),
- &saddr, tun_key->ipv4_dst,
- IPPROTO_TCP, tun_key->ipv4_tos,
- skb->mark);
-
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
-
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
- skb->ignore_df = 1;
-
- return stt_xmit_skb(skb, rt, saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos, tun_key->ipv4_ttl,
- df, sport, dport, tun_key->tun_id);
-error:
- kfree_skb(skb);
- return err;
-}
-
-static const char *stt_tnl_get_name(const struct vport *vport)
+static struct vport *stt_create(const struct vport_parms *parms)
{
- return stt_vport(vport)->name;
-}
+ struct vport *vport;
-static int stt_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
-{
- struct stt_port *stt_port = stt_vport(vport);
- struct net *net = ovs_dp_get_net(vport->dp);
- __be16 dport = inet_sk(stt_port->stt_sock->sock->sk)->inet_sport;
- __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true);
+ vport = stt_tnl_create(parms);
+ if (IS_ERR(vport))
+ return vport;
- /* Get tp_src and tp_dst, refert to stt_build_header().
- */
- return ovs_tunnel_get_egress_info(egress_tun_info,
- ovs_dp_get_net(vport->dp),
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_TCP, skb->mark, sport, dport);
+ return ovs_netdev_link(vport, parms->name);
}
static struct vport_ops ovs_stt_vport_ops = {
- .type = OVS_VPORT_TYPE_STT,
- .create = stt_tnl_create,
- .destroy = stt_tnl_destroy,
- .get_name = stt_tnl_get_name,
- .get_options = stt_tnl_get_options,
- .send = stt_tnl_send,
+ .type = OVS_VPORT_TYPE_STT,
+ .create = stt_create,
+ .destroy = ovs_netdev_tunnel_destroy,
+ .get_options = stt_get_options,
+ .send = ovs_stt_xmit,
+ .owner = THIS_MODULE,
.get_egress_tun_info = stt_get_egress_tun_info,
- .owner = THIS_MODULE,
};
static int __init ovs_stt_tnl_init(void)
{
- int err;
-
- err = stt_init_module();
- if (err)
- return err;
- err = ovs_vport_ops_register(&ovs_stt_vport_ops);
- if (err)
- stt_cleanup_module();
- return err;
+ return ovs_vport_ops_register(&ovs_stt_vport_ops);
}
static void __exit ovs_stt_tnl_exit(void)
{
ovs_vport_ops_unregister(&ovs_stt_vport_ops);
- stt_cleanup_module();
}
module_init(ovs_stt_tnl_init);
diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c
index fc9f350d2..66b79f4db 100644
--- a/datapath/vport-vxlan.c
+++ b/datapath/vport-vxlan.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2013 Nicira, Inc.
+ * Copyright (c) 2015 Nicira, Inc.
* Copyright (c) 2013 Cisco Systems, Inc.
*
* This program is free software; you can redistribute it and/or
@@ -17,95 +17,37 @@
* 02110-1301, USA
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/version.h>
-
-#include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/net.h>
-#include <linux/rculist.h>
-#include <linux/udp.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/openvswitch.h>
#include <linux/module.h>
-
-#include <net/icmp.h>
-#include <net/ip.h>
#include <net/udp.h>
#include <net/ip_tunnels.h>
#include <net/rtnetlink.h>
-#include <net/route.h>
-#include <net/dsfield.h>
-#include <net/inet_ecn.h>
-#include <net/net_namespace.h>
-#include <net/netns/generic.h>
#include <net/vxlan.h>
#include "datapath.h"
#include "vport.h"
-#include "vport-vxlan.h"
-
-/**
- * struct vxlan_port - Keeps track of open UDP ports
- * @vs: vxlan_sock created for the port.
- * @name: vport name.
- */
-struct vxlan_port {
- struct vxlan_sock *vs;
- char name[IFNAMSIZ];
- u32 exts; /* VXLAN_F_* in <net/vxlan.h> */
-};
+#include "vport-netdev.h"
-static struct vport_ops ovs_vxlan_vport_ops;
-
-static inline struct vxlan_port *vxlan_vport(const struct vport *vport)
-{
- return vport_priv(vport);
-}
-
-static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb,
- struct vxlan_metadata *md)
-{
- struct ovs_tunnel_info tun_info;
- struct vxlan_port *vxlan_port;
- struct vport *vport = vs->data;
- struct iphdr *iph;
- struct ovs_vxlan_opts opts = {
- .gbp = md->gbp,
- };
- __be64 key;
- __be16 flags;
-
- flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0);
- vxlan_port = vxlan_vport(vport);
- if (vxlan_port->exts & VXLAN_F_GBP && md->gbp)
- flags |= TUNNEL_VXLAN_OPT;
-
- /* Save outer tunnel values */
- iph = ip_hdr(skb);
- key = cpu_to_be64(ntohl(md->vni) >> 8);
- ovs_flow_tun_info_init(&tun_info, iph,
- udp_hdr(skb)->source, udp_hdr(skb)->dest,
- key, flags, &opts, sizeof(opts));
-
- ovs_vport_receive(vport, skb, &tun_info);
-}
+static struct vport_ops ovs_vxlan_netdev_vport_ops;
static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sport(vxlan_port->vs->sock->sk);
+ struct vxlan_dev *vxlan = netdev_priv(vport->dev);
+ __be16 dst_port = vxlan->cfg.dst_port;
if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port)))
return -EMSGSIZE;
- if (vxlan_port->exts) {
+ if (vxlan->flags & VXLAN_F_GBP) {
struct nlattr *exts;
exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION);
if (!exts)
return -EMSGSIZE;
- if (vxlan_port->exts & VXLAN_F_GBP &&
+ if (vxlan->flags & VXLAN_F_GBP &&
nla_put_flag(skb, OVS_VXLAN_EXT_GBP))
return -EMSGSIZE;
@@ -115,23 +57,14 @@ static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb)
return 0;
}
-static void vxlan_tnl_destroy(struct vport *vport)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
-
- vxlan_sock_release(vxlan_port->vs);
-
- ovs_vport_deferred_free(vport);
-}
-
-static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = {
+static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = {
[OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, },
};
-static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
+static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr,
+ struct vxlan_config *conf)
{
- struct nlattr *exts[OVS_VXLAN_EXT_MAX+1];
- struct vxlan_port *vxlan_port;
+ struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1];
int err;
if (nla_len(attr) < sizeof(struct nlattr))
@@ -141,10 +74,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr)
if (err < 0)
return err;
- vxlan_port = vxlan_vport(vport);
-
if (exts[OVS_VXLAN_EXT_GBP])
- vxlan_port->exts |= VXLAN_F_GBP;
+ conf->flags |= VXLAN_F_GBP;
return 0;
}
@@ -153,168 +84,103 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
{
struct net *net = ovs_dp_get_net(parms->dp);
struct nlattr *options = parms->options;
- struct vxlan_port *vxlan_port;
- struct vxlan_sock *vs;
+ struct net_device *dev;
struct vport *vport;
struct nlattr *a;
- u16 dst_port;
int err;
+ struct vxlan_config conf = {
+ .no_share = true,
+ .flags = VXLAN_F_COLLECT_METADATA,
+ };
if (!options) {
err = -EINVAL;
goto error;
}
+
a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT);
if (a && nla_len(a) == sizeof(u16)) {
- dst_port = nla_get_u16(a);
+ conf.dst_port = htons(nla_get_u16(a));
} else {
/* Require destination port from userspace. */
err = -EINVAL;
goto error;
}
- vport = ovs_vport_alloc(sizeof(struct vxlan_port),
- &ovs_vxlan_vport_ops, parms);
+ vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms);
if (IS_ERR(vport))
return vport;
- vxlan_port = vxlan_vport(vport);
- strncpy(vxlan_port->name, parms->name, IFNAMSIZ);
-
a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION);
if (a) {
- err = vxlan_configure_exts(vport, a);
+ err = vxlan_configure_exts(vport, a, &conf);
if (err) {
ovs_vport_free(vport);
goto error;
}
}
- vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true,
- vxlan_port->exts);
- if (IS_ERR(vs)) {
+ rtnl_lock();
+ dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf);
+ if (IS_ERR(dev)) {
+ rtnl_unlock();
ovs_vport_free(vport);
- return (void *)vs;
+ return ERR_CAST(dev);
}
- vxlan_port->vs = vs;
+ dev_change_flags(dev, dev->flags | IFF_UP);
+ rtnl_unlock();
return vport;
-
error:
return ERR_PTR(err);
}
-static int vxlan_ext_gbp(struct sk_buff *skb)
-{
- const struct ovs_tunnel_info *tun_info;
- const struct ovs_vxlan_opts *opts;
-
- tun_info = OVS_CB(skb)->egress_tun_info;
- opts = tun_info->options;
-
- if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT &&
- tun_info->options_len >= sizeof(*opts))
- return opts->gbp;
- else
- return 0;
-}
-
-static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb)
+static struct vport *vxlan_create(const struct vport_parms *parms)
{
- struct ovs_key_ipv4_tunnel *tun_key;
- struct net *net = ovs_dp_get_net(vport->dp);
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sport(vxlan_port->vs->sock->sk);
- struct vxlan_metadata md = {0};
- struct rtable *rt;
- __be16 src_port;
- __be32 saddr;
- __be16 df;
- int err;
- u32 vxflags;
-
- if (unlikely(!OVS_CB(skb)->egress_tun_info)) {
- err = -EINVAL;
- goto error;
- }
-
- tun_key = &OVS_CB(skb)->egress_tun_info->tunnel;
-
- /* Route lookup */
- saddr = tun_key->ipv4_src;
- rt = find_route(ovs_dp_get_net(vport->dp),
- &saddr, tun_key->ipv4_dst,
- IPPROTO_UDP, tun_key->ipv4_tos,
- skb->mark);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- goto error;
- }
+ struct vport *vport;
- df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- skb->ignore_df = 1;
+ vport = vxlan_tnl_create(parms);
+ if (IS_ERR(vport))
+ return vport;
- src_port = udp_flow_src_port(net, skb, 0, 0, true);
- md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8);
- md.gbp = vxlan_ext_gbp(skb);
- vxflags = vxlan_port->exts |
- (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0);
-
- err = vxlan_xmit_skb(rt, vxlan_port->vs->sock->sk, skb,
- saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos,
- tun_key->ipv4_ttl, df,
- src_port, dst_port,
- &md, false, vxflags);
- if (err < 0)
- ip_rt_put(rt);
- return err;
-error:
- kfree_skb(skb);
- return err;
+ return ovs_netdev_link(vport, parms->name);
}
static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *egress_tun_info)
+ struct dp_upcall_info *upcall)
{
+ struct vxlan_dev *vxlan = netdev_priv(vport->dev);
struct net *net = ovs_dp_get_net(vport->dp);
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- __be16 dst_port = inet_sport(vxlan_port->vs->sock->sk);
+ __be16 dst_port = vxlan_dev_dst_port(vxlan);
__be16 src_port;
+ int port_min;
+ int port_max;
+ inet_get_local_port_range(net, &port_min, &port_max);
src_port = udp_flow_src_port(net, skb, 0, 0, true);
- return ovs_tunnel_get_egress_info(egress_tun_info, net,
- OVS_CB(skb)->egress_tun_info,
- IPPROTO_UDP, skb->mark,
+ return ovs_tunnel_get_egress_info(upcall, net,
+ skb, IPPROTO_UDP,
src_port, dst_port);
}
-static const char *vxlan_get_name(const struct vport *vport)
-{
- struct vxlan_port *vxlan_port = vxlan_vport(vport);
- return vxlan_port->name;
-}
-
-static struct vport_ops ovs_vxlan_vport_ops = {
+static struct vport_ops ovs_vxlan_netdev_vport_ops = {
.type = OVS_VPORT_TYPE_VXLAN,
- .create = vxlan_tnl_create,
- .destroy = vxlan_tnl_destroy,
- .get_name = vxlan_get_name,
+ .create = vxlan_create,
+ .destroy = ovs_netdev_tunnel_destroy,
.get_options = vxlan_get_options,
- .send = vxlan_tnl_send,
+ .send = vxlan_xmit,
.get_egress_tun_info = vxlan_get_egress_tun_info,
- .owner = THIS_MODULE,
};
static int __init ovs_vxlan_tnl_init(void)
{
- return ovs_vport_ops_register(&ovs_vxlan_vport_ops);
+ return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops);
}
static void __exit ovs_vxlan_tnl_exit(void)
{
- ovs_vport_ops_unregister(&ovs_vxlan_vport_ops);
+ ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops);
}
module_init(ovs_vxlan_tnl_init);
diff --git a/datapath/vport-vxlan.h b/datapath/vport-vxlan.h
deleted file mode 100644
index 4b08233e7..000000000
--- a/datapath/vport-vxlan.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef VPORT_VXLAN_H
-#define VPORT_VXLAN_H 1
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-
-struct ovs_vxlan_opts {
- __u32 gbp;
-};
-
-#endif
diff --git a/datapath/vport.c b/datapath/vport.c
index 024491f0f..1e22c6d48 100644
--- a/datapath/vport.c
+++ b/datapath/vport.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -20,26 +20,27 @@
#include <linux/if.h>
#include <linux/if_vlan.h>
#include <linux/jhash.h>
-#include <linux/kconfig.h>
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/rtnetlink.h>
#include <linux/compat.h>
-#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/if_link.h>
#include <net/net_namespace.h>
+#include <net/lisp.h>
+#include <net/gre.h>
+#include <net/geneve.h>
+#include <net/vxlan.h>
+#include <net/stt.h>
#include "datapath.h"
#include "gso.h"
#include "vport.h"
#include "vport-internal_dev.h"
-static void ovs_vport_record_error(struct vport *,
- enum vport_err_type err_type);
-
static LIST_HEAD(vport_ops_list);
/* Protected by RCU read lock for reading, ovs_mutex for writing. */
@@ -53,12 +54,42 @@ static struct hlist_head *dev_table;
*/
int ovs_vport_init(void)
{
+ int err;
+
dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head),
GFP_KERNEL);
if (!dev_table)
return -ENOMEM;
+ err = lisp_init_module();
+ if (err)
+ goto err_lisp;
+ err = ipgre_init();
+ if (err)
+ goto err_gre;
+ err = geneve_init_module();
+ if (err)
+ goto err_geneve;
+
+ err = vxlan_init_module();
+ if (err)
+ goto err_vxlan;
+ err = ovs_stt_init_module();
+ if (err)
+ goto err_stt;
return 0;
+
+err_stt:
+ vxlan_cleanup_module();
+err_vxlan:
+ geneve_cleanup_module();
+err_geneve:
+ ipgre_fini();
+err_gre:
+ lisp_cleanup_module();
+err_lisp:
+ kfree(dev_table);
+ return err;
}
/**
@@ -68,6 +99,11 @@ int ovs_vport_init(void)
*/
void ovs_vport_exit(void)
{
+ ovs_stt_cleanup_module();
+ vxlan_cleanup_module();
+ geneve_cleanup_module();
+ ipgre_fini();
+ lisp_cleanup_module();
kfree(dev_table);
}
@@ -84,8 +120,8 @@ int ovs_vport_ops_register(struct vport_ops *ops)
ovs_lock();
list_for_each_entry(o, &vport_ops_list, list)
- if (ops->type == o->type)
- goto errout;
+ if (ops->type == o->type)
+ goto errout;
list_add_tail(&ops->list, &vport_ops_list);
err = 0;
@@ -116,7 +152,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
struct vport *vport;
hlist_for_each_entry_rcu(vport, bucket, hash_node)
- if (!strcmp(name, vport->ops->get_name(vport)) &&
+ if (!strcmp(name, ovs_vport_name(vport)) &&
net_eq(ovs_dp_get_net(vport->dp), net))
return vport;
@@ -132,10 +168,10 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name)
* Allocate and initialize a new vport defined by @ops. The vport will contain
* a private data area of size @priv_size that can be accessed using
* vport_priv(). vports that are no longer needed should be released with
- * ovs_vport_free().
+ * vport_free().
*/
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
- const struct vport_parms *parms)
+ const struct vport_parms *parms)
{
struct vport *vport;
size_t alloc_size;
@@ -160,45 +196,41 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops,
return ERR_PTR(-EINVAL);
}
- vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
- if (!vport->percpu_stats) {
- kfree(vport);
- return ERR_PTR(-ENOMEM);
- }
-
return vport;
}
EXPORT_SYMBOL_GPL(ovs_vport_alloc);
-static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms)
-{
- struct vport_ops *ops;
-
- list_for_each_entry(ops, &vport_ops_list, list)
- if (ops->type == parms->type)
- return ops;
-
- return NULL;
-}
-
/**
* ovs_vport_free - uninitialize and free vport
*
* @vport: vport to free
*
- * Frees a vport allocated with ovs_vport_alloc() when it is no longer needed.
+ * Frees a vport allocated with vport_alloc() when it is no longer needed.
*
* The caller must ensure that an RCU grace period has passed since the last
* time @vport was in a datapath.
*/
void ovs_vport_free(struct vport *vport)
{
+ /* vport is freed from RCU callback or error path, Therefore
+ * it is safe to use raw dereference.
+ */
kfree(rcu_dereference_raw(vport->upcall_portids));
- free_percpu(vport->percpu_stats);
kfree(vport);
}
EXPORT_SYMBOL_GPL(ovs_vport_free);
+static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms)
+{
+ struct vport_ops *ops;
+
+ list_for_each_entry(ops, &vport_ops_list, list)
+ if (ops->type == parms->type)
+ return ops;
+
+ return NULL;
+}
+
/**
* ovs_vport_add - add vport device (for kernel callers)
*
@@ -226,7 +258,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms)
}
bucket = hash_bucket(ovs_dp_get_net(vport->dp),
- vport->ops->get_name(vport));
+ ovs_vport_name(vport));
hlist_add_head_rcu(&vport->hash_node, bucket);
return vport;
}
@@ -290,45 +322,19 @@ void ovs_vport_del(struct vport *vport)
*/
void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats)
{
- int i;
-
- /* We potentially have two surces of stats that need to be
- * combined: those we have collected (split into err_stats and
- * percpu_stats), and device error stats from netdev->get_stats()
- * (for errors that happen downstream and therefore aren't
- * reported through our vport_record_error() function).
- * Stats from first source are reported by ovs over
- * OVS_VPORT_ATTR_STATS.
- * netdev-stats can be directly read over netlink-ioctl.
- */
-
- stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors);
- stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors);
- stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped);
- stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped);
-
- stats->rx_bytes = 0;
- stats->rx_packets = 0;
- stats->tx_bytes = 0;
- stats->tx_packets = 0;
-
- for_each_possible_cpu(i) {
- const struct pcpu_sw_netstats *percpu_stats;
- struct pcpu_sw_netstats local_stats;
- unsigned int start;
-
- percpu_stats = per_cpu_ptr(vport->percpu_stats, i);
-
- do {
- start = u64_stats_fetch_begin_irq(&percpu_stats->syncp);
- local_stats = *percpu_stats;
- } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start));
-
- stats->rx_bytes += local_stats.rx_bytes;
- stats->rx_packets += local_stats.rx_packets;
- stats->tx_bytes += local_stats.tx_bytes;
- stats->tx_packets += local_stats.tx_packets;
- }
+ const struct rtnl_link_stats64 *dev_stats;
+ struct rtnl_link_stats64 temp;
+
+ dev_stats = dev_get_stats(vport->dev, &temp);
+ stats->rx_errors = dev_stats->rx_errors;
+ stats->tx_errors = dev_stats->tx_errors;
+ stats->tx_dropped = dev_stats->tx_dropped;
+ stats->rx_dropped = dev_stats->rx_dropped;
+
+ stats->rx_bytes = dev_stats->rx_bytes;
+ stats->rx_packets = dev_stats->rx_packets;
+ stats->tx_bytes = dev_stats->tx_bytes;
+ stats->tx_packets = dev_stats->tx_packets;
}
/**
@@ -399,7 +405,7 @@ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids)
old = ovsl_dereference(vport->upcall_portids);
- vport_portids = kmalloc(sizeof *vport_portids + nla_len(ids),
+ vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids),
GFP_KERNEL);
if (!vport_portids)
return -ENOMEM;
@@ -412,7 +418,6 @@ int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids)
if (old)
call_rcu(&old->rcu, vport_portids_destroy_rcu_cb);
-
return 0;
}
@@ -439,7 +444,7 @@ int ovs_vport_get_upcall_portids(const struct vport *vport,
if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS)
return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID,
- ids->n_ids * sizeof(u32), (void *) ids->ids);
+ ids->n_ids * sizeof(u32), (void *)ids->ids);
else
return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]);
}
@@ -458,6 +463,7 @@ int ovs_vport_get_upcall_portids(const struct vport *vport,
u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
{
struct vport_portids *ids;
+ u32 ids_index;
u32 hash;
ids = rcu_dereference(vport->upcall_portids);
@@ -466,7 +472,8 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
return 0;
hash = skb_get_hash(skb);
- return ids->ids[hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids)];
+ ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids);
+ return ids->ids[ids_index];
}
/**
@@ -474,99 +481,31 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb)
*
* @vport: vport that received the packet
* @skb: skb that was received
- * @tun_info: tunnel (if any) that carried packet
+ * @tun_key: tunnel (if any) that carried packet
*
* Must be called with rcu_read_lock. The packet cannot be shared and
- * skb->data should point to the Ethernet header. The caller must have already
- * called compute_ip_summed() to initialize the checksumming fields.
+ * skb->data should point to the Ethernet header.
*/
-void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
- const struct ovs_tunnel_info *tun_info)
+int ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
+ const struct ip_tunnel_info *tun_info)
{
- struct pcpu_sw_netstats *stats;
struct sw_flow_key key;
int error;
- stats = this_cpu_ptr(vport->percpu_stats);
- u64_stats_update_begin(&stats->syncp);
- stats->rx_packets++;
- stats->rx_bytes += skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
- u64_stats_update_end(&stats->syncp);
-
- ovs_skb_init_inner_protocol(skb);
OVS_CB(skb)->input_vport = vport;
- OVS_CB(skb)->egress_tun_info = NULL;
+ ovs_skb_init_inner_protocol(skb);
+ skb_clear_ovs_gso_cb(skb);
+ /* Extract flow from 'skb' into 'key'. */
error = ovs_flow_key_extract(tun_info, skb, &key);
if (unlikely(error)) {
kfree_skb(skb);
- return;
+ return error;
}
ovs_dp_process_packet(skb, &key);
+ return 0;
}
EXPORT_SYMBOL_GPL(ovs_vport_receive);
-/**
- * ovs_vport_send - send a packet on a device
- *
- * @vport: vport on which to send the packet
- * @skb: skb to send
- *
- * Sends the given packet and returns the length of data sent. Either ovs
- * lock or rcu_read_lock must be held.
- */
-int ovs_vport_send(struct vport *vport, struct sk_buff *skb)
-{
- int sent = vport->ops->send(vport, skb);
-
- if (likely(sent > 0)) {
- struct pcpu_sw_netstats *stats;
-
- stats = this_cpu_ptr(vport->percpu_stats);
-
- u64_stats_update_begin(&stats->syncp);
- stats->tx_packets++;
- stats->tx_bytes += sent;
- u64_stats_update_end(&stats->syncp);
- } else if (sent < 0) {
- ovs_vport_record_error(vport, VPORT_E_TX_ERROR);
- } else {
- ovs_vport_record_error(vport, VPORT_E_TX_DROPPED);
- }
- return sent;
-}
-
-/**
- * ovs_vport_record_error - indicate device error to generic stats layer
- *
- * @vport: vport that encountered the error
- * @err_type: one of enum vport_err_type types to indicate the error type
- *
- * If using the vport generic stats layer indicate that an error of the given
- * type has occurred.
- */
-static void ovs_vport_record_error(struct vport *vport,
- enum vport_err_type err_type)
-{
- switch (err_type) {
- case VPORT_E_RX_DROPPED:
- atomic_long_inc(&vport->err_stats.rx_dropped);
- break;
-
- case VPORT_E_RX_ERROR:
- atomic_long_inc(&vport->err_stats.rx_errors);
- break;
-
- case VPORT_E_TX_DROPPED:
- atomic_long_inc(&vport->err_stats.tx_dropped);
- break;
-
- case VPORT_E_TX_ERROR:
- atomic_long_inc(&vport->err_stats.tx_errors);
- break;
- }
-
-}
-
static void free_vport_rcu(struct rcu_head *rcu)
{
struct vport *vport = container_of(rcu, struct vport, rcu);
@@ -583,33 +522,32 @@ void ovs_vport_deferred_free(struct vport *vport)
}
EXPORT_SYMBOL_GPL(ovs_vport_deferred_free);
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall,
struct net *net,
- const struct ovs_tunnel_info *tun_info,
+ struct sk_buff *skb,
u8 ipproto,
- u32 skb_mark,
__be16 tp_src,
__be16 tp_dst)
{
- const struct ovs_key_ipv4_tunnel *tun_key;
+ struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info;
+ struct ip_tunnel_info *tun_info = skb_tunnel_info(skb);
+ const struct ip_tunnel_key *tun_key;
+ u32 skb_mark = skb->mark;
struct rtable *rt;
- __be32 saddr;
+ struct flowi4 fl;
if (unlikely(!tun_info))
return -EINVAL;
+ if (ip_tunnel_info_af(tun_info) != AF_INET)
+ return -EINVAL;
+
+ tun_key = &tun_info->key;
- tun_key = &tun_info->tunnel;
- saddr = tun_key->ipv4_src;
- /* Route lookup to get srouce IP address: saddr.
+ /* Route lookup to get srouce IP address.
* The process may need to be changed if the corresponding process
* in vports ops changed.
*/
- rt = find_route(net,
- &saddr,
- tun_key->ipv4_dst,
- ipproto,
- tun_key->ipv4_tos,
- skb_mark);
+ rt = ovs_tunnel_route_lookup(net, tun_key, skb_mark, &fl, ipproto);
if (IS_ERR(rt))
return PTR_ERR(rt);
@@ -618,26 +556,56 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
/* Generate egress_tun_info based on tun_info,
* saddr, tp_src and tp_dst
*/
- __ovs_flow_tun_info_init(egress_tun_info,
- saddr, tun_key->ipv4_dst,
- tun_key->ipv4_tos,
- tun_key->ipv4_ttl,
- tp_src, tp_dst,
- tun_key->tun_id,
- tun_key->tun_flags,
- tun_info->options,
- tun_info->options_len);
-
+ ip_tunnel_key_init(&egress_tun_info->key,
+ fl.saddr, tun_key->u.ipv4.dst,
+ tun_key->tos,
+ tun_key->ttl,
+ tp_src, tp_dst,
+ tun_key->tun_id,
+ tun_key->tun_flags);
+ egress_tun_info->options_len = tun_info->options_len;
+ egress_tun_info->mode = tun_info->mode;
+ upcall->egress_tun_opts = ip_tunnel_info_opts(tun_info);
return 0;
}
EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info);
int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *info)
+ struct dp_upcall_info *upcall)
{
/* get_egress_tun_info() is only implemented on tunnel ports. */
if (unlikely(!vport->ops->get_egress_tun_info))
return -EINVAL;
- return vport->ops->get_egress_tun_info(vport, skb, info);
+ return vport->ops->get_egress_tun_info(vport, skb, upcall);
+}
+
+static unsigned int packet_length(const struct sk_buff *skb)
+{
+ unsigned int length = skb->len - ETH_HLEN;
+
+ if (skb->protocol == htons(ETH_P_8021Q))
+ length -= VLAN_HLEN;
+
+ return length;
+}
+
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb)
+{
+ int mtu = vport->dev->mtu;
+
+ if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) {
+ net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n",
+ vport->dev->name,
+ packet_length(skb), mtu);
+ vport->dev->stats.tx_errors++;
+ goto drop;
+ }
+
+ skb->dev = vport->dev;
+ vport->ops->send(skb);
+ return;
+
+drop:
+ kfree_skb(skb);
}
diff --git a/datapath/vport.h b/datapath/vport.h
index b217b8862..d82071970 100644
--- a/datapath/vport.h
+++ b/datapath/vport.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007-2012 Nicira, Inc.
+ * Copyright (c) 2007-2015 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -27,14 +27,14 @@
#include <linux/skbuff.h>
#include <linux/spinlock.h>
#include <linux/u64_stats_sync.h>
+#include <net/route.h>
+
+#include "datapath.h"
struct vport;
struct vport_parms;
/* The following definitions are for users of the vport subsytem: */
-struct vport_net {
- struct vport __rcu *gre_vport;
-};
int ovs_vport_init(void);
void ovs_vport_exit(void);
@@ -53,25 +53,15 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids);
int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *);
u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *);
-int ovs_vport_send(struct vport *, struct sk_buff *);
-
-int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info,
+int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall,
struct net *net,
- const struct ovs_tunnel_info *tun_info,
+ struct sk_buff *,
u8 ipproto,
- u32 skb_mark,
__be16 tp_src,
__be16 tp_dst);
+
int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb,
- struct ovs_tunnel_info *info);
-
-/* The following definitions are for implementers of vport devices: */
-struct vport_err_stats {
- atomic_long_t rx_dropped;
- atomic_long_t rx_errors;
- atomic_long_t tx_dropped;
- atomic_long_t tx_errors;
-};
+ struct dp_upcall_info *upcall);
/**
* struct vport_portids - array of netlink portids of a vport.
@@ -98,12 +88,10 @@ struct vport_portids {
* @hash_node: Element in @dev_table hash table in vport.c.
* @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
* @ops: Class structure.
- * @percpu_stats: Points to per-CPU statistics used and maintained by vport
- * @err_stats: Points to error statistics used and maintained by vport
* @detach_list: list used for detaching vport in net-exit call.
*/
struct vport {
- struct rcu_head rcu;
+ struct net_device *dev;
struct datapath *dp;
struct vport_portids __rcu *upcall_portids;
u16 port_no;
@@ -112,10 +100,8 @@ struct vport {
struct hlist_node dp_hash_node;
const struct vport_ops *ops;
- struct pcpu_sw_netstats __percpu *percpu_stats;
-
- struct vport_err_stats err_stats;
struct list_head detach_list;
+ struct rcu_head rcu;
};
/**
@@ -152,8 +138,7 @@ struct vport_parms {
* @get_options: Appends vport-specific attributes for the configuration of an
* existing vport to a &struct sk_buff. May be %NULL for a vport that does not
* have any configuration.
- * @get_name: Get the device's name.
- * @send: Send a packet on the device. Returns the length of the packet sent,
+ * @send: Send a packet on the device.
* zero for dropped packets or negative for error.
* @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for
* a packet.
@@ -168,24 +153,14 @@ struct vport_ops {
int (*set_options)(struct vport *, struct nlattr *);
int (*get_options)(const struct vport *, struct sk_buff *);
- /* Called with rcu_read_lock or ovs_mutex. */
- const char *(*get_name)(const struct vport *);
-
- int (*send)(struct vport *, struct sk_buff *);
int (*get_egress_tun_info)(struct vport *, struct sk_buff *,
- struct ovs_tunnel_info *);
+ struct dp_upcall_info *upcall);
+ netdev_tx_t (*send)(struct sk_buff *skb);
struct module *owner;
struct list_head list;
};
-enum vport_err_type {
- VPORT_E_RX_DROPPED,
- VPORT_E_RX_ERROR,
- VPORT_E_TX_DROPPED,
- VPORT_E_TX_ERROR,
-};
-
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
const struct vport_parms *);
void ovs_vport_free(struct vport *);
@@ -222,8 +197,8 @@ static inline struct vport *vport_from_priv(void *priv)
return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
}
-void ovs_vport_receive(struct vport *, struct sk_buff *,
- const struct ovs_tunnel_info *);
+int ovs_vport_receive(struct vport *, struct sk_buff *,
+ const struct ip_tunnel_info *);
static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
@@ -232,6 +207,32 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
}
+static inline const char *ovs_vport_name(struct vport *vport)
+{
+ return vport->dev->name;
+}
+
int ovs_vport_ops_register(struct vport_ops *ops);
void ovs_vport_ops_unregister(struct vport_ops *ops);
+
+static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
+ const struct ip_tunnel_key *key,
+ u32 mark,
+ struct flowi4 *fl,
+ u8 protocol)
+{
+ struct rtable *rt;
+
+ memset(fl, 0, sizeof(*fl));
+ fl->daddr = key->u.ipv4.dst;
+ fl->saddr = key->u.ipv4.src;
+ fl->flowi4_tos = RT_TOS(key->tos);
+ fl->flowi4_mark = mark;
+ fl->flowi4_proto = protocol;
+
+ rt = ip_route_output_key(net, fl);
+ return rt;
+}
+
+void ovs_vport_send(struct vport *vport, struct sk_buff *skb);
#endif /* vport.h */