summaryrefslogtreecommitdiff
path: root/datapath-windows
diff options
context:
space:
mode:
authorSairam Venugopal <vsairam@vmware.com>2016-04-13 11:54:03 -0700
committerBen Pfaff <blp@ovn.org>2016-04-14 10:11:46 -0700
commit792d377d8330606ad122dae5f941e5088c10cf14 (patch)
tree233132b3b7fc3b4fb7f0c35a71d51b53d4a364b5 /datapath-windows
parentce05810425872709ddda9d7755b3348f584849ff (diff)
downloadopenvswitch-792d377d8330606ad122dae5f941e5088c10cf14.tar.gz
datapath-windows: Add Connection Tracking Support
Enable support for Stateful Firewall in Hyper-V by adding a Connection Tracking module. The module has been ported over from the userspace implementation patch of a similar name. The current version of the module supports ct - zone, mark and label for TCP packets. Support for other packet formats will be added in subsequent patches. The conntrack-tcp module is adapted from FreeBSD's pf subsystem and hence the BSD license. It has been ported over to match OVS Hyper-V coding style. Signed-off-by: Sairam Venugopal <vsairam@vmware.com> Signed-off-by: Daniele Di Proietto <diproiettod@vmware.com> Co-Authored-by: Daniele Di Proietto <diproiettod@vmware.com> Acked-by: Nithin Raju <nithin@vmware.com> Signed-off-by: Ben Pfaff <blp@ovn.org>
Diffstat (limited to 'datapath-windows')
-rw-r--r--datapath-windows/automake.mk3
-rw-r--r--datapath-windows/ovsext/Actions.c23
-rw-r--r--datapath-windows/ovsext/Conntrack-tcp.c532
-rw-r--r--datapath-windows/ovsext/Conntrack.c530
-rw-r--r--datapath-windows/ovsext/Conntrack.h102
-rw-r--r--datapath-windows/ovsext/Debug.h1
-rw-r--r--datapath-windows/ovsext/DpInternal.h7
-rw-r--r--datapath-windows/ovsext/Flow.c128
-rw-r--r--datapath-windows/ovsext/Switch.c10
-rw-r--r--datapath-windows/ovsext/Types.h6
-rw-r--r--datapath-windows/ovsext/Util.h3
-rw-r--r--datapath-windows/ovsext/ovsext.vcxproj3
12 files changed, 1344 insertions, 4 deletions
diff --git a/datapath-windows/automake.mk b/datapath-windows/automake.mk
index 04fc97f07..c9af8064d 100644
--- a/datapath-windows/automake.mk
+++ b/datapath-windows/automake.mk
@@ -13,6 +13,9 @@ EXTRA_DIST += \
datapath-windows/ovsext/Atomic.h \
datapath-windows/ovsext/BufferMgmt.c \
datapath-windows/ovsext/BufferMgmt.h \
+ datapath-windows/ovsext/Conntrack-tcp.c \
+ datapath-windows/ovsext/Conntrack.c \
+ datapath-windows/ovsext/Conntrack.h \
datapath-windows/ovsext/Datapath.c \
datapath-windows/ovsext/Datapath.h \
datapath-windows/ovsext/Debug.c \
diff --git a/datapath-windows/ovsext/Actions.c b/datapath-windows/ovsext/Actions.c
index 3e5dac902..cf54ae20a 100644
--- a/datapath-windows/ovsext/Actions.c
+++ b/datapath-windows/ovsext/Actions.c
@@ -17,6 +17,7 @@
#include "precomp.h"
#include "Actions.h"
+#include "Conntrack.h"
#include "Debug.h"
#include "Event.h"
#include "Flow.h"
@@ -1786,6 +1787,28 @@ OvsDoExecuteActions(POVS_SWITCH_CONTEXT switchContext,
break;
}
+ case OVS_ACTION_ATTR_CT:
+ {
+ if (ovsFwdCtx.destPortsSizeOut > 0
+ || ovsFwdCtx.tunnelTxNic != NULL
+ || ovsFwdCtx.tunnelRxNic != NULL) {
+ status = OvsOutputBeforeSetAction(&ovsFwdCtx);
+ if (status != NDIS_STATUS_SUCCESS) {
+ dropReason = L"OVS-adding destination failed";
+ goto dropit;
+ }
+ }
+
+ status = OvsExecuteConntrackAction(ovsFwdCtx.curNbl, layers,
+ key, (const PNL_ATTR)a);
+ if (status != NDIS_STATUS_SUCCESS) {
+ OVS_LOG_ERROR("CT Action failed");
+ dropReason = L"OVS-conntrack action failed";
+ goto dropit;
+ }
+ break;
+ }
+
case OVS_ACTION_ATTR_RECIRC:
{
if (ovsFwdCtx.destPortsSizeOut > 0 || ovsFwdCtx.tunnelTxNic != NULL
diff --git a/datapath-windows/ovsext/Conntrack-tcp.c b/datapath-windows/ovsext/Conntrack-tcp.c
new file mode 100644
index 000000000..3e25ba567
--- /dev/null
+++ b/datapath-windows/ovsext/Conntrack-tcp.c
@@ -0,0 +1,532 @@
+/*-
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2008 Henning Brauer
+ * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2015, 2016 VMware, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
+ */
+
+#include "Conntrack.h"
+#include <stddef.h>
+
+struct tcp_peer {
+ enum ct_dpif_tcp_state state;
+ uint32_t seqlo; /* Max sequence number sent */
+ uint32_t seqhi; /* Max the other end ACKd + win */
+ uint16_t max_win;/* largest window (pre scaling) */
+ uint8_t wscale; /* window scaling factor */
+};
+
+struct conn_tcp {
+ struct OVS_CT_ENTRY up;
+ struct tcp_peer peer[2];
+};
+
+enum {
+ TCPOPT_EOL,
+ TCPOPT_NOP,
+ TCPOPT_WINDOW = 3,
+};
+
+/* Given POINTER, the address of the given MEMBER in a STRUCT object, returns
+ the STRUCT object. */
+#define CONTAINER_OF(POINTER, STRUCT, MEMBER) \
+ ((STRUCT *) (void *) ((char *) (POINTER) - \
+ offsetof (STRUCT, MEMBER)))
+
+
+/* TCP sequence numbers are 32 bit integers operated
+ * on with modular arithmetic. These macros can be
+ * used to compare such integers. */
+#define SEQ_LT(a,b) ((int)((a)-(b)) < 0)
+#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0)
+#define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
+#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
+
+#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
+#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
+
+#define TCP_FIN 0x001
+#define TCP_SYN 0x002
+#define TCP_RST 0x004
+#define TCP_PSH 0x008
+#define TCP_ACK 0x010
+#define TCP_URG 0x020
+#define TCP_ECE 0x040
+#define TCP_CWR 0x080
+#define TCP_NS 0x100
+
+#define CT_DPIF_TCP_FLAGS \
+ CT_DPIF_TCP_FLAG(WINDOW_SCALE) \
+ CT_DPIF_TCP_FLAG(SACK_PERM) \
+ CT_DPIF_TCP_FLAG(CLOSE_INIT) \
+ CT_DPIF_TCP_FLAG(BE_LIBERAL) \
+ CT_DPIF_TCP_FLAG(DATA_UNACKNOWLEDGED) \
+ CT_DPIF_TCP_FLAG(MAXACK_SET) \
+
+enum ct_dpif_tcp_flags_count_ {
+#define CT_DPIF_TCP_FLAG(FLAG) FLAG##_COUNT_,
+ CT_DPIF_TCP_FLAGS
+#undef CT_DPIF_TCP_FLAG
+};
+
+enum ct_dpif_tcp_flags {
+#define CT_DPIF_TCP_FLAG(FLAG) CT_DPIF_TCPF_##FLAG = (1 << \
+ FLAG##_COUNT_),
+ CT_DPIF_TCP_FLAGS
+#undef CT_DPIF_TCP_FLAG
+};
+
+
+#define CT_DPIF_TCP_STATES \
+ CT_DPIF_TCP_STATE(CLOSED) \
+ CT_DPIF_TCP_STATE(LISTEN) \
+ CT_DPIF_TCP_STATE(SYN_SENT) \
+ CT_DPIF_TCP_STATE(SYN_RECV) \
+ CT_DPIF_TCP_STATE(ESTABLISHED) \
+ CT_DPIF_TCP_STATE(CLOSE_WAIT) \
+ CT_DPIF_TCP_STATE(FIN_WAIT_1) \
+ CT_DPIF_TCP_STATE(CLOSING) \
+ CT_DPIF_TCP_STATE(LAST_ACK) \
+ CT_DPIF_TCP_STATE(FIN_WAIT_2) \
+ CT_DPIF_TCP_STATE(TIME_WAIT)
+
+enum ct_dpif_tcp_state {
+#define CT_DPIF_TCP_STATE(STATE) CT_DPIF_TCPS_##STATE,
+ CT_DPIF_TCP_STATES
+#undef CT_DPIF_TCP_STATE
+};
+
+#define TCP_MAX_WSCALE 14
+#define CT_WSCALE_FLAG 0x80
+#define CT_WSCALE_UNKNOWN 0x40
+#define CT_WSCALE_MASK 0xf
+
+/* pf does this in in pf_normalize_tcp(), and it is called only if scrub
+ * is enabled. We're not scrubbing, but this check seems reasonable. */
+static __inline BOOLEAN
+OvsConntrackValidateTcpFlags(const TCPHdr *tcp)
+{
+ if (tcp->syn) {
+ if (tcp->rst) {
+ return TRUE;
+ }
+ if (tcp->fin) {
+ /* Here pf removes the fin flag. We simply mark the packet as
+ * invalid */
+ return TRUE;
+ }
+ } else {
+ /* Illegal packet */
+ if (!(tcp->ack || tcp->rst)) {
+ return TRUE;
+ }
+ }
+
+ if (!(tcp->ack)) {
+ /* These flags are only valid if ACK is set */
+ if ((tcp->fin) || (tcp->psh) || (tcp->urg)) {
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+static __inline uint8_t
+OvsTcpGetWscale(const TCPHdr *tcp)
+{
+ unsigned len = tcp->doff * 4 - sizeof *tcp;
+ const uint8_t *opt = (const uint8_t *)(tcp + 1);
+ uint8_t wscale = 0;
+ uint8_t optlen;
+
+ while (len >= 3) {
+ if (*opt == TCPOPT_EOL) {
+ break;
+ }
+ switch (*opt) {
+ case TCPOPT_NOP:
+ opt++;
+ len--;
+ break;
+ case TCPOPT_WINDOW:
+ wscale = MIN(opt[2], TCP_MAX_WSCALE);
+ wscale |= CT_WSCALE_FLAG;
+ /* fall through */
+ default:
+ optlen = opt[2];
+ if (optlen < 2) {
+ optlen = 2;
+ }
+ len -= optlen;
+ opt += optlen;
+ }
+ }
+
+ return wscale;
+}
+
+static __inline uint32_t
+OvsGetTcpPayloadLength(PNET_BUFFER_LIST nbl)
+{
+ IPHdr *ipHdr;
+ char *ipBuf[sizeof(IPHdr)];
+ PNET_BUFFER curNb;
+ curNb = NET_BUFFER_LIST_FIRST_NB(nbl);
+ ipHdr = NdisGetDataBuffer(curNb, sizeof *ipHdr, (PVOID) &ipBuf,
+ 1 /*no align*/, 0);
+ TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4);
+ return (UINT16)ntohs(ipHdr->tot_len)
+ - (ipHdr->ihl * 4)
+ - (sizeof * tcp);
+}
+
+static __inline void
+OvsConntrackUpdateExpiration(struct conn_tcp *conn,
+ long long now,
+ long long interval)
+{
+ conn->up.expiration = now + interval;
+}
+
+static __inline struct conn_tcp*
+OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY* conn)
+{
+ return CONTAINER_OF(conn, struct conn_tcp, up);
+}
+
+enum CT_UPDATE_RES
+OvsConntrackUpdateTcpEntry(struct OVS_CT_ENTRY* conn_,
+ const TCPHdr *tcp,
+ PNET_BUFFER_LIST nbl,
+ BOOLEAN reply,
+ UINT64 now)
+{
+ struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_);
+ /* The peer that sent 'pkt' */
+ struct tcp_peer *src = &conn->peer[reply ? 1 : 0];
+ /* The peer that should receive 'pkt' */
+ struct tcp_peer *dst = &conn->peer[reply ? 0 : 1];
+ uint8_t sws = 0, dws = 0;
+ uint16_t win = ntohs(tcp->window);
+ uint32_t ack, end, seq, orig_seq;
+ uint32_t p_len = OvsGetTcpPayloadLength(nbl);
+ int ackskew;
+
+ if (OvsConntrackValidateTcpFlags(tcp)) {
+ return CT_UPDATE_INVALID;
+ }
+
+ if ((tcp->syn) && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 &&
+ src->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+ src->state = dst->state = CT_DPIF_TCPS_CLOSED;
+ return CT_UPDATE_NEW;
+ }
+
+ if (src->wscale & CT_WSCALE_FLAG
+ && dst->wscale & CT_WSCALE_FLAG
+ && !(tcp->syn)) {
+
+ sws = src->wscale & CT_WSCALE_MASK;
+ dws = dst->wscale & CT_WSCALE_MASK;
+
+ } else if (src->wscale & CT_WSCALE_UNKNOWN
+ && dst->wscale & CT_WSCALE_UNKNOWN
+ && !(tcp->syn)) {
+
+ sws = TCP_MAX_WSCALE;
+ dws = TCP_MAX_WSCALE;
+ }
+
+ /*
+ * Sequence tracking algorithm from Guido van Rooij's paper:
+ * http://www.madison-gurkha.com/publications/tcp_filtering/
+ * tcp_filtering.ps
+ */
+
+ orig_seq = seq = ntohl(tcp->seq);
+ if (src->state < CT_DPIF_TCPS_SYN_SENT) {
+ /* First packet from this end. Set its state */
+
+ ack = ntohl(tcp->ack);
+
+ end = seq + p_len;
+ if (tcp->syn) {
+ end++;
+ if (dst->wscale & CT_WSCALE_FLAG) {
+ src->wscale = OvsTcpGetWscale(tcp);
+ if (src->wscale & CT_WSCALE_FLAG) {
+ /* Remove scale factor from initial window */
+ sws = src->wscale & CT_WSCALE_MASK;
+ win = DIV_ROUND_UP((uint32_t) win, 1 << sws);
+ dws = dst->wscale & CT_WSCALE_MASK;
+ } else {
+ /* fixup other window */
+ dst->max_win <<= dst->wscale &
+ CT_WSCALE_MASK;
+ /* in case of a retrans SYN|ACK */
+ dst->wscale = 0;
+ }
+ }
+ }
+ if (tcp->fin) {
+ end++;
+ }
+
+ src->seqlo = seq;
+ src->state = CT_DPIF_TCPS_SYN_SENT;
+ /*
+ * May need to slide the window (seqhi may have been set by
+ * the crappy stack check or if we picked up the connection
+ * after establishment)
+ */
+ if (src->seqhi == 1 ||
+ SEQ_GEQ(end + MAX(1, dst->max_win << dws),
+ src->seqhi)) {
+ src->seqhi = end + MAX(1, dst->max_win << dws);
+ }
+ if (win > src->max_win) {
+ src->max_win = win;
+ }
+
+ } else {
+ ack = ntohl(tcp->ack);
+ end = seq + p_len;
+ if (tcp->syn) {
+ end++;
+ }
+ if (tcp->fin) {
+ end++;
+ }
+ }
+
+ if ((tcp->ack) == 0) {
+ /* Let it pass through the ack skew check */
+ ack = dst->seqlo;
+ } else if ((ack == 0
+ && (tcp->ack && tcp->rst) == (TCP_ACK|TCP_RST))
+ /* broken tcp stacks do not set ack */) {
+ /* Many stacks (ours included) will set the ACK number in an
+ * FIN|ACK if the SYN times out -- no sequence to ACK. */
+ ack = dst->seqlo;
+ }
+
+ if (seq == end) {
+ /* Ease sequencing restrictions on no data packets */
+ seq = src->seqlo;
+ end = seq;
+ }
+
+ ackskew = dst->seqlo - ack;
+#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */
+ if (SEQ_GEQ(src->seqhi, end)
+ /* Last octet inside other's window space */
+ && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws))
+ /* Retrans: not more than one window back */
+ && (ackskew >= -MAXACKWINDOW)
+ /* Acking not more than one reassembled fragment backwards */
+ && (ackskew <= (MAXACKWINDOW << sws))
+ /* Acking not more than one window forward */
+ && ((tcp->rst) == 0 || orig_seq == src->seqlo
+ || (orig_seq == src->seqlo + 1)
+ || (orig_seq + 1 == src->seqlo))) {
+ /* Require an exact/+1 sequence match on resets when possible */
+
+ /* update max window */
+ if (src->max_win < win) {
+ src->max_win = win;
+ }
+ /* synchronize sequencing */
+ if (SEQ_GT(end, src->seqlo)) {
+ src->seqlo = end;
+ }
+ /* slide the window of what the other end can send */
+ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+ dst->seqhi = ack + MAX((win << sws), 1);
+ }
+
+ /* update states */
+ if (tcp->syn && src->state < CT_DPIF_TCPS_SYN_SENT) {
+ src->state = CT_DPIF_TCPS_SYN_SENT;
+ }
+ if (tcp->fin && src->state < CT_DPIF_TCPS_CLOSING) {
+ src->state = CT_DPIF_TCPS_CLOSING;
+ }
+ if (tcp->ack) {
+ if (dst->state == CT_DPIF_TCPS_SYN_SENT) {
+ dst->state = CT_DPIF_TCPS_ESTABLISHED;
+ } else if (dst->state == CT_DPIF_TCPS_CLOSING) {
+ dst->state = CT_DPIF_TCPS_FIN_WAIT_2;
+ }
+ }
+ if (tcp->rst) {
+ src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+ }
+
+ if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2
+ && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) {
+ OvsConntrackUpdateExpiration(conn, now, 30 * 10000000LL);
+ } else if (src->state >= CT_DPIF_TCPS_CLOSING
+ && dst->state >= CT_DPIF_TCPS_CLOSING) {
+ OvsConntrackUpdateExpiration(conn, now, 45 * 10000000LL);
+ } else if (src->state < CT_DPIF_TCPS_ESTABLISHED
+ || dst->state < CT_DPIF_TCPS_ESTABLISHED) {
+ OvsConntrackUpdateExpiration(conn, now, 30 * 10000000LL);
+ } else if (src->state >= CT_DPIF_TCPS_CLOSING
+ || dst->state >= CT_DPIF_TCPS_CLOSING) {
+ OvsConntrackUpdateExpiration(conn, now, 15 * 60 * 10000000LL);
+ } else {
+ OvsConntrackUpdateExpiration(conn, now, 24 * 60 * 60 * 10000000LL);
+ }
+ } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT
+ || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2
+ || src->state >= CT_DPIF_TCPS_FIN_WAIT_2)
+ && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end)
+ /* Within a window forward of the originating packet */
+ && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
+ /* Within a window backward of the originating packet */
+
+ /*
+ * This currently handles three situations:
+ * 1) Stupid stacks will shotgun SYNs before their peer
+ * replies.
+ * 2) When PF catches an already established stream (the
+ * firewall rebooted, the state table was flushed, routes
+ * changed...)
+ * 3) Packets get funky immediately after the connection
+ * closes (this should catch Solaris spurious ACK|FINs
+ * that web servers like to spew after a close)
+ *
+ * This must be a little more careful than the above code
+ * since packet floods will also be caught here. We don't
+ * update the TTL here to mitigate the damage of a packet
+ * flood and so the same code can handle awkward establishment
+ * and a loosened connection close.
+ * In the establishment case, a correct peer response will
+ * validate the connection, go through the normal state code
+ * and keep updating the state TTL.
+ */
+
+ /* update max window */
+ if (src->max_win < win) {
+ src->max_win = win;
+ }
+ /* synchronize sequencing */
+ if (SEQ_GT(end, src->seqlo)) {
+ src->seqlo = end;
+ }
+ /* slide the window of what the other end can send */
+ if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) {
+ dst->seqhi = ack + MAX((win << sws), 1);
+ }
+
+ /*
+ * Cannot set dst->seqhi here since this could be a shotgunned
+ * SYN and not an already established connection.
+ */
+
+ if (tcp->fin && src->state < CT_DPIF_TCPS_CLOSING) {
+ src->state = CT_DPIF_TCPS_CLOSING;
+ }
+
+ if (tcp->rst) {
+ src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT;
+ }
+ } else {
+ return CT_UPDATE_INVALID;
+ }
+
+ return CT_UPDATE_VALID;
+}
+
+BOOLEAN
+OvsConntrackValidateTcpPacket(const TCPHdr *tcp)
+{
+ if (tcp == NULL || OvsConntrackValidateTcpFlags(tcp)) {
+ return FALSE;
+ }
+
+ /* A syn+ack is not allowed to create a connection. We want to allow
+ * totally new connections (syn) or already established, not partially
+ * open (syn+ack). */
+ if ((tcp->syn) && (tcp->ack)) {
+ return FALSE;
+ }
+
+ return TRUE;
+}
+
+OVS_CT_ENTRY *
+OvsNewTcpConntrack(const TCPHdr *tcp,
+ PNET_BUFFER_LIST nbl,
+ UINT64 now)
+{
+ struct conn_tcp* newconn = NULL;
+ struct tcp_peer *src, *dst;
+
+ newconn = OvsAllocateMemoryWithTag(sizeof(struct conn_tcp),
+ OVS_CT_POOL_TAG);
+ newconn->up = (OVS_CT_ENTRY) {0};
+ src = &newconn->peer[0];
+ dst = &newconn->peer[1];
+
+ src->seqlo = ntohl(tcp->seq);
+ src->seqhi = src->seqlo + OvsGetTcpPayloadLength(nbl) + 1;
+
+ if (tcp->syn) {
+ src->seqhi++;
+ src->wscale = OvsTcpGetWscale(tcp);
+ } else {
+ src->wscale = CT_WSCALE_UNKNOWN;
+ dst->wscale = CT_WSCALE_UNKNOWN;
+ }
+ src->max_win = MAX(ntohs(tcp->window), 1);
+ if (src->wscale & CT_WSCALE_MASK) {
+ /* Remove scale factor from initial window */
+ uint8_t sws = src->wscale & CT_WSCALE_MASK;
+ src->max_win = DIV_ROUND_UP((uint32_t) src->max_win,
+ 1 << sws);
+ }
+ if (tcp->fin) {
+ src->seqhi++;
+ }
+ dst->seqhi = 1;
+ dst->max_win = 1;
+ src->state = CT_DPIF_TCPS_SYN_SENT;
+ dst->state = CT_DPIF_TCPS_CLOSED;
+
+ OvsConntrackUpdateExpiration(newconn, now, CT_ENTRY_TIMEOUT);
+
+ return &newconn->up;
+} \ No newline at end of file
diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c
new file mode 100644
index 000000000..fbeb70c33
--- /dev/null
+++ b/datapath-windows/ovsext/Conntrack.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2015, 2016 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef OVS_DBG_MOD
+#undef OVS_DBG_MOD
+#endif
+#define OVS_DBG_MOD OVS_DBG_CONTRK
+
+#include "Conntrack.h"
+#include "Jhash.h"
+#include "PacketParser.h"
+#include "Debug.h"
+
+typedef struct _OVS_CT_THREAD_CTX {
+ KEVENT event;
+ PVOID threadObject;
+ UINT32 exit;
+} OVS_CT_THREAD_CTX, *POVS_CT_THREAD_CTX;
+
+KSTART_ROUTINE ovsConntrackEntryCleaner;
+static PLIST_ENTRY ovsConntrackTable;
+static OVS_CT_THREAD_CTX ctThreadCtx;
+static PNDIS_RW_LOCK_EX ovsConntrackLockObj;
+
+/*
+ *----------------------------------------------------------------------------
+ * OvsInitConntrack
+ * Initialize the components used by Connection Tracking
+ *----------------------------------------------------------------------------
+ */
+NTSTATUS
+OvsInitConntrack(POVS_SWITCH_CONTEXT context)
+{
+ NTSTATUS status;
+ HANDLE threadHandle = NULL;
+
+ /* Init the sync-lock */
+ ovsConntrackLockObj = NdisAllocateRWLock(context->NdisFilterHandle);
+ if (ovsConntrackLockObj == NULL) {
+ return STATUS_INSUFFICIENT_RESOURCES;
+ }
+
+ /* Init the Hash Buffer */
+ ovsConntrackTable = OvsAllocateMemoryWithTag(sizeof(LIST_ENTRY)
+ * CT_HASH_TABLE_SIZE,
+ OVS_CT_POOL_TAG);
+ if (ovsConntrackTable == NULL) {
+ NdisFreeRWLock(ovsConntrackLockObj);
+ ovsConntrackLockObj = NULL;
+ return STATUS_INSUFFICIENT_RESOURCES;
+ }
+
+ for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) {
+ InitializeListHead(&ovsConntrackTable[i]);
+ }
+
+ /* Init CT Cleaner Thread */
+ KeInitializeEvent(&ctThreadCtx.event, NotificationEvent, FALSE);
+ status = PsCreateSystemThread(&threadHandle, SYNCHRONIZE, NULL, NULL,
+ NULL, ovsConntrackEntryCleaner,
+ &ctThreadCtx);
+
+ if (status != STATUS_SUCCESS) {
+ NdisFreeRWLock(ovsConntrackLockObj);
+ ovsConntrackLockObj = NULL;
+
+ OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG);
+ ovsConntrackTable = NULL;
+
+ return status;
+ }
+
+ ObReferenceObjectByHandle(threadHandle, SYNCHRONIZE, NULL, KernelMode,
+ &ctThreadCtx.threadObject, NULL);
+ ZwClose(threadHandle);
+ threadHandle = NULL;
+ return STATUS_SUCCESS;
+}
+
+/*
+ *----------------------------------------------------------------------------
+ * OvsCleanupConntrack
+ * Cleanup memory and thread that were spawned for Connection tracking
+ *----------------------------------------------------------------------------
+ */
+VOID
+OvsCleanupConntrack(VOID)
+{
+ LOCK_STATE_EX lockState;
+ NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0);
+ ctThreadCtx.exit = 1;
+ KeSetEvent(&ctThreadCtx.event, 0, FALSE);
+ NdisReleaseRWLock(ovsConntrackLockObj, &lockState);
+
+ KeWaitForSingleObject(ctThreadCtx.threadObject, Executive,
+ KernelMode, FALSE, NULL);
+ ObDereferenceObject(ctThreadCtx.threadObject);
+
+ if (ovsConntrackTable) {
+ OvsFreeMemoryWithTag(ovsConntrackTable, OVS_CT_POOL_TAG);
+ ovsConntrackTable = NULL;
+ }
+
+ NdisFreeRWLock(ovsConntrackLockObj);
+ ovsConntrackLockObj = NULL;
+}
+
+static __inline VOID
+OvsCtKeyReverse(OVS_CT_KEY *key)
+{
+ struct ct_endpoint tmp;
+ tmp = key->src;
+ key->src = key->dst;
+ key->dst = tmp;
+}
+
+static __inline VOID
+OvsCtUpdateFlowKey(struct OvsFlowKey *key,
+ UINT32 state,
+ UINT16 zone,
+ UINT32 mark,
+ struct ovs_key_ct_labels *labels)
+{
+ key->ct.state = state | OVS_CS_F_TRACKED;
+ key->ct.zone = zone;
+ key->ct.mark = mark;
+ if (labels) {
+ NdisMoveMemory(&key->ct.labels, labels,
+ sizeof(struct ovs_key_ct_labels));
+ } else {
+ memset(&key->ct.labels, 0,
+ sizeof(struct ovs_key_ct_labels));
+ }
+}
+
+static __inline POVS_CT_ENTRY
+OvsCtEntryCreate(const TCPHdr *tcp,
+ PNET_BUFFER_LIST curNbl,
+ OvsConntrackKeyLookupCtx *ctx,
+ OvsFlowKey *key,
+ BOOLEAN commit,
+ UINT64 currentTime)
+{
+ POVS_CT_ENTRY entry = NULL;
+ UINT32 state = 0;
+ if (!OvsConntrackValidateTcpPacket(tcp)) {
+ state |= OVS_CS_F_INVALID;
+ OvsCtUpdateFlowKey(key, state, ctx->key.zone, 0, NULL);
+ return entry;
+ }
+
+ state |= OVS_CS_F_NEW;
+ if (commit) {
+ entry = OvsNewTcpConntrack(tcp, curNbl, currentTime);
+ NdisMoveMemory(&entry->key, &ctx->key, sizeof (OVS_CT_KEY));
+ NdisMoveMemory(&entry->rev_key, &ctx->key, sizeof (OVS_CT_KEY));
+ OvsCtKeyReverse(&entry->rev_key);
+ InsertHeadList(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK],
+ &entry->link);
+ }
+
+ OvsCtUpdateFlowKey(key, state, ctx->key.zone, 0, NULL);
+ return entry;
+}
+
+static __inline VOID
+OvsCtEntryDelete(POVS_CT_ENTRY entry)
+{
+ RemoveEntryList(&entry->link);
+ OvsFreeMemoryWithTag(entry, OVS_CT_POOL_TAG);
+}
+
+static __inline BOOLEAN
+OvsCtEntryExpired(POVS_CT_ENTRY entry)
+{
+ if (entry == NULL)
+ return TRUE;
+
+ UINT64 currentTime;
+ NdisGetCurrentSystemTime((LARGE_INTEGER *)&currentTime);
+ return entry->expiration < currentTime;
+}
+
+static __inline NDIS_STATUS
+OvsDetectCtPacket(OvsFlowKey *key)
+{
+ /* Currently we support only Unfragmented TCP packets */
+ switch (ntohs(key->l2.dlType)) {
+ case ETH_TYPE_IPV4:
+ if (key->ipKey.nwFrag != OVS_FRAG_TYPE_NONE) {
+ return NDIS_STATUS_NOT_SUPPORTED;
+ }
+ if (key->ipKey.nwProto != IPPROTO_TCP) {
+ return NDIS_STATUS_NOT_SUPPORTED;
+ }
+ return NDIS_STATUS_SUCCESS;
+ case ETH_TYPE_IPV6:
+ return NDIS_STATUS_NOT_SUPPORTED;
+ }
+
+ return NDIS_STATUS_NOT_SUPPORTED;
+}
+
+static __inline BOOLEAN
+OvsCtKeyAreSame(OVS_CT_KEY ctxKey, OVS_CT_KEY entryKey)
+{
+ return ((ctxKey.src.addr.ipv4 == entryKey.src.addr.ipv4) &&
+ (ctxKey.src.addr.ipv4_aligned == entryKey.src.addr.ipv4_aligned) &&
+ (ctxKey.src.port == entryKey.src.port) &&
+ (ctxKey.dst.addr.ipv4 == entryKey.dst.addr.ipv4) &&
+ (ctxKey.dst.addr.ipv4_aligned == entryKey.dst.addr.ipv4_aligned) &&
+ (ctxKey.dst.port == entryKey.dst.port) &&
+ (ctxKey.dl_type == entryKey.dl_type) &&
+ (ctxKey.nw_proto == entryKey.nw_proto) &&
+ (ctxKey.zone == entryKey.zone));
+}
+
+static __inline POVS_CT_ENTRY
+OvsCtLookup(OvsConntrackKeyLookupCtx *ctx)
+{
+ PLIST_ENTRY link;
+ POVS_CT_ENTRY entry;
+ BOOLEAN reply = FALSE;
+ POVS_CT_ENTRY found = NULL;
+
+ LIST_FORALL(&ovsConntrackTable[ctx->hash & CT_HASH_TABLE_MASK], link) {
+ entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link);
+
+ if (OvsCtKeyAreSame(ctx->key,entry->key)) {
+ found = entry;
+ reply = FALSE;
+ break;
+ }
+
+ if (OvsCtKeyAreSame(ctx->key,entry->rev_key)) {
+ found = entry;
+ reply = TRUE;
+ break;
+ }
+ }
+
+ if (found) {
+ if (OvsCtEntryExpired(found)) {
+ found = NULL;
+ } else {
+ ctx->reply = reply;
+ }
+ }
+
+ ctx->entry = found;
+ return found;
+}
+
+static __inline VOID
+OvsCtSetupLookupCtx(OvsFlowKey *flowKey,
+ UINT16 zone,
+ OvsConntrackKeyLookupCtx *ctx)
+{
+ UINT32 hsrc, hdst,hash;
+
+ ctx->key.zone = zone;
+ ctx->key.dl_type = flowKey->l2.dlType;
+
+ if (flowKey->l2.dlType == htons(ETH_TYPE_IPV4)) {
+ ctx->key.src.addr.ipv4 = flowKey->ipKey.nwSrc;
+ ctx->key.dst.addr.ipv4 = flowKey->ipKey.nwDst;
+ ctx->key.nw_proto = flowKey->ipKey.nwProto;
+
+ ctx->key.src.port = flowKey->ipKey.l4.tpSrc;
+ ctx->key.dst.port = flowKey->ipKey.l4.tpDst;
+ } else if (flowKey->l2.dlType == htons(ETH_TYPE_IPV6)) {
+ ctx->key.src.addr.ipv6 = flowKey->ipv6Key.ipv6Src;
+ ctx->key.dst.addr.ipv6 = flowKey->ipv6Key.ipv6Dst;
+ ctx->key.nw_proto = flowKey->ipv6Key.nwProto;
+
+ ctx->key.src.port = flowKey->ipv6Key.l4.tpSrc;
+ ctx->key.dst.port = flowKey->ipv6Key.l4.tpDst;
+ }
+
+ /* Related bit is set for ICMP and FTP (Not supported)*/
+ ctx->related = FALSE;
+
+ hsrc = OvsJhashBytes((UINT32*) &ctx->key.src, sizeof(ctx->key.src), 0);
+ hdst = OvsJhashBytes((UINT32*) &ctx->key.dst, sizeof(ctx->key.dst), 0);
+ hash = hsrc ^ hdst; /* TO identify reverse traffic */
+ ctx->hash = OvsJhashBytes((uint32_t *) &ctx->key.dst + 1,
+ ((uint32_t *) (&ctx->key + 1) -
+ (uint32_t *) (&ctx->key.dst + 1)),
+ hash);
+}
+
+/*
+ *----------------------------------------------------------------------------
+ * OvsProcessConntrackEntry
+ * Check the TCP flags and set the ct_state of the entry
+ *----------------------------------------------------------------------------
+ */
+static __inline POVS_CT_ENTRY
+OvsProcessConntrackEntry(PNET_BUFFER_LIST curNbl,
+ const TCPHdr *tcp,
+ OvsConntrackKeyLookupCtx *ctx,
+ OvsFlowKey *key,
+ UINT16 zone,
+ BOOLEAN commit,
+ UINT64 currentTime)
+{
+ POVS_CT_ENTRY entry = ctx->entry;
+ UINT32 state = 0;
+
+ /* If an entry was found, update the state based on TCP flags */
+ if (ctx->related) {
+ state |= OVS_CS_F_RELATED;
+ if (ctx->reply) {
+ state = OVS_CS_F_REPLY_DIR;
+ }
+ } else {
+ CT_UPDATE_RES result;
+ result = OvsConntrackUpdateTcpEntry(entry, tcp, curNbl,
+ ctx->reply, currentTime);
+ switch (result) {
+ case CT_UPDATE_VALID:
+ state |= OVS_CS_F_ESTABLISHED;
+ if (ctx->reply) {
+ state |= OVS_CS_F_REPLY_DIR;
+ }
+ break;
+ case CT_UPDATE_INVALID:
+ state |= OVS_CS_F_INVALID;
+ break;
+ case CT_UPDATE_NEW:
+ //Delete and update the Conntrack
+ OvsCtEntryDelete(ctx->entry);
+ ctx->entry = NULL;
+ entry = OvsCtEntryCreate(tcp, curNbl, ctx, key,
+ commit, currentTime);
+ break;
+ }
+ }
+ /* Copy mark and label from entry into flowKey. If actions specify
+ different mark and label, update the flowKey. */
+ OvsCtUpdateFlowKey(key, state, zone, entry->mark, &entry->labels);
+ return entry;
+}
+
+static __inline VOID
+OvsConntrackSetMark(OvsFlowKey *key,
+ POVS_CT_ENTRY entry,
+ UINT32 value,
+ UINT32 mask)
+{
+ UINT32 newMark;
+ newMark = value | (entry->mark & ~(mask));
+ if (entry->mark != newMark) {
+ entry->mark = newMark;
+ key->ct.mark = newMark;
+ }
+}
+
+static __inline void
+OvsConntrackSetLabels(OvsFlowKey *key,
+ POVS_CT_ENTRY entry,
+ struct ovs_key_ct_labels *val,
+ struct ovs_key_ct_labels *mask)
+{
+ ovs_u128 v, m, pktMdLabel;
+ memcpy(&v, val, sizeof v);
+ memcpy(&m, mask, sizeof m);
+
+ pktMdLabel.u64.lo = v.u64.lo | (pktMdLabel.u64.lo & ~(m.u64.lo));
+ pktMdLabel.u64.hi = v.u64.hi | (pktMdLabel.u64.hi & ~(m.u64.hi));
+
+ NdisMoveMemory(&entry->labels, &pktMdLabel,
+ sizeof(struct ovs_key_ct_labels));
+ NdisMoveMemory(&key->ct.labels, &pktMdLabel,
+ sizeof(struct ovs_key_ct_labels));
+}
+
+static __inline NDIS_STATUS
+OvsCtExecute_(PNET_BUFFER_LIST curNbl,
+ OvsFlowKey *key,
+ OVS_PACKET_HDR_INFO *layers,
+ BOOLEAN commit,
+ UINT16 zone,
+ MD_MARK *mark,
+ MD_LABELS *labels)
+{
+ NDIS_STATUS status = NDIS_STATUS_SUCCESS;
+ POVS_CT_ENTRY entry = NULL;
+ OvsConntrackKeyLookupCtx ctx = { 0 };
+ TCPHdr tcpStorage;
+ UINT64 currentTime;
+ LOCK_STATE_EX lockState;
+ const TCPHdr *tcp;
+ tcp = OvsGetTcp(curNbl, layers->l4Offset, &tcpStorage);
+ NdisGetCurrentSystemTime((LARGE_INTEGER *) &currentTime);
+
+ /* Retrieve the Conntrack Key related fields from packet */
+ OvsCtSetupLookupCtx(key, zone, &ctx);
+
+ NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0);
+
+ /* Lookup Conntrack entries for a matching entry */
+ entry = OvsCtLookup(&ctx);
+
+ if (!entry) {
+ /* If no matching entry was found, create one and add New state */
+ entry = OvsCtEntryCreate(tcp, curNbl, &ctx,
+ key, commit, currentTime);
+ } else {
+ /* Process the entry and update CT flags */
+ entry = OvsProcessConntrackEntry(curNbl, tcp, &ctx, key,
+ zone, commit, currentTime);
+ }
+
+ if (entry && mark) {
+ OvsConntrackSetMark(key, entry, mark->value, mark->mask);
+ }
+
+ if (entry && labels) {
+ OvsConntrackSetLabels(key, entry, &labels->value, &labels->mask);
+ }
+
+ NdisReleaseRWLock(ovsConntrackLockObj, &lockState);
+
+ return status;
+}
+
+/*
+ *---------------------------------------------------------------------------
+ * OvsExecuteConntrackAction
+ * Executes Conntrack actions XXX - Add more
+ *---------------------------------------------------------------------------
+ */
+NDIS_STATUS
+OvsExecuteConntrackAction(PNET_BUFFER_LIST curNbl,
+ OVS_PACKET_HDR_INFO *layers,
+ OvsFlowKey *key,
+ const PNL_ATTR a)
+{
+ PNL_ATTR ctAttr;
+ BOOLEAN commit = FALSE;
+ UINT16 zone = 0;
+ MD_MARK *mark = NULL;
+ MD_LABELS *labels = NULL;
+ NDIS_STATUS status;
+
+ status = OvsDetectCtPacket(key);
+ if (status != NDIS_STATUS_SUCCESS) {
+ return status;
+ }
+
+ ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_ZONE);
+ if (ctAttr) {
+ zone = NlAttrGetU16(ctAttr);
+ }
+ ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_COMMIT);
+ if (ctAttr) {
+ commit = TRUE;
+ }
+ ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_MARK);
+ if (ctAttr) {
+ mark = NlAttrGet(ctAttr);
+ }
+ ctAttr = NlAttrFindNested(a, OVS_CT_ATTR_LABELS);
+ if (ctAttr) {
+ labels = NlAttrGet(ctAttr);
+ }
+
+ status = OvsCtExecute_(curNbl, key, layers,
+ commit, zone, mark, labels);
+ return status;
+}
+
+/*
+ *----------------------------------------------------------------------------
+ * OvsConntrackEnrtyCleaner
+ * Runs periodically and cleans up the connection tracker
+ *----------------------------------------------------------------------------
+ */
+VOID
+ovsConntrackEntryCleaner(PVOID data)
+{
+
+ POVS_CT_THREAD_CTX context = (POVS_CT_THREAD_CTX)data;
+ PLIST_ENTRY link, next;
+ POVS_CT_ENTRY entry;
+ BOOLEAN success = TRUE;
+
+ while (success) {
+ LOCK_STATE_EX lockState;
+ NdisAcquireRWLockWrite(ovsConntrackLockObj, &lockState, 0);
+ if (context->exit) {
+ NdisReleaseRWLock(ovsConntrackLockObj, &lockState);
+ break;
+ }
+
+ /* Set the timeout for the thread and cleanup */
+ UINT64 currentTime, threadSleepTimeout;
+ NdisGetCurrentSystemTime((LARGE_INTEGER *)&currentTime);
+ threadSleepTimeout = currentTime + CT_CLEANUP_INTERVAL;
+
+ for (int i = 0; i < CT_HASH_TABLE_SIZE; i++) {
+ LIST_FORALL_SAFE(&ovsConntrackTable[i], link, next) {
+ entry = CONTAINING_RECORD(link, OVS_CT_ENTRY, link);
+ if (entry->expiration < currentTime) {
+ OvsCtEntryDelete(entry);
+ }
+ }
+ }
+
+ NdisReleaseRWLock(ovsConntrackLockObj, &lockState);
+ KeWaitForSingleObject(&context->event, Executive, KernelMode,
+ FALSE, (LARGE_INTEGER *)&threadSleepTimeout);
+ }
+
+ PsTerminateSystemThread(STATUS_SUCCESS);
+}
diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h
new file mode 100644
index 000000000..3a73f2174
--- /dev/null
+++ b/datapath-windows/ovsext/Conntrack.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2015, 2016 VMware, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OVS_CONNTRACK_H_
+#define __OVS_CONNTRACK_H_ 1
+
+#include "precomp.h"
+#include "Flow.h"
+
+struct ct_addr {
+ union {
+ ovs_be32 ipv4;
+ struct in6_addr ipv6;
+ uint32_t ipv4_aligned;
+ struct in6_addr ipv6_aligned;
+ };
+};
+
+struct ct_endpoint {
+ struct ct_addr addr;
+ ovs_be16 port;
+ UINT16 pad;
+};
+
+typedef enum CT_UPDATE_RES {
+ CT_UPDATE_INVALID,
+ CT_UPDATE_VALID,
+ CT_UPDATE_NEW,
+} CT_UPDATE_RES;
+
+/* Metadata mark for masked write to conntrack mark */
+typedef struct MD_MARK {
+ UINT32 value;
+ UINT32 mask;
+} MD_MARK;
+
+/* Metadata label for masked write to conntrack label. */
+typedef struct MD_LABELS {
+ struct ovs_key_ct_labels value;
+ struct ovs_key_ct_labels mask;
+} MD_LABELS;
+
+typedef struct _OVS_CT_KEY {
+ struct ct_endpoint src;
+ struct ct_endpoint dst;
+ UINT16 dl_type;
+ UINT8 nw_proto;
+ UINT16 zone;
+} OVS_CT_KEY, *POVS_CT_KEY;
+
+typedef struct OVS_CT_ENTRY {
+ OVS_CT_KEY key;
+ OVS_CT_KEY rev_key;
+ UINT64 expiration;
+ LIST_ENTRY link;
+ UINT32 mark;
+ struct ovs_key_ct_labels labels;
+} OVS_CT_ENTRY, *POVS_CT_ENTRY;
+
+typedef struct OvsConntrackKeyLookupCtx {
+ OVS_CT_KEY key;
+ POVS_CT_ENTRY entry;
+ UINT32 hash;
+ BOOLEAN reply;
+ BOOLEAN related;
+} OvsConntrackKeyLookupCtx;
+
+#define CT_HASH_TABLE_SIZE ((UINT32)1 << 10)
+#define CT_HASH_TABLE_MASK (CT_HASH_TABLE_SIZE - 1)
+#define CT_ENTRY_TIMEOUT (2 * 600000000) // 2m
+#define CT_CLEANUP_INTERVAL (2 * 600000000) // 2m
+
+VOID OvsCleanupConntrack(VOID);
+NTSTATUS OvsInitConntrack(POVS_SWITCH_CONTEXT context);
+
+NDIS_STATUS OvsExecuteConntrackAction(PNET_BUFFER_LIST curNbl,
+ OVS_PACKET_HDR_INFO *layers,
+ OvsFlowKey *key,
+ const PNL_ATTR a);
+BOOLEAN OvsConntrackValidateTcpPacket(const TCPHdr *tcp);
+OVS_CT_ENTRY * OvsNewTcpConntrack(const TCPHdr *tcp,
+ PNET_BUFFER_LIST nbl,
+ UINT64 now);
+enum CT_UPDATE_RES OvsConntrackUpdateTcpEntry(struct OVS_CT_ENTRY* conn_,
+ const TCPHdr *tcp,
+ PNET_BUFFER_LIST nbl,
+ BOOLEAN reply,
+ UINT64 now);
+#endif /* __OVS_CONNTRACK_H_ */ \ No newline at end of file
diff --git a/datapath-windows/ovsext/Debug.h b/datapath-windows/ovsext/Debug.h
index 45f3c4910..e5ed963c5 100644
--- a/datapath-windows/ovsext/Debug.h
+++ b/datapath-windows/ovsext/Debug.h
@@ -40,6 +40,7 @@
#define OVS_DBG_NETLINK BIT32(20)
#define OVS_DBG_TUNFLT BIT32(21)
#define OVS_DBG_STT BIT32(22)
+#define OVS_DBG_CONTRK BIT32(23)
#define OVS_DBG_RESERVED BIT32(31)
//Please add above OVS_DBG_RESERVED.
diff --git a/datapath-windows/ovsext/DpInternal.h b/datapath-windows/ovsext/DpInternal.h
index d26833f50..a3ce311cc 100644
--- a/datapath-windows/ovsext/DpInternal.h
+++ b/datapath-windows/ovsext/DpInternal.h
@@ -167,6 +167,13 @@ typedef __declspec(align(8)) struct OvsFlowKey {
};
UINT32 recircId; /* Recirculation ID. */
UINT32 dpHash; /* Datapath calculated hash value. */
+ struct {
+ /* Connection tracking fields. */
+ UINT16 zone;
+ UINT32 mark;
+ UINT32 state;
+ struct ovs_key_ct_labels labels;
+ } ct; /* Connection Tracking Flags */
} OvsFlowKey;
#define OVS_WIN_TUNNEL_KEY_SIZE (sizeof (OvsIPv4TunnelKey))
diff --git a/datapath-windows/ovsext/Flow.c b/datapath-windows/ovsext/Flow.c
index f74ce12e8..a7e9bd2ea 100644
--- a/datapath-windows/ovsext/Flow.c
+++ b/datapath-windows/ovsext/Flow.c
@@ -172,7 +172,17 @@ const NL_POLICY nlFlowKeyPolicy[] = {
.maxLen = 4, .optional = TRUE},
[OVS_KEY_ATTR_RECIRC_ID] = {.type = NL_A_UNSPEC, .minLen = 4,
.maxLen = 4, .optional = TRUE},
- [OVS_KEY_ATTR_MPLS] = {.type = NL_A_VAR_LEN, .optional = TRUE}
+ [OVS_KEY_ATTR_MPLS] = {.type = NL_A_VAR_LEN, .optional = TRUE},
+ [OVS_KEY_ATTR_CT_STATE] = {.type = NL_A_UNSPEC, .minLen = 4,
+ .maxLen = 4, .optional = TRUE},
+ [OVS_KEY_ATTR_CT_ZONE] = {.type = NL_A_UNSPEC, .minLen = 2,
+ .maxLen = 2, .optional = TRUE},
+ [OVS_KEY_ATTR_CT_MARK] = {.type = NL_A_UNSPEC, .minLen = 4,
+ .maxLen = 4, .optional = TRUE},
+ [OVS_KEY_ATTR_CT_LABELS] = {.type = NL_A_UNSPEC,
+ .minLen = sizeof(struct ovs_key_ct_labels),
+ .maxLen = sizeof(struct ovs_key_ct_labels),
+ .optional = TRUE}
};
const UINT32 nlFlowKeyPolicyLen = ARRAY_SIZE(nlFlowKeyPolicy);
@@ -229,7 +239,8 @@ const NL_POLICY nlFlowActionPolicy[] = {
.maxLen = sizeof(struct ovs_action_hash),
.optional = TRUE},
[OVS_ACTION_ATTR_SET] = {.type = NL_A_VAR_LEN, .optional = TRUE},
- [OVS_ACTION_ATTR_SAMPLE] = {.type = NL_A_VAR_LEN, .optional = TRUE}
+ [OVS_ACTION_ATTR_SAMPLE] = {.type = NL_A_VAR_LEN, .optional = TRUE},
+ [OVS_ACTION_ATTR_CT] = {.type = NL_A_VAR_LEN, .optional = TRUE}
};
/*
@@ -850,6 +861,28 @@ MapFlowKeyToNlKey(PNL_BUFFER nlBuf,
goto done;
}
+ if (!NlMsgPutTailU32(nlBuf, OVS_KEY_ATTR_CT_STATE,
+ flowKey->ct.state)) {
+ rc = STATUS_UNSUCCESSFUL;
+ goto done;
+ }
+ if (!NlMsgPutTailU16(nlBuf, OVS_KEY_ATTR_CT_ZONE,
+ flowKey->ct.zone)) {
+ rc = STATUS_UNSUCCESSFUL;
+ goto done;
+ }
+ if (!NlMsgPutTailU32(nlBuf, OVS_KEY_ATTR_CT_MARK,
+ flowKey->ct.mark)) {
+ rc = STATUS_UNSUCCESSFUL;
+ goto done;
+ }
+ if (!NlMsgPutTailUnspec(nlBuf, OVS_KEY_ATTR_CT_LABELS,
+ (PCHAR)(&flowKey->ct.labels),
+ sizeof(struct ovs_key_ct_labels))) {
+ rc = STATUS_UNSUCCESSFUL;
+ goto done;
+ }
+
if (flowKey->dpHash) {
if (!NlMsgPutTailU32(nlBuf, OVS_KEY_ATTR_DP_HASH,
flowKey->dpHash)) {
@@ -1386,6 +1419,24 @@ _MapKeyAttrToFlowPut(PNL_ATTR *keyAttrs,
destKey->dpHash = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_DP_HASH]);
}
+ if (keyAttrs[OVS_KEY_ATTR_CT_STATE]) {
+ destKey->ct.state = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_STATE]));
+ }
+
+ if (keyAttrs[OVS_KEY_ATTR_CT_ZONE]) {
+ destKey->ct.zone = (NlAttrGetU16(keyAttrs[OVS_KEY_ATTR_CT_ZONE]));
+ }
+
+ if (keyAttrs[OVS_KEY_ATTR_CT_MARK]) {
+ destKey->ct.mark = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_MARK]));
+ }
+
+ if (keyAttrs[OVS_KEY_ATTR_CT_LABELS]) {
+ const struct ovs_key_ct_labels *ct_labels;
+ ct_labels = NlAttrGet(keyAttrs[OVS_KEY_ATTR_CT_LABELS]);
+ RtlCopyMemory(&destKey->ct.labels, ct_labels, sizeof(struct ovs_key_ct_labels));
+ }
+
/* ===== L2 headers ===== */
destKey->l2.inPort = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_IN_PORT]);
@@ -1774,6 +1825,24 @@ OvsGetFlowMetadata(OvsFlowKey *key,
key->dpHash = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_DP_HASH]);
}
+ if (keyAttrs[OVS_KEY_ATTR_CT_STATE]) {
+ key->ct.state = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_STATE]));
+ }
+
+ if (keyAttrs[OVS_KEY_ATTR_CT_ZONE]) {
+ key->ct.zone = (NlAttrGetU16(keyAttrs[OVS_KEY_ATTR_CT_ZONE]));
+ }
+
+ if (keyAttrs[OVS_KEY_ATTR_CT_MARK]) {
+ key->ct.mark = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_MARK]));
+ }
+
+ if (keyAttrs[OVS_KEY_ATTR_CT_LABELS]) {
+ const struct ovs_key_ct_labels *ct_labels;
+ ct_labels = NlAttrGet(keyAttrs[OVS_KEY_ATTR_CT_LABELS]);
+ RtlCopyMemory(&key->ct.labels, ct_labels, sizeof(struct ovs_key_ct_labels));
+ }
+
return status;
}
@@ -2059,6 +2128,11 @@ FlowEqual(OvsFlow *srcFlow,
srcFlow->key.l2.val == dstKey->l2.val &&
srcFlow->key.recircId == dstKey->recircId &&
srcFlow->key.dpHash == dstKey->dpHash &&
+ srcFlow->key.ct.state == dstKey->ct.state &&
+ srcFlow->key.ct.zone == dstKey->ct.zone &&
+ srcFlow->key.ct.mark == dstKey->ct.mark &&
+ !memcmp(&srcFlow->key.ct.labels, &dstKey->ct.labels,
+ sizeof(struct ovs_key_ct_labels)) &&
FlowMemoryEqual((UINT64 *)((UINT8 *)&srcFlow->key + offset),
(UINT64 *) dstStart,
size));
@@ -2156,6 +2230,21 @@ OvsLookupFlow(OVS_DATAPATH *datapath,
if (key->dpHash) {
*hash = OvsJhashWords((UINT32*)hash, 1, key->dpHash);
}
+ if (key->ct.state) {
+ *hash = OvsJhashWords((UINT32*)hash, 1, key->ct.state);
+ }
+ if (key->ct.zone) {
+ *hash = OvsJhashWords((UINT32*)hash, 1, key->ct.zone);
+ }
+ if (key->ct.mark) {
+ *hash = OvsJhashWords((UINT32*)hash, 1, key->ct.zone);
+ }
+ if (key->ct.labels.ct_labels) {
+ UINT32 lblHash = OvsJhashBytes(&key->ct.labels,
+ sizeof(struct ovs_key_ct_labels),
+ 0);
+ *hash = OvsJhashWords((UINT32*)hash, 1, lblHash);
+ }
}
head = &datapath->flowTable[HASH_BUCKET(*hash)];
@@ -2322,6 +2411,12 @@ ReportFlowInfo(OvsFlow *flow,
info->key.recircId = flow->key.recircId;
info->key.dpHash = flow->key.dpHash;
+ info->key.ct.state = flow->key.ct.state;
+ info->key.ct.zone = flow->key.ct.zone;
+ info->key.ct.mark = flow->key.ct.mark;
+ NdisMoveMemory(&info->key.ct.labels,
+ &flow->key.ct.labels,
+ sizeof(struct ovs_key_ct_labels));
return status;
}
@@ -2578,6 +2673,10 @@ OvsFlowKeyAttrSize(void)
+ NlAttrTotalSize(4) /* OVS_KEY_ATTR_SKB_MARK */
+ NlAttrTotalSize(4) /* OVS_KEY_ATTR_DP_HASH */
+ NlAttrTotalSize(4) /* OVS_KEY_ATTR_RECIRC_ID */
+ + NlAttrTotalSize(4) /* OVS_KEY_ATTR_CT_STATE */
+ + NlAttrTotalSize(2) /* OVS_KEY_ATTR_CT_ZONE */
+ + NlAttrTotalSize(4) /* OVS_KEY_ATTR_CT_MARK */
+ + NlAttrTotalSize(16) /* OVS_KEY_ATTR_CT_LABELS */
+ NlAttrTotalSize(12) /* OVS_KEY_ATTR_ETHERNET */
+ NlAttrTotalSize(2) /* OVS_KEY_ATTR_ETHERTYPE */
+ NlAttrTotalSize(4) /* OVS_KEY_ATTR_VLAN */
@@ -2657,6 +2756,31 @@ OvsProbeSupportedFeature(POVS_MESSAGE msgIn,
OVS_LOG_ERROR("Invalid recirculation ID.");
status = STATUS_INVALID_PARAMETER;
}
+ } else if (keyAttrs[OVS_KEY_ATTR_CT_STATE]) {
+ UINT32 state = NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_STATE]);
+ if (state & OVS_CS_F_DST_NAT || state & OVS_CS_F_SRC_NAT) {
+ status = STATUS_INVALID_PARAMETER;
+ OVS_LOG_ERROR("Contrack NAT is not supported:%d", state);
+ }
+ } else if (keyAttrs[OVS_KEY_ATTR_CT_ZONE]) {
+ UINT16 zone = (NlAttrGetU16(keyAttrs[OVS_KEY_ATTR_CT_ZONE]));
+ if (!zone) {
+ OVS_LOG_ERROR("Invalid zone specified.");
+ status = STATUS_INVALID_PARAMETER;
+ }
+ } else if (keyAttrs[OVS_KEY_ATTR_CT_MARK]) {
+ UINT32 mark = (NlAttrGetU32(keyAttrs[OVS_KEY_ATTR_CT_MARK]));
+ if (!mark) {
+ OVS_LOG_ERROR("Invalid ct mark specified.");
+ status = STATUS_INVALID_PARAMETER;
+ }
+ } else if (keyAttrs[OVS_KEY_ATTR_CT_LABELS]) {
+ const struct ovs_key_ct_labels *ct_labels;
+ ct_labels = NlAttrGet(keyAttrs[OVS_KEY_ATTR_CT_LABELS]);
+ if (!ct_labels->ct_labels) {
+ OVS_LOG_ERROR("Invalid ct label specified.");
+ status = STATUS_INVALID_PARAMETER;
+ }
} else {
OVS_LOG_ERROR("Feature not supported.");
status = STATUS_INVALID_PARAMETER;
diff --git a/datapath-windows/ovsext/Switch.c b/datapath-windows/ovsext/Switch.c
index 77bafb466..7ad2e9846 100644
--- a/datapath-windows/ovsext/Switch.c
+++ b/datapath-windows/ovsext/Switch.c
@@ -20,7 +20,7 @@
*/
#include "precomp.h"
-
+#include "Conntrack.h"
#include "Switch.h"
#include "Vport.h"
#include "Event.h"
@@ -218,6 +218,13 @@ OvsCreateSwitch(NDIS_HANDLE ndisFilterHandle,
goto create_switch_done;
}
+ status = OvsInitConntrack(switchContext);
+ if (status != STATUS_SUCCESS) {
+ OvsUninitSwitchContext(switchContext);
+ OVS_LOG_ERROR("Exit: Failed to initialize Connection tracking");
+ goto create_switch_done;
+ }
+
*switchContextOut = switchContext;
create_switch_done:
@@ -249,6 +256,7 @@ OvsExtDetach(NDIS_HANDLE filterModuleContext)
OvsDeleteSwitch(switchContext);
OvsCleanupIpHelper();
OvsCleanupSttDefragmentation();
+ OvsCleanupConntrack();
/* This completes the cleanup, and a new attach can be handled now. */
OVS_LOG_TRACE("Exit: OvsDetach Successfully");
diff --git a/datapath-windows/ovsext/Types.h b/datapath-windows/ovsext/Types.h
index 5d2744bda..022c65bd9 100644
--- a/datapath-windows/ovsext/Types.h
+++ b/datapath-windows/ovsext/Types.h
@@ -28,6 +28,12 @@ typedef uint64 __u64, __be64;
typedef uint32 __u32, __be32;
typedef uint16 __u16, __be16;
typedef uint8 __u8;
+typedef union ovs_u128 {
+ uint32_t u32[4];
+ struct {
+ uint64_t lo, hi;
+ } u64;
+} ovs_u128;
/* Defines the userspace specific data types for file
* included within kernel only. */
diff --git a/datapath-windows/ovsext/Util.h b/datapath-windows/ovsext/Util.h
index 78b926d3b..4bcde3b23 100644
--- a/datapath-windows/ovsext/Util.h
+++ b/datapath-windows/ovsext/Util.h
@@ -37,6 +37,7 @@
#define OVS_GRE_POOL_TAG 'GSVO'
#define OVS_TUNFLT_POOL_TAG 'WSVO'
#define OVS_RECIRC_POOL_TAG 'CSVO'
+#define OVS_CT_POOL_TAG 'CTVO'
VOID *OvsAllocateMemory(size_t size);
VOID *OvsAllocateMemoryWithTag(size_t size, ULONG tag);
@@ -68,7 +69,7 @@ VOID OvsFreeAlignedMemory(VOID *ptr);
VOID OvsAppendList(PLIST_ENTRY dst, PLIST_ENTRY src);
-
+#define MAX(_a, _b) ((_a) > (_b) ? (_a) : (_b))
#define MIN(_a, _b) ((_a) > (_b) ? (_b) : (_a))
#define ARRAY_SIZE(_x) ((sizeof(_x))/sizeof (_x)[0])
#define OVS_SWITCH_PORT_ID_INVALID (NDIS_SWITCH_PORT_ID)(-1)
diff --git a/datapath-windows/ovsext/ovsext.vcxproj b/datapath-windows/ovsext/ovsext.vcxproj
index af718f775..0356ddf58 100644
--- a/datapath-windows/ovsext/ovsext.vcxproj
+++ b/datapath-windows/ovsext/ovsext.vcxproj
@@ -74,6 +74,7 @@
<ClInclude Include="Actions.h" />
<ClInclude Include="Atomic.h" />
<ClInclude Include="BufferMgmt.h" />
+ <ClInclude Include="Conntrack.h" />
<ClInclude Include="Datapath.h" />
<ClInclude Include="Debug.h" />
<ClInclude Include="DpInternal.h" />
@@ -175,6 +176,8 @@
<ItemGroup>
<ClCompile Include="Actions.c" />
<ClCompile Include="BufferMgmt.c" />
+ <ClCompile Include="Conntrack-tcp.c" />
+ <ClCompile Include="Conntrack.c" />
<ClCompile Include="Debug.c" />
<ClCompile Include="Driver.c" />
<ClCompile Include="Event.c" />