From 792d377d8330606ad122dae5f941e5088c10cf14 Mon Sep 17 00:00:00 2001 From: Sairam Venugopal Date: Wed, 13 Apr 2016 11:54:03 -0700 Subject: datapath-windows: Add Connection Tracking Support Enable support for Stateful Firewall in Hyper-V by adding a Connection Tracking module. The module has been ported over from the userspace implementation patch of a similar name. The current version of the module supports ct - zone, mark and label for TCP packets. Support for other packet formats will be added in subsequent patches. The conntrack-tcp module is adapted from FreeBSD's pf subsystem and hence the BSD license. It has been ported over to match OVS Hyper-V coding style. Signed-off-by: Sairam Venugopal Signed-off-by: Daniele Di Proietto Co-Authored-by: Daniele Di Proietto Acked-by: Nithin Raju Signed-off-by: Ben Pfaff --- datapath-windows/ovsext/Conntrack-tcp.c | 532 ++++++++++++++++++++++++++++++++ 1 file changed, 532 insertions(+) create mode 100644 datapath-windows/ovsext/Conntrack-tcp.c (limited to 'datapath-windows/ovsext/Conntrack-tcp.c') diff --git a/datapath-windows/ovsext/Conntrack-tcp.c b/datapath-windows/ovsext/Conntrack-tcp.c new file mode 100644 index 000000000..3e25ba567 --- /dev/null +++ b/datapath-windows/ovsext/Conntrack-tcp.c @@ -0,0 +1,532 @@ +/*- + * Copyright (c) 2001 Daniel Hartmeier + * Copyright (c) 2002 - 2008 Henning Brauer + * Copyright (c) 2012 Gleb Smirnoff + * Copyright (c) 2015, 2016 VMware, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Effort sponsored in part by the Defense Advanced Research Projects + * Agency (DARPA) and Air Force Research Laboratory, Air Force + * Materiel Command, USAF, under agreement number F30602-01-2-0537. + * + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ + */ + +#include "Conntrack.h" +#include + +struct tcp_peer { + enum ct_dpif_tcp_state state; + uint32_t seqlo; /* Max sequence number sent */ + uint32_t seqhi; /* Max the other end ACKd + win */ + uint16_t max_win;/* largest window (pre scaling) */ + uint8_t wscale; /* window scaling factor */ +}; + +struct conn_tcp { + struct OVS_CT_ENTRY up; + struct tcp_peer peer[2]; +}; + +enum { + TCPOPT_EOL, + TCPOPT_NOP, + TCPOPT_WINDOW = 3, +}; + +/* Given POINTER, the address of the given MEMBER in a STRUCT object, returns + the STRUCT object. */ +#define CONTAINER_OF(POINTER, STRUCT, MEMBER) \ + ((STRUCT *) (void *) ((char *) (POINTER) - \ + offsetof (STRUCT, MEMBER))) + + +/* TCP sequence numbers are 32 bit integers operated + * on with modular arithmetic. These macros can be + * used to compare such integers. */ +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) + +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) + +#define TCP_FIN 0x001 +#define TCP_SYN 0x002 +#define TCP_RST 0x004 +#define TCP_PSH 0x008 +#define TCP_ACK 0x010 +#define TCP_URG 0x020 +#define TCP_ECE 0x040 +#define TCP_CWR 0x080 +#define TCP_NS 0x100 + +#define CT_DPIF_TCP_FLAGS \ + CT_DPIF_TCP_FLAG(WINDOW_SCALE) \ + CT_DPIF_TCP_FLAG(SACK_PERM) \ + CT_DPIF_TCP_FLAG(CLOSE_INIT) \ + CT_DPIF_TCP_FLAG(BE_LIBERAL) \ + CT_DPIF_TCP_FLAG(DATA_UNACKNOWLEDGED) \ + CT_DPIF_TCP_FLAG(MAXACK_SET) \ + +enum ct_dpif_tcp_flags_count_ { +#define CT_DPIF_TCP_FLAG(FLAG) FLAG##_COUNT_, + CT_DPIF_TCP_FLAGS +#undef CT_DPIF_TCP_FLAG +}; + +enum ct_dpif_tcp_flags { +#define CT_DPIF_TCP_FLAG(FLAG) CT_DPIF_TCPF_##FLAG = (1 << \ + FLAG##_COUNT_), + CT_DPIF_TCP_FLAGS +#undef CT_DPIF_TCP_FLAG +}; + + +#define CT_DPIF_TCP_STATES \ + CT_DPIF_TCP_STATE(CLOSED) \ + CT_DPIF_TCP_STATE(LISTEN) \ + CT_DPIF_TCP_STATE(SYN_SENT) \ + CT_DPIF_TCP_STATE(SYN_RECV) \ + CT_DPIF_TCP_STATE(ESTABLISHED) \ + CT_DPIF_TCP_STATE(CLOSE_WAIT) \ + CT_DPIF_TCP_STATE(FIN_WAIT_1) \ + CT_DPIF_TCP_STATE(CLOSING) \ + CT_DPIF_TCP_STATE(LAST_ACK) \ + CT_DPIF_TCP_STATE(FIN_WAIT_2) \ + CT_DPIF_TCP_STATE(TIME_WAIT) + +enum ct_dpif_tcp_state { +#define CT_DPIF_TCP_STATE(STATE) CT_DPIF_TCPS_##STATE, + CT_DPIF_TCP_STATES +#undef CT_DPIF_TCP_STATE +}; + +#define TCP_MAX_WSCALE 14 +#define CT_WSCALE_FLAG 0x80 +#define CT_WSCALE_UNKNOWN 0x40 +#define CT_WSCALE_MASK 0xf + +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub + * is enabled. We're not scrubbing, but this check seems reasonable. */ +static __inline BOOLEAN +OvsConntrackValidateTcpFlags(const TCPHdr *tcp) +{ + if (tcp->syn) { + if (tcp->rst) { + return TRUE; + } + if (tcp->fin) { + /* Here pf removes the fin flag. We simply mark the packet as + * invalid */ + return TRUE; + } + } else { + /* Illegal packet */ + if (!(tcp->ack || tcp->rst)) { + return TRUE; + } + } + + if (!(tcp->ack)) { + /* These flags are only valid if ACK is set */ + if ((tcp->fin) || (tcp->psh) || (tcp->urg)) { + return TRUE; + } + } + + return FALSE; +} + +static __inline uint8_t +OvsTcpGetWscale(const TCPHdr *tcp) +{ + unsigned len = tcp->doff * 4 - sizeof *tcp; + const uint8_t *opt = (const uint8_t *)(tcp + 1); + uint8_t wscale = 0; + uint8_t optlen; + + while (len >= 3) { + if (*opt == TCPOPT_EOL) { + break; + } + switch (*opt) { + case TCPOPT_NOP: + opt++; + len--; + break; + case TCPOPT_WINDOW: + wscale = MIN(opt[2], TCP_MAX_WSCALE); + wscale |= CT_WSCALE_FLAG; + /* fall through */ + default: + optlen = opt[2]; + if (optlen < 2) { + optlen = 2; + } + len -= optlen; + opt += optlen; + } + } + + return wscale; +} + +static __inline uint32_t +OvsGetTcpPayloadLength(PNET_BUFFER_LIST nbl) +{ + IPHdr *ipHdr; + char *ipBuf[sizeof(IPHdr)]; + PNET_BUFFER curNb; + curNb = NET_BUFFER_LIST_FIRST_NB(nbl); + ipHdr = NdisGetDataBuffer(curNb, sizeof *ipHdr, (PVOID) &ipBuf, + 1 /*no align*/, 0); + TCPHdr *tcp = (TCPHdr *)((PCHAR)ipHdr + ipHdr->ihl * 4); + return (UINT16)ntohs(ipHdr->tot_len) + - (ipHdr->ihl * 4) + - (sizeof * tcp); +} + +static __inline void +OvsConntrackUpdateExpiration(struct conn_tcp *conn, + long long now, + long long interval) +{ + conn->up.expiration = now + interval; +} + +static __inline struct conn_tcp* +OvsCastConntrackEntryToTcpEntry(OVS_CT_ENTRY* conn) +{ + return CONTAINER_OF(conn, struct conn_tcp, up); +} + +enum CT_UPDATE_RES +OvsConntrackUpdateTcpEntry(struct OVS_CT_ENTRY* conn_, + const TCPHdr *tcp, + PNET_BUFFER_LIST nbl, + BOOLEAN reply, + UINT64 now) +{ + struct conn_tcp *conn = OvsCastConntrackEntryToTcpEntry(conn_); + /* The peer that sent 'pkt' */ + struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; + /* The peer that should receive 'pkt' */ + struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; + uint8_t sws = 0, dws = 0; + uint16_t win = ntohs(tcp->window); + uint32_t ack, end, seq, orig_seq; + uint32_t p_len = OvsGetTcpPayloadLength(nbl); + int ackskew; + + if (OvsConntrackValidateTcpFlags(tcp)) { + return CT_UPDATE_INVALID; + } + + if ((tcp->syn) && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 && + src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { + src->state = dst->state = CT_DPIF_TCPS_CLOSED; + return CT_UPDATE_NEW; + } + + if (src->wscale & CT_WSCALE_FLAG + && dst->wscale & CT_WSCALE_FLAG + && !(tcp->syn)) { + + sws = src->wscale & CT_WSCALE_MASK; + dws = dst->wscale & CT_WSCALE_MASK; + + } else if (src->wscale & CT_WSCALE_UNKNOWN + && dst->wscale & CT_WSCALE_UNKNOWN + && !(tcp->syn)) { + + sws = TCP_MAX_WSCALE; + dws = TCP_MAX_WSCALE; + } + + /* + * Sequence tracking algorithm from Guido van Rooij's paper: + * http://www.madison-gurkha.com/publications/tcp_filtering/ + * tcp_filtering.ps + */ + + orig_seq = seq = ntohl(tcp->seq); + if (src->state < CT_DPIF_TCPS_SYN_SENT) { + /* First packet from this end. Set its state */ + + ack = ntohl(tcp->ack); + + end = seq + p_len; + if (tcp->syn) { + end++; + if (dst->wscale & CT_WSCALE_FLAG) { + src->wscale = OvsTcpGetWscale(tcp); + if (src->wscale & CT_WSCALE_FLAG) { + /* Remove scale factor from initial window */ + sws = src->wscale & CT_WSCALE_MASK; + win = DIV_ROUND_UP((uint32_t) win, 1 << sws); + dws = dst->wscale & CT_WSCALE_MASK; + } else { + /* fixup other window */ + dst->max_win <<= dst->wscale & + CT_WSCALE_MASK; + /* in case of a retrans SYN|ACK */ + dst->wscale = 0; + } + } + } + if (tcp->fin) { + end++; + } + + src->seqlo = seq; + src->state = CT_DPIF_TCPS_SYN_SENT; + /* + * May need to slide the window (seqhi may have been set by + * the crappy stack check or if we picked up the connection + * after establishment) + */ + if (src->seqhi == 1 || + SEQ_GEQ(end + MAX(1, dst->max_win << dws), + src->seqhi)) { + src->seqhi = end + MAX(1, dst->max_win << dws); + } + if (win > src->max_win) { + src->max_win = win; + } + + } else { + ack = ntohl(tcp->ack); + end = seq + p_len; + if (tcp->syn) { + end++; + } + if (tcp->fin) { + end++; + } + } + + if ((tcp->ack) == 0) { + /* Let it pass through the ack skew check */ + ack = dst->seqlo; + } else if ((ack == 0 + && (tcp->ack && tcp->rst) == (TCP_ACK|TCP_RST)) + /* broken tcp stacks do not set ack */) { + /* Many stacks (ours included) will set the ACK number in an + * FIN|ACK if the SYN times out -- no sequence to ACK. */ + ack = dst->seqlo; + } + + if (seq == end) { + /* Ease sequencing restrictions on no data packets */ + seq = src->seqlo; + end = seq; + } + + ackskew = dst->seqlo - ack; +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge factor */ + if (SEQ_GEQ(src->seqhi, end) + /* Last octet inside other's window space */ + && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) + /* Retrans: not more than one window back */ + && (ackskew >= -MAXACKWINDOW) + /* Acking not more than one reassembled fragment backwards */ + && (ackskew <= (MAXACKWINDOW << sws)) + /* Acking not more than one window forward */ + && ((tcp->rst) == 0 || orig_seq == src->seqlo + || (orig_seq == src->seqlo + 1) + || (orig_seq + 1 == src->seqlo))) { + /* Require an exact/+1 sequence match on resets when possible */ + + /* update max window */ + if (src->max_win < win) { + src->max_win = win; + } + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) { + src->seqlo = end; + } + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { + dst->seqhi = ack + MAX((win << sws), 1); + } + + /* update states */ + if (tcp->syn && src->state < CT_DPIF_TCPS_SYN_SENT) { + src->state = CT_DPIF_TCPS_SYN_SENT; + } + if (tcp->fin && src->state < CT_DPIF_TCPS_CLOSING) { + src->state = CT_DPIF_TCPS_CLOSING; + } + if (tcp->ack) { + if (dst->state == CT_DPIF_TCPS_SYN_SENT) { + dst->state = CT_DPIF_TCPS_ESTABLISHED; + } else if (dst->state == CT_DPIF_TCPS_CLOSING) { + dst->state = CT_DPIF_TCPS_FIN_WAIT_2; + } + } + if (tcp->rst) { + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; + } + + if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { + OvsConntrackUpdateExpiration(conn, now, 30 * 10000000LL); + } else if (src->state >= CT_DPIF_TCPS_CLOSING + && dst->state >= CT_DPIF_TCPS_CLOSING) { + OvsConntrackUpdateExpiration(conn, now, 45 * 10000000LL); + } else if (src->state < CT_DPIF_TCPS_ESTABLISHED + || dst->state < CT_DPIF_TCPS_ESTABLISHED) { + OvsConntrackUpdateExpiration(conn, now, 30 * 10000000LL); + } else if (src->state >= CT_DPIF_TCPS_CLOSING + || dst->state >= CT_DPIF_TCPS_CLOSING) { + OvsConntrackUpdateExpiration(conn, now, 15 * 60 * 10000000LL); + } else { + OvsConntrackUpdateExpiration(conn, now, 24 * 60 * 60 * 10000000LL); + } + } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT + || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 + || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) + && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) + /* Within a window forward of the originating packet */ + && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { + /* Within a window backward of the originating packet */ + + /* + * This currently handles three situations: + * 1) Stupid stacks will shotgun SYNs before their peer + * replies. + * 2) When PF catches an already established stream (the + * firewall rebooted, the state table was flushed, routes + * changed...) + * 3) Packets get funky immediately after the connection + * closes (this should catch Solaris spurious ACK|FINs + * that web servers like to spew after a close) + * + * This must be a little more careful than the above code + * since packet floods will also be caught here. We don't + * update the TTL here to mitigate the damage of a packet + * flood and so the same code can handle awkward establishment + * and a loosened connection close. + * In the establishment case, a correct peer response will + * validate the connection, go through the normal state code + * and keep updating the state TTL. + */ + + /* update max window */ + if (src->max_win < win) { + src->max_win = win; + } + /* synchronize sequencing */ + if (SEQ_GT(end, src->seqlo)) { + src->seqlo = end; + } + /* slide the window of what the other end can send */ + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { + dst->seqhi = ack + MAX((win << sws), 1); + } + + /* + * Cannot set dst->seqhi here since this could be a shotgunned + * SYN and not an already established connection. + */ + + if (tcp->fin && src->state < CT_DPIF_TCPS_CLOSING) { + src->state = CT_DPIF_TCPS_CLOSING; + } + + if (tcp->rst) { + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; + } + } else { + return CT_UPDATE_INVALID; + } + + return CT_UPDATE_VALID; +} + +BOOLEAN +OvsConntrackValidateTcpPacket(const TCPHdr *tcp) +{ + if (tcp == NULL || OvsConntrackValidateTcpFlags(tcp)) { + return FALSE; + } + + /* A syn+ack is not allowed to create a connection. We want to allow + * totally new connections (syn) or already established, not partially + * open (syn+ack). */ + if ((tcp->syn) && (tcp->ack)) { + return FALSE; + } + + return TRUE; +} + +OVS_CT_ENTRY * +OvsNewTcpConntrack(const TCPHdr *tcp, + PNET_BUFFER_LIST nbl, + UINT64 now) +{ + struct conn_tcp* newconn = NULL; + struct tcp_peer *src, *dst; + + newconn = OvsAllocateMemoryWithTag(sizeof(struct conn_tcp), + OVS_CT_POOL_TAG); + newconn->up = (OVS_CT_ENTRY) {0}; + src = &newconn->peer[0]; + dst = &newconn->peer[1]; + + src->seqlo = ntohl(tcp->seq); + src->seqhi = src->seqlo + OvsGetTcpPayloadLength(nbl) + 1; + + if (tcp->syn) { + src->seqhi++; + src->wscale = OvsTcpGetWscale(tcp); + } else { + src->wscale = CT_WSCALE_UNKNOWN; + dst->wscale = CT_WSCALE_UNKNOWN; + } + src->max_win = MAX(ntohs(tcp->window), 1); + if (src->wscale & CT_WSCALE_MASK) { + /* Remove scale factor from initial window */ + uint8_t sws = src->wscale & CT_WSCALE_MASK; + src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, + 1 << sws); + } + if (tcp->fin) { + src->seqhi++; + } + dst->seqhi = 1; + dst->max_win = 1; + src->state = CT_DPIF_TCPS_SYN_SENT; + dst->state = CT_DPIF_TCPS_CLOSED; + + OvsConntrackUpdateExpiration(newconn, now, CT_ENTRY_TIMEOUT); + + return &newconn->up; +} \ No newline at end of file -- cgit v1.2.1