/* * Copyright (c) 2016 VMware, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at: * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "precomp.h" #include "Atomic.h" #include "Debug.h" #include "Flow.h" #include "IpHelper.h" #include "Jhash.h" #include "NetProto.h" #include "Offload.h" #include "PacketIO.h" #include "PacketParser.h" #include "Geneve.h" #include "Switch.h" #include "User.h" #include "Util.h" #include "Vport.h" #ifdef OVS_DBG_MOD #undef OVS_DBG_MOD #endif #define OVS_DBG_MOD OVS_DBG_GENEVE NTSTATUS OvsInitGeneveTunnel(POVS_VPORT_ENTRY vport, UINT16 udpDestPort) { POVS_GENEVE_VPORT genevePort; genevePort = (POVS_GENEVE_VPORT) OvsAllocateMemoryWithTag(sizeof(*genevePort), OVS_GENEVE_POOL_TAG); if (!genevePort) { OVS_LOG_ERROR("Insufficient memory, can't allocate GENEVE_VPORT"); return STATUS_INSUFFICIENT_RESOURCES; } RtlZeroMemory(genevePort, sizeof(*genevePort)); genevePort->dstPort = udpDestPort; vport->priv = (PVOID) genevePort; return STATUS_SUCCESS; } VOID OvsCleanupGeneveTunnel(POVS_VPORT_ENTRY vport) { if (vport->ovsType != OVS_VPORT_TYPE_GENEVE || vport->priv == NULL) { return; } OvsFreeMemoryWithTag(vport->priv, OVS_GENEVE_POOL_TAG); vport->priv = NULL; } NDIS_STATUS OvsEncapGeneve(POVS_VPORT_ENTRY vport, PNET_BUFFER_LIST curNbl, OvsIPTunnelKey* tunKey, POVS_SWITCH_CONTEXT switchContext, POVS_PACKET_HDR_INFO layers, PNET_BUFFER_LIST *newNbl, POVS_FWD_INFO switchFwdInfo) { NTSTATUS status; OVS_FWD_INFO fwdInfo; PNET_BUFFER curNb; PMDL curMdl; PUINT8 bufferStart; EthHdr* ethHdr; IPHdr* ipHdr = NULL; IPv6Hdr* ipv6Hdr = NULL; UDPHdr* udpHdr = NULL; GeneveHdr* geneveHdr; GeneveOptionHdr* optHdr; POVS_GENEVE_VPORT vportGeneve; UINT32 headRoom = 0; UINT32 packetLength; ULONG mss = 0; NDIS_TCP_IP_CHECKSUM_NET_BUFFER_LIST_INFO csumInfo; if (tunKey->dst.si_family == AF_INET) { headRoom = OvsGetGeneveTunHdrMinSize() + tunKey->tunOptLen; } else if (tunKey->dst.si_family == AF_INET6) { headRoom = OvsGetGeneveIPv6TunHdrMinSize() + tunKey->tunOptLen; } status = OvsLookupIPhFwdInfo(tunKey->src, tunKey->dst, &fwdInfo); if (status != STATUS_SUCCESS) { OvsFwdIPHelperRequest(NULL, 0, tunKey, NULL, NULL, NULL); // return NDIS_STATUS_PENDING; /* * XXX: Don't know if the completionList will make any sense when * accessed in the callback. Make sure the caveats are known. * * XXX: This code will work once we are able to grab locks in the * callback. */ return NDIS_STATUS_FAILURE; } RtlCopyMemory(switchFwdInfo->value, fwdInfo.value, sizeof fwdInfo.value); curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); packetLength = NET_BUFFER_DATA_LENGTH(curNb); if (layers->isTcp) { mss = OVSGetTcpMSS(curNbl); OVS_LOG_TRACE("MSS %u packet len %u", mss, packetLength); if (mss) { OVS_LOG_TRACE("l4Offset %d", layers->l4Offset); *newNbl = OvsTcpSegmentNBL(switchContext, curNbl, layers, mss, headRoom, FALSE); if (*newNbl == NULL) { OVS_LOG_ERROR("Unable to segment NBL"); return NDIS_STATUS_FAILURE; } /* Clear out LSO flags after this point */ NET_BUFFER_LIST_INFO(*newNbl, TcpLargeSendNetBufferListInfo) = 0; } } vportGeneve = (POVS_GENEVE_VPORT) GetOvsVportPriv(vport); ASSERT(vportGeneve != NULL); /* If we didn't split the packet above, make a copy now */ if (*newNbl == NULL) { *newNbl = OvsPartialCopyNBL(switchContext, curNbl, 0, headRoom, FALSE /*NBL info*/); if (*newNbl == NULL) { OVS_LOG_ERROR("Unable to copy NBL"); return NDIS_STATUS_FAILURE; } csumInfo.Value = NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo); status = OvsApplySWChecksumOnNB(layers, *newNbl, &csumInfo); if (status != NDIS_STATUS_SUCCESS) { goto ret_error; } } curNbl = *newNbl; for (curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curNb != NULL; curNb = curNb->Next) { status = NdisRetreatNetBufferDataStart(curNb, headRoom, 0, NULL); if (status != NDIS_STATUS_SUCCESS) { goto ret_error; } curMdl = NET_BUFFER_CURRENT_MDL(curNb); bufferStart = (PUINT8)OvsGetMdlWithLowPriority(curMdl); if (!bufferStart) { status = NDIS_STATUS_RESOURCES; goto ret_error; } bufferStart += NET_BUFFER_CURRENT_MDL_OFFSET(curNb); if (NET_BUFFER_NEXT_NB(curNb)) { OVS_LOG_TRACE("nb length %u next %u", NET_BUFFER_DATA_LENGTH(curNb), NET_BUFFER_DATA_LENGTH(curNb->Next)); } /* L2 header */ ethHdr = (EthHdr *)bufferStart; NdisMoveMemory(ethHdr->Destination, fwdInfo.dstMacAddr, sizeof ethHdr->Destination); NdisMoveMemory(ethHdr->Source, fwdInfo.srcMacAddr, sizeof ethHdr->Source); if (tunKey->dst.si_family == AF_INET) { ethHdr->Type = htons(ETH_TYPE_IPV4); } else if (tunKey->dst.si_family == AF_INET6) { ethHdr->Type = htons(ETH_TYPE_IPV6); } if (tunKey->dst.si_family == AF_INET) { /* IP header */ ipHdr = (IPHdr *)((PCHAR)ethHdr + sizeof *ethHdr); ipHdr->ihl = sizeof *ipHdr / 4; ipHdr->version = IPPROTO_IPV4; ipHdr->tos = tunKey->tos; ipHdr->tot_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr); ipHdr->id = (uint16)atomic_add64(&vportGeneve->ipId, NET_BUFFER_DATA_LENGTH(curNb)); ipHdr->frag_off = (tunKey->flags & OVS_TNL_F_DONT_FRAGMENT) ? IP_DF_NBO : 0; ipHdr->ttl = tunKey->ttl ? tunKey->ttl : GENEVE_DEFAULT_TTL; ipHdr->protocol = IPPROTO_UDP; ASSERT(OvsIphAddrEquals(&tunKey->dst, &fwdInfo.dstIphAddr)); ASSERT(OvsIphAddrEquals(&tunKey->src, &fwdInfo.srcIphAddr) || OvsIphIsZero(&tunKey->src)); ipHdr->saddr = fwdInfo.srcIphAddr.Ipv4.sin_addr.s_addr; ipHdr->daddr = fwdInfo.dstIphAddr.Ipv4.sin_addr.s_addr; ipHdr->check = 0; } else if (tunKey->dst.si_family == AF_INET6) { /* IPv6 header */ ipv6Hdr = (IPv6Hdr *)((PCHAR)ethHdr + sizeof *ethHdr); ipv6Hdr->version = IPV6; ipv6Hdr->priority = 0; ipv6Hdr->flow_lbl[0] = 0; ipv6Hdr->flow_lbl[1] = 0; ipv6Hdr->flow_lbl[2] = 0; ipv6Hdr->payload_len = htons(NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ethHdr - sizeof *ipv6Hdr); ipv6Hdr->hop_limit = tunKey->ttl ? tunKey->ttl : GENEVE_DEFAULT_TTL; ipv6Hdr->nexthdr = IPPROTO_UDP; ASSERT(OvsIphAddrEquals(&(tunKey->dst), &(fwdInfo.dstIphAddr))); ASSERT(OvsIphAddrEquals(&(tunKey->src), &(fwdInfo.srcIphAddr)) || OvsIphIsZero(&(tunKey->src))); RtlCopyMemory(&ipv6Hdr->saddr, &fwdInfo.srcIphAddr.Ipv6.sin6_addr, sizeof(ipv6Hdr->saddr)); RtlCopyMemory(&ipv6Hdr->daddr, &fwdInfo.dstIphAddr.Ipv6.sin6_addr, sizeof(ipv6Hdr->daddr)); } /* UDP header */ if (tunKey->dst.si_family == AF_INET) { udpHdr = (UDPHdr *)((PCHAR)ipHdr + sizeof *ipHdr); } else if (tunKey->dst.si_family == AF_INET6) { udpHdr = (UDPHdr *)((PCHAR)ipv6Hdr + sizeof *ipv6Hdr); } ASSERT(udpHdr); udpHdr->source = htons(tunKey->flow_hash | MAXINT16); udpHdr->dest = tunKey->dst_port ? tunKey->dst_port : htons(vportGeneve->dstPort); udpHdr->len = htons(NET_BUFFER_DATA_LENGTH(curNb) - headRoom + sizeof *udpHdr + sizeof *geneveHdr + tunKey->tunOptLen); if (tunKey->flags & OVS_TNL_F_CSUM) { UINT16 udpChksumLen = 0; if (tunKey->dst.si_family == AF_INET) { udpChksumLen = (UINT16) NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ipHdr - sizeof *ethHdr; udpHdr->check = IPPseudoChecksum(&ipHdr->saddr, &ipHdr->daddr, IPPROTO_UDP, udpChksumLen); } else if (tunKey->dst.si_family == AF_INET6) { udpChksumLen = (UINT16) NET_BUFFER_DATA_LENGTH(curNb) - sizeof *ipv6Hdr - sizeof *ethHdr; udpHdr->check = IPv6PseudoChecksum((UINT32*)&ipv6Hdr->saddr, (UINT32*)&ipv6Hdr->daddr, IPPROTO_UDP, udpChksumLen); } } else { udpHdr->check = 0; } /* Geneve header */ geneveHdr = (GeneveHdr *)((PCHAR)udpHdr + sizeof *udpHdr); geneveHdr->version = GENEVE_VER; geneveHdr->optLen = tunKey->tunOptLen / 4; geneveHdr->oam = !!(tunKey->flags & OVS_TNL_F_OAM); geneveHdr->critical = !!(tunKey->flags & OVS_TNL_F_CRT_OPT); geneveHdr->reserved1 = 0; geneveHdr->protocol = ETH_P_TEB_NBO; geneveHdr->vni = GENEVE_TUNNELID_TO_VNI(tunKey->tunnelId); geneveHdr->reserved2 = 0; /* Geneve header options */ optHdr = (GeneveOptionHdr *)(geneveHdr + 1); memcpy(optHdr, IPTunnelKeyGetOptions(tunKey), tunKey->tunOptLen); csumInfo.Value = 0; csumInfo.Transmit.IpHeaderChecksum = 1; if (tunKey->dst.si_family == AF_INET) { csumInfo.Transmit.IsIPv4 = 1; } else if (tunKey->dst.si_family == AF_INET6) { csumInfo.Transmit.IsIPv4 = 0; csumInfo.Transmit.IsIPv6 = 1; csumInfo.Transmit.IpHeaderChecksum = 0; } if (tunKey->flags & OVS_TNL_F_CSUM) { csumInfo.Transmit.UdpChecksum = 1; } NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = csumInfo.Value; } return STATUS_SUCCESS; ret_error: OvsCompleteNBL(switchContext, *newNbl, TRUE); *newNbl = NULL; return status; } NDIS_STATUS OvsDecapGeneve(POVS_SWITCH_CONTEXT switchContext, PNET_BUFFER_LIST curNbl, OvsIPTunnelKey *tunKey, PNET_BUFFER_LIST *newNbl) { PNET_BUFFER curNb; PMDL curMdl; EthHdr *ethHdr; IPHdr *ipHdr; IPv6Hdr *ipv6Hdr; UDPHdr *udpHdr; GeneveHdr *geneveHdr; UINT32 tunnelSize; UINT32 packetLength; PUINT8 bufferStart; PVOID optStart; NDIS_STATUS status; OVS_PACKET_HDR_INFO layers = { 0 }; status = OvsExtractLayers(curNbl, &layers); if (status != NDIS_STATUS_SUCCESS) { return status; } /* Check the length of the UDP payload */ curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); tunnelSize = OvsGetGeneveTunHdrSizeFromLayers(&layers); packetLength = NET_BUFFER_DATA_LENGTH(curNb); if (packetLength <= tunnelSize) { return NDIS_STATUS_INVALID_LENGTH; } /* * Create a copy of the NBL so that we have all the headers in one MDL. */ *newNbl = OvsPartialCopyNBL(switchContext, curNbl, tunnelSize, 0, TRUE /*copy NBL info */); if (*newNbl == NULL) { return NDIS_STATUS_RESOURCES; } /* XXX: Handle VLAN header. */ curNbl = *newNbl; curNb = NET_BUFFER_LIST_FIRST_NB(curNbl); curMdl = NET_BUFFER_CURRENT_MDL(curNb); bufferStart = (PUINT8)OvsGetMdlWithLowPriority(curMdl) + NET_BUFFER_CURRENT_MDL_OFFSET(curNb); if (!bufferStart) { status = NDIS_STATUS_RESOURCES; goto dropNbl; } ethHdr = (EthHdr *)bufferStart; /* XXX: Handle IP options. */ if (layers.isIPv4) { ipHdr = (IPHdr *)(bufferStart + layers.l3Offset); tunKey->src.si_family = AF_INET; tunKey->src.Ipv4.sin_addr.s_addr = ipHdr->saddr; tunKey->dst.si_family = AF_INET; tunKey->dst.Ipv4.sin_addr.s_addr = ipHdr->daddr; tunKey->tos = ipHdr->tos; tunKey->ttl = ipHdr->ttl; tunKey->pad = 0; } else if (layers.isIPv6) { ipv6Hdr = (IPv6Hdr *)(bufferStart + layers.l3Offset); tunKey->src.si_family = AF_INET6; RtlCopyMemory(&(tunKey->src.Ipv6.sin6_addr), &ipv6Hdr->saddr, sizeof(tunKey->src.Ipv6.sin6_addr)); tunKey->dst.si_family = AF_INET6; RtlCopyMemory(&(tunKey->dst.Ipv6.sin6_addr), &ipv6Hdr->daddr, sizeof(tunKey->dst.Ipv6.sin6_addr)); tunKey->tos = 0;/*???*/ tunKey->ttl = ipv6Hdr->hop_limit; tunKey->pad = 0; } udpHdr = (UDPHdr *)(bufferStart + layers.l4Offset); /* Validate if NIC has indicated checksum failure. */ status = OvsValidateUDPChecksum(curNbl, udpHdr->check == 0); if (status != NDIS_STATUS_SUCCESS) { goto dropNbl; } /* Calculate and verify UDP checksum if NIC didn't do it. */ if (udpHdr->check != 0) { status = OvsCalculateUDPChecksum(curNbl, curNb, ethHdr, udpHdr, packetLength, &layers); tunKey->flags |= OVS_TNL_F_CSUM; if (status != NDIS_STATUS_SUCCESS) { goto dropNbl; } } geneveHdr = (GeneveHdr *)((PCHAR)udpHdr + sizeof *udpHdr); if (geneveHdr->protocol != ETH_P_TEB_NBO) { status = STATUS_NDIS_INVALID_PACKET; goto dropNbl; } /* Update tunnelKey flags. */ tunKey->flags = OVS_TNL_F_KEY | (geneveHdr->oam ? OVS_TNL_F_OAM : 0) | (geneveHdr->critical ? OVS_TNL_F_CRT_OPT : 0); tunKey->tunnelId = GENEVE_VNI_TO_TUNNELID(geneveHdr->vni); tunKey->tunOptLen = (uint8)geneveHdr->optLen * 4; if (tunKey->tunOptLen > TUN_OPT_MAX_LEN || packetLength < tunnelSize + tunKey->tunOptLen) { status = NDIS_STATUS_INVALID_LENGTH; goto dropNbl; } /* Clear out the receive flag for the inner packet. */ NET_BUFFER_LIST_INFO(curNbl, TcpIpChecksumNetBufferListInfo) = 0; NdisAdvanceNetBufferDataStart(curNb, tunnelSize, FALSE, NULL); if (tunKey->tunOptLen > 0) { optStart = NdisGetDataBuffer(curNb, tunKey->tunOptLen, IPTunnelKeyGetOptions(tunKey), 1, 0); /* If data is contiguous in the buffer, NdisGetDataBuffer will not copy data to the storage. Manual copy is needed. */ if (optStart != IPTunnelKeyGetOptions(tunKey)) { memcpy(IPTunnelKeyGetOptions(tunKey), optStart, tunKey->tunOptLen); } NdisAdvanceNetBufferDataStart(curNb, tunKey->tunOptLen, FALSE, NULL); tunKey->flags |= OVS_TNL_F_GENEVE_OPT; } return NDIS_STATUS_SUCCESS; dropNbl: OvsCompleteNBL(switchContext, *newNbl, TRUE); *newNbl = NULL; return status; }