diff options
author | David S. Miller <davem@davemloft.net> | 2010-08-02 15:07:58 -0700 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2010-08-02 15:07:58 -0700 |
commit | 83bf2e4089bebc2c7fd14a79de5954b26fe8d4af (patch) | |
tree | ab2cb1f229ba4c2d7236406c997e41a223daf74d /net | |
parent | de38483010bae523f533bb6bf9f7b7353772f6eb (diff) | |
parent | 6661481d5a8975657742c7ed40ae16bdaa7d0a6e (diff) | |
download | linux-rt-83bf2e4089bebc2c7fd14a79de5954b26fe8d4af.tar.gz |
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6
Diffstat (limited to 'net')
35 files changed, 717 insertions, 352 deletions
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 16c0ba0a2728..6bccba31d132 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -283,16 +283,13 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { const struct arpt_entry_target *t; - int hdr_len; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); continue; } - hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + - (2 * skb->dev->addr_len); - ADD_COUNTER(e->counters, hdr_len, 1); + ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1); t = arpt_get_target_c(e); @@ -713,7 +710,7 @@ static void get_counters(const struct xt_table_info *t, struct arpt_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = get_cpu(); /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -723,14 +720,16 @@ static void get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); - curcpu = smp_processor_id(); - i = 0; xt_entry_foreach(iter, t->entries[curcpu], t->size) { SET_COUNTER(counters[i], iter->counters.bcnt, iter->counters.pcnt); ++i; } + local_bh_enable(); + /* Processing counters from other cpus, we can let bottom half enabled, + * (preemption is disabled) + */ for_each_possible_cpu(cpu) { if (cpu == curcpu) @@ -744,7 +743,7 @@ static void get_counters(const struct xt_table_info *t, } xt_info_wrunlock(cpu); } - local_bh_enable(); + put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index b38c11810c65..c439721b165a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -364,7 +364,7 @@ ipt_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); + ADD_COUNTER(e->counters, skb->len, 1); t = ipt_get_target(e); IP_NF_ASSERT(t->u.kernel.target); @@ -884,7 +884,7 @@ get_counters(const struct xt_table_info *t, struct ipt_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = get_cpu(); /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -894,14 +894,16 @@ get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); - curcpu = smp_processor_id(); - i = 0; xt_entry_foreach(iter, t->entries[curcpu], t->size) { SET_COUNTER(counters[i], iter->counters.bcnt, iter->counters.pcnt); ++i; } + local_bh_enable(); + /* Processing counters from other cpus, we can let bottom half enabled, + * (preemption is disabled) + */ for_each_possible_cpu(cpu) { if (cpu == curcpu) @@ -915,7 +917,7 @@ get_counters(const struct xt_table_info *t, } xt_info_wrunlock(cpu); } - local_bh_enable(); + put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index bbbd2736c549..b254dafaf429 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -95,10 +95,11 @@ static void send_reset(struct sk_buff *oldskb, int hook) } tcph->rst = 1; - tcph->check = tcp_v4_check(sizeof(struct tcphdr), - niph->saddr, niph->daddr, - csum_partial(tcph, - sizeof(struct tcphdr), 0)); + tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), niph->saddr, + niph->daddr, 0); + nskb->ip_summed = CHECKSUM_PARTIAL; + nskb->csum_start = (unsigned char *)tcph - nskb->head; + nskb->csum_offset = offsetof(struct tcphdr, check); addr_type = RTN_UNSPEC; if (hook != NF_INET_FORWARD @@ -115,7 +116,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) goto free_nskb; niph->ttl = dst_metric(skb_dst(nskb), RTAX_HOPLIMIT); - nskb->ip_summed = CHECKSUM_NONE; /* "Never happens" */ if (nskb->len > dst_mtu(skb_dst(nskb))) diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index c7719b283ada..8c8632d9b93c 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -261,14 +261,9 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, rcu_read_lock(); proto = __nf_nat_proto_find(orig_tuple->dst.protonum); - /* Change protocol info to have some randomization */ - if (range->flags & IP_NAT_RANGE_PROTO_RANDOM) { - proto->unique_tuple(tuple, range, maniptype, ct); - goto out; - } - /* Only bother mapping if it's not already in range and unique */ - if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || + if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) && + (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || proto->in_range(tuple, maniptype, &range->min, &range->max)) && !nf_nat_used_tuple(tuple, ct)) goto out; @@ -440,7 +435,7 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, if (!skb_make_writable(skb, hdrlen + sizeof(*inside))) return 0; - inside = (void *)skb->data + ip_hdrlen(skb); + inside = (void *)skb->data + hdrlen; /* We're actually going to mangle it beyond trivial checksum adjustment, so make sure the current checksum is correct. */ @@ -470,12 +465,10 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, /* rcu_read_lock()ed by nf_hook_slow */ l4proto = __nf_ct_l4proto_find(PF_INET, inside->ip.protocol); - if (!nf_ct_get_tuple(skb, - ip_hdrlen(skb) + sizeof(struct icmphdr), - (ip_hdrlen(skb) + + if (!nf_ct_get_tuple(skb, hdrlen + sizeof(struct icmphdr), + (hdrlen + sizeof(struct icmphdr) + inside->ip.ihl * 4), - (u_int16_t)AF_INET, - inside->ip.protocol, + (u_int16_t)AF_INET, inside->ip.protocol, &inner, l3proto, l4proto)) return 0; @@ -484,15 +477,13 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, pass all hooks (locally-generated ICMP). Consider incoming packet: PREROUTING (DST manip), routing produces ICMP, goes through POSTROUTING (which must correct the DST manip). */ - if (!manip_pkt(inside->ip.protocol, skb, - ip_hdrlen(skb) + sizeof(inside->icmp), - &ct->tuplehash[!dir].tuple, - !manip)) + if (!manip_pkt(inside->ip.protocol, skb, hdrlen + sizeof(inside->icmp), + &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt inner. */ - inside = (void *)skb->data + ip_hdrlen(skb); + inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, diff --git a/net/ipv4/netfilter/nf_nat_proto_common.c b/net/ipv4/netfilter/nf_nat_proto_common.c index 6c4f11f51446..3e61faf23a9a 100644 --- a/net/ipv4/netfilter/nf_nat_proto_common.c +++ b/net/ipv4/netfilter/nf_nat_proto_common.c @@ -34,7 +34,7 @@ bool nf_nat_proto_in_range(const struct nf_conntrack_tuple *tuple, } EXPORT_SYMBOL_GPL(nf_nat_proto_in_range); -bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, +void nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct, @@ -53,7 +53,7 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == IP_NAT_MANIP_DST) - return false; + return; if (ntohs(*portptr) < 1024) { /* Loose convention: >> 512 is credential passing */ @@ -81,15 +81,15 @@ bool nf_nat_proto_unique_tuple(struct nf_conntrack_tuple *tuple, else off = *rover; - for (i = 0; i < range_size; i++, off++) { + for (i = 0; ; ++off) { *portptr = htons(min + off % range_size); - if (nf_nat_used_tuple(tuple, ct)) + if (++i != range_size && nf_nat_used_tuple(tuple, ct)) continue; if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) *rover = off; - return true; + return; } - return false; + return; } EXPORT_SYMBOL_GPL(nf_nat_proto_unique_tuple); diff --git a/net/ipv4/netfilter/nf_nat_proto_dccp.c b/net/ipv4/netfilter/nf_nat_proto_dccp.c index 22485ce306d4..570faf2667b2 100644 --- a/net/ipv4/netfilter/nf_nat_proto_dccp.c +++ b/net/ipv4/netfilter/nf_nat_proto_dccp.c @@ -22,14 +22,14 @@ static u_int16_t dccp_port_rover; -static bool +static void dccp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &dccp_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &dccp_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c b/net/ipv4/netfilter/nf_nat_proto_gre.c index d7e89201351e..bc8d83a31c73 100644 --- a/net/ipv4/netfilter/nf_nat_proto_gre.c +++ b/net/ipv4/netfilter/nf_nat_proto_gre.c @@ -37,7 +37,7 @@ MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); MODULE_DESCRIPTION("Netfilter NAT protocol helper module for GRE"); /* generate unique tuple ... */ -static bool +static void gre_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, @@ -50,7 +50,7 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) - return false; + return; if (maniptype == IP_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; @@ -68,14 +68,14 @@ gre_unique_tuple(struct nf_conntrack_tuple *tuple, pr_debug("min = %u, range_size = %u\n", min, range_size); - for (i = 0; i < range_size; i++, key++) { + for (i = 0; ; ++key) { *keyptr = htons(min + key % range_size); - if (!nf_nat_used_tuple(tuple, ct)) - return true; + if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + return; } pr_debug("%p: no NAT mapping\n", ct); - return false; + return; } /* manipulate a GRE packet according to maniptype */ diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 19a8b0b07d8e..5744c3ec847c 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -27,7 +27,7 @@ icmp_in_range(const struct nf_conntrack_tuple *tuple, ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); } -static bool +static void icmp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, @@ -42,13 +42,13 @@ icmp_unique_tuple(struct nf_conntrack_tuple *tuple, if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) range_size = 0xFFFF; - for (i = 0; i < range_size; i++, id++) { + for (i = 0; ; ++id) { tuple->src.u.icmp.id = htons(ntohs(range->min.icmp.id) + (id % range_size)); - if (!nf_nat_used_tuple(tuple, ct)) - return true; + if (++i == range_size || !nf_nat_used_tuple(tuple, ct)) + return; } - return false; + return; } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_sctp.c b/net/ipv4/netfilter/nf_nat_proto_sctp.c index 3fc598eeeb1a..756331d42661 100644 --- a/net/ipv4/netfilter/nf_nat_proto_sctp.c +++ b/net/ipv4/netfilter/nf_nat_proto_sctp.c @@ -16,14 +16,14 @@ static u_int16_t nf_sctp_port_rover; -static bool +static void sctp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &nf_sctp_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &nf_sctp_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_tcp.c b/net/ipv4/netfilter/nf_nat_proto_tcp.c index 399e2cfa263b..aa460a595d5d 100644 --- a/net/ipv4/netfilter/nf_nat_proto_tcp.c +++ b/net/ipv4/netfilter/nf_nat_proto_tcp.c @@ -20,14 +20,13 @@ static u_int16_t tcp_port_rover; -static bool +static void tcp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &tcp_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &tcp_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_udp.c b/net/ipv4/netfilter/nf_nat_proto_udp.c index 9e61c79492e4..dfe65c7e2925 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udp.c +++ b/net/ipv4/netfilter/nf_nat_proto_udp.c @@ -19,14 +19,13 @@ static u_int16_t udp_port_rover; -static bool +static void udp_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &udp_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, &udp_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_udplite.c b/net/ipv4/netfilter/nf_nat_proto_udplite.c index 440a229bbd87..3cc8c8af39ef 100644 --- a/net/ipv4/netfilter/nf_nat_proto_udplite.c +++ b/net/ipv4/netfilter/nf_nat_proto_udplite.c @@ -18,14 +18,14 @@ static u_int16_t udplite_port_rover; -static bool +static void udplite_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { - return nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, - &udplite_port_rover); + nf_nat_proto_unique_tuple(tuple, range, maniptype, ct, + &udplite_port_rover); } static bool diff --git a/net/ipv4/netfilter/nf_nat_proto_unknown.c b/net/ipv4/netfilter/nf_nat_proto_unknown.c index 14381c62acea..a50f2bc1c732 100644 --- a/net/ipv4/netfilter/nf_nat_proto_unknown.c +++ b/net/ipv4/netfilter/nf_nat_proto_unknown.c @@ -26,14 +26,14 @@ static bool unknown_in_range(const struct nf_conntrack_tuple *tuple, return true; } -static bool unknown_unique_tuple(struct nf_conntrack_tuple *tuple, +static void unknown_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { /* Sorry: we can't help you; if it's not unique, we can't frob anything. */ - return false; + return; } static bool diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index dc41d6d3c6c6..5359ef4daac5 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -387,9 +387,7 @@ ip6t_do_table(struct sk_buff *skb, goto no_match; } - ADD_COUNTER(e->counters, - ntohs(ipv6_hdr(skb)->payload_len) + - sizeof(struct ipv6hdr), 1); + ADD_COUNTER(e->counters, skb->len, 1); t = ip6t_get_target_c(e); IP_NF_ASSERT(t->u.kernel.target); @@ -899,7 +897,7 @@ get_counters(const struct xt_table_info *t, struct ip6t_entry *iter; unsigned int cpu; unsigned int i; - unsigned int curcpu; + unsigned int curcpu = get_cpu(); /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters @@ -909,14 +907,16 @@ get_counters(const struct xt_table_info *t, * if new softirq were to run and call ipt_do_table */ local_bh_disable(); - curcpu = smp_processor_id(); - i = 0; xt_entry_foreach(iter, t->entries[curcpu], t->size) { SET_COUNTER(counters[i], iter->counters.bcnt, iter->counters.pcnt); ++i; } + local_bh_enable(); + /* Processing counters from other cpus, we can let bottom half enabled, + * (preemption is disabled) + */ for_each_possible_cpu(cpu) { if (cpu == curcpu) @@ -930,7 +930,7 @@ get_counters(const struct xt_table_info *t, } xt_info_wrunlock(cpu); } - local_bh_enable(); + put_cpu(); } static struct xt_counters *alloc_counters(const struct xt_table *table) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 9254008602d4..098a050a20b0 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -269,6 +269,11 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, * in the chain of fragments so far. We must know where to put * this fragment, right? */ + prev = fq->q.fragments_tail; + if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { + next = NULL; + goto found; + } prev = NULL; for (next = fq->q.fragments; next != NULL; next = next->next) { if (NFCT_FRAG6_CB(next)->offset >= offset) @@ -276,6 +281,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, prev = next; } +found: /* We found where to put this one. Check for overlap with * preceding fragment, and, if needed, align things so that * any overlaps are eliminated. @@ -341,6 +347,8 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, /* Insert this fragment in the chain of fragments. */ skb->next = next; + if (!next) + fq->q.fragments_tail = skb; if (prev) prev->next = skb; else @@ -464,6 +472,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) head->csum); fq->q.fragments = NULL; + fq->q.fragments_tail = NULL; /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */ fp = skb_shinfo(head)->frag_list; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index aa2f106347e4..43288259f4a1 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -326,6 +326,22 @@ config NETFILTER_XT_CONNMARK comment "Xtables targets" +config NETFILTER_XT_TARGET_CHECKSUM + tristate "CHECKSUM target support" + depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on NETFILTER_ADVANCED + ---help--- + This option adds a `CHECKSUM' target, which can be used in the iptables mangle + table. + + You can use this target to compute and fill in the checksum in + a packet that lacks a checksum. This is particularly useful, + if you need to work around old applications such as dhcp clients, + that do not work well with checksum offloads, but don't want to disable + checksum offload in your device. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_CLASSIFY tristate '"CLASSIFY" target support' depends on NETFILTER_ADVANCED @@ -647,6 +663,15 @@ config NETFILTER_XT_MATCH_CONNTRACK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_CPU + tristate '"cpu" match support' + depends on NETFILTER_ADVANCED + help + CPU matching allows you to match packets based on the CPU + currently handling the packet. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_DCCP tristate '"dccp" protocol match support' depends on NETFILTER_ADVANCED @@ -726,6 +751,16 @@ config NETFILTER_XT_MATCH_IPRANGE If unsure, say M. +config NETFILTER_XT_MATCH_IPVS + tristate '"ipvs" match support' + depends on IP_VS + depends on NETFILTER_ADVANCED + depends on NF_CONNTRACK + help + This option allows you to match against IPVS properties of a packet. + + If unsure, say N. + config NETFILTER_XT_MATCH_LENGTH tristate '"length" match support' depends on NETFILTER_ADVANCED diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e28420aac5ef..441050f31111 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o # targets +obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_CT) += xt_CT.o @@ -69,6 +70,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNBYTES) += xt_connbytes.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNLIMIT) += xt_connlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_CONNTRACK) += xt_conntrack.o +obj-$(CONFIG_NETFILTER_XT_MATCH_CPU) += xt_cpu.o obj-$(CONFIG_NETFILTER_XT_MATCH_DCCP) += xt_dccp.o obj-$(CONFIG_NETFILTER_XT_MATCH_DSCP) += xt_dscp.o obj-$(CONFIG_NETFILTER_XT_MATCH_ESP) += xt_esp.o @@ -76,6 +78,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_HASHLIMIT) += xt_hashlimit.o obj-$(CONFIG_NETFILTER_XT_MATCH_HELPER) += xt_helper.o obj-$(CONFIG_NETFILTER_XT_MATCH_HL) += xt_hl.o obj-$(CONFIG_NETFILTER_XT_MATCH_IPRANGE) += xt_iprange.o +obj-$(CONFIG_NETFILTER_XT_MATCH_IPVS) += xt_ipvs.o obj-$(CONFIG_NETFILTER_XT_MATCH_LENGTH) += xt_length.o obj-$(CONFIG_NETFILTER_XT_MATCH_LIMIT) += xt_limit.o obj-$(CONFIG_NETFILTER_XT_MATCH_MAC) += xt_mac.o diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 712ccad13344..46a77d5c3887 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -3,7 +3,7 @@ # menuconfig IP_VS tristate "IP virtual server support" - depends on NET && INET && NETFILTER + depends on NET && INET && NETFILTER && NF_CONNTRACK ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This @@ -26,7 +26,7 @@ if IP_VS config IP_VS_IPV6 bool "IPv6 support for IPVS" - depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) + depends on IPV6 = y || IP_VS = IPV6 ---help--- Add IPv6 support to IPVS. This is incomplete and might be dangerous. @@ -87,19 +87,16 @@ config IP_VS_PROTO_UDP protocol. Say Y if unsure. config IP_VS_PROTO_AH_ESP - bool - depends on UNDEFINED + def_bool IP_VS_PROTO_ESP || IP_VS_PROTO_AH config IP_VS_PROTO_ESP bool "ESP load balancing support" - select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing ESP (Encapsulation Security Payload) transport protocol. Say Y if unsure. config IP_VS_PROTO_AH bool "AH load balancing support" - select IP_VS_PROTO_AH_ESP ---help--- This option enables support for load balancing AH (Authentication Header) transport protocol. Say Y if unsure. @@ -238,7 +235,7 @@ comment 'IPVS application helper' config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP + depends on IP_VS_PROTO_TCP && NF_NAT ---help--- FTP is a protocol that transfers IP address and/or port number in the payload. In the virtual server via Network Address Translation, diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 1cb0e834f8ff..e76f87f4aca8 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -569,49 +569,6 @@ static const struct file_operations ip_vs_app_fops = { }; #endif - -/* - * Replace a segment of data with a new segment - */ -int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, - char *o_buf, int o_len, char *n_buf, int n_len) -{ - int diff; - int o_offset; - int o_left; - - EnterFunction(9); - - diff = n_len - o_len; - o_offset = o_buf - (char *)skb->data; - /* The length of left data after o_buf+o_len in the skb data */ - o_left = skb->len - (o_offset + o_len); - - if (diff <= 0) { - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - skb_trim(skb, skb->len + diff); - } else if (diff <= skb_tailroom(skb)) { - skb_put(skb, diff); - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - } else { - if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) - return -ENOMEM; - skb_put(skb, diff); - memmove(skb->data + o_offset + n_len, - skb->data + o_offset + o_len, o_left); - skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); - } - - /* must update the iph total length here */ - ip_hdr(skb)->tot_len = htons(skb->len); - - LeaveFunction(9); - return 0; -} - - int __init ip_vs_app_init(void) { /* we will replace it with proc_net_ipvs_create() soon */ diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 654544e72264..b71c69a2db13 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -271,6 +271,29 @@ struct ip_vs_conn *ip_vs_conn_in_get return cp; } +struct ip_vs_conn * +ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) + return ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + else + return ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); + /* Get reference to connection template */ struct ip_vs_conn *ip_vs_ct_in_get (int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, @@ -356,6 +379,28 @@ struct ip_vs_conn *ip_vs_conn_out_get return ret; } +struct ip_vs_conn * +ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) + return ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + else + return ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); +} +EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); /* * Put back the conn and restart its timer with its timeout diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 50907d8472a3..4f8ddba48011 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -54,7 +54,6 @@ EXPORT_SYMBOL(register_ip_vs_scheduler); EXPORT_SYMBOL(unregister_ip_vs_scheduler); -EXPORT_SYMBOL(ip_vs_skb_replace); EXPORT_SYMBOL(ip_vs_proto_name); EXPORT_SYMBOL(ip_vs_conn_new); EXPORT_SYMBOL(ip_vs_conn_in_get); @@ -536,26 +535,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } - -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain, and is used for VS/NAT. - * It detects packets for VS/NAT connections and sends the packets - * immediately. This can avoid that iptable_nat mangles the packets - * for VS/NAT. - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -1499,14 +1478,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1535,14 +1506,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, - }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 2ae747a376a5..f228a17ec649 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -20,6 +20,17 @@ * * Author: Wouter Gadeyne * + * + * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from + * http://www.ssi.bg/~ja/nfct/: + * + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2008 + * Julian Anastasov */ #define KMSG_COMPONENT "IPVS" @@ -32,6 +43,9 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_nat_helper.h> #include <linux/gfp.h> #include <net/protocol.h> #include <net/tcp.h> @@ -43,6 +57,16 @@ #define SERVER_STRING "227 Entering Passive Mode (" #define CLIENT_STRING "PORT " +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper @@ -123,6 +147,119 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, return 1; } +/* + * Called from init_conntrack() as expectfn handler. + */ +static void +ip_vs_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + */ +static void +ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 *port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + BUG_ON(!ct || ct == &nf_conntrack_untracked); + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + if (from_rs) + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), &cp->daddr, &cp->caddr, + proto, port, &cp->cport); + else + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, + nf_ct_l3num(ct), &cp->caddr, &cp->vaddr, + proto, port, &cp->vport); + + exp->expectfn = ip_vs_expect_callback; + + IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} /* * Look at outgoing ftp packets to catch the response to a PASV command @@ -149,7 +286,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, struct ip_vs_conn *n_cp; char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ unsigned buf_len; - int ret; + int ret = 0; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -219,19 +358,26 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, buf_len = strlen(buf); + ct = nf_ct_get(skb, &ctinfo); + if (ct && !nf_ct_is_untracked(ct)) { + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. + */ + ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + start-data, end-start, + buf, buf_len); + if (ret) + ip_vs_expect_related(skb, ct, n_cp, + IPPROTO_TCP, NULL, 0); + } + /* - * Calculate required delta-offset to keep TCP happy + * Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. */ - *diff = buf_len - (end-start); - - if (*diff == 0) { - /* simply replace it with new passive address */ - memcpy(start, buf, buf_len); - ret = 1; - } else { - ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, - end-start, buf, buf_len); - } cp->app_data = NULL; ip_vs_tcp_conn_listen(n_cp); @@ -263,6 +409,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, union nf_inet_addr to; __be16 port; struct ip_vs_conn *n_cp; + struct nf_conn *ct; #ifdef CONFIG_IP_VS_IPV6 /* This application helper doesn't work with IPv6 yet, @@ -349,6 +496,11 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_control_add(n_cp, cp); } + ct = (struct nf_conn *)skb->nfct; + if (ct && ct != &nf_conntrack_untracked) + ip_vs_expect_related(skb, ct, n_cp, + IPPROTO_TCP, &n_cp->dport, 1); + /* * Move tunnel to listen state */ diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 2d3d5e4b35f8..027f654799fe 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -98,6 +98,7 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) return NULL; } +EXPORT_SYMBOL(ip_vs_proto_get); /* diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index c9a3f7a21d53..4c0855cb006e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -8,55 +8,6 @@ #include <net/sctp/checksum.h> #include <net/ip_vs.h> - -static struct ip_vs_conn * -sctp_conn_in_get(int af, - const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - else - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); -} - -static struct ip_vs_conn * -sctp_conn_out_get(int af, - const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, - unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - else - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); -} - static int sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) @@ -173,7 +124,7 @@ sctp_dnat_handler(struct sk_buff *skb, return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!ip_vs_app_pkt_in(cp, skb)) return 0; } @@ -1169,8 +1120,8 @@ struct ip_vs_protocol ip_vs_protocol_sctp = { .register_app = sctp_register_app, .unregister_app = sctp_unregister_app, .conn_schedule = sctp_conn_schedule, - .conn_in_get = sctp_conn_in_get, - .conn_out_get = sctp_conn_out_get, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = sctp_snat_handler, .dnat_handler = sctp_dnat_handler, .csum_check = sctp_csum_check, diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 91d28e073742..282d24de8592 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -27,52 +27,6 @@ #include <net/ip_vs.h> - -static struct ip_vs_conn * -tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - -static struct ip_vs_conn * -tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } -} - - static int tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) @@ -721,8 +675,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, - .conn_in_get = tcp_conn_in_get, - .conn_out_get = tcp_conn_out_get, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, .csum_check = tcp_csum_check, diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index e7a6885e0167..8553231b5d41 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -27,58 +27,6 @@ #include <net/ip.h> #include <net/ip6_checksum.h> -static struct ip_vs_conn * -udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - -static struct ip_vs_conn * -udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct ip_vs_iphdr *iph, unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); - } - - return cp; -} - - static int udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, int *verdict, struct ip_vs_conn **cpp) @@ -520,8 +468,8 @@ struct ip_vs_protocol ip_vs_protocol_udp = { .init = udp_init, .exit = udp_exit, .conn_schedule = udp_conn_schedule, - .conn_in_get = udp_conn_in_get, - .conn_out_get = udp_conn_out_get, + .conn_in_get = ip_vs_conn_in_get_proto, + .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = udp_snat_handler, .dnat_handler = udp_dnat_handler, .csum_check = udp_csum_check, diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 02b078e11cf3..21e1a5e9b9d3 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -28,6 +28,7 @@ #include <net/ip6_route.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> +#include <net/netfilter/nf_conntrack.h> #include <linux/netfilter_ipv4.h> #include <net/ip_vs.h> @@ -348,6 +349,30 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } #endif +static void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + struct nf_conn *ct = (struct nf_conn *)skb->nfct; + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct)) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + new_tuple.src.u3 = cp->daddr; + /* + * This will also take care of UDP and other protocols. + */ + new_tuple.src.u.tcp.port = cp->dport; + nf_conntrack_alter_reply(ct, &new_tuple); +} + /* * NAT transmitter (only for outside-to-inside nat forwarding) * Not used for related ICMP @@ -403,6 +428,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + ip_vs_update_conntrack(skb, cp); + /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -479,6 +506,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + ip_vs_update_conntrack(skb, cp); + /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 16b41b4e2a3c..df3eedb142ff 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -966,8 +966,7 @@ acct: if (acct) { spin_lock_bh(&ct->lock); acct[CTINFO2DIR(ctinfo)].packets++; - acct[CTINFO2DIR(ctinfo)].bytes += - skb->len - skb_network_offset(skb); + acct[CTINFO2DIR(ctinfo)].bytes += skb->len; spin_unlock_bh(&ct->lock); } } diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index fdc8fb4ae10f..7dcf7a404190 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -23,9 +23,10 @@ void __nf_ct_ext_destroy(struct nf_conn *ct) { unsigned int i; struct nf_ct_ext_type *t; + struct nf_ct_ext *ext = ct->ext; for (i = 0; i < NF_CT_EXT_NUM; i++) { - if (!nf_ct_ext_exist(ct, i)) + if (!__nf_ct_ext_exist(ext, i)) continue; rcu_read_lock(); @@ -73,44 +74,45 @@ static void __nf_ct_ext_free_rcu(struct rcu_head *head) void *__nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { - struct nf_ct_ext *new; + struct nf_ct_ext *old, *new; int i, newlen, newoff; struct nf_ct_ext_type *t; /* Conntrack must not be confirmed to avoid races on reallocation. */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); - if (!ct->ext) + old = ct->ext; + if (!old) return nf_ct_ext_create(&ct->ext, id, gfp); - if (nf_ct_ext_exist(ct, id)) + if (__nf_ct_ext_exist(old, id)) return NULL; rcu_read_lock(); t = rcu_dereference(nf_ct_ext_types[id]); BUG_ON(t == NULL); - newoff = ALIGN(ct->ext->len, t->align); + newoff = ALIGN(old->len, t->align); newlen = newoff + t->len; rcu_read_unlock(); - new = __krealloc(ct->ext, newlen, gfp); + new = __krealloc(old, newlen, gfp); if (!new) return NULL; - if (new != ct->ext) { + if (new != old) { for (i = 0; i < NF_CT_EXT_NUM; i++) { - if (!nf_ct_ext_exist(ct, i)) + if (!__nf_ct_ext_exist(old, i)) continue; rcu_read_lock(); t = rcu_dereference(nf_ct_ext_types[i]); if (t && t->move) t->move((void *)new + new->offset[i], - (void *)ct->ext + ct->ext->offset[i]); + (void *)old + old->offset[i]); rcu_read_unlock(); } - call_rcu(&ct->ext->rcu, __nf_ct_ext_free_rcu); + call_rcu(&old->rcu, __nf_ct_ext_free_rcu); ct->ext = new; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 802dbffae8b4..c4c885dca3bd 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -585,8 +585,16 @@ static bool tcp_in_window(const struct nf_conn *ct, * Let's try to use the data from the packet. */ sender->td_end = end; + win <<= sender->td_scale; sender->td_maxwin = (win == 0 ? 1 : win); sender->td_maxend = end + sender->td_maxwin; + /* + * We haven't seen traffic in the other direction yet + * but we have to tweak window tracking to pass III + * and IV until that happens. + */ + if (receiver->td_maxwin == 0) + receiver->td_end = receiver->td_maxend = sack; } } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL) @@ -680,7 +688,7 @@ static bool tcp_in_window(const struct nf_conn *ct, /* * Update receiver data. */ - if (after(end, sender->td_maxend)) + if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) receiver->td_maxwin += end - sender->td_maxend; if (after(sack + win, receiver->td_maxend - 1)) { receiver->td_maxend = sack + win; diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c new file mode 100644 index 000000000000..0f642ef8cd26 --- /dev/null +++ b/net/netfilter/xt_CHECKSUM.c @@ -0,0 +1,70 @@ +/* iptables module for the packet checksum mangling + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * (C) 2010 Red Hat, Inc. + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. +*/ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_CHECKSUM.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michael S. Tsirkin <mst@redhat.com>"); +MODULE_DESCRIPTION("Xtables: checksum modification"); +MODULE_ALIAS("ipt_CHECKSUM"); +MODULE_ALIAS("ip6t_CHECKSUM"); + +static unsigned int +checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) +{ + if (skb->ip_summed == CHECKSUM_PARTIAL) + skb_checksum_help(skb); + + return XT_CONTINUE; +} + +static int checksum_tg_check(const struct xt_tgchk_param *par) +{ + const struct xt_CHECKSUM_info *einfo = par->targinfo; + + if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { + pr_info("unsupported CHECKSUM operation %x\n", einfo->operation); + return -EINVAL; + } + if (!einfo->operation) { + pr_info("no CHECKSUM operation enabled\n"); + return -EINVAL; + } + return 0; +} + +static struct xt_target checksum_tg_reg __read_mostly = { + .name = "CHECKSUM", + .family = NFPROTO_UNSPEC, + .target = checksum_tg, + .targetsize = sizeof(struct xt_CHECKSUM_info), + .table = "mangle", + .checkentry = checksum_tg_check, + .me = THIS_MODULE, +}; + +static int __init checksum_tg_init(void) +{ + return xt_register_target(&checksum_tg_reg); +} + +static void __exit checksum_tg_exit(void) +{ + xt_unregister_target(&checksum_tg_reg); +} + +module_init(checksum_tg_init); +module_exit(checksum_tg_exit); diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index e1a0dedac258..c61294d85fda 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -37,8 +37,10 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) return NF_DROP; sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, - iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, - hp->source, tgi->lport ? tgi->lport : hp->dest, + iph->saddr, + tgi->laddr ? tgi->laddr : iph->daddr, + hp->source, + tgi->lport ? tgi->lport : hp->dest, par->in, true); /* NOTE: assign_sock consumes our sk reference */ diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c new file mode 100644 index 000000000000..b39db8a5cbae --- /dev/null +++ b/net/netfilter/xt_cpu.c @@ -0,0 +1,63 @@ +/* Kernel module to match running CPU */ + +/* + * Might be used to distribute connections on several daemons, if + * RPS (Remote Packet Steering) is enabled or NIC is multiqueue capable, + * each RX queue IRQ affined to one CPU (1:1 mapping) + * + */ + +/* (C) 2010 Eric Dumazet + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter/xt_cpu.h> +#include <linux/netfilter/x_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>"); +MODULE_DESCRIPTION("Xtables: CPU match"); + +static int cpu_mt_check(const struct xt_mtchk_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + if (info->invert & ~1) + return -EINVAL; + return 0; +} + +static bool cpu_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cpu_info *info = par->matchinfo; + + return (info->cpu == smp_processor_id()) ^ info->invert; +} + +static struct xt_match cpu_mt_reg __read_mostly = { + .name = "cpu", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = cpu_mt_check, + .match = cpu_mt, + .matchsize = sizeof(struct xt_cpu_info), + .me = THIS_MODULE, +}; + +static int __init cpu_mt_init(void) +{ + return xt_register_match(&cpu_mt_reg); +} + +static void __exit cpu_mt_exit(void) +{ + xt_unregister_match(&cpu_mt_reg); +} + +module_init(cpu_mt_init); +module_exit(cpu_mt_exit); diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c new file mode 100644 index 000000000000..7a4d66db95ae --- /dev/null +++ b/net/netfilter/xt_ipvs.c @@ -0,0 +1,189 @@ +/* + * xt_ipvs - kernel module to match IPVS connection properties + * + * Author: Hannes Eder <heder@google.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#ifdef CONFIG_IP_VS_IPV6 +#include <net/ipv6.h> +#endif +#include <linux/ip_vs.h> +#include <linux/types.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/x_tables.h> +#include <linux/netfilter/xt_ipvs.h> +#include <net/netfilter/nf_conntrack.h> + +#include <net/ip_vs.h> + +MODULE_AUTHOR("Hannes Eder <heder@google.com>"); +MODULE_DESCRIPTION("Xtables: match IPVS connection properties"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_ipvs"); +MODULE_ALIAS("ip6t_ipvs"); + +/* borrowed from xt_conntrack */ +static bool ipvs_mt_addrcmp(const union nf_inet_addr *kaddr, + const union nf_inet_addr *uaddr, + const union nf_inet_addr *umask, + unsigned int l3proto) +{ + if (l3proto == NFPROTO_IPV4) + return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; +#ifdef CONFIG_IP_VS_IPV6 + else if (l3proto == NFPROTO_IPV6) + return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, + &uaddr->in6) == 0; +#endif + else + return false; +} + +static bool +ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_ipvs_mtinfo *data = par->matchinfo; + /* ipvs_mt_check ensures that family is only NFPROTO_IPV[46]. */ + const u_int8_t family = par->family; + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + bool match = true; + + if (data->bitmask == XT_IPVS_IPVS_PROPERTY) { + match = skb->ipvs_property ^ + !!(data->invert & XT_IPVS_IPVS_PROPERTY); + goto out; + } + + /* other flags than XT_IPVS_IPVS_PROPERTY are set */ + if (!skb->ipvs_property) { + match = false; + goto out; + } + + ip_vs_fill_iphdr(family, skb_network_header(skb), &iph); + + if (data->bitmask & XT_IPVS_PROTO) + if ((iph.protocol == data->l4proto) ^ + !(data->invert & XT_IPVS_PROTO)) { + match = false; + goto out; + } + + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) { + match = false; + goto out; + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */); + if (unlikely(cp == NULL)) { + match = false; + goto out; + } + + /* + * We found a connection, i.e. ct != 0, make sure to call + * __ip_vs_conn_put before returning. In our case jump to out_put_con. + */ + + if (data->bitmask & XT_IPVS_VPORT) + if ((cp->vport == data->vport) ^ + !(data->invert & XT_IPVS_VPORT)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VPORTCTL) + if ((cp->control != NULL && + cp->control->vport == data->vportctl) ^ + !(data->invert & XT_IPVS_VPORTCTL)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_DIR) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + if (ct == NULL || nf_ct_is_untracked(ct)) { + match = false; + goto out_put_cp; + } + + if ((ctinfo >= IP_CT_IS_REPLY) ^ + !!(data->invert & XT_IPVS_DIR)) { + match = false; + goto out_put_cp; + } + } + + if (data->bitmask & XT_IPVS_METHOD) + if (((cp->flags & IP_VS_CONN_F_FWD_MASK) == data->fwd_method) ^ + !(data->invert & XT_IPVS_METHOD)) { + match = false; + goto out_put_cp; + } + + if (data->bitmask & XT_IPVS_VADDR) { + if (ipvs_mt_addrcmp(&cp->vaddr, &data->vaddr, + &data->vmask, family) ^ + !(data->invert & XT_IPVS_VADDR)) { + match = false; + goto out_put_cp; + } + } + +out_put_cp: + __ip_vs_conn_put(cp); +out: + pr_debug("match=%d\n", match); + return match; +} + +static int ipvs_mt_check(const struct xt_mtchk_param *par) +{ + if (par->family != NFPROTO_IPV4 +#ifdef CONFIG_IP_VS_IPV6 + && par->family != NFPROTO_IPV6 +#endif + ) { + pr_info("protocol family %u not supported\n", par->family); + return -EINVAL; + } + + return 0; +} + +static struct xt_match xt_ipvs_mt_reg __read_mostly = { + .name = "ipvs", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = ipvs_mt, + .checkentry = ipvs_mt_check, + .matchsize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)), + .me = THIS_MODULE, +}; + +static int __init ipvs_mt_init(void) +{ + return xt_register_match(&xt_ipvs_mt_reg); +} + +static void __exit ipvs_mt_exit(void) +{ + xt_unregister_match(&xt_ipvs_mt_reg); +} + +module_init(ipvs_mt_init); +module_exit(ipvs_mt_exit); diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index b4f7dfea5980..70eb2b4984dd 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -11,7 +11,8 @@ #include <linux/netfilter/xt_quota.h> struct xt_quota_priv { - uint64_t quota; + spinlock_t lock; + uint64_t quota; }; MODULE_LICENSE("GPL"); @@ -20,8 +21,6 @@ MODULE_DESCRIPTION("Xtables: countdown quota match"); MODULE_ALIAS("ipt_quota"); MODULE_ALIAS("ip6t_quota"); -static DEFINE_SPINLOCK(quota_lock); - static bool quota_mt(const struct sk_buff *skb, struct xt_action_param *par) { @@ -29,7 +28,7 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par) struct xt_quota_priv *priv = q->master; bool ret = q->flags & XT_QUOTA_INVERT; - spin_lock_bh("a_lock); + spin_lock_bh(&priv->lock); if (priv->quota >= skb->len) { priv->quota -= skb->len; ret = !ret; @@ -37,9 +36,7 @@ quota_mt(const struct sk_buff *skb, struct xt_action_param *par) /* we do not allow even small packets from now on */ priv->quota = 0; } - /* Copy quota back to matchinfo so that iptables can display it */ - q->quota = priv->quota; - spin_unlock_bh("a_lock); + spin_unlock_bh(&priv->lock); return ret; } @@ -55,6 +52,7 @@ static int quota_mt_check(const struct xt_mtchk_param *par) if (q->master == NULL) return -ENOMEM; + spin_lock_init(&q->master->lock); q->master->quota = q->quota; return 0; } |