From 95dd8653de658143770cb0e55a58d2aab97c79d2 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 9 Jul 2015 22:56:00 +0200 Subject: netfilter: ctnetlink: put back references to master ct and expect objects We have to put back the references to the master conntrack and the expectation that we just created, otherwise we'll leak them. Fixes: 0ef71ee1a5b9 ("netfilter: ctnetlink: refactor ctnetlink_create_expect") Reported-by: Tim Wiess Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_netlink.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index d1c23940a86a..6b8b0abbfab4 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2995,11 +2995,6 @@ ctnetlink_create_expect(struct net *net, u16 zone, } err = nf_ct_expect_related_report(exp, portid, report); - if (err < 0) - goto err_exp; - - return 0; -err_exp: nf_ct_expect_put(exp); err_ct: nf_ct_put(ct); -- cgit v1.2.3 From 484836ec2de24d9a7c6471f022b746d947698725 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Thu, 9 Jul 2015 17:15:01 -0700 Subject: netfilter: IDLETIMER: fix lockdep warning Dynamically allocated sysfs attributes should be initialized with sysfs_attr_init() otherwise lockdep will be angry with us: [ 45.468653] BUG: key ffffffc030fad4e0 not in .data! [ 45.468655] ------------[ cut here ]------------ [ 45.468666] WARNING: CPU: 0 PID: 1176 at /mnt/host/source/src/third_party/kernel/v3.18/kernel/locking/lockdep.c:2991 lockdep_init_map+0x12c/0x490() [ 45.468672] DEBUG_LOCKS_WARN_ON(1) [ 45.468672] CPU: 0 PID: 1176 Comm: iptables Tainted: G U W 3.18.0 #43 [ 45.468674] Hardware name: XXX [ 45.468675] Call trace: [ 45.468680] [] dump_backtrace+0x0/0x10c [ 45.468683] [] show_stack+0x10/0x1c [ 45.468688] [] dump_stack+0x74/0x94 [ 45.468692] [] warn_slowpath_common+0x84/0xb0 [ 45.468694] [] warn_slowpath_fmt+0x4c/0x58 [ 45.468697] [] lockdep_init_map+0x128/0x490 [ 45.468701] [] __kernfs_create_file+0x80/0xe4 [ 45.468704] [] sysfs_add_file_mode_ns+0x104/0x170 [ 45.468706] [] sysfs_create_file_ns+0x58/0x64 [ 45.468711] [] idletimer_tg_checkentry+0x14c/0x324 [ 45.468714] [] xt_check_target+0x170/0x198 [ 45.468717] [] check_target+0x58/0x6c [ 45.468720] [] translate_table+0x30c/0x424 [ 45.468723] [] do_ipt_set_ctl+0x144/0x1d0 [ 45.468728] [] nf_setsockopt+0x50/0x60 [ 45.468732] [] ip_setsockopt+0x8c/0xb4 [ 45.468735] [] raw_setsockopt+0x10/0x50 [ 45.468739] [] sock_common_setsockopt+0x14/0x20 [ 45.468742] [] SyS_setsockopt+0x88/0xb8 [ 45.468744] ---[ end trace 41d156354d18c039 ]--- Signed-off-by: Dmitry Torokhov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_IDLETIMER.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c index f407ebc13481..29d2c31f406c 100644 --- a/net/netfilter/xt_IDLETIMER.c +++ b/net/netfilter/xt_IDLETIMER.c @@ -126,6 +126,7 @@ static int idletimer_tg_create(struct idletimer_tg_info *info) goto out; } + sysfs_attr_init(&info->timer->attr.attr); info->timer->attr.attr.name = kstrdup(info->label, GFP_KERNEL); if (!info->timer->attr.attr.name) { ret = -ENOMEM; -- cgit v1.2.3 From 326bf17ea5d4f8f17b54cbf167b8cb504c606ee9 Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Fri, 26 Jun 2015 03:18:45 -0700 Subject: ipvs: fix ipv6 route unreach panic Previously there was a trivial panic unshare -n /bin/bash <dev and use that for the purpose of the invocation. Signed-off-by: Alex Gartrell Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index bf66a8657a5f..b99d80695b1f 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -505,6 +505,13 @@ err_put: return -1; err_unreach: + /* The ip6_link_failure function requires the dev field to be set + * in order to get the net (further for the sake of fwmark + * reflection). + */ + if (!skb->dev) + skb->dev = skb_dst(skb)->dev; + dst_link_failure(skb); return -1; } -- cgit v1.2.3 From 4754957f04f5f368792a0eb7dab0ae89fb93dcfd Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 27 Jun 2015 14:39:30 +0300 Subject: ipvs: do not use random local source address for tunnels Michael Vallaly reports about wrong source address used in rare cases for tunneled traffic. Looks like __ip_vs_get_out_rt in 3.10+ is providing uninitialized dest_dst->dst_saddr.ip because ip_vs_dest_dst_alloc uses kmalloc. While we retry after seeing EINVAL from routing for data that does not look like valid local address, it still succeeded when this memory was previously used from other dests and with different local addresses. As result, we can use valid local address that is not suitable for our real server. Fix it by providing 0.0.0.0 every time our cache is refreshed. By this way we will get preferred source address from routing. Reported-by: Michael Vallaly Fixes: 026ace060dfe ("ipvs: optimize dst usage for real server") Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b99d80695b1f..ec30d68ccc0b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -130,7 +130,6 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; - fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? FLOWI_FLAG_KNOWN_NH : 0; -- cgit v1.2.3 From 05f00505a89acd21f5d0d20f5797dfbc4cf85243 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 29 Jun 2015 21:51:40 +0300 Subject: ipvs: fix crash if scheduler is changed I overlooked the svc->sched_data usage from schedulers when the services were converted to RCU in 3.10. Now the rare ipvsadm -E command can change the scheduler but due to the reverse order of ip_vs_bind_scheduler and ip_vs_unbind_scheduler we provide new sched_data to the old scheduler resulting in a crash. To fix it without changing the scheduler methods we have to use synchronize_rcu() only for the editing case. It means all svc->scheduler readers should expect a NULL value. To avoid breakage for the service listing and ipvsadm -R we can use the "none" name to indicate that scheduler is not assigned, a state when we drop new connections. Reported-by: Alexander Vasiliev Fixes: ceec4c381681 ("ipvs: convert services to rcu") Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 16 +++++++-- net/netfilter/ipvs/ip_vs_ctl.c | 78 +++++++++++++++++++++++++--------------- net/netfilter/ipvs/ip_vs_sched.c | 12 +++---- 3 files changed, 69 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5d2b806a862e..38fbc194b9cb 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -319,7 +319,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * return *ignored=0 i.e. ICMP and NF_DROP */ sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); kfree(param.pe_data); @@ -467,7 +473,13 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, } sched = rcu_dereference(svc->scheduler); - dest = sched->schedule(svc, skb, iph); + if (sched) { + /* read svc->sched_data after svc->scheduler */ + smp_rmb(); + dest = sched->schedule(svc, skb, iph); + } else { + dest = NULL; + } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 285eae3a1454..24c554201a76 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -842,15 +842,16 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, __ip_vs_dst_cache_reset(dest); spin_unlock_bh(&dest->dst_lock); - sched = rcu_dereference_protected(svc->scheduler, 1); if (add) { ip_vs_start_estimator(svc->net, &dest->stats); list_add_rcu(&dest->n_list, &svc->destinations); svc->num_dests++; - if (sched->add_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->add_dest) sched->add_dest(svc, dest); } else { - if (sched->upd_dest) + sched = rcu_dereference_protected(svc->scheduler, 1); + if (sched && sched->upd_dest) sched->upd_dest(svc, dest); } } @@ -1084,7 +1085,7 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc, struct ip_vs_scheduler *sched; sched = rcu_dereference_protected(svc->scheduler, 1); - if (sched->del_dest) + if (sched && sched->del_dest) sched->del_dest(svc, dest); } } @@ -1175,11 +1176,14 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, ip_vs_use_count_inc(); /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - ret = -ENOENT; - goto out_err; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_err; + } } if (u->pe_name && *u->pe_name) { @@ -1240,10 +1244,12 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, spin_lock_init(&svc->stats.lock); /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + } /* Bind the ct retriever */ RCU_INIT_POINTER(svc->pe, pe); @@ -1291,17 +1297,20 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { - struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; /* * Lookup the scheduler, by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); - return -ENOENT; + if (strcmp(u->sched_name, "none")) { + sched = ip_vs_scheduler_get(u->sched_name); + if (!sched) { + pr_info("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } } old_sched = sched; @@ -1329,14 +1338,20 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) old_sched = rcu_dereference_protected(svc->scheduler, 1); if (sched != old_sched) { + if (old_sched) { + ip_vs_unbind_scheduler(svc, old_sched); + RCU_INIT_POINTER(svc->scheduler, NULL); + /* Wait all svc->sched_data users */ + synchronize_rcu(); + } /* Bind the new scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) { - old_sched = sched; - goto out; + if (sched) { + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) { + ip_vs_scheduler_put(sched); + goto out; + } } - /* Unbind the old scheduler on success */ - ip_vs_unbind_scheduler(svc, old_sched); } /* @@ -1982,6 +1997,7 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) const struct ip_vs_iter *iter = seq->private; const struct ip_vs_dest *dest; struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; if (iter->table == ip_vs_svc_table) { #ifdef CONFIG_IP_VS_IPV6 @@ -1990,18 +2006,18 @@ static int ip_vs_info_seq_show(struct seq_file *seq, void *v) ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), - sched->name); + sched_name); else #endif seq_printf(seq, "%s %08X:%04X %s %s ", ip_vs_proto_name(svc->protocol), ntohl(svc->addr.ip), ntohs(svc->port), - sched->name, + sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } else { seq_printf(seq, "FWM %08X %s %s", - svc->fwmark, sched->name, + svc->fwmark, sched_name, (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":""); } @@ -2427,13 +2443,15 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) { struct ip_vs_scheduler *sched; struct ip_vs_kstats kstats; + char *sched_name; sched = rcu_dereference_protected(src->scheduler, 1); + sched_name = sched ? sched->name : "none"; dst->protocol = src->protocol; dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, sched->name, sizeof(dst->sched_name)); + strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2892,6 +2910,7 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, struct ip_vs_flags flags = { .flags = svc->flags, .mask = ~0 }; struct ip_vs_kstats kstats; + char *sched_name; nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); if (!nl_service) @@ -2910,8 +2929,9 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } sched = rcu_dereference_protected(svc->scheduler, 1); + sched_name = sched ? sched->name : "none"; pe = rcu_dereference_protected(svc->pe, 1); - if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched->name) || + if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) || (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) || nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) || nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) || diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 199760c71f39..7e8141647943 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -74,7 +74,7 @@ void ip_vs_unbind_scheduler(struct ip_vs_service *svc, if (sched->done_service) sched->done_service(svc); - /* svc->scheduler can not be set to NULL */ + /* svc->scheduler can be set to NULL only by caller */ } @@ -147,21 +147,21 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { - struct ip_vs_scheduler *sched; + struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); + char *sched_name = sched ? sched->name : "none"; - sched = rcu_dereference(svc->scheduler); if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", - sched->name, svc->fwmark, svc->fwmark, msg); + sched_name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", - sched->name, ip_vs_proto_name(svc->protocol), + sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } -- cgit v1.2.3 From 71563f3414e917c62acd8e0fb0edf8ed6af63e4b Mon Sep 17 00:00:00 2001 From: Alex Gartrell Date: Sun, 5 Jul 2015 14:28:26 -0700 Subject: ipvs: skb_orphan in case of forwarding It is possible that we bind against a local socket in early_demux when we are actually going to want to forward it. In this case, the socket serves no purpose and only serves to confuse things (particularly functions which implicitly expect sk_fullsock to be true, like ip_local_out). Additionally, skb_set_owner_w is totally broken for non full-socks. Signed-off-by: Alex Gartrell Fixes: 41063e9dd119 ("ipv4: Early TCP socket demux.") Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index ec30d68ccc0b..34dc1429ebdb 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -533,6 +533,21 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, return ret; } +/* In the event of a remote destination, it's possible that we would have + * matches against an old socket (particularly a TIME-WAIT socket). This + * causes havoc down the line (ip_local_out et. al. expect regular sockets + * and invalid memory accesses will happen) so simply drop the association + * in this case. +*/ +static inline void ip_vs_drop_early_demux_sk(struct sk_buff *skb) +{ + /* If dev is set, the packet came from the LOCAL_IN callback and + * not from a local TCP socket. + */ + if (skb->dev) + skb_orphan(skb); +} + /* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, struct ip_vs_conn *cp, int local) @@ -544,12 +559,21 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 1); + + /* Remove the early_demux association unless it's bound for the + * exact same port and address on this host after translation. + */ + if (!local || cp->vport != cp->dport || + !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) + ip_vs_drop_early_demux_sk(skb); + if (!local) { skb_forward_csum(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, NULL, skb_dst(skb)->dev, dst_output_sk); } else ret = NF_ACCEPT; + return ret; } @@ -563,6 +587,7 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) ip_vs_notrack(skb); if (!local) { + ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, NULL, skb_dst(skb)->dev, dst_output_sk); @@ -851,6 +876,8 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, struct ipv6hdr *old_ipv6h = NULL; #endif + ip_vs_drop_early_demux_sk(skb); + if (skb_headroom(skb) < max_headroom || skb_cloned(skb)) { new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) -- cgit v1.2.3 From 56184858d1fc95c46723436b455cb7261cd8be6f Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 8 Jul 2015 08:31:33 +0300 Subject: ipvs: fix crash with sync protocol v0 and FTP Fix crash in 3.5+ if FTP is used after switching sync_version to 0. Fixes: 749c42b620a9 ("ipvs: reduce sync rate with time thresholds") Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index b08ba9538d12..d99ad93eb855 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -612,7 +612,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, pkts = atomic_add_return(1, &cp->in_pkts); else pkts = sysctl_sync_threshold(ipvs); - ip_vs_sync_conn(net, cp->control, pkts); + ip_vs_sync_conn(net, cp, pkts); } } -- cgit v1.2.3 From e3895c0334d0ef46e80f22eaf2a52401ff6d5a67 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 9 Jul 2015 11:15:27 +0300 Subject: ipvs: call skb_sender_cpu_clear Reset XPS's sender_cpu on forwarding. Signed-off-by: Julian Anastasov Fixes: 2bd82484bb4c ("xps: fix xps for stacked devices") Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 34dc1429ebdb..258a0b0e82a2 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -529,6 +529,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset(skb); skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); } return ret; } @@ -569,6 +571,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, NULL, skb_dst(skb)->dev, dst_output_sk); } else @@ -589,6 +593,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!local) { ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); + if (!skb->sk) + skb_sender_cpu_clear(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, NULL, skb, NULL, skb_dst(skb)->dev, dst_output_sk); } else -- cgit v1.2.3 From 0838aa7fcfcd875caa7bcc5dab0c3fd40444553d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 13 Jul 2015 15:11:48 +0200 Subject: netfilter: fix netns dependencies with conntrack templates Quoting Daniel Borkmann: "When adding connection tracking template rules to a netns, f.e. to configure netfilter zones, the kernel will endlessly busy-loop as soon as we try to delete the given netns in case there's at least one template present, which is problematic i.e. if there is such bravery that the priviledged user inside the netns is assumed untrusted. Minimal example: ip netns add foo ip netns exec foo iptables -t raw -A PREROUTING -d 1.2.3.4 -j CT --zone 1 ip netns del foo What happens is that when nf_ct_iterate_cleanup() is being called from nf_conntrack_cleanup_net_list() for a provided netns, we always end up with a net->ct.count > 0 and thus jump back to i_see_dead_people. We don't get a soft-lockup as we still have a schedule() point, but the serving CPU spins on 100% from that point onwards. Since templates are normally allocated with nf_conntrack_alloc(), we also bump net->ct.count. The issue why they are not yet nf_ct_put() is because the per netns .exit() handler from x_tables (which would eventually invoke xt_CT's xt_ct_tg_destroy() that drops reference on info->ct) is called in the dependency chain at a *later* point in time than the per netns .exit() handler for the connection tracker. This is clearly a chicken'n'egg problem: after the connection tracker .exit() handler, we've teared down all the connection tracking infrastructure already, so rightfully, xt_ct_tg_destroy() cannot be invoked at a later point in time during the netns cleanup, as that would lead to a use-after-free. At the same time, we cannot make x_tables depend on the connection tracker module, so that the xt_ct_tg_destroy() would be invoked earlier in the cleanup chain." Daniel confirms this has to do with the order in which modules are loaded or having compiled nf_conntrack as modules while x_tables built-in. So we have no guarantees regarding the order in which netns callbacks are executed. Fix this by allocating the templates through kmalloc() from the respective SYNPROXY and CT targets, so they don't depend on the conntrack kmem cache. Then, release then via nf_ct_tmpl_free() from destroy_conntrack(). This branch is marked as unlikely since conntrack templates are rarely allocated and only from the configuration plane path. Note that templates are not kept in any list to avoid further dependencies with nf_conntrack anymore, thus, the tmpl larval list is removed. Reported-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso Tested-by: Daniel Borkmann --- net/netfilter/nf_conntrack_core.c | 67 +++++++++++++++++++++++++-------------- net/netfilter/nf_synproxy_core.c | 7 ++-- net/netfilter/xt_CT.c | 8 ++--- 3 files changed, 50 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 13fad8668f83..651039ad1681 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -287,6 +287,46 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) spin_unlock(&pcpu->lock); } +/* Released via destroy_conntrack() */ +struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags) +{ + struct nf_conn *tmpl; + + tmpl = kzalloc(sizeof(struct nf_conn), GFP_KERNEL); + if (tmpl == NULL) + return NULL; + + tmpl->status = IPS_TEMPLATE; + write_pnet(&tmpl->ct_net, net); + +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (zone) { + struct nf_conntrack_zone *nf_ct_zone; + + nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC); + if (!nf_ct_zone) + goto out_free; + nf_ct_zone->id = zone; + } +#endif + atomic_set(&tmpl->ct_general.use, 0); + + return tmpl; +#ifdef CONFIG_NF_CONNTRACK_ZONES +out_free: + kfree(tmpl); + return NULL; +#endif +} +EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); + +static void nf_ct_tmpl_free(struct nf_conn *tmpl) +{ + nf_ct_ext_destroy(tmpl); + nf_ct_ext_free(tmpl); + kfree(tmpl); +} + static void destroy_conntrack(struct nf_conntrack *nfct) { @@ -298,6 +338,10 @@ destroy_conntrack(struct nf_conntrack *nfct) NF_CT_ASSERT(atomic_read(&nfct->use) == 0); NF_CT_ASSERT(!timer_pending(&ct->timeout)); + if (unlikely(nf_ct_is_template(ct))) { + nf_ct_tmpl_free(ct); + return; + } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto && l4proto->destroy) @@ -540,28 +584,6 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); -/* deletion from this larval template list happens via nf_ct_put() */ -void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl) -{ - struct ct_pcpu *pcpu; - - __set_bit(IPS_TEMPLATE_BIT, &tmpl->status); - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); - nf_conntrack_get(&tmpl->ct_general); - - /* add this conntrack to the (per cpu) tmpl list */ - local_bh_disable(); - tmpl->cpu = smp_processor_id(); - pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu); - - spin_lock(&pcpu->lock); - /* Overload tuple linked list to put us in template list. */ - hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &pcpu->tmpl); - spin_unlock_bh(&pcpu->lock); -} -EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert); - /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -1751,7 +1773,6 @@ int nf_conntrack_init_net(struct net *net) spin_lock_init(&pcpu->lock); INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL); INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL); - INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL); } net->ct.stat = alloc_percpu(struct ip_conntrack_stat); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 789feeae6c44..71f1e9fdfa18 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -349,12 +349,10 @@ static void __net_exit synproxy_proc_exit(struct net *net) static int __net_init synproxy_net_init(struct net *net) { struct synproxy_net *snet = synproxy_pernet(net); - struct nf_conntrack_tuple t; struct nf_conn *ct; int err = -ENOMEM; - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(net, 0, &t, &t, GFP_KERNEL); + ct = nf_ct_tmpl_alloc(net, 0, GFP_KERNEL); if (IS_ERR(ct)) { err = PTR_ERR(ct); goto err1; @@ -365,7 +363,8 @@ static int __net_init synproxy_net_init(struct net *net) if (!nfct_synproxy_ext_add(ct)) goto err2; - nf_conntrack_tmpl_insert(net, ct); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_get(&ct->ct_general); snet->tmpl = ct; snet->stats = alloc_percpu(struct synproxy_stats); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 75747aecdebe..c6630030c912 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -184,7 +184,6 @@ out: static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { - struct nf_conntrack_tuple t; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -202,8 +201,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err1; - memset(&t, 0, sizeof(t)); - ct = nf_conntrack_alloc(par->net, info->zone, &t, &t, GFP_KERNEL); + ct = nf_ct_tmpl_alloc(par->net, info->zone, GFP_KERNEL); ret = PTR_ERR(ct); if (IS_ERR(ct)) goto err2; @@ -227,8 +225,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err3; } - - nf_conntrack_tmpl_insert(par->net, ct); + __set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_get(&ct->ct_general); out: info->ct = ct; return 0; -- cgit v1.2.3 From 4b31814d20cbe5cd4ccf18089751e77a04afe4f2 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 21 Jul 2015 21:37:31 -0700 Subject: netfilter: nf_conntrack: Support expectations in different zones When zones were originally introduced, the expectation functions were all extended to perform lookup using the zone. However, insertion was not modified to check the zone. This means that two expectations which are intended to apply for different connections that have the same tuple but exist in different zones cannot both be tracked. Fixes: 5d0aa2ccd4 (netfilter: nf_conntrack: add support for "conntrack zones") Signed-off-by: Joe Stringer Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_expect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 7a17070c5dab..b45a4223cb05 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -219,7 +219,8 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } - return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); + return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + nf_ct_zone(a->master) == nf_ct_zone(b->master); } static inline int expect_matches(const struct nf_conntrack_expect *a, -- cgit v1.2.3