From 634a4b2038a6eba4c211fb906fa2f6ec9a4bbfc7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 21 Mar 2010 18:01:05 -0700 Subject: net: suppress lockdep-RCU false positive in FIB trie. Allow fib_find_node() to be called either under rcu_read_lock() protection or with RTNL held. Signed-off-by: Paul E. McKenney Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index af5d89792860..01ef8ba9025c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -961,7 +961,9 @@ fib_find_node(struct trie *t, u32 key) struct node *n; pos = 0; - n = rcu_dereference(t->trie); + n = rcu_dereference_check(t->trie, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; -- cgit v1.2.3 From 5e016cbf6cffd4a53b7922e0c91b775399d7fe47 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 21 Mar 2010 20:55:13 -0700 Subject: ipv4: Don't drop redirected route cache entry unless PTMU actually expired TCP sessions over IPv4 can get stuck if routers between endpoints do not fragment packets but implement PMTU instead, and we are using those routers because of an ICMP redirect. Setup is as follows MTU1 MTU2 MTU1 A--------B------C------D with MTU1 > MTU2. A and D are endpoints, B and C are routers. B and C implement PMTU and drop packets larger than MTU2 (for example because DF is set on all packets). TCP sessions are initiated between A and D. There is packet loss between A and D, causing frequent TCP retransmits. After the number of retransmits on a TCP session reaches tcp_retries1, tcp calls dst_negative_advice() prior to each retransmit. This results in route cache entries for the peer to be deleted in ipv4_negative_advice() if the Path MTU is set. If the outstanding data on an affected TCP session is larger than MTU2, packets sent from the endpoints will be dropped by B or C, and ICMP NEEDFRAG will be returned. A and D receive NEEDFRAG messages and update PMTU. Before the next retransmit, tcp will again call dst_negative_advice(), causing the route cache entry (with correct PMTU) to be deleted. The retransmitted packet will be larger than MTU2, causing it to be dropped again. This sequence repeats until the TCP session aborts or is terminated. Problem is fixed by removing redirected route cache entries in ipv4_negative_advice() only if the PMTU is expired. Signed-off-by: Guenter Roeck Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 32d396196df8..54fd68c14c87 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1510,7 +1510,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->u.dst.expires) { + (rt->u.dst.expires && + time_after_eq(jiffies, rt->u.dst.expires))) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, rt->fl.oif, rt_genid(dev_net(dst->dev))); -- cgit v1.2.3 From 243aad830e8a4cdda261626fbaeddde16b08d04a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Sat, 20 Mar 2010 02:27:58 +0000 Subject: ip_gre: include route header_len in max_headroom calculation Taking route's header_len into account, and updating gre device needed_headroom will give better hints on upper bound of required headroom. This is useful if the gre traffic is xfrm'ed. Signed-off-by: Timo Teras Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f47c9f76754b..f78402d097b3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -810,11 +810,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tunnel->err_count = 0; } - max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; if (!new_skb) { ip_rt_put(rt); txq->tx_dropped++; -- cgit v1.2.3 From 4b97efdf392563bf03b4917a0b5add2df65de39a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 26 Mar 2010 20:27:49 -0700 Subject: net: fix netlink address dumping in IPv4/IPv6 When a dump is interrupted at the last device in a hash chain and then continued, "idx" won't get incremented past s_idx, so s_ip_idx is not reset when moving on to the next device. This means of all following devices only the last n - s_ip_idx addresses are dumped. Tested-by: Pawel Staszewski Signed-off-by: Patrick McHardy --- net/ipv4/devinet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 51ca946e3392..3feb2b390308 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1194,7 +1194,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) hlist_for_each_entry_rcu(dev, node, head, index_hlist) { if (idx < s_idx) goto cont; - if (idx > s_idx) + if (h > s_h || idx > s_idx) s_ip_idx = 0; in_dev = __in_dev_get_rcu(dev); if (!in_dev) -- cgit v1.2.3 From b35ecb5d404c00f2420221ccbb1bbba1139353a4 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 24 Mar 2010 07:43:17 +0000 Subject: ipv4: Cleanup struct net dereference in rt_intern_hash There's no need in getting it 3 times and gcc isn't smart enough to understand this himself. This is just a cleanup before the fix (next patch). Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv4/route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 54fd68c14c87..124af1605d10 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1212,11 +1212,11 @@ restart: slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { struct net *net = dev_net(rt->u.dst.dev); int num = ++net->ipv4.current_rt_cache_rebuild_count; - if (!rt_caching(dev_net(rt->u.dst.dev))) { + if (!rt_caching(net)) { printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", rt->u.dst.dev->name, num); } - rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); + rt_emergency_hash_rebuild(net); } } -- cgit v1.2.3 From 6a2bad70d546cf30a46bc6d9ec0cb9a0891a38eb Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Wed, 24 Mar 2010 21:51:22 +0000 Subject: ipv4: Restart rt_intern_hash after emergency rebuild (v2) The the rebuild changes the genid which in turn is used at the hash calculation. Thus if we don't restart and go on with inserting the rt will happen in wrong chain. (Fixed Neil's comment about the index passed into the rt_intern_hash) Signed-off-by: Pavel Emelyanov Reviewed-by: Neil Horman Signed-off-by: David S. Miller --- net/ipv4/route.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 124af1605d10..d413b57be9b3 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1097,7 +1097,7 @@ static int slow_chain_length(const struct rtable *head) } static int rt_intern_hash(unsigned hash, struct rtable *rt, - struct rtable **rp, struct sk_buff *skb) + struct rtable **rp, struct sk_buff *skb, int ifindex) { struct rtable *rth, **rthp; unsigned long now; @@ -1217,6 +1217,11 @@ restart: rt->u.dst.dev->name, num); } rt_emergency_hash_rebuild(net); + spin_unlock_bh(rt_hash_lock_addr(hash)); + + hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, + ifindex, rt_genid(net)); + goto restart; } } @@ -1477,7 +1482,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, &netevent); rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt, NULL)) + if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) ip_rt_put(rt); goto do_next; } @@ -1931,7 +1936,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, NULL, skb); + return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); e_nobufs: in_dev_put(in_dev); @@ -2098,7 +2103,7 @@ static int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif, rt_genid(dev_net(rth->u.dst.dev))); - return rt_intern_hash(hash, rth, NULL, skb); + return rt_intern_hash(hash, rth, NULL, skb, fl->iif); } /* @@ -2255,7 +2260,7 @@ local_input: } rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, NULL, skb); + err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); goto done; no_route: @@ -2502,7 +2507,7 @@ static int ip_mkroute_output(struct rtable **rp, if (err == 0) { hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp, NULL); + err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); } return err; -- cgit v1.2.3 From 7438189baa0a2fe30084bdc97e3d540ebc5444f3 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 25 Mar 2010 23:45:35 +0000 Subject: net: ipmr/ip6mr: prevent out-of-bounds vif_table access When cache is unresolved, c->mf[6]c_parent is set to 65535 and minvif, maxvif are not initialized, hence we must avoid to parse IIF and OIF. A second problem can happen when the user dumps a cache entry where a VIF, that was referenced at creation time, has been removed. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 0b9d03c54dc3..d0a6092a67be 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1616,17 +1616,20 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) int ct; struct rtnexthop *nhp; struct net *net = mfc_net(c); - struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev; u8 *b = skb_tail_pointer(skb); struct rtattr *mp_head; - if (dev) - RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mfc_parent > MAXVIFS) + return -ENOENT; + + if (VIF_EXISTS(net, c->mfc_parent)) + RTA_PUT(skb, RTA_IIF, 4, &net->ipv4.vif_table[c->mfc_parent].dev->ifindex); mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (c->mfc_un.res.ttls[ct] < 255) { + if (VIF_EXISTS(net, ct) && c->mfc_un.res.ttls[ct] < 255) { if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) goto rtattr_failure; nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); -- cgit v1.2.3 From baff42ab1494528907bf4d5870359e31711746ae Mon Sep 17 00:00:00 2001 From: "Steven J. Magnani" Date: Tue, 30 Mar 2010 13:56:01 -0700 Subject: net: Fix oops from tcp_collapse() when using splice() tcp_read_sock() can have a eat skbs without immediately advancing copied_seq. This can cause a panic in tcp_collapse() if it is called as a result of the recv_actor dropping the socket lock. A userspace program that splices data from a socket to either another socket or to a file can trigger this bug. Signed-off-by: Steven J. Magnani Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6afb6d8662b2..2c75f891914e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1368,6 +1368,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_eat_skb(sk, skb, 0); if (!desc->count) break; + tp->copied_seq = seq; } tp->copied_seq = seq; -- cgit v1.2.3 From 6503d96168f891ffa3b70ae6c9698a1a722025a0 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Wed, 31 Mar 2010 22:58:26 +0000 Subject: net: check the length of the socket address passed to connect(2) check the length of the socket address passed to connect(2). Check the length of the socket address passed to connect(2). If the length is invalid, -EINVAL will be returned. Signed-off-by: Changli Gao ---- net/bluetooth/l2cap.c | 3 ++- net/bluetooth/rfcomm/sock.c | 3 ++- net/bluetooth/sco.c | 3 ++- net/can/bcm.c | 3 +++ net/ieee802154/af_ieee802154.c | 3 +++ net/ipv4/af_inet.c | 5 +++++ net/netlink/af_netlink.c | 3 +++ 7 files changed, 20 insertions(+), 3 deletions(-) Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 33b7dffa7732..a366861bf4cd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -530,6 +530,8 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, { struct sock *sk = sock->sk; + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); @@ -573,6 +575,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int err; long timeo; + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; + lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) { -- cgit v1.2.3