From 634a4b2038a6eba4c211fb906fa2f6ec9a4bbfc7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 21 Mar 2010 18:01:05 -0700
Subject: net: suppress lockdep-RCU false positive in FIB trie.

Allow fib_find_node() to be called either under rcu_read_lock()
protection or with RTNL held.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index af5d89792860..01ef8ba9025c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -961,7 +961,9 @@ fib_find_node(struct trie *t, u32 key)
 	struct node *n;
 
 	pos = 0;
-	n = rcu_dereference(t->trie);
+	n = rcu_dereference_check(t->trie,
+				  rcu_read_lock_held() ||
+				  lockdep_rtnl_is_held());
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
-- 
cgit v1.2.3


From 5e016cbf6cffd4a53b7922e0c91b775399d7fe47 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <guenter.roeck@ericsson.com>
Date: Sun, 21 Mar 2010 20:55:13 -0700
Subject: ipv4: Don't drop redirected route cache entry unless PTMU actually
 expired

TCP sessions over IPv4 can get stuck if routers between endpoints
do not fragment packets but implement PMTU instead, and we are using
those routers because of an ICMP redirect.

Setup is as follows

       MTU1    MTU2   MTU1
    A--------B------C------D

with MTU1 > MTU2. A and D are endpoints, B and C are routers. B and C
implement PMTU and drop packets larger than MTU2 (for example because
DF is set on all packets). TCP sessions are initiated between A and D.
There is packet loss between A and D, causing frequent TCP
retransmits.

After the number of retransmits on a TCP session reaches tcp_retries1,
tcp calls dst_negative_advice() prior to each retransmit. This results
in route cache entries for the peer to be deleted in
ipv4_negative_advice() if the Path MTU is set.

If the outstanding data on an affected TCP session is larger than
MTU2, packets sent from the endpoints will be dropped by B or C, and
ICMP NEEDFRAG will be returned. A and D receive NEEDFRAG messages and
update PMTU.

Before the next retransmit, tcp will again call dst_negative_advice(),
causing the route cache entry (with correct PMTU) to be deleted. The
retransmitted packet will be larger than MTU2, causing it to be
dropped again.

This sequence repeats until the TCP session aborts or is terminated.

Problem is fixed by removing redirected route cache entries in
ipv4_negative_advice() only if the PMTU is expired.

Signed-off-by: Guenter Roeck <guenter.roeck@ericsson.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 32d396196df8..54fd68c14c87 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1510,7 +1510,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 			ip_rt_put(rt);
 			ret = NULL;
 		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
-			   rt->u.dst.expires) {
+			   (rt->u.dst.expires &&
+			    time_after_eq(jiffies, rt->u.dst.expires))) {
 			unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
 						rt->fl.oif,
 						rt_genid(dev_net(dst->dev)));
-- 
cgit v1.2.3


From 243aad830e8a4cdda261626fbaeddde16b08d04a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timo=20Ter=C3=A4s?= <timo.teras@iki.fi>
Date: Sat, 20 Mar 2010 02:27:58 +0000
Subject: ip_gre: include route header_len in max_headroom calculation

Taking route's header_len into account, and updating gre device
needed_headroom will give better hints on upper bound of required
headroom. This is useful if the gre traffic is xfrm'ed.

Signed-off-by: Timo Teras <timo.teras@iki.fi>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f47c9f76754b..f78402d097b3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -810,11 +810,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			tunnel->err_count = 0;
 	}
 
-	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
 
 	if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
 	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+		if (max_headroom > dev->needed_headroom)
+			dev->needed_headroom = max_headroom;
 		if (!new_skb) {
 			ip_rt_put(rt);
 			txq->tx_dropped++;
-- 
cgit v1.2.3


From 4b97efdf392563bf03b4917a0b5add2df65de39a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 26 Mar 2010 20:27:49 -0700
Subject: net: fix netlink address dumping in IPv4/IPv6

When a dump is interrupted at the last device in a hash chain and
then continued, "idx" won't get incremented past s_idx, so s_ip_idx
is not reset when moving on to the next device. This means of all
following devices only the last n - s_ip_idx addresses are dumped.

Tested-by: Pawel Staszewski <pstaszewski@itcare.pl>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/devinet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 51ca946e3392..3feb2b390308 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1194,7 +1194,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 		hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
 			if (idx < s_idx)
 				goto cont;
-			if (idx > s_idx)
+			if (h > s_h || idx > s_idx)
 				s_ip_idx = 0;
 			in_dev = __in_dev_get_rcu(dev);
 			if (!in_dev)
-- 
cgit v1.2.3


From b35ecb5d404c00f2420221ccbb1bbba1139353a4 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 24 Mar 2010 07:43:17 +0000
Subject: ipv4: Cleanup struct net dereference in rt_intern_hash

There's no need in getting it 3 times and gcc isn't smart enough
to understand this himself.

This is just a cleanup before the fix (next patch).

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 54fd68c14c87..124af1605d10 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1212,11 +1212,11 @@ restart:
 		    slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
 			struct net *net = dev_net(rt->u.dst.dev);
 			int num = ++net->ipv4.current_rt_cache_rebuild_count;
-			if (!rt_caching(dev_net(rt->u.dst.dev))) {
+			if (!rt_caching(net)) {
 				printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
 					rt->u.dst.dev->name, num);
 			}
-			rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
+			rt_emergency_hash_rebuild(net);
 		}
 	}
 
-- 
cgit v1.2.3


From 6a2bad70d546cf30a46bc6d9ec0cb9a0891a38eb Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 24 Mar 2010 21:51:22 +0000
Subject: ipv4: Restart rt_intern_hash after emergency rebuild (v2)

The the rebuild changes the genid which in turn is used at
the hash calculation. Thus if we don't restart and go on with
inserting the rt will happen in wrong chain.

(Fixed Neil's comment about the index passed into the rt_intern_hash)

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Reviewed-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 124af1605d10..d413b57be9b3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1097,7 +1097,7 @@ static int slow_chain_length(const struct rtable *head)
 }
 
 static int rt_intern_hash(unsigned hash, struct rtable *rt,
-			  struct rtable **rp, struct sk_buff *skb)
+			  struct rtable **rp, struct sk_buff *skb, int ifindex)
 {
 	struct rtable	*rth, **rthp;
 	unsigned long	now;
@@ -1217,6 +1217,11 @@ restart:
 					rt->u.dst.dev->name, num);
 			}
 			rt_emergency_hash_rebuild(net);
+			spin_unlock_bh(rt_hash_lock_addr(hash));
+
+			hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
+					ifindex, rt_genid(net));
+			goto restart;
 		}
 	}
 
@@ -1477,7 +1482,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
 							&netevent);
 
 				rt_del(hash, rth);
-				if (!rt_intern_hash(hash, rt, &rt, NULL))
+				if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
 					ip_rt_put(rt);
 				goto do_next;
 			}
@@ -1931,7 +1936,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	in_dev_put(in_dev);
 	hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-	return rt_intern_hash(hash, rth, NULL, skb);
+	return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
 
 e_nobufs:
 	in_dev_put(in_dev);
@@ -2098,7 +2103,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
 	/* put it into the cache */
 	hash = rt_hash(daddr, saddr, fl->iif,
 		       rt_genid(dev_net(rth->u.dst.dev)));
-	return rt_intern_hash(hash, rth, NULL, skb);
+	return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
 }
 
 /*
@@ -2255,7 +2260,7 @@ local_input:
 	}
 	rth->rt_type	= res.type;
 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
-	err = rt_intern_hash(hash, rth, NULL, skb);
+	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
 	goto done;
 
 no_route:
@@ -2502,7 +2507,7 @@ static int ip_mkroute_output(struct rtable **rp,
 	if (err == 0) {
 		hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
 			       rt_genid(dev_net(dev_out)));
-		err = rt_intern_hash(hash, rth, rp, NULL);
+		err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
 	}
 
 	return err;
-- 
cgit v1.2.3


From 7438189baa0a2fe30084bdc97e3d540ebc5444f3 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@dev.6wind.com>
Date: Thu, 25 Mar 2010 23:45:35 +0000
Subject: net: ipmr/ip6mr: prevent out-of-bounds vif_table access

When cache is unresolved, c->mf[6]c_parent is set to 65535 and
minvif, maxvif are not initialized, hence we must avoid to
parse IIF and OIF.
A second problem can happen when the user dumps a cache entry
where a VIF, that was referenced at creation time, has been
removed.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 0b9d03c54dc3..d0a6092a67be 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1616,17 +1616,20 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
 	int ct;
 	struct rtnexthop *nhp;
 	struct net *net = mfc_net(c);
-	struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
 	u8 *b = skb_tail_pointer(skb);
 	struct rtattr *mp_head;
 
-	if (dev)
-		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+	/* If cache is unresolved, don't try to parse IIF and OIF */
+	if (c->mfc_parent > MAXVIFS)
+		return -ENOENT;
+
+	if (VIF_EXISTS(net, c->mfc_parent))
+		RTA_PUT(skb, RTA_IIF, 4, &net->ipv4.vif_table[c->mfc_parent].dev->ifindex);
 
 	mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
 
 	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
-		if (c->mfc_un.res.ttls[ct] < 255) {
+		if (VIF_EXISTS(net, ct) && c->mfc_un.res.ttls[ct] < 255) {
 			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
 				goto rtattr_failure;
 			nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
-- 
cgit v1.2.3


From baff42ab1494528907bf4d5870359e31711746ae Mon Sep 17 00:00:00 2001
From: "Steven J. Magnani" <steve@digidescorp.com>
Date: Tue, 30 Mar 2010 13:56:01 -0700
Subject: net: Fix oops from tcp_collapse() when using splice()

tcp_read_sock() can have a eat skbs without immediately advancing copied_seq.
This can cause a panic in tcp_collapse() if it is called as a result
of the recv_actor dropping the socket lock.

A userspace program that splices data from a socket to either another
socket or to a file can trigger this bug.

Signed-off-by: Steven J. Magnani <steve@digidescorp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6afb6d8662b2..2c75f891914e 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1368,6 +1368,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
 		sk_eat_skb(sk, skb, 0);
 		if (!desc->count)
 			break;
+		tp->copied_seq = seq;
 	}
 	tp->copied_seq = seq;
 
-- 
cgit v1.2.3


From 6503d96168f891ffa3b70ae6c9698a1a722025a0 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 31 Mar 2010 22:58:26 +0000
Subject: net: check the length of the socket address passed to connect(2)

check the length of the socket address passed to connect(2).

Check the length of the socket address passed to connect(2). If the
length is invalid, -EINVAL will be returned.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
----
net/bluetooth/l2cap.c | 3 ++-
net/bluetooth/rfcomm/sock.c | 3 ++-
net/bluetooth/sco.c | 3 ++-
net/can/bcm.c | 3 +++
net/ieee802154/af_ieee802154.c | 3 +++
net/ipv4/af_inet.c | 5 +++++
net/netlink/af_netlink.c | 3 +++
7 files changed, 20 insertions(+), 3 deletions(-)
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 33b7dffa7732..a366861bf4cd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -530,6 +530,8 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
 {
 	struct sock *sk = sock->sk;
 
+	if (addr_len < sizeof(uaddr->sa_family))
+		return -EINVAL;
 	if (uaddr->sa_family == AF_UNSPEC)
 		return sk->sk_prot->disconnect(sk, flags);
 
@@ -573,6 +575,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 	int err;
 	long timeo;
 
+	if (addr_len < sizeof(uaddr->sa_family))
+		return -EINVAL;
+
 	lock_sock(sk);
 
 	if (uaddr->sa_family == AF_UNSPEC) {
-- 
cgit v1.2.3