From d543103a0c75edc0a7a08dfd796de67466a15dfb Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 8 Apr 2009 13:15:22 +0000 Subject: net: netif_device_attach/detach should start/stop all queues Currently netif_device_attach/detach are only stopping one queue. They should be starting and stopping all the queues on a given device. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 91d792d17e09..ea8eb2214b09 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1430,7 +1430,7 @@ void netif_device_detach(struct net_device *dev) { if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { - netif_stop_queue(dev); + netif_tx_stop_all_queues(dev); } } EXPORT_SYMBOL(netif_device_detach); @@ -1445,7 +1445,7 @@ void netif_device_attach(struct net_device *dev) { if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) && netif_running(dev)) { - netif_wake_queue(dev); + netif_tx_wake_all_queues(dev); __netdev_watchdog_up(dev); } } -- cgit v1.2.3 From fc59f9a3bf8096a1f68a8b78ada7a0e0ab9236b2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 14 Apr 2009 15:11:06 -0700 Subject: gro: Restore correct value to gso_size Since everybody has been focusing on baremetal GRO performance no one noticed when I added a bug that zapped gso_size for all GRO packets. This only gets picked up when you forward the skb out of an interface. Thanks to Mark Wagner for noticing this bug when testing kvm. Reported-by: Mark Wagner Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ea8eb2214b09..343883f65ea7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2328,8 +2328,10 @@ static int napi_gro_complete(struct sk_buff *skb) struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK]; int err = -ENOENT; - if (NAPI_GRO_CB(skb)->count == 1) + if (NAPI_GRO_CB(skb)->count == 1) { + skb_shinfo(skb)->gso_size = 0; goto out; + } rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -2348,7 +2350,6 @@ static int napi_gro_complete(struct sk_buff *skb) } out: - skb_shinfo(skb)->gso_size = 0; return netif_receive_skb(skb); } -- cgit v1.2.3 From 8caf153974f2274301e583fda732cc8e5b80331f Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Fri, 17 Apr 2009 10:08:49 +0000 Subject: net: sch_netem: Fix an inconsistency in ingress netem timestamps. Alex Sidorenko reported: "while experimenting with 'netem' we have found some strange behaviour. It seemed that ingress delay as measured by 'ping' command shows up on some hosts but not on others. After some investigation I have found that the problem is that skbuff->tstamp field value depends on whether there are any packet sniffers enabled. That is: - if any ptype_all handler is registered, the tstamp field is as expected - if there are no ptype_all handlers, the tstamp field does not show the delay" This patch prevents unnecessary update of tstamp in dev_queue_xmit_nit() on ingress path (with act_mirred) adding a check, so minimal overhead on the fast path, but only when sniffers etc. are active. Since netem at ingress seems to logically emulate a network before a host, tstamp is zeroed to trigger the update and pretend delays are from the outside. Reported-by: Alex Sidorenko Tested-by: Alex Sidorenko Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 343883f65ea7..dcc357e4f91e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1336,7 +1336,12 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) { struct packet_type *ptype; +#ifdef CONFIG_NET_CLS_ACT + if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS))) + net_timestamp(skb); +#else net_timestamp(skb); +#endif rcu_read_lock(); list_for_each_entry_rcu(ptype, &ptype_all, list) { -- cgit v1.2.3 From eb39c57ff7782bc015da517af1d9c3b2592e721e Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 19 Apr 2009 07:24:24 +0000 Subject: net: fix "compatibility" typos Signed-off-by: Marcin Slusarz Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index dcc357e4f91e..001a4c551d44 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4405,7 +4405,7 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; #ifdef CONFIG_COMPAT_NET_DEV_OPS - /* Netdevice_ops API compatiability support. + /* Netdevice_ops API compatibility support. * This is temporary until all network devices are converted. */ if (dev->netdev_ops) { @@ -4416,7 +4416,7 @@ int register_netdevice(struct net_device *dev) dev->name, netdev_drivername(dev, drivername, 64)); /* This works only because net_device_ops and the - compatiablity structure are the same. */ + compatibility structure are the same. */ dev->netdev_ops = (void *) &(dev->init); } #endif -- cgit v1.2.3 From 5db8765a86a4cbaf45adaf8c231cf9a6ca2dcfaf Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 16 Apr 2009 08:04:20 +0000 Subject: net: Fix GRO for multiple page fragments This loop over fragments in napi_fraginfo_skb() was "interesting". Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 001a4c551d44..308a7d0c277f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2545,9 +2545,9 @@ struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi, } BUG_ON(info->nr_frags > MAX_SKB_FRAGS); - frag = &info->frags[info->nr_frags - 1]; + frag = info->frags; - for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) { + for (i = 0; i < info->nr_frags; i++) { skb_fill_page_desc(skb, i, frag->page, frag->page_offset, frag->size); frag++; -- cgit v1.2.3 From bf368e4e70cd4e0f880923c44e95a4273d725ab4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Apr 2009 02:24:21 -0700 Subject: net: Avoid extra wakeups of threads blocked in wait_for_packet() In 2.6.25 we added UDP mem accounting. This unfortunatly added a penalty when a frame is transmitted, since we have at TX completion time to call sock_wfree() to perform necessary memory accounting. This calls sock_def_write_space() and utimately scheduler if any thread is waiting on the socket. Thread(s) waiting for an incoming frame was scheduled, then had to sleep again as event was meaningless. (All threads waiting on a socket are using same sk_sleep anchor) This adds lot of extra wakeups and increases latencies, as noted by Christoph Lameter, and slows down softirq handler. Reference : http://marc.info/?l=linux-netdev&m=124060437012283&w=2 Fortunatly, Davide Libenzi recently added concept of keyed wakeups into kernel, and particularly for sockets (see commit 37e5540b3c9d838eb20f2ca8ea2eb8072271e403 epoll keyed wakeups: make sockets use keyed wakeups) Davide goal was to optimize epoll, but this new wakeup infrastructure can help non epoll users as well, if they care to setup an appropriate handler. This patch introduces new DEFINE_WAIT_FUNC() helper and uses it in wait_for_packet(), so that only relevant event can wakeup a thread blocked in this function. Trace of function calls from bnx2 TX completion bnx2_poll_work() is : __kfree_skb() skb_release_head_state() sock_wfree() sock_def_write_space() __wake_up_sync_key() __wake_up_common() receiver_wake_function() : Stops here since thread is waiting for an INPUT Reported-by: Christoph Lameter Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/datagram.c b/net/core/datagram.c index d0de644b378d..b01a76abe1d2 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -64,13 +64,25 @@ static inline int connection_based(struct sock *sk) return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; } +static int receiver_wake_function(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + unsigned long bits = (unsigned long)key; + + /* + * Avoid a wakeup if event not interesting for us + */ + if (bits && !(bits & (POLLIN | POLLERR))) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} /* * Wait for a packet.. */ static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) { int error; - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, receiver_wake_function); prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); -- cgit v1.2.3 From 7a67e56fd362d3edfde1f19170893508c3940d3a Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Thu, 30 Apr 2009 05:41:19 -0700 Subject: net: Fix oops when splicing skbs from a frag_list. Lennert Buytenhek wrote: > Since 4fb669948116d928ae44262ab7743732c574630d ("net: Optimize memory > usage when splicing from sockets.") I'm seeing this oops (e.g. in > 2.6.30-rc3) when splicing from a TCP socket to /dev/null on a driver > (mv643xx_eth) that uses LRO in the skb mode (lro_receive_skb) rather > than the frag mode: My patch incorrectly assumed skb->sk was always valid, but for "frag_listed" skbs we can only use skb->sk of their parent. Reported-by: Lennert Buytenhek Debugged-by: Lennert Buytenhek Tested-by: Lennert Buytenhek Signed-off-by: Jarek Poplawski Signed-off-by: David S. Miller --- net/core/skbuff.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ce6356cd9f71..f091a5a845c1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1365,9 +1365,8 @@ static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) static inline struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, - struct sk_buff *skb) + struct sk_buff *skb, struct sock *sk) { - struct sock *sk = skb->sk; struct page *p = sk->sk_sndmsg_page; unsigned int off; @@ -1405,13 +1404,14 @@ new_page: */ static inline int spd_fill_page(struct splice_pipe_desc *spd, struct page *page, unsigned int *len, unsigned int offset, - struct sk_buff *skb, int linear) + struct sk_buff *skb, int linear, + struct sock *sk) { if (unlikely(spd->nr_pages == PIPE_BUFFERS)) return 1; if (linear) { - page = linear_to_page(page, len, &offset, skb); + page = linear_to_page(page, len, &offset, skb, sk); if (!page) return 1; } else @@ -1442,7 +1442,8 @@ static inline void __segment_seek(struct page **page, unsigned int *poff, static inline int __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, struct sk_buff *skb, - struct splice_pipe_desc *spd, int linear) + struct splice_pipe_desc *spd, int linear, + struct sock *sk) { if (!*len) return 1; @@ -1465,7 +1466,7 @@ static inline int __splice_segment(struct page *page, unsigned int poff, /* the linear region may spread across several pages */ flen = min_t(unsigned int, flen, PAGE_SIZE - poff); - if (spd_fill_page(spd, page, &flen, poff, skb, linear)) + if (spd_fill_page(spd, page, &flen, poff, skb, linear, sk)) return 1; __segment_seek(&page, &poff, &plen, flen); @@ -1481,8 +1482,8 @@ static inline int __splice_segment(struct page *page, unsigned int poff, * pipe is full or if we already spliced the requested length. */ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, - unsigned int *len, - struct splice_pipe_desc *spd) + unsigned int *len, struct splice_pipe_desc *spd, + struct sock *sk) { int seg; @@ -1492,7 +1493,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, skb, spd, 1)) + offset, len, skb, spd, 1, sk)) return 1; /* @@ -1502,7 +1503,7 @@ static int __skb_splice_bits(struct sk_buff *skb, unsigned int *offset, const skb_frag_t *f = &skb_shinfo(skb)->frags[seg]; if (__splice_segment(f->page, f->page_offset, f->size, - offset, len, skb, spd, 0)) + offset, len, skb, spd, 0, sk)) return 1; } @@ -1528,12 +1529,13 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, }; + struct sock *sk = skb->sk; /* * __skb_splice_bits() only fails if the output has no room left, * so no point in going over the frag_list for the error case. */ - if (__skb_splice_bits(skb, &offset, &tlen, &spd)) + if (__skb_splice_bits(skb, &offset, &tlen, &spd, sk)) goto done; else if (!tlen) goto done; @@ -1545,14 +1547,13 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct sk_buff *list = skb_shinfo(skb)->frag_list; for (; list && tlen; list = list->next) { - if (__skb_splice_bits(list, &offset, &tlen, &spd)) + if (__skb_splice_bits(list, &offset, &tlen, &spd, sk)) break; } } done: if (spd.nr_pages) { - struct sock *sk = skb->sk; int ret; /* -- cgit v1.2.3 From ec581f6a42bbbea5271c66da9769a41b46c74e10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 May 2009 09:05:06 -0700 Subject: net: Fix skb_tx_hash() for forwarding workloads. When skb_rx_queue_recorded() is true, we dont want to use jash distribution as the device driver exactly told us which queue was selected at RX time. jhash makes a statistical shuffle, but this wont work with 8 static inputs. Later improvements would be to compute reciprocal value of real_num_tx_queues to avoid a divide here. But this computation should be done once, when real_num_tx_queues is set. This needs a separate patch, and a new field in struct net_device. Reported-by: Andrew Dickinson Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 308a7d0c277f..e2e9e4af3ace 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1735,11 +1735,12 @@ u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb) { u32 hash; - if (skb_rx_queue_recorded(skb)) { - hash = skb_get_rx_queue(skb); - } else if (skb->sk && skb->sk->sk_hash) { + if (skb_rx_queue_recorded(skb)) + return skb_get_rx_queue(skb) % dev->real_num_tx_queues; + + if (skb->sk && skb->sk->sk_hash) hash = skb->sk->sk_hash; - } else + else hash = skb->protocol; hash = jhash_1word(hash, skb_tx_hashrnd); -- cgit v1.2.3