summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-05-17 22:45:49 -0400
committerDavid S. Miller <davem@davemloft.net>2015-05-17 22:46:26 -0400
commit5d48ef3e954cae2f237ff8e006322d2f3b672375 (patch)
tree13db6cdf2a7ff4f841f3ec44b5700a8e9af1b361 /net
parent4633c9e07b3b7d7fc262a5f59ff635c1f702af6f (diff)
parentb66e91ccbc34ebd5a2f90f9e1bc1597e2924a500 (diff)
Merge branch 'tcp_mem_pressure'
Eric Dumazet says: ==================== tcp: better handling of memory pressure When testing commit 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure") using edge triggered epoll applications, I found various issues under memory pressure and thousands of active sockets. This patch series is a first round to solve these issues, in send and receive paths. There are probably other fixes needed, but with this series, my tests now all succeed. v2: fix typo in "allow one skb to be received per socket under memory pressure", as spotted by Jason Baron. ==================== Acked-by: Jason Baron <jbaron@akamai.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/core/sock.c9
-rw-r--r--net/ipv4/tcp.c24
-rw-r--r--net/ipv4/tcp_input.c18
-rw-r--r--net/ipv4/tcp_output.c10
-rw-r--r--net/ipv4/tcp_timer.c2
5 files changed, 40 insertions, 23 deletions
diff --git a/net/core/sock.c b/net/core/sock.c
index c18738a795b0..29124fcdc42a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2069,12 +2069,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
/**
* __sk_reclaim - reclaim memory_allocated
* @sk: socket
+ * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
*/
-void __sk_mem_reclaim(struct sock *sk)
+void __sk_mem_reclaim(struct sock *sk, int amount)
{
- sk_memory_allocated_sub(sk,
- sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
- sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ sk_memory_allocated_sub(sk, amount);
+ sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (sk_under_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ecccfdc50d76..c724195e5862 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -815,9 +815,20 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
/* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4);
+ if (unlikely(tcp_under_memory_pressure(sk)))
+ sk_mem_reclaim_partial(sk);
+
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
- if (skb) {
- if (sk_wmem_schedule(sk, skb->truesize)) {
+ if (likely(skb)) {
+ bool mem_schedule;
+
+ if (skb_queue_len(&sk->sk_write_queue) == 0) {
+ mem_schedule = true;
+ sk_forced_mem_schedule(sk, skb->truesize);
+ } else {
+ mem_schedule = sk_wmem_schedule(sk, skb->truesize);
+ }
+ if (likely(mem_schedule)) {
skb_reserve(skb, sk->sk_prot->max_header);
/*
* Make sure that we have exactly size bytes
@@ -3057,11 +3068,12 @@ __setup("thash_entries=", set_thash_entries);
static void __init tcp_init_mem(void)
{
- unsigned long limit = nr_free_buffer_pages() / 8;
+ unsigned long limit = nr_free_buffer_pages() / 16;
+
limit = max(limit, 128UL);
- sysctl_tcp_mem[0] = limit / 4 * 3;
- sysctl_tcp_mem[1] = limit;
- sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
+ sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
+ sysctl_tcp_mem[1] = limit; /* 6.25 % */
+ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
}
void __init tcp_init(void)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cf8b20ff6658..40c435997e54 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) &&
- !sk_under_memory_pressure(sk)) {
+ !tcp_under_memory_pressure(sk)) {
int incr;
/* Check #2. Increase window, if skb with such overhead
@@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
- !sk_under_memory_pressure(sk) &&
+ !tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]);
@@ -4507,10 +4507,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) {
queue_and_out:
- if (eaten < 0 &&
- tcp_try_rmem_schedule(sk, skb, skb->truesize))
- goto drop;
-
+ if (eaten < 0) {
+ if (skb_queue_len(&sk->sk_receive_queue) == 0)
+ sk_forced_mem_schedule(sk, skb->truesize);
+ else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
+ goto drop;
+ }
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
}
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
@@ -4781,7 +4783,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk);
- else if (sk_under_memory_pressure(sk))
+ else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk);
@@ -4825,7 +4827,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return false;
/* If we are under global TCP memory pressure, do not expand. */
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
return false;
/* If we are under soft global TCP memory pressure, do not expand. */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 7386d32cd670..08c2cc40b26d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2392,7 +2392,7 @@ u32 __tcp_select_window(struct sock *sk)
if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0;
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss);
@@ -2816,8 +2816,10 @@ begin_fwd:
* connection tear down and (memory) recovery.
* Otherwise tcp_send_fin() could be tempted to either delay FIN
* or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
*/
-static void sk_forced_wmem_schedule(struct sock *sk, int size)
+void sk_forced_mem_schedule(struct sock *sk, int size)
{
int amt, status;
@@ -2841,7 +2843,7 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted.
*/
- if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
+ if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
coalesce:
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++;
@@ -2864,7 +2866,7 @@ coalesce:
return;
}
skb_reserve(skb, MAX_TCP_HEADER);
- sk_forced_wmem_schedule(sk, skb->truesize);
+ sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 65bf670e8714..5b752f58a900 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -247,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk)
}
out:
- if (sk_under_memory_pressure(sk))
+ if (tcp_under_memory_pressure(sk))
sk_mem_reclaim(sk);
}