summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.txt13
-rw-r--r--include/linux/tcp.h1
-rw-r--r--include/net/sock.h19
-rw-r--r--include/net/tcp.h14
-rw-r--r--include/uapi/linux/tcp.h1
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp.c7
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_output.c3
-rw-r--r--net/ipv6/tcp_ipv6.c1
10 files changed, 61 insertions, 6 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 10742902146f..53cea9bcb14c 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -516,6 +516,19 @@ tcp_wmem - vector of 3 INTEGERs: min, default, max
this value is ignored.
Default: between 64K and 4MB, depending on RAM size.
+tcp_notsent_lowat - UNSIGNED INTEGER
+ A TCP socket can control the amount of unsent bytes in its write queue,
+ thanks to TCP_NOTSENT_LOWAT socket option. poll()/select()/epoll()
+ reports POLLOUT events if the amount of unsent bytes is below a per
+ socket value, and if the write queue is not full. sendmsg() will
+ also not add new buffers if the limit is hit.
+
+ This global variable controls the amount of unsent data for
+ sockets not using TCP_NOTSENT_LOWAT. For these sockets, a change
+ to the global variable has immediate effect.
+
+ Default: UINT_MAX (0xFFFFFFFF)
+
tcp_workaround_signed_windows - BOOLEAN
If set, assume no receipt of a window scaling option means the
remote TCP is broken and treats the window as a signed quantity.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 472120b4fac5..9640803a17a7 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -238,6 +238,7 @@ struct tcp_sock {
u32 rcv_wnd; /* Current receiver window */
u32 write_seq; /* Tail(+1) of data held in tcp send buffer */
+ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */
u32 pushed_seq; /* Last pushed seq, required to talk to windows */
u32 lost_out; /* Lost packets */
u32 sacked_out; /* SACK'd packets */
diff --git a/include/net/sock.h b/include/net/sock.h
index d0b5fdee50a2..b9f2b095b1ab 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -746,11 +746,6 @@ static inline int sk_stream_wspace(const struct sock *sk)
extern void sk_stream_write_space(struct sock *sk);
-static inline bool sk_stream_memory_free(const struct sock *sk)
-{
- return sk->sk_wmem_queued < sk->sk_sndbuf;
-}
-
/* OOB backlog add */
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
@@ -950,6 +945,7 @@ struct proto {
unsigned int inuse_idx;
#endif
+ bool (*stream_memory_free)(const struct sock *sk);
/* Memory pressure */
void (*enter_memory_pressure)(struct sock *sk);
atomic_long_t *memory_allocated; /* Current allocated memory. */
@@ -1088,11 +1084,22 @@ static inline struct cg_proto *parent_cg_proto(struct proto *proto,
}
#endif
+static inline bool sk_stream_memory_free(const struct sock *sk)
+{
+ if (sk->sk_wmem_queued >= sk->sk_sndbuf)
+ return false;
+
+ return sk->sk_prot->stream_memory_free ?
+ sk->sk_prot->stream_memory_free(sk) : true;
+}
+
static inline bool sk_stream_is_writeable(const struct sock *sk)
{
- return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk);
+ return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) &&
+ sk_stream_memory_free(sk);
}
+
static inline bool sk_has_memory_pressure(const struct sock *sk)
{
return sk->sk_prot->memory_pressure != NULL;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c5868471abae..18fc999dae3c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -284,6 +284,7 @@ extern int sysctl_tcp_thin_dupack;
extern int sysctl_tcp_early_retrans;
extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit;
+extern unsigned int sysctl_tcp_notsent_lowat;
extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated;
@@ -1539,6 +1540,19 @@ extern int tcp_gro_complete(struct sk_buff *skb);
extern void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr,
__be32 daddr);
+static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
+{
+ return tp->notsent_lowat ?: sysctl_tcp_notsent_lowat;
+}
+
+static inline bool tcp_stream_memory_free(const struct sock *sk)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 notsent_bytes = tp->write_seq - tp->snd_nxt;
+
+ return notsent_bytes < tcp_notsent_lowat(tp);
+}
+
#ifdef CONFIG_PROC_FS
extern int tcp4_proc_init(void);
extern void tcp4_proc_exit(void);
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 8d776ebc4829..377f1e59411d 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -111,6 +111,7 @@ enum {
#define TCP_REPAIR_OPTIONS 22
#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */
#define TCP_TIMESTAMP 24
+#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
struct tcp_repair_opt {
__u32 opt_code;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index b2c123c44d69..69ed203802da 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -555,6 +555,13 @@ static struct ctl_table ipv4_table[] = {
.extra1 = &one,
},
{
+ .procname = "tcp_notsent_lowat",
+ .data = &sysctl_tcp_notsent_lowat,
+ .maxlen = sizeof(sysctl_tcp_notsent_lowat),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "tcp_rmem",
.data = &sysctl_tcp_rmem,
.maxlen = sizeof(sysctl_tcp_rmem),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5eca9060bb8e..c27e81392398 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2631,6 +2631,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->tsoffset = val - tcp_time_stamp;
break;
+ case TCP_NOTSENT_LOWAT:
+ tp->notsent_lowat = val;
+ sk->sk_write_space(sk);
+ break;
default:
err = -ENOPROTOOPT;
break;
@@ -2847,6 +2851,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
+ case TCP_NOTSENT_LOWAT:
+ val = tp->notsent_lowat;
+ break;
default:
return -ENOPROTOOPT;
}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 2e3f129df0eb..2a5d5c469d17 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2800,6 +2800,7 @@ struct proto tcp_prot = {
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 92fde8d1aa82..884efff5b531 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,9 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
+EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
+
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 80fe69ef2188..b792e870686b 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1924,6 +1924,7 @@ struct proto tcpv6_prot = {
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
+ .stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,