summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
authorBjörn Töpel <bjorn.topel@intel.com>2020-11-30 19:51:56 +0100
committerDaniel Borkmann <daniel@iogearbox.net>2020-12-01 00:09:25 +0100
commit7fd3253a7de6a317a0683f83739479fb880bffc8 (patch)
tree8490a9756c6a4a7f5078fd856f23b568703e3680 /net/core
parent854055c0cf30d732b3514ce7956976f60496b1a1 (diff)
net: Introduce preferred busy-polling
The existing busy-polling mode, enabled by the SO_BUSY_POLL socket option or system-wide using the /proc/sys/net/core/busy_read knob, is an opportunistic. That means that if the NAPI context is not scheduled, it will poll it. If, after busy-polling, the budget is exceeded the busy-polling logic will schedule the NAPI onto the regular softirq handling. One implication of the behavior above is that a busy/heavy loaded NAPI context will never enter/allow for busy-polling. Some applications prefer that most NAPI processing would be done by busy-polling. This series adds a new socket option, SO_PREFER_BUSY_POLL, that works in concert with the napi_defer_hard_irqs and gro_flush_timeout knobs. The napi_defer_hard_irqs and gro_flush_timeout knobs were introduced in commit 6f8b12d661d0 ("net: napi: add hard irqs deferral feature"), and allows for a user to defer interrupts to be enabled and instead schedule the NAPI context from a watchdog timer. When a user enables the SO_PREFER_BUSY_POLL, again with the other knobs enabled, and the NAPI context is being processed by a softirq, the softirq NAPI processing will exit early to allow the busy-polling to be performed. If the application stops performing busy-polling via a system call, the watchdog timer defined by gro_flush_timeout will timeout, and regular softirq handling will resume. In summary; Heavy traffic applications that prefer busy-polling over softirq processing should use this option. Example usage: $ echo 2 | sudo tee /sys/class/net/ens785f1/napi_defer_hard_irqs $ echo 200000 | sudo tee /sys/class/net/ens785f1/gro_flush_timeout Note that the timeout should be larger than the userspace processing window, otherwise the watchdog will timeout and fall back to regular softirq processing. Enable the SO_BUSY_POLL/SO_PREFER_BUSY_POLL options on your socket. Signed-off-by: Björn Töpel <bjorn.topel@intel.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Jakub Kicinski <kuba@kernel.org> Link: https://lore.kernel.org/bpf/20201130185205.196029-2-bjorn.topel@gmail.com
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c78
-rw-r--r--net/core/sock.c9
2 files changed, 72 insertions, 15 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 60d325bda0d7..6f8d2cffb7c5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6458,7 +6458,8 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
- new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED);
+ new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+ NAPIF_STATE_PREFER_BUSY_POLL);
/* If STATE_MISSED was set, leave STATE_SCHED set,
* because we will call napi->poll() one more time.
@@ -6497,8 +6498,29 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
#define BUSY_POLL_BUDGET 8
-static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
+static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{
+ if (!skip_schedule) {
+ gro_normal_list(napi);
+ __napi_schedule(napi);
+ return;
+ }
+
+ if (napi->gro_bitmask) {
+ /* flush too old packets
+ * If HZ < 1000, flush all packets.
+ */
+ napi_gro_flush(napi, HZ >= 1000);
+ }
+
+ gro_normal_list(napi);
+ clear_bit(NAPI_STATE_SCHED, &napi->state);
+}
+
+static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll)
+{
+ bool skip_schedule = false;
+ unsigned long timeout;
int rc;
/* Busy polling means there is a high chance device driver hard irq
@@ -6515,6 +6537,15 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
local_bh_disable();
+ if (prefer_busy_poll) {
+ napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
+ timeout = READ_ONCE(napi->dev->gro_flush_timeout);
+ if (napi->defer_hard_irqs_count && timeout) {
+ hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
+ skip_schedule = true;
+ }
+ }
+
/* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/
@@ -6525,19 +6556,14 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock)
*/
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
netpoll_poll_unlock(have_poll_lock);
- if (rc == BUSY_POLL_BUDGET) {
- /* As the whole budget was spent, we still own the napi so can
- * safely handle the rx_list.
- */
- gro_normal_list(napi);
- __napi_schedule(napi);
- }
+ if (rc == BUSY_POLL_BUDGET)
+ __busy_poll_stop(napi, skip_schedule);
local_bh_enable();
}
void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long),
- void *loop_end_arg)
+ void *loop_end_arg, bool prefer_busy_poll)
{
unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget);
@@ -6565,12 +6591,18 @@ restart:
* we avoid dirtying napi->state as much as we can.
*/
if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
- NAPIF_STATE_IN_BUSY_POLL))
+ NAPIF_STATE_IN_BUSY_POLL)) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
if (cmpxchg(&napi->state, val,
val | NAPIF_STATE_IN_BUSY_POLL |
- NAPIF_STATE_SCHED) != val)
+ NAPIF_STATE_SCHED) != val) {
+ if (prefer_busy_poll)
+ set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
goto count;
+ }
have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll;
}
@@ -6588,7 +6620,7 @@ count:
if (unlikely(need_resched())) {
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll);
preempt_enable();
rcu_read_unlock();
cond_resched();
@@ -6599,7 +6631,7 @@ count:
cpu_relax();
}
if (napi_poll)
- busy_poll_stop(napi, have_poll_lock);
+ busy_poll_stop(napi, have_poll_lock, prefer_busy_poll);
preempt_enable();
out:
rcu_read_unlock();
@@ -6650,8 +6682,10 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
if (!napi_disable_pending(napi) &&
- !test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
+ !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
__napi_schedule_irqoff(napi);
+ }
return HRTIMER_NORESTART;
}
@@ -6709,6 +6743,7 @@ void napi_disable(struct napi_struct *n)
hrtimer_cancel(&n->timer);
+ clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
EXPORT_SYMBOL(napi_disable);
@@ -6781,6 +6816,19 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
+ /* The NAPI context has more processing work, but busy-polling
+ * is preferred. Exit early.
+ */
+ if (napi_prefer_busy_poll(n)) {
+ if (napi_complete_done(n, work)) {
+ /* If timeout is not set, we need to make sure
+ * that the NAPI is re-scheduled.
+ */
+ napi_schedule(n);
+ }
+ goto out_unlock;
+ }
+
if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
diff --git a/net/core/sock.c b/net/core/sock.c
index 727ea1cc633c..e05f2e52b5a8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1159,6 +1159,12 @@ set_sndbuf:
sk->sk_ll_usec = val;
}
break;
+ case SO_PREFER_BUSY_POLL:
+ if (valbool && !capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
+ break;
#endif
case SO_MAX_PACING_RATE:
@@ -1523,6 +1529,9 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
case SO_BUSY_POLL:
v.val = sk->sk_ll_usec;
break;
+ case SO_PREFER_BUSY_POLL:
+ v.val = READ_ONCE(sk->sk_prefer_busy_poll);
+ break;
#endif
case SO_MAX_PACING_RATE: