summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2020-04-23 12:43:20 -0700
committerDavid S. Miller <davem@davemloft.net>2020-04-23 12:43:20 -0700
commit4c532b144fc28af43d11f0c12221db2a8c3ee4e3 (patch)
treee225c4e8ede36140d00cdb3dc1aac52a3788935a
parente6acd2b6e84bf61fef42f99fe3d117fe75701629 (diff)
parentcf4058dbaa18bf8e55b7cb8c04e1c313298cd5b1 (diff)
Merge branch 'net-napi-addition-of-napi_defer_hard_irqs'
Eric Dumazet says: ==================== net: napi: addition of napi_defer_hard_irqs This patch series augments gro_glush_timeout feature with napi_defer_hard_irqs As extensively described in first patch changelog, this can suppresss the chit-chat traffic between NIC and host to signal interrupts and re-arming them, since this can be an issue on high speed NIC with many queues. The last patch in this series converts mlx4 TX completion to napi_complete_done(), to enable this new mechanism. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_rx.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_tx.c20
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/mlx4_en.h4
-rw-r--r--include/linux/netdevice.h2
-rw-r--r--net/core/dev.c29
-rw-r--r--net/core/net-sysfs.c20
6 files changed, 52 insertions, 25 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index db3552f2d087..787139219813 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -946,7 +946,7 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring];
if (xdp_tx_cq->xdp_busy) {
clean_complete = mlx4_en_process_tx_cq(dev, xdp_tx_cq,
- budget);
+ budget) < budget;
xdp_tx_cq->xdp_busy = !clean_complete;
}
}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 4d5ca302c067..a99d3ed49ed6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -382,8 +382,8 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
return cnt;
}
-bool mlx4_en_process_tx_cq(struct net_device *dev,
- struct mlx4_en_cq *cq, int napi_budget)
+int mlx4_en_process_tx_cq(struct net_device *dev,
+ struct mlx4_en_cq *cq, int napi_budget)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
struct mlx4_cq *mcq = &cq->mcq;
@@ -405,7 +405,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
u32 ring_cons;
if (unlikely(!priv->port_up))
- return true;
+ return 0;
netdev_txq_bql_complete_prefetchw(ring->tx_queue);
@@ -480,7 +480,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
WRITE_ONCE(ring->cons, ring_cons + txbbs_skipped);
if (cq->type == TX_XDP)
- return done < budget;
+ return done;
netdev_tx_completed_queue(ring->tx_queue, packets, bytes);
@@ -492,7 +492,7 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
ring->wake_queue++;
}
- return done < budget;
+ return done;
}
void mlx4_en_tx_irq(struct mlx4_cq *mcq)
@@ -512,14 +512,14 @@ int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget)
struct mlx4_en_cq *cq = container_of(napi, struct mlx4_en_cq, napi);
struct net_device *dev = cq->dev;
struct mlx4_en_priv *priv = netdev_priv(dev);
- bool clean_complete;
+ int work_done;
- clean_complete = mlx4_en_process_tx_cq(dev, cq, budget);
- if (!clean_complete)
+ work_done = mlx4_en_process_tx_cq(dev, cq, budget);
+ if (work_done >= budget)
return budget;
- napi_complete(napi);
- mlx4_en_arm_cq(priv, cq);
+ if (napi_complete_done(napi, work_done))
+ mlx4_en_arm_cq(priv, cq);
return 0;
}
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index 630f15977f09..9f5603612960 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -737,8 +737,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev,
int budget);
int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget);
int mlx4_en_poll_tx_cq(struct napi_struct *napi, int budget);
-bool mlx4_en_process_tx_cq(struct net_device *dev,
- struct mlx4_en_cq *cq, int napi_budget);
+int mlx4_en_process_tx_cq(struct net_device *dev,
+ struct mlx4_en_cq *cq, int napi_budget);
u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
struct mlx4_en_tx_ring *ring,
int index, u64 timestamp,
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 0750b54b3765..5a8d40f1ffe2 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -329,6 +329,7 @@ struct napi_struct {
unsigned long state;
int weight;
+ int defer_hard_irqs_count;
unsigned long gro_bitmask;
int (*poll)(struct napi_struct *, int);
#ifdef CONFIG_NETPOLL
@@ -1995,6 +1996,7 @@ struct net_device {
struct bpf_prog __rcu *xdp_prog;
unsigned long gro_flush_timeout;
+ int napi_defer_hard_irqs;
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
diff --git a/net/core/dev.c b/net/core/dev.c
index fb61522b1ce1..afff16849c26 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff);
bool napi_complete_done(struct napi_struct *n, int work_done)
{
- unsigned long flags, val, new;
+ unsigned long flags, val, new, timeout = 0;
+ bool ret = true;
/*
* 1) Don't let napi dequeue from the cpu poll list
@@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
+ if (work_done) {
+ if (n->gro_bitmask)
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
+ }
+ if (n->defer_hard_irqs_count > 0) {
+ n->defer_hard_irqs_count--;
+ timeout = READ_ONCE(n->dev->gro_flush_timeout);
+ if (timeout)
+ ret = false;
+ }
if (n->gro_bitmask) {
- unsigned long timeout = 0;
-
- if (work_done)
- timeout = n->dev->gro_flush_timeout;
-
/* When the NAPI instance uses a timeout and keeps postponing
* it, we need to bound somehow the time packets are kept in
* the GRO layer
*/
napi_gro_flush(n, !!timeout);
- if (timeout)
- hrtimer_start(&n->timer, ns_to_ktime(timeout),
- HRTIMER_MODE_REL_PINNED);
}
gro_normal_list(n);
@@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
return false;
}
- return true;
+ if (timeout)
+ hrtimer_start(&n->timer, ns_to_ktime(timeout),
+ HRTIMER_MODE_REL_PINNED);
+ return ret;
}
EXPORT_SYMBOL(napi_complete_done);
@@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_bitmask && !napi_disable_pending(napi) &&
+ if (!napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 0d9e46de205e..880e89c894f6 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -367,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec);
static int change_gro_flush_timeout(struct net_device *dev, unsigned long val)
{
- dev->gro_flush_timeout = val;
+ WRITE_ONCE(dev->gro_flush_timeout, val);
return 0;
}
@@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
+static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val)
+{
+ WRITE_ONCE(dev->napi_defer_hard_irqs, val);
+ return 0;
+}
+
+static ssize_t napi_defer_hard_irqs_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t len)
+{
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
+}
+NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec);
+
static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t len)
{
@@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = {
&dev_attr_flags.attr,
&dev_attr_tx_queue_len.attr,
&dev_attr_gro_flush_timeout.attr,
+ &dev_attr_napi_defer_hard_irqs.attr,
&dev_attr_phys_port_id.attr,
&dev_attr_phys_port_name.attr,
&dev_attr_phys_switch_id.attr,