summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-12-03 19:29:37 -0500
committerDavid S. Miller <davem@davemloft.net>2016-12-03 19:29:37 -0500
commit69248719d0c27ba7cb734d4e716896369daca98d (patch)
tree65da87ee804eacc4e5f6535e1d9bb8927cba244d /drivers
parent548ed722465b763841252bf74b7eda52cfc9131c (diff)
parentc3852ef7f2f8f75a9f85a864bec1f6f5a3068eea (diff)
Merge branch 'fib-notifier-event-replay'
Jiri Pirko says: ==================== ipv4: fib: Replay events when registering FIB notifier Ido says: In kernel 4.9 the switchdev-specific FIB offload mechanism was replaced by a new FIB notification chain to which modules could register in order to be notified about the addition and deletion of FIB entries. The motivation for this change was that switchdev drivers need to be able to reflect the entire FIB table and not only FIBs configured on top of the port netdevs themselves. This is useful in case of in-band management. The fundamental problem with this approach is that upon registration listeners lose all the information previously sent in the chain and thus have an incomplete view of the FIB tables, which can result in packet loss. This patchset fixes that by dumping the FIB tables and replaying notifications previously sent in the chain for the registered notification block. The entire dump process is done under RCU and thus the FIB notification chain is converted to be atomic. The listeners are modified accordingly. This is done in the first eight patches. The ninth patch adds a change sequence counter to ensure the integrity of the FIB dump. The last patch adds the dump itself to the FIB chain registration function and modifies existing listeners to pass a callback to be executed in case dump was inconsistent. --- v3->v4: - Register the notification block after the dump and protect it using the change sequence counter (Hannes Frederic Sowa). - Since we now integrate the dump into the registration function, drop the sysctl to set maximum number of retries and instead set it to a fixed number. Lets see if it's really a problem before adding something we can never remove. - For the same reason, dump FIB tables for all net namespaces. - Add a comment regarding guarantees provided by mutex semantics. v2->v3: - Add sysctl to set the number of FIB dump retries (Hannes Frederic Sowa). - Read the sequence counter under RTNL to ensure synchronization between the dump process and other processes changing the routing tables (Hannes Frederic Sowa). - Pass a callback to the dump function to be executed prior to a retry. - Limit the dump to a single net namespace. v1->v2: - Add a sequence counter to ensure the integrity of the FIB dump (David S. Miller, Hannes Frederic Sowa). - Protect notifications from re-ordering in listeners by using an ordered workqueue (Hannes Frederic Sowa). - Introduce fib_info_hold() (Jiri Pirko). - Relieve rocker from the need to invoke the FIB dump by registering to the FIB notification chain prior to ports creation. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core.c22
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c92
-rw-r--r--drivers/net/ethernet/rocker/rocker.h1
-rw-r--r--drivers/net/ethernet/rocker/rocker_main.c84
-rw-r--r--drivers/net/ethernet/rocker/rocker_ofdpa.c1
6 files changed, 179 insertions, 23 deletions
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 4dc028bb4a33..57a98849551b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -77,6 +77,7 @@ static const char mlxsw_core_driver_name[] = "mlxsw_core";
static struct dentry *mlxsw_core_dbg_root;
static struct workqueue_struct *mlxsw_wq;
+static struct workqueue_struct *mlxsw_owq;
struct mlxsw_core_pcpu_stats {
u64 trap_rx_packets[MLXSW_TRAP_ID_MAX];
@@ -1900,6 +1901,18 @@ int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
}
EXPORT_SYMBOL(mlxsw_core_schedule_dw);
+int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay)
+{
+ return queue_delayed_work(mlxsw_owq, dwork, delay);
+}
+EXPORT_SYMBOL(mlxsw_core_schedule_odw);
+
+void mlxsw_core_flush_owq(void)
+{
+ flush_workqueue(mlxsw_owq);
+}
+EXPORT_SYMBOL(mlxsw_core_flush_owq);
+
static int __init mlxsw_core_module_init(void)
{
int err;
@@ -1907,6 +1920,12 @@ static int __init mlxsw_core_module_init(void)
mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_MEM_RECLAIM, 0);
if (!mlxsw_wq)
return -ENOMEM;
+ mlxsw_owq = alloc_ordered_workqueue("%s_ordered", WQ_MEM_RECLAIM,
+ mlxsw_core_driver_name);
+ if (!mlxsw_owq) {
+ err = -ENOMEM;
+ goto err_alloc_ordered_workqueue;
+ }
mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL);
if (!mlxsw_core_dbg_root) {
err = -ENOMEM;
@@ -1915,6 +1934,8 @@ static int __init mlxsw_core_module_init(void)
return 0;
err_debugfs_create_dir:
+ destroy_workqueue(mlxsw_owq);
+err_alloc_ordered_workqueue:
destroy_workqueue(mlxsw_wq);
return err;
}
@@ -1922,6 +1943,7 @@ err_debugfs_create_dir:
static void __exit mlxsw_core_module_exit(void)
{
debugfs_remove_recursive(mlxsw_core_dbg_root);
+ destroy_workqueue(mlxsw_owq);
destroy_workqueue(mlxsw_wq);
}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e856b49b83de..a7f94fbc898b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -207,6 +207,8 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
u8 local_port);
int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
+int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay);
+void mlxsw_core_flush_owq(void);
#define MLXSW_CONFIG_PROFILE_SWID_COUNT 8
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 683f0454170c..53126bf68ea9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -593,6 +593,14 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);
static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
{
+ /* At this stage we're guaranteed not to have new incoming
+ * FIB notifications and the work queue is free from FIBs
+ * sitting on top of mlxsw netdevs. However, we can still
+ * have other FIBs queued. Flush the queue before flushing
+ * the device's tables. No need for locks, as we're the only
+ * writer.
+ */
+ mlxsw_core_flush_owq();
mlxsw_sp_router_fib_flush(mlxsw_sp);
kfree(mlxsw_sp->router.vrs);
}
@@ -1948,33 +1956,89 @@ static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
kfree(mlxsw_sp->rifs);
}
-static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
- unsigned long event, void *ptr)
+struct mlxsw_sp_fib_event_work {
+ struct delayed_work dw;
+ struct fib_entry_notifier_info fen_info;
+ struct mlxsw_sp *mlxsw_sp;
+ unsigned long event;
+};
+
+static void mlxsw_sp_router_fib_event_work(struct work_struct *work)
{
- struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
- struct fib_entry_notifier_info *fen_info = ptr;
+ struct mlxsw_sp_fib_event_work *fib_work =
+ container_of(work, struct mlxsw_sp_fib_event_work, dw.work);
+ struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
int err;
- if (!net_eq(fen_info->info.net, &init_net))
- return NOTIFY_DONE;
-
- switch (event) {
+ /* Protect internal structures from changes */
+ rtnl_lock();
+ switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
- err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info);
+ err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info);
if (err)
mlxsw_sp_router_fib4_abort(mlxsw_sp);
+ fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_ENTRY_DEL:
- mlxsw_sp_router_fib4_del(mlxsw_sp, fen_info);
+ mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
+ fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_RULE_ADD: /* fall through */
case FIB_EVENT_RULE_DEL:
mlxsw_sp_router_fib4_abort(mlxsw_sp);
break;
}
+ rtnl_unlock();
+ kfree(fib_work);
+}
+
+/* Called with rcu_read_lock() */
+static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
+ struct mlxsw_sp_fib_event_work *fib_work;
+ struct fib_notifier_info *info = ptr;
+
+ if (!net_eq(info->net, &init_net))
+ return NOTIFY_DONE;
+
+ fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+ if (WARN_ON(!fib_work))
+ return NOTIFY_BAD;
+
+ INIT_DELAYED_WORK(&fib_work->dw, mlxsw_sp_router_fib_event_work);
+ fib_work->mlxsw_sp = mlxsw_sp;
+ fib_work->event = event;
+
+ switch (event) {
+ case FIB_EVENT_ENTRY_ADD: /* fall through */
+ case FIB_EVENT_ENTRY_DEL:
+ memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
+ /* Take referece on fib_info to prevent it from being
+ * freed while work is queued. Release it afterwards.
+ */
+ fib_info_hold(fib_work->fen_info.fi);
+ break;
+ }
+
+ mlxsw_core_schedule_odw(&fib_work->dw, 0);
+
return NOTIFY_DONE;
}
+static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
+{
+ struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
+
+ /* Flush pending FIB notifications and then flush the device's
+ * table before requesting another dump. The FIB notification
+ * block is unregistered, so no need to take RTNL.
+ */
+ mlxsw_core_flush_owq();
+ mlxsw_sp_router_fib_flush(mlxsw_sp);
+}
+
int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
{
int err;
@@ -1995,9 +2059,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
goto err_neigh_init;
mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
- register_fib_notifier(&mlxsw_sp->fib_nb);
+ err = register_fib_notifier(&mlxsw_sp->fib_nb,
+ mlxsw_sp_router_fib_dump_flush);
+ if (err)
+ goto err_register_fib_notifier;
+
return 0;
+err_register_fib_notifier:
+ mlxsw_sp_neigh_fini(mlxsw_sp);
err_neigh_init:
mlxsw_sp_vrs_fini(mlxsw_sp);
err_vrs_init:
diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h
index 2eb9b49569d5..ee9675db5bf9 100644
--- a/drivers/net/ethernet/rocker/rocker.h
+++ b/drivers/net/ethernet/rocker/rocker.h
@@ -72,6 +72,7 @@ struct rocker {
struct rocker_dma_ring_info event_ring;
struct notifier_block fib_nb;
struct rocker_world_ops *wops;
+ struct workqueue_struct *rocker_owq;
void *wpriv;
};
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 67df4cf93362..7c450b5a1138 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -28,6 +28,7 @@
#include <linux/if_bridge.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
+#include <linux/workqueue.h>
#include <net/switchdev.h>
#include <net/rtnetlink.h>
#include <net/netevent.h>
@@ -2165,28 +2166,70 @@ static const struct switchdev_ops rocker_port_switchdev_ops = {
.switchdev_port_obj_dump = rocker_port_obj_dump,
};
-static int rocker_router_fib_event(struct notifier_block *nb,
- unsigned long event, void *ptr)
+struct rocker_fib_event_work {
+ struct work_struct work;
+ struct fib_entry_notifier_info fen_info;
+ struct rocker *rocker;
+ unsigned long event;
+};
+
+static void rocker_router_fib_event_work(struct work_struct *work)
{
- struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
- struct fib_entry_notifier_info *fen_info = ptr;
+ struct rocker_fib_event_work *fib_work =
+ container_of(work, struct rocker_fib_event_work, work);
+ struct rocker *rocker = fib_work->rocker;
int err;
- switch (event) {
+ /* Protect internal structures from changes */
+ rtnl_lock();
+ switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
- err = rocker_world_fib4_add(rocker, fen_info);
+ err = rocker_world_fib4_add(rocker, &fib_work->fen_info);
if (err)
rocker_world_fib4_abort(rocker);
- else
+ fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_ENTRY_DEL:
- rocker_world_fib4_del(rocker, fen_info);
+ rocker_world_fib4_del(rocker, &fib_work->fen_info);
+ fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_RULE_ADD: /* fall through */
case FIB_EVENT_RULE_DEL:
rocker_world_fib4_abort(rocker);
break;
}
+ rtnl_unlock();
+ kfree(fib_work);
+}
+
+/* Called with rcu_read_lock() */
+static int rocker_router_fib_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
+ struct rocker_fib_event_work *fib_work;
+
+ fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+ if (WARN_ON(!fib_work))
+ return NOTIFY_BAD;
+
+ INIT_WORK(&fib_work->work, rocker_router_fib_event_work);
+ fib_work->rocker = rocker;
+ fib_work->event = event;
+
+ switch (event) {
+ case FIB_EVENT_ENTRY_ADD: /* fall through */
+ case FIB_EVENT_ENTRY_DEL:
+ memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
+ /* Take referece on fib_info to prevent it from being
+ * freed while work is queued. Release it afterwards.
+ */
+ fib_info_hold(fib_work->fen_info.fi);
+ break;
+ }
+
+ queue_work(rocker->rocker_owq, &fib_work->work);
+
return NOTIFY_DONE;
}
@@ -2754,6 +2797,21 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_request_event_irq;
}
+ rocker->rocker_owq = alloc_ordered_workqueue(rocker_driver_name,
+ WQ_MEM_RECLAIM);
+ if (!rocker->rocker_owq) {
+ err = -ENOMEM;
+ goto err_alloc_ordered_workqueue;
+ }
+
+ /* Only FIBs pointing to our own netdevs are programmed into
+ * the device, so no need to pass a callback.
+ */
+ rocker->fib_nb.notifier_call = rocker_router_fib_event;
+ err = register_fib_notifier(&rocker->fib_nb, NULL);
+ if (err)
+ goto err_register_fib_notifier;
+
rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
err = rocker_probe_ports(rocker);
@@ -2762,15 +2820,16 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_probe_ports;
}
- rocker->fib_nb.notifier_call = rocker_router_fib_event;
- register_fib_notifier(&rocker->fib_nb);
-
dev_info(&pdev->dev, "Rocker switch with id %*phN\n",
(int)sizeof(rocker->hw.id), &rocker->hw.id);
return 0;
err_probe_ports:
+ unregister_fib_notifier(&rocker->fib_nb);
+err_register_fib_notifier:
+ destroy_workqueue(rocker->rocker_owq);
+err_alloc_ordered_workqueue:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
err_request_event_irq:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
@@ -2796,9 +2855,10 @@ static void rocker_remove(struct pci_dev *pdev)
{
struct rocker *rocker = pci_get_drvdata(pdev);
+ rocker_remove_ports(rocker);
unregister_fib_notifier(&rocker->fib_nb);
rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
- rocker_remove_ports(rocker);
+ destroy_workqueue(rocker->rocker_owq);
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
rocker_dma_rings_fini(rocker);
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 4ca461322d60..7cd76b6b5cb9 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2516,6 +2516,7 @@ static void ofdpa_fini(struct rocker *rocker)
int bkt;
del_timer_sync(&ofdpa->fdb_cleanup_timer);
+ flush_workqueue(rocker->rocker_owq);
spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry)