From b423cb10807b3a24b909c40f4372df88118a9875 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:44:58 +0100 Subject: ipv4: fib: Export free_fib_info() The FIB notification chain is going to be converted to an atomic chain, which means switchdev drivers will have to offload FIB entries in deferred work, as hardware operations entail sleeping. However, while the work is queued fib info might be freed, so a reference must be taken. To release the reference (and potentially free the fib info) fib_info_put() will be called, which in turn calls free_fib_info(). Export free_fib_info() so that modules will be able to invoke fib_info_put(). Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 388d3e21629b..c1bc1e92de0e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi) #endif call_rcu(&fi->rcu, free_fib_info_rcu); } +EXPORT_SYMBOL_GPL(free_fib_info); void fib_release_info(struct fib_info *fi) { -- cgit v1.2.3 From 1c677b3d2828a84e0360e1a07d1204531540dc03 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:44:59 +0100 Subject: ipv4: fib: Add fib_info_hold() helper As explained in the previous commit, modules are going to need to take a reference on fib info and then drop it using fib_info_put(). Add the fib_info_hold() helper to make the code more readable and also symmetric with fib_info_put(). Signed-off-by: Ido Schimmel Suggested-by: Jiri Pirko Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index f390c3bb05c5..6c67b9391fc0 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -397,6 +397,11 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) void free_fib_info(struct fib_info *fi); +static inline void fib_info_hold(struct fib_info *fi) +{ + atomic_inc(&fi->fib_clntref); +} + static inline void fib_info_put(struct fib_info *fi) { if (atomic_dec_and_test(&fi->fib_clntref)) -- cgit v1.2.3 From a3832b31898f6936816257c020b1d4f1a07622f7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:00 +0100 Subject: mlxsw: core: Create an ordered workqueue for FIB offload We're going to start processing FIB entries addition / deletion events in deferred work. These work items must be processed in the order they were submitted or otherwise we can have differences between the kernel's FIB table and the device's. Solve this by creating an ordered workqueue to which these work items will be submitted to. Note that we can't simply convert the current workqueue to be ordered, as EMADs re-transmissions are also processed in deferred work. Later on, we can migrate other work items to this workqueue, such as FDB notification processing and nexthop resolution, since they all take the same lock anyway. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/core.c | 22 ++++++++++++++++++++++ drivers/net/ethernet/mellanox/mlxsw/core.h | 2 ++ 2 files changed, 24 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c index 4dc028bb4a33..57a98849551b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.c +++ b/drivers/net/ethernet/mellanox/mlxsw/core.c @@ -77,6 +77,7 @@ static const char mlxsw_core_driver_name[] = "mlxsw_core"; static struct dentry *mlxsw_core_dbg_root; static struct workqueue_struct *mlxsw_wq; +static struct workqueue_struct *mlxsw_owq; struct mlxsw_core_pcpu_stats { u64 trap_rx_packets[MLXSW_TRAP_ID_MAX]; @@ -1900,6 +1901,18 @@ int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay) } EXPORT_SYMBOL(mlxsw_core_schedule_dw); +int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work(mlxsw_owq, dwork, delay); +} +EXPORT_SYMBOL(mlxsw_core_schedule_odw); + +void mlxsw_core_flush_owq(void) +{ + flush_workqueue(mlxsw_owq); +} +EXPORT_SYMBOL(mlxsw_core_flush_owq); + static int __init mlxsw_core_module_init(void) { int err; @@ -1907,6 +1920,12 @@ static int __init mlxsw_core_module_init(void) mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_MEM_RECLAIM, 0); if (!mlxsw_wq) return -ENOMEM; + mlxsw_owq = alloc_ordered_workqueue("%s_ordered", WQ_MEM_RECLAIM, + mlxsw_core_driver_name); + if (!mlxsw_owq) { + err = -ENOMEM; + goto err_alloc_ordered_workqueue; + } mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL); if (!mlxsw_core_dbg_root) { err = -ENOMEM; @@ -1915,6 +1934,8 @@ static int __init mlxsw_core_module_init(void) return 0; err_debugfs_create_dir: + destroy_workqueue(mlxsw_owq); +err_alloc_ordered_workqueue: destroy_workqueue(mlxsw_wq); return err; } @@ -1922,6 +1943,7 @@ err_debugfs_create_dir: static void __exit mlxsw_core_module_exit(void) { debugfs_remove_recursive(mlxsw_core_dbg_root); + destroy_workqueue(mlxsw_owq); destroy_workqueue(mlxsw_wq); } diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h index e856b49b83de..a7f94fbc898b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/core.h +++ b/drivers/net/ethernet/mellanox/mlxsw/core.h @@ -207,6 +207,8 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core, u8 local_port); int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay); +int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay); +void mlxsw_core_flush_owq(void); #define MLXSW_CONFIG_PROFILE_SWID_COUNT 8 -- cgit v1.2.3 From 3057224e014c14921de5ab534d5147eb109fd12b Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:01 +0100 Subject: mlxsw: spectrum_router: Implement FIB offload in deferred work FIB offload is currently done in process context with RTNL held, but we're about to dump the FIB tables in RCU critical section, so we can no longer sleep. Instead, defer the operation to process context using deferred work. Make sure fib info isn't freed while the work is queued by taking a reference on it and releasing it after the operation is done. Deferring the operation is valid because the upper layers always assume the operation was successful. If it's not, then the driver-specific abort mechanism is called and all routed traffic is directed to slow path. The work items are submitted to an ordered workqueue to prevent a mismatch between the kernel's FIB table and the device's. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 72 +++++++++++++++++++--- 1 file changed, 62 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 683f0454170c..14bed1d10b72 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -593,6 +593,14 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp); static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp) { + /* At this stage we're guaranteed not to have new incoming + * FIB notifications and the work queue is free from FIBs + * sitting on top of mlxsw netdevs. However, we can still + * have other FIBs queued. Flush the queue before flushing + * the device's tables. No need for locks, as we're the only + * writer. + */ + mlxsw_core_flush_owq(); mlxsw_sp_router_fib_flush(mlxsw_sp); kfree(mlxsw_sp->router.vrs); } @@ -1948,30 +1956,74 @@ static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp) kfree(mlxsw_sp->rifs); } -static int mlxsw_sp_router_fib_event(struct notifier_block *nb, - unsigned long event, void *ptr) +struct mlxsw_sp_fib_event_work { + struct delayed_work dw; + struct fib_entry_notifier_info fen_info; + struct mlxsw_sp *mlxsw_sp; + unsigned long event; +}; + +static void mlxsw_sp_router_fib_event_work(struct work_struct *work) { - struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb); - struct fib_entry_notifier_info *fen_info = ptr; + struct mlxsw_sp_fib_event_work *fib_work = + container_of(work, struct mlxsw_sp_fib_event_work, dw.work); + struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp; int err; - if (!net_eq(fen_info->info.net, &init_net)) - return NOTIFY_DONE; - - switch (event) { + /* Protect internal structures from changes */ + rtnl_lock(); + switch (fib_work->event) { case FIB_EVENT_ENTRY_ADD: - err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info); + err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info); if (err) mlxsw_sp_router_fib4_abort(mlxsw_sp); + fib_info_put(fib_work->fen_info.fi); break; case FIB_EVENT_ENTRY_DEL: - mlxsw_sp_router_fib4_del(mlxsw_sp, fen_info); + mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info); + fib_info_put(fib_work->fen_info.fi); break; case FIB_EVENT_RULE_ADD: /* fall through */ case FIB_EVENT_RULE_DEL: mlxsw_sp_router_fib4_abort(mlxsw_sp); break; } + rtnl_unlock(); + kfree(fib_work); +} + +/* Called with rcu_read_lock() */ +static int mlxsw_sp_router_fib_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb); + struct mlxsw_sp_fib_event_work *fib_work; + struct fib_notifier_info *info = ptr; + + if (!net_eq(info->net, &init_net)) + return NOTIFY_DONE; + + fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC); + if (WARN_ON(!fib_work)) + return NOTIFY_BAD; + + INIT_DELAYED_WORK(&fib_work->dw, mlxsw_sp_router_fib_event_work); + fib_work->mlxsw_sp = mlxsw_sp; + fib_work->event = event; + + switch (event) { + case FIB_EVENT_ENTRY_ADD: /* fall through */ + case FIB_EVENT_ENTRY_DEL: + memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info)); + /* Take referece on fib_info to prevent it from being + * freed while work is queued. Release it afterwards. + */ + fib_info_hold(fib_work->fen_info.fi); + break; + } + + mlxsw_core_schedule_odw(&fib_work->dw, 0); + return NOTIFY_DONE; } -- cgit v1.2.3 From c1bb279cfa9d079bdee35402f5d13210f25f5c80 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:02 +0100 Subject: rocker: Create an ordered workqueue for FIB offload As explained in the previous commits, we need to process FIB entries addition / deletion events in FIFO order or otherwise we can have a mismatch between the kernel's FIB table and the device's. Create an ordered workqueue for rocker to which these work items will be submitted to. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.h | 1 + drivers/net/ethernet/rocker/rocker_main.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h index 2eb9b49569d5..ee9675db5bf9 100644 --- a/drivers/net/ethernet/rocker/rocker.h +++ b/drivers/net/ethernet/rocker/rocker.h @@ -72,6 +72,7 @@ struct rocker { struct rocker_dma_ring_info event_ring; struct notifier_block fib_nb; struct rocker_world_ops *wops; + struct workqueue_struct *rocker_owq; void *wpriv; }; diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 67df4cf93362..424be969da3f 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -2754,6 +2755,13 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_request_event_irq; } + rocker->rocker_owq = alloc_ordered_workqueue(rocker_driver_name, + WQ_MEM_RECLAIM); + if (!rocker->rocker_owq) { + err = -ENOMEM; + goto err_alloc_ordered_workqueue; + } + rocker->hw.id = rocker_read64(rocker, SWITCH_ID); err = rocker_probe_ports(rocker); @@ -2771,6 +2779,8 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) return 0; err_probe_ports: + destroy_workqueue(rocker->rocker_owq); +err_alloc_ordered_workqueue: free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker); err_request_event_irq: free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker); @@ -2799,6 +2809,7 @@ static void rocker_remove(struct pci_dev *pdev) unregister_fib_notifier(&rocker->fib_nb); rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET); rocker_remove_ports(rocker); + destroy_workqueue(rocker->rocker_owq); free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker); free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker); rocker_dma_rings_fini(rocker); -- cgit v1.2.3 From db7019557cb48508f3ff9d6b40c2e967702897a6 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:03 +0100 Subject: rocker: Implement FIB offload in deferred work Convert rocker to offload FIBs in deferred work in a similar fashion to mlxsw, which was converted in the previous commits. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker_main.c | 58 +++++++++++++++++++++++++----- drivers/net/ethernet/rocker/rocker_ofdpa.c | 1 + 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 424be969da3f..914e9e1b01ad 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2166,28 +2166,70 @@ static const struct switchdev_ops rocker_port_switchdev_ops = { .switchdev_port_obj_dump = rocker_port_obj_dump, }; -static int rocker_router_fib_event(struct notifier_block *nb, - unsigned long event, void *ptr) +struct rocker_fib_event_work { + struct work_struct work; + struct fib_entry_notifier_info fen_info; + struct rocker *rocker; + unsigned long event; +}; + +static void rocker_router_fib_event_work(struct work_struct *work) { - struct rocker *rocker = container_of(nb, struct rocker, fib_nb); - struct fib_entry_notifier_info *fen_info = ptr; + struct rocker_fib_event_work *fib_work = + container_of(work, struct rocker_fib_event_work, work); + struct rocker *rocker = fib_work->rocker; int err; - switch (event) { + /* Protect internal structures from changes */ + rtnl_lock(); + switch (fib_work->event) { case FIB_EVENT_ENTRY_ADD: - err = rocker_world_fib4_add(rocker, fen_info); + err = rocker_world_fib4_add(rocker, &fib_work->fen_info); if (err) rocker_world_fib4_abort(rocker); - else + fib_info_put(fib_work->fen_info.fi); break; case FIB_EVENT_ENTRY_DEL: - rocker_world_fib4_del(rocker, fen_info); + rocker_world_fib4_del(rocker, &fib_work->fen_info); + fib_info_put(fib_work->fen_info.fi); break; case FIB_EVENT_RULE_ADD: /* fall through */ case FIB_EVENT_RULE_DEL: rocker_world_fib4_abort(rocker); break; } + rtnl_unlock(); + kfree(fib_work); +} + +/* Called with rcu_read_lock() */ +static int rocker_router_fib_event(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct rocker *rocker = container_of(nb, struct rocker, fib_nb); + struct rocker_fib_event_work *fib_work; + + fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC); + if (WARN_ON(!fib_work)) + return NOTIFY_BAD; + + INIT_WORK(&fib_work->work, rocker_router_fib_event_work); + fib_work->rocker = rocker; + fib_work->event = event; + + switch (event) { + case FIB_EVENT_ENTRY_ADD: /* fall through */ + case FIB_EVENT_ENTRY_DEL: + memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info)); + /* Take referece on fib_info to prevent it from being + * freed while work is queued. Release it afterwards. + */ + fib_info_hold(fib_work->fen_info.fi); + break; + } + + queue_work(rocker->rocker_owq, &fib_work->work); + return NOTIFY_DONE; } diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 4ca461322d60..7cd76b6b5cb9 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -2516,6 +2516,7 @@ static void ofdpa_fini(struct rocker *rocker) int bkt; del_timer_sync(&ofdpa->fdb_cleanup_timer); + flush_workqueue(rocker->rocker_owq); spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags); hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry) -- cgit v1.2.3 From 17f8be7dafce4e4b92fbe7bfec605944df290546 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:04 +0100 Subject: rocker: Register FIB notifier before creating ports We can miss FIB notifications sent between the time the ports were created and the FIB notification block registered. Instead of receiving these notifications only when they are replayed for the FIB notification block during registration, just register the notification block before the ports are created. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker_main.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 914e9e1b01ad..8c9c90ae8962 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2804,6 +2804,9 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_alloc_ordered_workqueue; } + rocker->fib_nb.notifier_call = rocker_router_fib_event; + register_fib_notifier(&rocker->fib_nb); + rocker->hw.id = rocker_read64(rocker, SWITCH_ID); err = rocker_probe_ports(rocker); @@ -2812,15 +2815,13 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_probe_ports; } - rocker->fib_nb.notifier_call = rocker_router_fib_event; - register_fib_notifier(&rocker->fib_nb); - dev_info(&pdev->dev, "Rocker switch with id %*phN\n", (int)sizeof(rocker->hw.id), &rocker->hw.id); return 0; err_probe_ports: + unregister_fib_notifier(&rocker->fib_nb); destroy_workqueue(rocker->rocker_owq); err_alloc_ordered_workqueue: free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker); @@ -2848,9 +2849,9 @@ static void rocker_remove(struct pci_dev *pdev) { struct rocker *rocker = pci_get_drvdata(pdev); + rocker_remove_ports(rocker); unregister_fib_notifier(&rocker->fib_nb); rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET); - rocker_remove_ports(rocker); destroy_workqueue(rocker->rocker_owq); free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker); free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker); -- cgit v1.2.3 From d3f706f68e2fd93f1172fe7fd6f16ba70cc52b31 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:05 +0100 Subject: ipv4: fib: Convert FIB notification chain to be atomic In order not to hold RTNL for long periods of time we're going to dump the FIB tables using RCU. Convert the FIB notification chain to be atomic, as we can't block in RCU critical sections. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 026f309c51e9..9bfce0df20dc 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -84,17 +84,17 @@ #include #include "fib_lookup.h" -static BLOCKING_NOTIFIER_HEAD(fib_chain); +static ATOMIC_NOTIFIER_HEAD(fib_chain); int register_fib_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_register(&fib_chain, nb); + return atomic_notifier_chain_register(&fib_chain, nb); } EXPORT_SYMBOL(register_fib_notifier); int unregister_fib_notifier(struct notifier_block *nb) { - return blocking_notifier_chain_unregister(&fib_chain, nb); + return atomic_notifier_chain_unregister(&fib_chain, nb); } EXPORT_SYMBOL(unregister_fib_notifier); @@ -102,7 +102,7 @@ int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { info->net = net; - return blocking_notifier_call_chain(&fib_chain, event_type, info); + return atomic_notifier_call_chain(&fib_chain, event_type, info); } static int call_fib_entry_notifiers(struct net *net, -- cgit v1.2.3 From cacaad11f43aefbbe5fca00af3b9c16e6aee1ba4 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:06 +0100 Subject: ipv4: fib: Allow for consistent FIB dumping The next patch will enable listeners of the FIB notification chain to request a dump of the FIB tables. However, since RTNL isn't taken during the dump, it's possible for the FIB tables to change mid-dump, which will result in inconsistency between the listener's table and the kernel's. Allow listeners to know about changes that occurred mid-dump, by adding a change sequence counter to each net namespace. The counter is incremented just before a notification is sent in the FIB chain. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 3 +++ net/ipv4/fib_frontend.c | 2 ++ net/ipv4/fib_trie.c | 1 + 3 files changed, 6 insertions(+) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 7adf4386ac8f..f0cf5a1b777e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -135,6 +135,9 @@ struct netns_ipv4 { #ifdef CONFIG_IP_ROUTE_MULTIPATH int sysctl_fib_multipath_use_neigh; #endif + + unsigned int fib_seq; /* protected by rtnl_mutex */ + atomic_t rt_genid; }; #endif diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 121384bbb40b..dbad5a1c161a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1219,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net) int err; size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; + net->ipv4.fib_seq = 0; + /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9bfce0df20dc..28913563e7cd 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -101,6 +101,7 @@ EXPORT_SYMBOL(unregister_fib_notifier); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info) { + net->ipv4.fib_seq++; info->net = net; return atomic_notifier_call_chain(&fib_chain, event_type, info); } -- cgit v1.2.3 From c3852ef7f2f8f75a9f85a864bec1f6f5a3068eea Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 3 Dec 2016 16:45:07 +0100 Subject: ipv4: fib: Replay events when registering FIB notifier Commit b90eb7549499 ("fib: introduce FIB notification infrastructure") introduced a new notification chain to notify listeners (f.e., switchdev drivers) about addition and deletion of routes. However, upon registration to the chain the FIB tables can already be populated, which means potential listeners will have an incomplete view of the tables. Solve that by dumping the FIB tables and replaying the events to the passed notification block. The dump itself is done using RCU in order not to starve consumers that need RTNL to make progress. The integrity of the dump is ensured by reading the FIB change sequence counter before and after the dump under RTNL. This allows us to avoid the problematic situation in which the dumping process sends a ENTRY_ADD notification following ENTRY_DEL generated by another process holding RTNL. Callers of the registration function may pass a callback that is executed in case the dump was inconsistent with current FIB tables. The number of retries until a consistent dump is achieved is set to a fixed number to prevent callers from looping for long periods of time. In case current limit proves to be problematic in the future, it can be easily converted to be configurable using a sysctl. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum_router.c | 20 ++- drivers/net/ethernet/rocker/rocker_main.c | 8 +- include/net/ip_fib.h | 3 +- net/ipv4/fib_trie.c | 148 ++++++++++++++++++++- 4 files changed, 174 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 14bed1d10b72..53126bf68ea9 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2027,6 +2027,18 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, return NOTIFY_DONE; } +static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb) +{ + struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb); + + /* Flush pending FIB notifications and then flush the device's + * table before requesting another dump. The FIB notification + * block is unregistered, so no need to take RTNL. + */ + mlxsw_core_flush_owq(); + mlxsw_sp_router_fib_flush(mlxsw_sp); +} + int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) { int err; @@ -2047,9 +2059,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp) goto err_neigh_init; mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event; - register_fib_notifier(&mlxsw_sp->fib_nb); + err = register_fib_notifier(&mlxsw_sp->fib_nb, + mlxsw_sp_router_fib_dump_flush); + if (err) + goto err_register_fib_notifier; + return 0; +err_register_fib_notifier: + mlxsw_sp_neigh_fini(mlxsw_sp); err_neigh_init: mlxsw_sp_vrs_fini(mlxsw_sp); err_vrs_init: diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index 8c9c90ae8962..7c450b5a1138 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2804,8 +2804,13 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto err_alloc_ordered_workqueue; } + /* Only FIBs pointing to our own netdevs are programmed into + * the device, so no need to pass a callback. + */ rocker->fib_nb.notifier_call = rocker_router_fib_event; - register_fib_notifier(&rocker->fib_nb); + err = register_fib_notifier(&rocker->fib_nb, NULL); + if (err) + goto err_register_fib_notifier; rocker->hw.id = rocker_read64(rocker, SWITCH_ID); @@ -2822,6 +2827,7 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id) err_probe_ports: unregister_fib_notifier(&rocker->fib_nb); +err_register_fib_notifier: destroy_workqueue(rocker->rocker_owq); err_alloc_ordered_workqueue: free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker); diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6c67b9391fc0..5f376af377c7 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -221,7 +221,8 @@ enum fib_event_type { FIB_EVENT_RULE_DEL, }; -int register_fib_notifier(struct notifier_block *nb); +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)); int unregister_fib_notifier(struct notifier_block *nb); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 28913563e7cd..73a62700b00a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -84,11 +84,99 @@ #include #include "fib_lookup.h" +static unsigned int fib_seq_sum(void) +{ + unsigned int fib_seq = 0; + struct net *net; + + rtnl_lock(); + for_each_net(net) + fib_seq += net->ipv4.fib_seq; + rtnl_unlock(); + + return fib_seq; +} + static ATOMIC_NOTIFIER_HEAD(fib_chain); -int register_fib_notifier(struct notifier_block *nb) +static int call_fib_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) { - return atomic_notifier_chain_register(&fib_chain, nb); + info->net = net; + return nb->notifier_call(nb, event_type, info); +} + +static void fib_rules_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type) +{ +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_notifier_info info; + + if (net->ipv4.fib_has_custom_rules) + call_fib_notifier(nb, net, event_type, &info); +#endif +} + +static void fib_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type); + +static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, u32 dst, + int dst_len, struct fib_info *fi, + u8 tos, u8 type, u32 tb_id, u32 nlflags) +{ + struct fib_entry_notifier_info info = { + .dst = dst, + .dst_len = dst_len, + .fi = fi, + .tos = tos, + .type = type, + .tb_id = tb_id, + .nlflags = nlflags, + }; + return call_fib_notifier(nb, net, event_type, &info.info); +} + +static bool fib_dump_is_consistent(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) +{ + atomic_notifier_chain_register(&fib_chain, nb); + if (fib_seq == fib_seq_sum()) + return true; + atomic_notifier_chain_unregister(&fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)) +{ + int retries = 0; + + do { + unsigned int fib_seq = fib_seq_sum(); + struct net *net; + + /* Mutex semantics guarantee that every change done to + * FIB tries before we read the change sequence counter + * is now visible to us. + */ + rcu_read_lock(); + for_each_net_rcu(net) { + fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD); + fib_notify(net, nb, FIB_EVENT_ENTRY_ADD); + } + rcu_read_unlock(); + + if (fib_dump_is_consistent(nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; } EXPORT_SYMBOL(register_fib_notifier); @@ -1902,6 +1990,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb) return found; } +static void fib_leaf_notify(struct net *net, struct key_vector *l, + struct fib_table *tb, struct notifier_block *nb, + enum fib_event_type event_type) +{ + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (!fi) + continue; + + /* local and main table can share the same trie, + * so don't notify twice for the same entry. + */ + if (tb->tb_id != fa->tb_id) + continue; + + call_fib_entry_notifier(nb, net, event_type, l->key, + KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, + fa->fa_type, fa->tb_id, 0); + } +} + +static void fib_table_notify(struct net *net, struct fib_table *tb, + struct notifier_block *nb, + enum fib_event_type event_type) +{ + struct trie *t = (struct trie *)tb->tb_data; + struct key_vector *l, *tp = t->kv; + t_key key = 0; + + while ((l = leaf_walk_rcu(&tp, key)) != NULL) { + fib_leaf_notify(net, l, tb, nb, event_type); + + key = l->key + 1; + /* stop in case of wrap around */ + if (key < l->key) + break; + } +} + +static void fib_notify(struct net *net, struct notifier_block *nb, + enum fib_event_type event_type) +{ + unsigned int h; + + for (h = 0; h < FIB_TABLE_HASHSZ; h++) { + struct hlist_head *head = &net->ipv4.fib_table_hash[h]; + struct fib_table *tb; + + hlist_for_each_entry_rcu(tb, head, tb_hlist) + fib_table_notify(net, tb, nb, event_type); + } +} + static void __trie_free_rcu(struct rcu_head *head) { struct fib_table *tb = container_of(head, struct fib_table, rcu); -- cgit v1.2.3