summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2016-12-03 19:29:37 -0500
committerDavid S. Miller <davem@davemloft.net>2016-12-03 19:29:37 -0500
commit69248719d0c27ba7cb734d4e716896369daca98d (patch)
tree65da87ee804eacc4e5f6535e1d9bb8927cba244d
parent548ed722465b763841252bf74b7eda52cfc9131c (diff)
parentc3852ef7f2f8f75a9f85a864bec1f6f5a3068eea (diff)
Merge branch 'fib-notifier-event-replay'
Jiri Pirko says: ==================== ipv4: fib: Replay events when registering FIB notifier Ido says: In kernel 4.9 the switchdev-specific FIB offload mechanism was replaced by a new FIB notification chain to which modules could register in order to be notified about the addition and deletion of FIB entries. The motivation for this change was that switchdev drivers need to be able to reflect the entire FIB table and not only FIBs configured on top of the port netdevs themselves. This is useful in case of in-band management. The fundamental problem with this approach is that upon registration listeners lose all the information previously sent in the chain and thus have an incomplete view of the FIB tables, which can result in packet loss. This patchset fixes that by dumping the FIB tables and replaying notifications previously sent in the chain for the registered notification block. The entire dump process is done under RCU and thus the FIB notification chain is converted to be atomic. The listeners are modified accordingly. This is done in the first eight patches. The ninth patch adds a change sequence counter to ensure the integrity of the FIB dump. The last patch adds the dump itself to the FIB chain registration function and modifies existing listeners to pass a callback to be executed in case dump was inconsistent. --- v3->v4: - Register the notification block after the dump and protect it using the change sequence counter (Hannes Frederic Sowa). - Since we now integrate the dump into the registration function, drop the sysctl to set maximum number of retries and instead set it to a fixed number. Lets see if it's really a problem before adding something we can never remove. - For the same reason, dump FIB tables for all net namespaces. - Add a comment regarding guarantees provided by mutex semantics. v2->v3: - Add sysctl to set the number of FIB dump retries (Hannes Frederic Sowa). - Read the sequence counter under RTNL to ensure synchronization between the dump process and other processes changing the routing tables (Hannes Frederic Sowa). - Pass a callback to the dump function to be executed prior to a retry. - Limit the dump to a single net namespace. v1->v2: - Add a sequence counter to ensure the integrity of the FIB dump (David S. Miller, Hannes Frederic Sowa). - Protect notifications from re-ordering in listeners by using an ordered workqueue (Hannes Frederic Sowa). - Introduce fib_info_hold() (Jiri Pirko). - Relieve rocker from the need to invoke the FIB dump by registering to the FIB notification chain prior to ports creation. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core.c22
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c92
-rw-r--r--drivers/net/ethernet/rocker/rocker.h1
-rw-r--r--drivers/net/ethernet/rocker/rocker_main.c84
-rw-r--r--drivers/net/ethernet/rocker/rocker_ofdpa.c1
-rw-r--r--include/net/ip_fib.h8
-rw-r--r--include/net/netns/ipv4.h3
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_semantics.c1
-rw-r--r--net/ipv4/fib_trie.c155
11 files changed, 342 insertions, 29 deletions
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index 4dc028bb4a33..57a98849551b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -77,6 +77,7 @@ static const char mlxsw_core_driver_name[] = "mlxsw_core";
static struct dentry *mlxsw_core_dbg_root;
static struct workqueue_struct *mlxsw_wq;
+static struct workqueue_struct *mlxsw_owq;
struct mlxsw_core_pcpu_stats {
u64 trap_rx_packets[MLXSW_TRAP_ID_MAX];
@@ -1900,6 +1901,18 @@ int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay)
}
EXPORT_SYMBOL(mlxsw_core_schedule_dw);
+int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay)
+{
+ return queue_delayed_work(mlxsw_owq, dwork, delay);
+}
+EXPORT_SYMBOL(mlxsw_core_schedule_odw);
+
+void mlxsw_core_flush_owq(void)
+{
+ flush_workqueue(mlxsw_owq);
+}
+EXPORT_SYMBOL(mlxsw_core_flush_owq);
+
static int __init mlxsw_core_module_init(void)
{
int err;
@@ -1907,6 +1920,12 @@ static int __init mlxsw_core_module_init(void)
mlxsw_wq = alloc_workqueue(mlxsw_core_driver_name, WQ_MEM_RECLAIM, 0);
if (!mlxsw_wq)
return -ENOMEM;
+ mlxsw_owq = alloc_ordered_workqueue("%s_ordered", WQ_MEM_RECLAIM,
+ mlxsw_core_driver_name);
+ if (!mlxsw_owq) {
+ err = -ENOMEM;
+ goto err_alloc_ordered_workqueue;
+ }
mlxsw_core_dbg_root = debugfs_create_dir(mlxsw_core_driver_name, NULL);
if (!mlxsw_core_dbg_root) {
err = -ENOMEM;
@@ -1915,6 +1934,8 @@ static int __init mlxsw_core_module_init(void)
return 0;
err_debugfs_create_dir:
+ destroy_workqueue(mlxsw_owq);
+err_alloc_ordered_workqueue:
destroy_workqueue(mlxsw_wq);
return err;
}
@@ -1922,6 +1943,7 @@ err_debugfs_create_dir:
static void __exit mlxsw_core_module_exit(void)
{
debugfs_remove_recursive(mlxsw_core_dbg_root);
+ destroy_workqueue(mlxsw_owq);
destroy_workqueue(mlxsw_wq);
}
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index e856b49b83de..a7f94fbc898b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -207,6 +207,8 @@ enum devlink_port_type mlxsw_core_port_type_get(struct mlxsw_core *mlxsw_core,
u8 local_port);
int mlxsw_core_schedule_dw(struct delayed_work *dwork, unsigned long delay);
+int mlxsw_core_schedule_odw(struct delayed_work *dwork, unsigned long delay);
+void mlxsw_core_flush_owq(void);
#define MLXSW_CONFIG_PROFILE_SWID_COUNT 8
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 683f0454170c..53126bf68ea9 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -593,6 +593,14 @@ static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);
static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
{
+ /* At this stage we're guaranteed not to have new incoming
+ * FIB notifications and the work queue is free from FIBs
+ * sitting on top of mlxsw netdevs. However, we can still
+ * have other FIBs queued. Flush the queue before flushing
+ * the device's tables. No need for locks, as we're the only
+ * writer.
+ */
+ mlxsw_core_flush_owq();
mlxsw_sp_router_fib_flush(mlxsw_sp);
kfree(mlxsw_sp->router.vrs);
}
@@ -1948,33 +1956,89 @@ static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
kfree(mlxsw_sp->rifs);
}
-static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
- unsigned long event, void *ptr)
+struct mlxsw_sp_fib_event_work {
+ struct delayed_work dw;
+ struct fib_entry_notifier_info fen_info;
+ struct mlxsw_sp *mlxsw_sp;
+ unsigned long event;
+};
+
+static void mlxsw_sp_router_fib_event_work(struct work_struct *work)
{
- struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
- struct fib_entry_notifier_info *fen_info = ptr;
+ struct mlxsw_sp_fib_event_work *fib_work =
+ container_of(work, struct mlxsw_sp_fib_event_work, dw.work);
+ struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
int err;
- if (!net_eq(fen_info->info.net, &init_net))
- return NOTIFY_DONE;
-
- switch (event) {
+ /* Protect internal structures from changes */
+ rtnl_lock();
+ switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
- err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info);
+ err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info);
if (err)
mlxsw_sp_router_fib4_abort(mlxsw_sp);
+ fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_ENTRY_DEL:
- mlxsw_sp_router_fib4_del(mlxsw_sp, fen_info);
+ mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
+ fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_RULE_ADD: /* fall through */
case FIB_EVENT_RULE_DEL:
mlxsw_sp_router_fib4_abort(mlxsw_sp);
break;
}
+ rtnl_unlock();
+ kfree(fib_work);
+}
+
+/* Called with rcu_read_lock() */
+static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
+ struct mlxsw_sp_fib_event_work *fib_work;
+ struct fib_notifier_info *info = ptr;
+
+ if (!net_eq(info->net, &init_net))
+ return NOTIFY_DONE;
+
+ fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+ if (WARN_ON(!fib_work))
+ return NOTIFY_BAD;
+
+ INIT_DELAYED_WORK(&fib_work->dw, mlxsw_sp_router_fib_event_work);
+ fib_work->mlxsw_sp = mlxsw_sp;
+ fib_work->event = event;
+
+ switch (event) {
+ case FIB_EVENT_ENTRY_ADD: /* fall through */
+ case FIB_EVENT_ENTRY_DEL:
+ memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
+ /* Take referece on fib_info to prevent it from being
+ * freed while work is queued. Release it afterwards.
+ */
+ fib_info_hold(fib_work->fen_info.fi);
+ break;
+ }
+
+ mlxsw_core_schedule_odw(&fib_work->dw, 0);
+
return NOTIFY_DONE;
}
+static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
+{
+ struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
+
+ /* Flush pending FIB notifications and then flush the device's
+ * table before requesting another dump. The FIB notification
+ * block is unregistered, so no need to take RTNL.
+ */
+ mlxsw_core_flush_owq();
+ mlxsw_sp_router_fib_flush(mlxsw_sp);
+}
+
int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
{
int err;
@@ -1995,9 +2059,15 @@ int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
goto err_neigh_init;
mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
- register_fib_notifier(&mlxsw_sp->fib_nb);
+ err = register_fib_notifier(&mlxsw_sp->fib_nb,
+ mlxsw_sp_router_fib_dump_flush);
+ if (err)
+ goto err_register_fib_notifier;
+
return 0;
+err_register_fib_notifier:
+ mlxsw_sp_neigh_fini(mlxsw_sp);
err_neigh_init:
mlxsw_sp_vrs_fini(mlxsw_sp);
err_vrs_init:
diff --git a/drivers/net/ethernet/rocker/rocker.h b/drivers/net/ethernet/rocker/rocker.h
index 2eb9b49569d5..ee9675db5bf9 100644
--- a/drivers/net/ethernet/rocker/rocker.h
+++ b/drivers/net/ethernet/rocker/rocker.h
@@ -72,6 +72,7 @@ struct rocker {
struct rocker_dma_ring_info event_ring;
struct notifier_block fib_nb;
struct rocker_world_ops *wops;
+ struct workqueue_struct *rocker_owq;
void *wpriv;
};
diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c
index 67df4cf93362..7c450b5a1138 100644
--- a/drivers/net/ethernet/rocker/rocker_main.c
+++ b/drivers/net/ethernet/rocker/rocker_main.c
@@ -28,6 +28,7 @@
#include <linux/if_bridge.h>
#include <linux/bitops.h>
#include <linux/ctype.h>
+#include <linux/workqueue.h>
#include <net/switchdev.h>
#include <net/rtnetlink.h>
#include <net/netevent.h>
@@ -2165,28 +2166,70 @@ static const struct switchdev_ops rocker_port_switchdev_ops = {
.switchdev_port_obj_dump = rocker_port_obj_dump,
};
-static int rocker_router_fib_event(struct notifier_block *nb,
- unsigned long event, void *ptr)
+struct rocker_fib_event_work {
+ struct work_struct work;
+ struct fib_entry_notifier_info fen_info;
+ struct rocker *rocker;
+ unsigned long event;
+};
+
+static void rocker_router_fib_event_work(struct work_struct *work)
{
- struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
- struct fib_entry_notifier_info *fen_info = ptr;
+ struct rocker_fib_event_work *fib_work =
+ container_of(work, struct rocker_fib_event_work, work);
+ struct rocker *rocker = fib_work->rocker;
int err;
- switch (event) {
+ /* Protect internal structures from changes */
+ rtnl_lock();
+ switch (fib_work->event) {
case FIB_EVENT_ENTRY_ADD:
- err = rocker_world_fib4_add(rocker, fen_info);
+ err = rocker_world_fib4_add(rocker, &fib_work->fen_info);
if (err)
rocker_world_fib4_abort(rocker);
- else
+ fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_ENTRY_DEL:
- rocker_world_fib4_del(rocker, fen_info);
+ rocker_world_fib4_del(rocker, &fib_work->fen_info);
+ fib_info_put(fib_work->fen_info.fi);
break;
case FIB_EVENT_RULE_ADD: /* fall through */
case FIB_EVENT_RULE_DEL:
rocker_world_fib4_abort(rocker);
break;
}
+ rtnl_unlock();
+ kfree(fib_work);
+}
+
+/* Called with rcu_read_lock() */
+static int rocker_router_fib_event(struct notifier_block *nb,
+ unsigned long event, void *ptr)
+{
+ struct rocker *rocker = container_of(nb, struct rocker, fib_nb);
+ struct rocker_fib_event_work *fib_work;
+
+ fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
+ if (WARN_ON(!fib_work))
+ return NOTIFY_BAD;
+
+ INIT_WORK(&fib_work->work, rocker_router_fib_event_work);
+ fib_work->rocker = rocker;
+ fib_work->event = event;
+
+ switch (event) {
+ case FIB_EVENT_ENTRY_ADD: /* fall through */
+ case FIB_EVENT_ENTRY_DEL:
+ memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
+ /* Take referece on fib_info to prevent it from being
+ * freed while work is queued. Release it afterwards.
+ */
+ fib_info_hold(fib_work->fen_info.fi);
+ break;
+ }
+
+ queue_work(rocker->rocker_owq, &fib_work->work);
+
return NOTIFY_DONE;
}
@@ -2754,6 +2797,21 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_request_event_irq;
}
+ rocker->rocker_owq = alloc_ordered_workqueue(rocker_driver_name,
+ WQ_MEM_RECLAIM);
+ if (!rocker->rocker_owq) {
+ err = -ENOMEM;
+ goto err_alloc_ordered_workqueue;
+ }
+
+ /* Only FIBs pointing to our own netdevs are programmed into
+ * the device, so no need to pass a callback.
+ */
+ rocker->fib_nb.notifier_call = rocker_router_fib_event;
+ err = register_fib_notifier(&rocker->fib_nb, NULL);
+ if (err)
+ goto err_register_fib_notifier;
+
rocker->hw.id = rocker_read64(rocker, SWITCH_ID);
err = rocker_probe_ports(rocker);
@@ -2762,15 +2820,16 @@ static int rocker_probe(struct pci_dev *pdev, const struct pci_device_id *id)
goto err_probe_ports;
}
- rocker->fib_nb.notifier_call = rocker_router_fib_event;
- register_fib_notifier(&rocker->fib_nb);
-
dev_info(&pdev->dev, "Rocker switch with id %*phN\n",
(int)sizeof(rocker->hw.id), &rocker->hw.id);
return 0;
err_probe_ports:
+ unregister_fib_notifier(&rocker->fib_nb);
+err_register_fib_notifier:
+ destroy_workqueue(rocker->rocker_owq);
+err_alloc_ordered_workqueue:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
err_request_event_irq:
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
@@ -2796,9 +2855,10 @@ static void rocker_remove(struct pci_dev *pdev)
{
struct rocker *rocker = pci_get_drvdata(pdev);
+ rocker_remove_ports(rocker);
unregister_fib_notifier(&rocker->fib_nb);
rocker_write32(rocker, CONTROL, ROCKER_CONTROL_RESET);
- rocker_remove_ports(rocker);
+ destroy_workqueue(rocker->rocker_owq);
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_EVENT), rocker);
free_irq(rocker_msix_vector(rocker, ROCKER_MSIX_VEC_CMD), rocker);
rocker_dma_rings_fini(rocker);
diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c
index 4ca461322d60..7cd76b6b5cb9 100644
--- a/drivers/net/ethernet/rocker/rocker_ofdpa.c
+++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c
@@ -2516,6 +2516,7 @@ static void ofdpa_fini(struct rocker *rocker)
int bkt;
del_timer_sync(&ofdpa->fdb_cleanup_timer);
+ flush_workqueue(rocker->rocker_owq);
spin_lock_irqsave(&ofdpa->flow_tbl_lock, flags);
hash_for_each_safe(ofdpa->flow_tbl, bkt, tmp, flow_entry, entry)
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f390c3bb05c5..5f376af377c7 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -221,7 +221,8 @@ enum fib_event_type {
FIB_EVENT_RULE_DEL,
};
-int register_fib_notifier(struct notifier_block *nb);
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb));
int unregister_fib_notifier(struct notifier_block *nb);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info);
@@ -397,6 +398,11 @@ static inline void fib_combine_itag(u32 *itag, const struct fib_result *res)
void free_fib_info(struct fib_info *fi);
+static inline void fib_info_hold(struct fib_info *fi)
+{
+ atomic_inc(&fi->fib_clntref);
+}
+
static inline void fib_info_put(struct fib_info *fi)
{
if (atomic_dec_and_test(&fi->fib_clntref))
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 7adf4386ac8f..f0cf5a1b777e 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -135,6 +135,9 @@ struct netns_ipv4 {
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int sysctl_fib_multipath_use_neigh;
#endif
+
+ unsigned int fib_seq; /* protected by rtnl_mutex */
+
atomic_t rt_genid;
};
#endif
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 121384bbb40b..dbad5a1c161a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1219,6 +1219,8 @@ static int __net_init ip_fib_net_init(struct net *net)
int err;
size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+ net->ipv4.fib_seq = 0;
+
/* Avoid false sharing : Use at least a full cache line */
size = max_t(size_t, size, L1_CACHE_BYTES);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 388d3e21629b..c1bc1e92de0e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -234,6 +234,7 @@ void free_fib_info(struct fib_info *fi)
#endif
call_rcu(&fi->rcu, free_fib_info_rcu);
}
+EXPORT_SYMBOL_GPL(free_fib_info);
void fib_release_info(struct fib_info *fi)
{
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 026f309c51e9..73a62700b00a 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -84,25 +84,114 @@
#include <trace/events/fib.h>
#include "fib_lookup.h"
-static BLOCKING_NOTIFIER_HEAD(fib_chain);
+static unsigned int fib_seq_sum(void)
+{
+ unsigned int fib_seq = 0;
+ struct net *net;
+
+ rtnl_lock();
+ for_each_net(net)
+ fib_seq += net->ipv4.fib_seq;
+ rtnl_unlock();
+
+ return fib_seq;
+}
+
+static ATOMIC_NOTIFIER_HEAD(fib_chain);
-int register_fib_notifier(struct notifier_block *nb)
+static int call_fib_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type,
+ struct fib_notifier_info *info)
{
- return blocking_notifier_chain_register(&fib_chain, nb);
+ info->net = net;
+ return nb->notifier_call(nb, event_type, info);
+}
+
+static void fib_rules_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+ struct fib_notifier_info info;
+
+ if (net->ipv4.fib_has_custom_rules)
+ call_fib_notifier(nb, net, event_type, &info);
+#endif
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type);
+
+static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net,
+ enum fib_event_type event_type, u32 dst,
+ int dst_len, struct fib_info *fi,
+ u8 tos, u8 type, u32 tb_id, u32 nlflags)
+{
+ struct fib_entry_notifier_info info = {
+ .dst = dst,
+ .dst_len = dst_len,
+ .fi = fi,
+ .tos = tos,
+ .type = type,
+ .tb_id = tb_id,
+ .nlflags = nlflags,
+ };
+ return call_fib_notifier(nb, net, event_type, &info.info);
+}
+
+static bool fib_dump_is_consistent(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb),
+ unsigned int fib_seq)
+{
+ atomic_notifier_chain_register(&fib_chain, nb);
+ if (fib_seq == fib_seq_sum())
+ return true;
+ atomic_notifier_chain_unregister(&fib_chain, nb);
+ if (cb)
+ cb(nb);
+ return false;
+}
+
+#define FIB_DUMP_MAX_RETRIES 5
+int register_fib_notifier(struct notifier_block *nb,
+ void (*cb)(struct notifier_block *nb))
+{
+ int retries = 0;
+
+ do {
+ unsigned int fib_seq = fib_seq_sum();
+ struct net *net;
+
+ /* Mutex semantics guarantee that every change done to
+ * FIB tries before we read the change sequence counter
+ * is now visible to us.
+ */
+ rcu_read_lock();
+ for_each_net_rcu(net) {
+ fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD);
+ fib_notify(net, nb, FIB_EVENT_ENTRY_ADD);
+ }
+ rcu_read_unlock();
+
+ if (fib_dump_is_consistent(nb, cb, fib_seq))
+ return 0;
+ } while (++retries < FIB_DUMP_MAX_RETRIES);
+
+ return -EBUSY;
}
EXPORT_SYMBOL(register_fib_notifier);
int unregister_fib_notifier(struct notifier_block *nb)
{
- return blocking_notifier_chain_unregister(&fib_chain, nb);
+ return atomic_notifier_chain_unregister(&fib_chain, nb);
}
EXPORT_SYMBOL(unregister_fib_notifier);
int call_fib_notifiers(struct net *net, enum fib_event_type event_type,
struct fib_notifier_info *info)
{
+ net->ipv4.fib_seq++;
info->net = net;
- return blocking_notifier_call_chain(&fib_chain, event_type, info);
+ return atomic_notifier_call_chain(&fib_chain, event_type, info);
}
static int call_fib_entry_notifiers(struct net *net,
@@ -1901,6 +1990,62 @@ int fib_table_flush(struct net *net, struct fib_table *tb)
return found;
}
+static void fib_leaf_notify(struct net *net, struct key_vector *l,
+ struct fib_table *tb, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct fib_alias *fa;
+
+ hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) {
+ struct fib_info *fi = fa->fa_info;
+
+ if (!fi)
+ continue;
+
+ /* local and main table can share the same trie,
+ * so don't notify twice for the same entry.
+ */
+ if (tb->tb_id != fa->tb_id)
+ continue;
+
+ call_fib_entry_notifier(nb, net, event_type, l->key,
+ KEYLENGTH - fa->fa_slen, fi, fa->fa_tos,
+ fa->fa_type, fa->tb_id, 0);
+ }
+}
+
+static void fib_table_notify(struct net *net, struct fib_table *tb,
+ struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ struct trie *t = (struct trie *)tb->tb_data;
+ struct key_vector *l, *tp = t->kv;
+ t_key key = 0;
+
+ while ((l = leaf_walk_rcu(&tp, key)) != NULL) {
+ fib_leaf_notify(net, l, tb, nb, event_type);
+
+ key = l->key + 1;
+ /* stop in case of wrap around */
+ if (key < l->key)
+ break;
+ }
+}
+
+static void fib_notify(struct net *net, struct notifier_block *nb,
+ enum fib_event_type event_type)
+{
+ unsigned int h;
+
+ for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
+ struct hlist_head *head = &net->ipv4.fib_table_hash[h];
+ struct fib_table *tb;
+
+ hlist_for_each_entry_rcu(tb, head, tb_hlist)
+ fib_table_notify(net, tb, nb, event_type);
+ }
+}
+
static void __trie_free_rcu(struct rcu_head *head)
{
struct fib_table *tb = container_of(head, struct fib_table, rcu);