summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2021-03-14 15:00:44 -0700
committerDavid S. Miller <davem@davemloft.net>2021-03-14 15:00:44 -0700
commit2117fce81f6b862aac0673abe8df0c60dca64bfa (patch)
tree145db1bc35b99623997e7a7a861500b5c19a6c25
parentc6baf7eeb0cf82f6a90a703f6548250fc85cfdcc (diff)
parentbb24d592e66eb059e086595510e0f0b064e4114d (diff)
Merge branch 'psample-Add-additional-metadata-attributes'
Ido Schimmel says: ==================== psample: Add additional metadata attributes This series extends the psample module to expose additional metadata to user space for packets sampled via act_sample. The new metadata (e.g., transit delay) can then be consumed by applications such as hsflowd [1] for better network observability. netdevsim is extended with a dummy psample implementation that periodically reports "sampled" packets to the psample module. In addition to testing of the psample module, it enables the development and demonstration of user space applications (e.g., hsflowd) that are interested in the new metadata even without access to specialized hardware (e.g., Spectrum ASIC) that can provide it. mlxsw is also extended to provide the new metadata to psample. A Wireshark dissector for psample netlink packets [2] will be submitted upstream after the kernel patches are accepted. In addition, a libpcap capture module for psample is currently in the works. Eventually, users should be able to run: # tshark -i psample In order to consume sampled packets along with their metadata. Series overview: Patch #1 makes it easier to extend the metadata provided to psample Patch #2 adds the new metadata attributes to psample Patch #3 extends netdevsim to periodically report "sampled" packets to psample. Various debugfs knobs are added to control the reporting Patch #4 adds a selftest over netdevsim Patches #5-#10 gradually add support for the new metadata in mlxsw Patch #11 adds a selftest over mlxsw [1] https://sflow.org/draft4_sflow_transit.txt [2] https://gitlab.com/amitcohen1/wireshark/-/commit/3d711143024e032aef1b056dd23f0266c54fab56 ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--drivers/net/Kconfig1
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/core.h21
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/pci.c55
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/pci_hw.h71
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum.c26
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum.h2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c71
-rw-r--r--drivers/net/netdevsim/Makefile4
-rw-r--r--drivers/net/netdevsim/dev.c17
-rw-r--r--drivers/net/netdevsim/netdevsim.h15
-rw-r--r--drivers/net/netdevsim/psample.c264
-rw-r--r--include/net/psample.h21
-rw-r--r--include/uapi/linux/psample.h7
-rw-r--r--net/psample/psample.c45
-rw-r--r--net/sched/act_sample.c16
-rwxr-xr-xtools/testing/selftests/drivers/net/mlxsw/tc_sample.sh492
-rwxr-xr-xtools/testing/selftests/drivers/net/netdevsim/psample.sh181
17 files changed, 1256 insertions, 53 deletions
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index bcd31f458d1a..5895905b6aa1 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -579,6 +579,7 @@ config NETDEVSIM
depends on DEBUG_FS
depends on INET
depends on IPV6 || IPV6=n
+ depends on PSAMPLE || PSAMPLE=n
select NET_DEVLINK
help
This driver is a developer testing tool and software model that can
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.h b/drivers/net/ethernet/mellanox/mlxsw/core.h
index 8af7d9d03475..80712dc803d0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.h
@@ -58,6 +58,25 @@ struct mlxsw_tx_info {
bool is_emad;
};
+struct mlxsw_rx_md_info {
+ u32 cookie_index;
+ u32 latency;
+ u32 tx_congestion;
+ union {
+ /* Valid when 'tx_port_valid' is set. */
+ u16 tx_sys_port;
+ u16 tx_lag_id;
+ };
+ u8 tx_lag_port_index; /* Valid when 'tx_port_is_lag' is set. */
+ u8 tx_tc;
+ u8 latency_valid:1,
+ tx_congestion_valid:1,
+ tx_tc_valid:1,
+ tx_port_valid:1,
+ tx_port_is_lag:1,
+ unused:3;
+};
+
bool mlxsw_core_skb_transmit_busy(struct mlxsw_core *mlxsw_core,
const struct mlxsw_tx_info *tx_info);
int mlxsw_core_skb_transmit(struct mlxsw_core *mlxsw_core, struct sk_buff *skb,
@@ -515,7 +534,7 @@ enum mlxsw_devlink_param_id {
struct mlxsw_skb_cb {
union {
struct mlxsw_tx_info tx_info;
- u32 cookie_index; /* Only used during receive */
+ struct mlxsw_rx_md_info rx_md_info;
};
};
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci.c b/drivers/net/ethernet/mellanox/mlxsw/pci.c
index d0052537e627..8e8456811384 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -540,6 +540,55 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
spin_unlock(&q->lock);
}
+static void mlxsw_pci_cqe_rdq_md_tx_port_init(struct sk_buff *skb,
+ const char *cqe)
+{
+ struct mlxsw_skb_cb *cb = mlxsw_skb_cb(skb);
+
+ if (mlxsw_pci_cqe2_tx_lag_get(cqe)) {
+ cb->rx_md_info.tx_port_is_lag = true;
+ cb->rx_md_info.tx_lag_id = mlxsw_pci_cqe2_tx_lag_id_get(cqe);
+ cb->rx_md_info.tx_lag_port_index =
+ mlxsw_pci_cqe2_tx_lag_subport_get(cqe);
+ } else {
+ cb->rx_md_info.tx_port_is_lag = false;
+ cb->rx_md_info.tx_sys_port =
+ mlxsw_pci_cqe2_tx_system_port_get(cqe);
+ }
+
+ if (cb->rx_md_info.tx_sys_port != MLXSW_PCI_CQE2_TX_PORT_MULTI_PORT &&
+ cb->rx_md_info.tx_sys_port != MLXSW_PCI_CQE2_TX_PORT_INVALID)
+ cb->rx_md_info.tx_port_valid = 1;
+ else
+ cb->rx_md_info.tx_port_valid = 0;
+}
+
+static void mlxsw_pci_cqe_rdq_md_init(struct sk_buff *skb, const char *cqe)
+{
+ struct mlxsw_skb_cb *cb = mlxsw_skb_cb(skb);
+
+ cb->rx_md_info.tx_congestion = mlxsw_pci_cqe2_mirror_cong_get(cqe);
+ if (cb->rx_md_info.tx_congestion != MLXSW_PCI_CQE2_MIRROR_CONG_INVALID)
+ cb->rx_md_info.tx_congestion_valid = 1;
+ else
+ cb->rx_md_info.tx_congestion_valid = 0;
+ cb->rx_md_info.tx_congestion <<= MLXSW_PCI_CQE2_MIRROR_CONG_SHIFT;
+
+ cb->rx_md_info.latency = mlxsw_pci_cqe2_mirror_latency_get(cqe);
+ if (cb->rx_md_info.latency != MLXSW_PCI_CQE2_MIRROR_LATENCY_INVALID)
+ cb->rx_md_info.latency_valid = 1;
+ else
+ cb->rx_md_info.latency_valid = 0;
+
+ cb->rx_md_info.tx_tc = mlxsw_pci_cqe2_mirror_tclass_get(cqe);
+ if (cb->rx_md_info.tx_tc != MLXSW_PCI_CQE2_MIRROR_TCLASS_INVALID)
+ cb->rx_md_info.tx_tc_valid = 1;
+ else
+ cb->rx_md_info.tx_tc_valid = 0;
+
+ mlxsw_pci_cqe_rdq_md_tx_port_init(skb, cqe);
+}
+
static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
struct mlxsw_pci_queue *q,
u16 consumer_counter_limit,
@@ -581,11 +630,15 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
if (mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2)
cookie_index = mlxsw_pci_cqe2_user_def_val_orig_pkt_len_get(cqe);
- mlxsw_skb_cb(skb)->cookie_index = cookie_index;
+ mlxsw_skb_cb(skb)->rx_md_info.cookie_index = cookie_index;
} else if (rx_info.trap_id >= MLXSW_TRAP_ID_MIRROR_SESSION0 &&
rx_info.trap_id <= MLXSW_TRAP_ID_MIRROR_SESSION7 &&
mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2) {
rx_info.mirror_reason = mlxsw_pci_cqe2_mirror_reason_get(cqe);
+ mlxsw_pci_cqe_rdq_md_init(skb, cqe);
+ } else if (rx_info.trap_id == MLXSW_TRAP_ID_PKT_SAMPLE &&
+ mlxsw_pci->max_cqe_ver >= MLXSW_PCI_CQE_V2) {
+ mlxsw_pci_cqe_rdq_md_tx_port_init(skb, cqe);
}
byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
diff --git a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
index a2c1fbd3e0d1..7b531228d6c0 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci_hw.h
@@ -173,6 +173,15 @@ MLXSW_ITEM32(pci, cqe, wqe_counter, 0x04, 16, 16);
*/
MLXSW_ITEM32(pci, cqe, byte_count, 0x04, 0, 14);
+#define MLXSW_PCI_CQE2_MIRROR_CONG_INVALID 0xFFFF
+
+/* pci_cqe_mirror_cong_high
+ * Congestion level in units of 8KB of the egress traffic class of the original
+ * packet that does mirroring to the CPU. Value of 0xFFFF means that the
+ * congestion level is invalid.
+ */
+MLXSW_ITEM32(pci, cqe2, mirror_cong_high, 0x08, 16, 4);
+
/* pci_cqe_trap_id
* Trap ID that captured the packet.
*/
@@ -208,6 +217,59 @@ MLXSW_ITEM32(pci, cqe0, dqn, 0x0C, 1, 5);
MLXSW_ITEM32(pci, cqe12, dqn, 0x0C, 1, 6);
mlxsw_pci_cqe_item_helpers(dqn, 0, 12, 12);
+#define MLXSW_PCI_CQE2_MIRROR_TCLASS_INVALID 0x1F
+
+/* pci_cqe_mirror_tclass
+ * The egress traffic class of the original packet that does mirroring to the
+ * CPU. Value of 0x1F means that the traffic class is invalid.
+ */
+MLXSW_ITEM32(pci, cqe2, mirror_tclass, 0x10, 27, 5);
+
+/* pci_cqe_tx_lag
+ * The Tx port of a packet that is mirrored / sampled to the CPU is a LAG.
+ */
+MLXSW_ITEM32(pci, cqe2, tx_lag, 0x10, 24, 1);
+
+/* pci_cqe_tx_lag_subport
+ * The port index within the LAG of a packet that is mirrored / sampled to the
+ * CPU. Reserved when tx_lag is 0.
+ */
+MLXSW_ITEM32(pci, cqe2, tx_lag_subport, 0x10, 16, 8);
+
+#define MLXSW_PCI_CQE2_TX_PORT_MULTI_PORT 0xFFFE
+#define MLXSW_PCI_CQE2_TX_PORT_INVALID 0xFFFF
+
+/* pci_cqe_tx_lag_id
+ * The Tx LAG ID of the original packet that is mirrored / sampled to the CPU.
+ * Value of 0xFFFE means multi-port. Value fo 0xFFFF means that the Tx LAG ID
+ * is invalid. Reserved when tx_lag is 0.
+ */
+MLXSW_ITEM32(pci, cqe2, tx_lag_id, 0x10, 0, 16);
+
+/* pci_cqe_tx_system_port
+ * The Tx port of the original packet that is mirrored / sampled to the CPU.
+ * Value of 0xFFFE means multi-port. Value fo 0xFFFF means that the Tx port is
+ * invalid. Reserved when tx_lag is 1.
+ */
+MLXSW_ITEM32(pci, cqe2, tx_system_port, 0x10, 0, 16);
+
+/* pci_cqe_mirror_cong_low
+ * Congestion level in units of 8KB of the egress traffic class of the original
+ * packet that does mirroring to the CPU. Value of 0xFFFF means that the
+ * congestion level is invalid.
+ */
+MLXSW_ITEM32(pci, cqe2, mirror_cong_low, 0x14, 20, 12);
+
+#define MLXSW_PCI_CQE2_MIRROR_CONG_SHIFT 13 /* Units of 8KB. */
+
+static inline u16 mlxsw_pci_cqe2_mirror_cong_get(const char *cqe)
+{
+ u16 cong_high = mlxsw_pci_cqe2_mirror_cong_high_get(cqe);
+ u16 cong_low = mlxsw_pci_cqe2_mirror_cong_low_get(cqe);
+
+ return cong_high << 12 | cong_low;
+}
+
/* pci_cqe_user_def_val_orig_pkt_len
* When trap_id is an ACL: User defined value from policy engine action.
*/
@@ -218,6 +280,15 @@ MLXSW_ITEM32(pci, cqe2, user_def_val_orig_pkt_len, 0x14, 0, 20);
*/
MLXSW_ITEM32(pci, cqe2, mirror_reason, 0x18, 24, 8);
+#define MLXSW_PCI_CQE2_MIRROR_LATENCY_INVALID 0xFFFFFF
+
+/* pci_cqe_mirror_latency
+ * End-to-end latency of the original packet that does mirroring to the CPU.
+ * Value of 0xFFFFFF means that the latency is invalid. Units are according to
+ * MOGCR.mirror_latency_units.
+ */
+MLXSW_ITEM32(pci, cqe2, mirror_latency, 0x1C, 8, 24);
+
/* pci_cqe_owner
* Ownership bit.
*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
index 93b15b8c007e..6054147fd51c 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -2212,32 +2212,6 @@ void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb,
mlxsw_sp->ptp_ops->receive(mlxsw_sp, skb, local_port);
}
-void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb,
- u8 local_port)
-{
- struct mlxsw_sp_port *mlxsw_sp_port = mlxsw_sp->ports[local_port];
- struct mlxsw_sp_port_sample *sample;
- u32 size;
-
- if (unlikely(!mlxsw_sp_port)) {
- dev_warn_ratelimited(mlxsw_sp->bus_info->dev, "Port %d: sample skb received for non-existent port\n",
- local_port);
- goto out;
- }
-
- rcu_read_lock();
- sample = rcu_dereference(mlxsw_sp_port->sample);
- if (!sample)
- goto out_unlock;
- size = sample->truncate ? sample->trunc_size : skb->len;
- psample_sample_packet(sample->psample_group, skb, size,
- mlxsw_sp_port->dev->ifindex, 0, sample->rate);
-out_unlock:
- rcu_read_unlock();
-out:
- consume_skb(skb);
-}
-
#define MLXSW_SP_RXL_NO_MARK(_trap_id, _action, _trap_group, _is_ctrl) \
MLXSW_RXL(mlxsw_sp_rx_listener_no_mark_func, _trap_id, _action, \
_is_ctrl, SP_##_trap_group, DISCARD)
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
index bc7006de7873..0082f70daff3 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h
@@ -570,8 +570,6 @@ void mlxsw_sp_rx_listener_no_mark_func(struct sk_buff *skb,
u8 local_port, void *priv);
void mlxsw_sp_ptp_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb,
u8 local_port);
-void mlxsw_sp_sample_receive(struct mlxsw_sp *mlxsw_sp, struct sk_buff *skb,
- u8 local_port);
int mlxsw_sp_port_speed_get(struct mlxsw_sp_port *mlxsw_sp_port, u32 *speed);
int mlxsw_sp_port_ets_set(struct mlxsw_sp_port *mlxsw_sp_port,
enum mlxsw_reg_qeec_hr hr, u8 index, u8 next_index,
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
index e0e6ee58d31a..056201029ce5 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
@@ -108,7 +108,7 @@ static void mlxsw_sp_rx_drop_listener(struct sk_buff *skb, u8 local_port,
static void mlxsw_sp_rx_acl_drop_listener(struct sk_buff *skb, u8 local_port,
void *trap_ctx)
{
- u32 cookie_index = mlxsw_skb_cb(skb)->cookie_index;
+ u32 cookie_index = mlxsw_skb_cb(skb)->rx_md_info.cookie_index;
const struct flow_action_cookie *fa_cookie;
struct devlink_port *in_devlink_port;
struct mlxsw_sp_port *mlxsw_sp_port;
@@ -204,21 +204,86 @@ static void mlxsw_sp_rx_ptp_listener(struct sk_buff *skb, u8 local_port,
mlxsw_sp_ptp_receive(mlxsw_sp, skb, local_port);
}
+static struct mlxsw_sp_port *
+mlxsw_sp_sample_tx_port_get(struct mlxsw_sp *mlxsw_sp,
+ const struct mlxsw_rx_md_info *rx_md_info)
+{
+ u8 local_port;
+
+ if (!rx_md_info->tx_port_valid)
+ return NULL;
+
+ if (rx_md_info->tx_port_is_lag)
+ local_port = mlxsw_core_lag_mapping_get(mlxsw_sp->core,
+ rx_md_info->tx_lag_id,
+ rx_md_info->tx_lag_port_index);
+ else
+ local_port = rx_md_info->tx_sys_port;
+
+ if (local_port >= mlxsw_core_max_ports(mlxsw_sp->core))
+ return NULL;
+
+ return mlxsw_sp->ports[local_port];
+}
+
+/* The latency units are determined according to MOGCR.mirror_latency_units. It
+ * defaults to 64 nanoseconds.
+ */
+#define MLXSW_SP_MIRROR_LATENCY_SHIFT 6
+
+static void mlxsw_sp_psample_md_init(struct mlxsw_sp *mlxsw_sp,
+ struct psample_metadata *md,
+ struct sk_buff *skb, int in_ifindex,
+ bool truncate, u32 trunc_size)
+{
+ struct mlxsw_rx_md_info *rx_md_info = &mlxsw_skb_cb(skb)->rx_md_info;
+ struct mlxsw_sp_port *mlxsw_sp_port;
+
+ md->trunc_size = truncate ? trunc_size : skb->len;
+ md->in_ifindex = in_ifindex;
+ mlxsw_sp_port = mlxsw_sp_sample_tx_port_get(mlxsw_sp, rx_md_info);
+ md->out_ifindex = mlxsw_sp_port && mlxsw_sp_port->dev ?
+ mlxsw_sp_port->dev->ifindex : 0;
+ md->out_tc_valid = rx_md_info->tx_tc_valid;
+ md->out_tc = rx_md_info->tx_tc;
+ md->out_tc_occ_valid = rx_md_info->tx_congestion_valid;
+ md->out_tc_occ = rx_md_info->tx_congestion;
+ md->latency_valid = rx_md_info->latency_valid;
+ md->latency = rx_md_info->latency;
+ md->latency <<= MLXSW_SP_MIRROR_LATENCY_SHIFT;
+}
+
static void mlxsw_sp_rx_sample_listener(struct sk_buff *skb, u8 local_port,
void *trap_ctx)
{
struct mlxsw_sp *mlxsw_sp = devlink_trap_ctx_priv(trap_ctx);
+ struct mlxsw_sp_port *mlxsw_sp_port;
+ struct mlxsw_sp_port_sample *sample;
+ struct psample_metadata md = {};
int err;
err = __mlxsw_sp_rx_no_mark_listener(skb, local_port, trap_ctx);
if (err)
return;
- /* The sample handler expects skb->data to point to the start of the
+ mlxsw_sp_port = mlxsw_sp->ports[local_port];
+ if (!mlxsw_sp_port)
+ goto out;
+
+ sample = rcu_dereference(mlxsw_sp_port->sample);
+ if (!sample)
+ goto out;
+
+ /* The psample module expects skb->data to point to the start of the
* Ethernet header.
*/
skb_push(skb, ETH_HLEN);
- mlxsw_sp_sample_receive(mlxsw_sp, skb, local_port);
+ mlxsw_sp_psample_md_init(mlxsw_sp, &md, skb,
+ mlxsw_sp_port->dev->ifindex, sample->truncate,
+ sample->trunc_size);
+ psample_sample_packet(sample->psample_group, skb, sample->rate, &md);
+out:
+ consume_skb(skb);
}
#define MLXSW_SP_TRAP_DROP(_id, _group_id) \
diff --git a/drivers/net/netdevsim/Makefile b/drivers/net/netdevsim/Makefile
index ade086eed955..a1cbfa44a1e1 100644
--- a/drivers/net/netdevsim/Makefile
+++ b/drivers/net/netdevsim/Makefile
@@ -13,3 +13,7 @@ endif
ifneq ($(CONFIG_XFRM_OFFLOAD),)
netdevsim-objs += ipsec.o
endif
+
+ifneq ($(CONFIG_PSAMPLE),)
+netdevsim-objs += psample.o
+endif
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index dbeb29fa16e8..6189a4c0d39e 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -1032,10 +1032,14 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev,
if (err)
goto err_fib_destroy;
- err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count);
+ err = nsim_dev_psample_init(nsim_dev);
if (err)
goto err_health_exit;
+ err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count);
+ if (err)
+ goto err_psample_exit;
+
nsim_dev->take_snapshot = debugfs_create_file("take_snapshot",
0200,
nsim_dev->ddir,
@@ -1043,6 +1047,8 @@ static int nsim_dev_reload_create(struct nsim_dev *nsim_dev,
&nsim_dev_take_snapshot_fops);
return 0;
+err_psample_exit:
+ nsim_dev_psample_exit(nsim_dev);
err_health_exit:
nsim_dev_health_exit(nsim_dev);
err_fib_destroy:
@@ -1118,14 +1124,20 @@ int nsim_dev_probe(struct nsim_bus_dev *nsim_bus_dev)
if (err)
goto err_health_exit;
- err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count);
+ err = nsim_dev_psample_init(nsim_dev);
if (err)
goto err_bpf_dev_exit;
+ err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count);
+ if (err)
+ goto err_psample_exit;
+
devlink_params_publish(devlink);
devlink_reload_enable(devlink);
return 0;
+err_psample_exit:
+ nsim_dev_psample_exit(nsim_dev);
err_bpf_dev_exit:
nsim_bpf_dev_exit(nsim_dev);
err_health_exit:
@@ -1158,6 +1170,7 @@ static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev)
return;
debugfs_remove(nsim_dev->take_snapshot);
nsim_dev_port_del_all(nsim_dev);
+ nsim_dev_psample_exit(nsim_dev);
nsim_dev_health_exit(nsim_dev);
nsim_fib_destroy(devlink, nsim_dev->fib_data);
nsim_dev_traps_exit(devlink);
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index 48163c5f2ec9..d735c21def4b 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -180,6 +180,20 @@ struct nsim_dev_health {
int nsim_dev_health_init(struct nsim_dev *nsim_dev, struct devlink *devlink);
void nsim_dev_health_exit(struct nsim_dev *nsim_dev);
+#if IS_ENABLED(CONFIG_PSAMPLE)
+int nsim_dev_psample_init(struct nsim_dev *nsim_dev);
+void nsim_dev_psample_exit(struct nsim_dev *nsim_dev);
+#else
+static inline int nsim_dev_psample_init(struct nsim_dev *nsim_dev)
+{
+ return 0;
+}
+
+static inline void nsim_dev_psample_exit(struct nsim_dev *nsim_dev)
+{
+}
+#endif
+
struct nsim_dev_port {
struct list_head list;
struct devlink_port devlink_port;
@@ -229,6 +243,7 @@ struct nsim_dev {
bool static_iana_vxlan;
u32 sleep;
} udp_ports;
+ struct nsim_dev_psample *psample;
};
static inline struct net *nsim_dev_net(struct nsim_dev *nsim_dev)
diff --git a/drivers/net/netdevsim/psample.c b/drivers/net/netdevsim/psample.c
new file mode 100644
index 000000000000..5ec3bd7f891b
--- /dev/null
+++ b/drivers/net/netdevsim/psample.c
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2021 Mellanox Technologies. All rights reserved */
+
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/etherdevice.h>
+#include <linux/inet.h>
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/devlink.h>
+#include <net/ip.h>
+#include <net/psample.h>
+#include <uapi/linux/ip.h>
+#include <uapi/linux/udp.h>
+
+#include "netdevsim.h"
+
+#define NSIM_PSAMPLE_REPORT_INTERVAL_MS 100
+#define NSIM_PSAMPLE_INVALID_TC 0xFFFF
+#define NSIM_PSAMPLE_L4_DATA_LEN 100
+
+struct nsim_dev_psample {
+ struct delayed_work psample_dw;
+ struct dentry *ddir;
+ struct psample_group *group;
+ u32 rate;
+ u32 group_num;
+ u32 trunc_size;
+ int in_ifindex;
+ int out_ifindex;
+ u16 out_tc;
+ u64 out_tc_occ_max;
+ u64 latency_max;
+ bool is_active;
+};
+
+static struct sk_buff *nsim_dev_psample_skb_build(void)
+{
+ int tot_len, data_len = NSIM_PSAMPLE_L4_DATA_LEN;
+ struct sk_buff *skb;
+ struct udphdr *udph;
+ struct ethhdr *eth;
+ struct iphdr *iph;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+ tot_len = sizeof(struct iphdr) + sizeof(struct udphdr) + data_len;
+
+ skb_reset_mac_header(skb);
+ eth = skb_put(skb, sizeof(struct ethhdr));
+ eth_random_addr(eth->h_dest);
+ eth_random_addr(eth->h_source);
+ eth->h_proto = htons(ETH_P_IP);
+ skb->protocol = htons(ETH_P_IP);
+
+ skb_set_network_header(skb, skb->len);
+ iph = skb_put(skb, sizeof(struct iphdr));
+ iph->protocol = IPPROTO_UDP;
+ iph->saddr = in_aton("192.0.2.1");
+ iph->daddr = in_aton("198.51.100.1");
+ iph->version = 0x4;
+ iph->frag_off = 0;
+ iph->ihl = 0x5;
+ iph->tot_len = htons(tot_len);
+ iph->id = 0;
+ iph->ttl = 100;
+ iph->check = 0;
+ iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+
+ skb_set_transport_header(skb, skb->len);
+ udph = skb_put_zero(skb, sizeof(struct udphdr) + data_len);
+ get_random_bytes(&udph->source, sizeof(u16));
+ get_random_bytes(&udph->dest, sizeof(u16));
+ udph->len = htons(sizeof(struct udphdr) + data_len);
+
+ return skb;
+}
+
+static void nsim_dev_psample_md_prepare(const struct nsim_dev_psample *psample,
+ struct psample_metadata *md)
+{
+ md->trunc_size = psample->trunc_size;
+ md->in_ifindex = psample->in_ifindex;
+ md->out_ifindex = psample->out_ifindex;
+
+ if (psample->out_tc != NSIM_PSAMPLE_INVALID_TC) {
+ md->out_tc = psample->out_tc;
+ md->out_tc_valid = 1;
+ }
+
+ if (psample->out_tc_occ_max) {
+ u64 out_tc_occ;
+
+ get_random_bytes(&out_tc_occ, sizeof(u64));
+ md->out_tc_occ = out_tc_occ & (psample->out_tc_occ_max - 1);
+ md->out_tc_occ_valid = 1;
+ }
+
+ if (psample->latency_max) {
+ u64 latency;
+
+ get_random_bytes(&latency, sizeof(u64));
+ md->latency = latency & (psample->latency_max - 1);
+ md->latency_valid = 1;
+ }
+}
+
+static void nsim_dev_psample_report_work(struct work_struct *work)
+{
+ struct nsim_dev_psample *psample;
+ struct psample_metadata md = {};
+ struct sk_buff *skb;
+ unsigned long delay;
+
+ psample = container_of(work, struct nsim_dev_psample, psample_dw.work);
+
+ skb = nsim_dev_psample_skb_build();
+ if (!skb)
+ goto out;
+
+ nsim_dev_psample_md_prepare(psample, &md);
+ psample_sample_packet(psample->group, skb, psample->rate, &md);
+ consume_skb(skb);
+
+out:
+ delay = msecs_to_jiffies(NSIM_PSAMPLE_REPORT_INTERVAL_MS);
+ schedule_delayed_work(&psample->psample_dw, delay);
+}
+
+static int nsim_dev_psample_enable(struct nsim_dev *nsim_dev)
+{
+ struct nsim_dev_psample *psample = nsim_dev->psample;
+ struct devlink *devlink;
+ unsigned long delay;
+
+ if (psample->is_active)
+ return -EBUSY;
+
+ devlink = priv_to_devlink(nsim_dev);
+ psample->group = psample_group_get(devlink_net(devlink),
+ psample->group_num);
+ if (!psample->group)
+ return -EINVAL;
+
+ delay = msecs_to_jiffies(NSIM_PSAMPLE_REPORT_INTERVAL_MS);
+ schedule_delayed_work(&psample->psample_dw, delay);
+
+ psample->is_active = true;
+
+ return 0;
+}
+
+static int nsim_dev_psample_disable(struct nsim_dev *nsim_dev)
+{
+ struct nsim_dev_psample *psample = nsim_dev->psample;
+
+ if (!psample->is_active)
+ return -EINVAL;
+
+ psample->is_active = false;
+
+ cancel_delayed_work_sync(&psample->psample_dw);
+ psample_group_put(psample->group);
+
+ return 0;
+}
+
+static ssize_t nsim_dev_psample_enable_write(struct file *file,
+ const char __user *data,
+ size_t count, loff_t *ppos)
+{
+ struct nsim_dev *nsim_dev = file->private_data;
+ bool enable;
+ int err;
+
+ err = kstrtobool_from_user(data, count, &enable);
+ if (err)
+ return err;
+
+ if (enable)
+ err = nsim_dev_psample_enable(nsim_dev);
+ else
+ err = nsim_dev_psample_disable(nsim_dev);
+
+ return err ? err : count;
+}
+
+static const struct file_operations nsim_psample_enable_fops = {
+ .open = simple_open,
+ .write = nsim_dev_psample_enable_write,
+ .llseek = generic_file_llseek,
+ .owner = THIS_MODULE,
+};
+
+int nsim_dev_psample_init(struct nsim_dev *nsim_dev)
+{
+ struct nsim_dev_psample *psample;
+ int err;
+
+ psample = kzalloc(sizeof(*psample), GFP_KERNEL);
+ if (!psample)
+ return -ENOMEM;
+ nsim_dev->psample = psample;
+
+ INIT_DELAYED_WORK(&psample->psample_dw, nsim_dev_psample_report_work);
+
+ psample->ddir = debugfs_create_dir("psample", nsim_dev->ddir);
+ if (IS_ERR(psample->ddir)) {
+ err = PTR_ERR(psample->ddir);
+ goto err_psample_free;
+ }
+
+ /* Populate sampling parameters with sane defaults. */
+ psample->rate = 100;
+ debugfs_create_u32("rate", 0600, psample->ddir, &psample->rate);
+
+ psample->group_num = 10;
+ debugfs_create_u32("group_num", 0600, psample->ddir,
+ &psample->group_num);
+
+ psample->trunc_size = 0;
+ debugfs_create_u32("trunc_size", 0600, psample->ddir,
+ &psample->trunc_size);
+
+ psample->in_ifindex = 1;
+ debugfs_create_u32("in_ifindex", 0600, psample->ddir,
+ &psample->in_ifindex);
+
+ psample->out_ifindex = 2;
+ debugfs_create_u32("out_ifindex", 0600, psample->ddir,
+ &psample->out_ifindex);
+
+ psample->out_tc = 0;
+ debugfs_create_u16("out_tc", 0600, psample->ddir, &psample->out_tc);
+
+ psample->out_tc_occ_max = 10000;
+ debugfs_create_u64("out_tc_occ_max", 0600, psample->ddir,
+ &psample->out_tc_occ_max);
+
+ psample->latency_max = 50;
+ debugfs_create_u64("latency_max", 0600, psample->ddir,
+ &psample->latency_max);
+
+ debugfs_create_file("enable", 0200, psample->ddir, nsim_dev,
+ &nsim_psample_enable_fops);
+
+ return 0;
+
+err_psample_free:
+ kfree(nsim_dev->psample);
+ return err;
+}
+
+void nsim_dev_psample_exit(struct nsim_dev *nsim_dev)
+{
+ debugfs_remove_recursive(nsim_dev->psample->ddir);
+ if (nsim_dev->psample->is_active) {
+ cancel_delayed_work_sync(&nsim_dev->psample->psample_dw);
+ psample_group_put(nsim_dev->psample->group);
+ }
+ kfree(nsim_dev->psample);
+}
diff --git a/include/net/psample.h b/include/net/psample.h
index 68ae16bb0a4a..e328c5127757 100644
--- a/include/net/psample.h
+++ b/include/net/psample.h
@@ -14,6 +14,19 @@ struct psample_group {
struct rcu_head rcu;
};
+struct psample_metadata {
+ u32 trunc_size;
+ int in_ifindex;
+ int out_ifindex;
+ u16 out_tc;
+ u64 out_tc_occ; /* bytes */
+ u64 latency; /* nanoseconds */
+ u8 out_tc_valid:1,
+ out_tc_occ_valid:1,
+ latency_valid:1,
+ unused:5;
+};
+
struct psample_group *psample_group_get(struct net *net, u32 group_num);
void psample_group_take(struct psample_group *group);
void psample_group_put(struct psample_group *group);
@@ -21,15 +34,13 @@ void psample_group_put(struct psample_group *group);
#if IS_ENABLED(CONFIG_PSAMPLE)
void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
- u32 trunc_size, int in_ifindex, int out_ifindex,
- u32 sample_rate);
+ u32 sample_rate, const struct psample_metadata *md);
#else
static inline void psample_sample_packet(struct psample_group *group,
- struct sk_buff *skb, u32 trunc_size,
- int in_ifindex, int out_ifindex,
- u32 sample_rate)
+ struct sk_buff *skb, u32 sample_rate,
+ const struct psample_metadata *md)
{
}
diff --git a/include/uapi/linux/psample.h b/include/uapi/linux/psample.h
index aea26ab1431c..c6329f71b939 100644
--- a/include/uapi/linux/psample.h
+++ b/include/uapi/linux/psample.h
@@ -16,6 +16,13 @@ enum {
/* commands attributes */
PSAMPLE_ATTR_GROUP_REFCOUNT,
+ PSAMPLE_ATTR_PAD,
+ PSAMPLE_ATTR_OUT_TC, /* u16 */
+ PSAMPLE_ATTR_OUT_TC_OCC, /* u64, bytes */
+ PSAMPLE_ATTR_LATENCY, /* u64, nanoseconds */
+ PSAMPLE_ATTR_TIMESTAMP, /* u64, nanoseconds */
+ PSAMPLE_ATTR_PROTO, /* u16 */
+
__PSAMPLE_ATTR_MAX
};
diff --git a/net/psample/psample.c b/net/psample/psample.c
index 482c07f2766b..118d5d2a81a0 100644
--- a/net/psample/psample.c
+++ b/net/psample/psample.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/module.h>
+#include <linux/timekeeping.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -356,9 +357,12 @@ static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info)
#endif
void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
- u32 trunc_size, int in_ifindex, int out_ifindex,
- u32 sample_rate)
+ u32 sample_rate, const struct psample_metadata *md)
{
+ ktime_t tstamp = ktime_get_real();
+ int out_ifindex = md->out_ifindex;
+ int in_ifindex = md->in_ifindex;
+ u32 trunc_size = md->trunc_size;
#ifdef CONFIG_INET
struct ip_tunnel_info *tun_info;
#endif
@@ -370,10 +374,15 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
meta_len = (in_ifindex ? nla_total_size(sizeof(u16)) : 0) +
(out_ifindex ? nla_total_size(sizeof(u16)) : 0) +
+ (md->out_tc_valid ? nla_total_size(sizeof(u16)) : 0) +
+ (md->out_tc_occ_valid ? nla_total_size_64bit(sizeof(u64)) : 0) +
+ (md->latency_valid ? nla_total_size_64bit(sizeof(u64)) : 0) +
nla_total_size(sizeof(u32)) + /* sample_rate */
nla_total_size(sizeof(u32)) + /* orig_size */
nla_total_size(sizeof(u32)) + /* group_num */
- nla_total_size(sizeof(u32)); /* seq */
+ nla_total_size(sizeof(u32)) + /* seq */
+ nla_total_size_64bit(sizeof(u64)) + /* timestamp */
+ nla_total_size(sizeof(u16)); /* protocol */
#ifdef CONFIG_INET
tun_info = skb_tunnel_info(skb);
@@ -423,6 +432,36 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb,
if (unlikely(ret < 0))
goto error;
+ if (md->out_tc_valid) {
+ ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_OUT_TC, md->out_tc);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+
+ if (md->out_tc_occ_valid) {
+ ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_OUT_TC_OCC,
+ md->out_tc_occ, PSAMPLE_ATTR_PAD);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+
+ if (md->latency_valid) {
+ ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_LATENCY,
+ md->latency, PSAMPLE_ATTR_PAD);
+ if (unlikely(ret < 0))
+ goto error;
+ }
+
+ ret = nla_put_u64_64bit(nl_skb, PSAMPLE_ATTR_TIMESTAMP,
+ ktime_to_ns(tstamp), PSAMPLE_ATTR_PAD);
+ if (unlikely(ret < 0))
+ goto error;
+
+ ret = nla_put_u16(nl_skb, PSAMPLE_ATTR_PROTO,
+ be16_to_cpu(skb->protocol));
+ if (unlikely(ret < 0))
+ goto error;
+
if (data_len) {
int nla_len = nla_total_size(data_len);
struct nlattr *nla;
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index db8ee9e5c8c2..6a0c16e4351d 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -158,10 +158,8 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
{
struct tcf_sample *s = to_sample(a);
struct psample_group *psample_group;
+ struct psample_metadata md = {};
int retval;
- int size;
- int iif;
- int oif;
tcf_lastuse_update(&s->tcf_tm);
bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
@@ -172,20 +170,18 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
/* randomly sample packets according to rate */
if (psample_group && (prandom_u32() % s->rate == 0)) {
if (!skb_at_tc_ingress(skb)) {
- iif = skb->skb_iif;
- oif = skb->dev->ifindex;
+ md.in_ifindex = skb->skb_iif;
+ md.out_ifindex = skb->dev->ifindex;
} else {
- iif = skb->dev->ifindex;
- oif = 0;
+ md.in_ifindex = skb->dev->ifindex;
}
/* on ingress, the mac header gets popped, so push it back */
if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
skb_push(skb, skb->mac_len);
- size = s->truncate ? s->trunc_size : skb->len;
- psample_sample_packet(psample_group, skb, size, iif, oif,
- s->rate);
+ md.trunc_size = s->truncate ? s->trunc_size : skb->len;
+ psample_sample_packet(psample_group, skb, s->rate, &md);
if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev))
skb_pull(skb, skb->mac_len);
diff --git a/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh
new file mode 100755
index 000000000000..75d00104f291
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/mlxsw/tc_sample.sh
@@ -0,0 +1,492 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# Test that packets are sampled when tc-sample is used and that reported
+# metadata is correct. Two sets of hosts (with and without LAG) are used, since
+# metadata extraction in mlxsw is a bit different when LAG is involved.
+#
+# +---------------------------------+ +---------------------------------+
+# | H1 (vrf) | | H3 (vrf) |
+# | + $h1 | | + $h3_lag |
+# | | 192.0.2.1/28 | | | 192.0.2.17/28 |
+# | | | | | |
+# | | default via 192.0.2.2 | | | default via 192.0.2.18 |
+# +----|----------------------------+ +----|----------------------------+
+# | |
+# +----|-----------------------------------------|----------------------------+
+# | | 192.0.2.2/28 | 192.0.2.18/28 |
+# | + $rp1 + $rp3_lag |
+# | |
+# | + $rp2 + $rp4_lag |
+# | | 198.51.100.2/28 | 198.51.100.18/28 |
+# +----|-----------------------------------------|----------------------------+
+# | |
+# +----|----------------------------+ +----|----------------------------+
+# | | default via 198.51.100.2 | | | default via 198.51.100.18 |
+# | | | | | |
+# | | 198.51.100.1/28 | | | 198.51.100.17/28 |
+# | + $h2 | | + $h4_lag |
+# | H2 (vrf) | | H4 (vrf) |
+# +---------------------------------+ +---------------------------------+
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ tc_sample_rate_test
+ tc_sample_max_rate_test
+ tc_sample_group_conflict_test
+ tc_sample_md_iif_test
+ tc_sample_md_lag_iif_test
+ tc_sample_md_oif_test
+ tc_sample_md_lag_oif_test
+ tc_sample_md_out_tc_test
+ tc_sample_md_out_tc_occ_test
+"
+NUM_NETIFS=8
+CAPTURE_FILE=$(mktemp)
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+# Available at https://github.com/Mellanox/libpsample
+require_command psample
+
+h1_create()
+{
+ simple_if_init $h1 192.0.2.1/28
+
+ ip -4 route add default vrf v$h1 nexthop via 192.0.2.2
+}
+
+h1_destroy()
+{
+ ip -4 route del default vrf v$h1 nexthop via 192.0.2.2
+
+ simple_if_fini $h1 192.0.2.1/28
+}
+
+h2_create()
+{
+ simple_if_init $h2 198.51.100.1/28
+
+ ip -4 route add default vrf v$h2 nexthop via 198.51.100.2
+}
+
+h2_destroy()
+{
+ ip -4 route del default vrf v$h2 nexthop via 198.51.100.2
+
+ simple_if_fini $h2 198.51.100.1/28
+}
+
+h3_create()
+{
+ ip link set dev $h3 down
+ ip link add name ${h3}_bond type bond mode 802.3ad
+ ip link set dev $h3 master ${h3}_bond
+
+ simple_if_init ${h3}_bond 192.0.2.17/28
+
+ ip -4 route add default vrf v${h3}_bond nexthop via 192.0.2.18
+}
+
+h3_destroy()
+{
+ ip -4 route del default vrf v${h3}_bond nexthop via 192.0.2.18
+
+ simple_if_fini ${h3}_bond 192.0.2.17/28
+
+ ip link set dev $h3 nomaster
+ ip link del dev ${h3}_bond
+}
+
+h4_create()
+{
+ ip link set dev $h4 down
+ ip link add name ${h4}_bond type bond mode 802.3ad
+ ip link set dev $h4 master ${h4}_bond
+
+ simple_if_init ${h4}_bond 198.51.100.17/28
+
+ ip -4 route add default vrf v${h4}_bond nexthop via 198.51.100.18
+}
+
+h4_destroy()
+{
+ ip -4 route del default vrf v${h4}_bond nexthop via 198.51.100.18
+
+ simple_if_fini ${h4}_bond 198.51.100.17/28
+
+ ip link set dev $h4 nomaster
+ ip link del dev ${h4}_bond
+}
+
+router_create()
+{
+ ip link set dev $rp1 up
+ __addr_add_del $rp1 add 192.0.2.2/28
+ tc qdisc add dev $rp1 clsact
+
+ ip link set dev $rp2 up
+ __addr_add_del $rp2 add 198.51.100.2/28
+ tc qdisc add dev $rp2 clsact
+
+ ip link add name ${rp3}_bond type bond mode 802.3ad
+ ip link set dev $rp3 master ${rp3}_bond
+ __addr_add_del ${rp3}_bond add 192.0.2.18/28
+ tc qdisc add dev $rp3 clsact
+ ip link set dev ${rp3}_bond up
+
+ ip link add name ${rp4}_bond type bond mode 802.3ad
+ ip link set dev $rp4 master ${rp4}_bond
+ __addr_add_del ${rp4}_bond add 198.51.100.18/28
+ tc qdisc add dev $rp4 clsact
+ ip link set dev ${rp4}_bond up
+}
+
+router_destroy()
+{
+ ip link set dev ${rp4}_bond down
+ tc qdisc del dev $rp4 clsact
+ __addr_add_del ${rp4}_bond del 198.51.100.18/28
+ ip link set dev $rp4 nomaster
+ ip link del dev ${rp4}_bond
+
+ ip link set dev ${rp3}_bond down
+ tc qdisc del dev $rp3 clsact
+ __addr_add_del ${rp3}_bond del 192.0.2.18/28
+ ip link set dev $rp3 nomaster
+ ip link del dev ${rp3}_bond
+
+ tc qdisc del dev $rp2 clsact
+ __addr_add_del $rp2 del 198.51.100.2/28
+ ip link set dev $rp2 down
+
+ tc qdisc del dev $rp1 clsact
+ __addr_add_del $rp1 del 192.0.2.2/28
+ ip link set dev $rp1 down
+}
+
+setup_prepare()
+{
+ h1=${NETIFS[p1]}
+ rp1=${NETIFS[p2]}
+ rp2=${NETIFS[p3]}
+ h2=${NETIFS[p4]}
+ h3=${NETIFS[p5]}
+ rp3=${NETIFS[p6]}
+ h4=${NETIFS[p7]}
+ rp4=${NETIFS[p8]}
+
+ vrf_prepare
+
+ h1_create
+ h2_create
+ h3_create
+ h4_create
+ router_create
+}
+
+cleanup()
+{
+ pre_cleanup
+
+ rm -f $CAPTURE_FILE
+
+ router_destroy
+ h4_destroy
+ h3_destroy
+ h2_destroy
+ h1_destroy
+
+ vrf_cleanup
+}
+
+psample_capture_start()
+{
+ rm -f $CAPTURE_FILE
+
+ psample &> $CAPTURE_FILE &
+
+ sleep 1
+}
+
+psample_capture_stop()
+{
+ { kill %% && wait %%; } 2>/dev/null
+}
+
+__tc_sample_rate_test()
+{
+ local desc=$1; shift
+ local dip=$1; shift
+ local pkts pct
+
+ RET=0
+
+ tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate 32 group 1
+ check_err $? "Failed to configure sampling rule"
+
+ psample_capture_start
+
+ ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
+ -B $dip -t udp dp=52768,sp=42768 -q
+
+ psample_capture_stop
+
+ pkts=$(grep -e "group 1 " $CAPTURE_FILE | wc -l)
+ pct=$((100 * (pkts - 100) / 100))
+ (( -25 <= pct && pct <= 25))
+ check_err $? "Expected 100 packets, got $pkts packets, which is $pct% off. Required accuracy is +-25%"
+
+ log_test "tc sample rate ($desc)"
+
+ tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
+}
+
+tc_sample_rate_test()
+{
+ __tc_sample_rate_test "forward" 198.51.100.1
+ __tc_sample_rate_test "local receive" 192.0.2.2
+}
+
+tc_sample_max_rate_test()
+{
+ RET=0
+
+ tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate $((35 * 10 ** 8)) group 1
+ check_err $? "Failed to configure sampling rule with max rate"
+
+ tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
+
+ tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate $((35 * 10 ** 8 + 1)) \
+ group 1 &> /dev/null
+ check_fail $? "Managed to configure sampling rate above maximum"
+
+ log_test "tc sample maximum rate"
+}
+
+tc_sample_group_conflict_test()
+{
+ RET=0
+
+ # Test that two sampling rules cannot be configured on the same port
+ # with different groups.
+
+ tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate 1024 group 1
+ check_err $? "Failed to configure sampling rule"
+
+ tc filter add dev $rp1 ingress protocol all pref 2 handle 102 matchall \
+ skip_sw action sample rate 1024 group 2 &> /dev/null
+ check_fail $? "Managed to configure sampling rule with conflicting group"
+
+ log_test "tc sample group conflict test"
+
+ tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
+}
+
+tc_sample_md_iif_test()
+{
+ local rp1_ifindex
+
+ RET=0
+
+ tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate 5 group 1
+ check_err $? "Failed to configure sampling rule"
+
+ psample_capture_start
+
+ ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
+ -B 198.51.100.1 -t udp dp=52768,sp=42768 -q
+
+ psample_capture_stop
+
+ rp1_ifindex=$(ip -j -p link show dev $rp1 | jq '.[]["ifindex"]')
+ grep -q -e "in-ifindex $rp1_ifindex " $CAPTURE_FILE
+ check_err $? "Sampled packets do not have expected in-ifindex"
+
+ log_test "tc sample iif"
+
+ tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
+}
+
+tc_sample_md_lag_iif_test()
+{
+ local rp3_ifindex
+
+ RET=0
+
+ tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate 5 group 1
+ check_err $? "Failed to configure sampling rule"
+
+ psample_capture_start
+
+ ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \
+ -A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q
+
+ psample_capture_stop
+
+ rp3_ifindex=$(ip -j -p link show dev $rp3 | jq '.[]["ifindex"]')
+ grep -q -e "in-ifindex $rp3_ifindex " $CAPTURE_FILE
+ check_err $? "Sampled packets do not have expected in-ifindex"
+
+ log_test "tc sample lag iif"
+
+ tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall
+}
+
+tc_sample_md_oif_test()
+{
+ local rp2_ifindex
+
+ RET=0
+
+ tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate 5 group 1
+ check_err $? "Failed to configure sampling rule"
+
+ psample_capture_start
+
+ ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
+ -B 198.51.100.1 -t udp dp=52768,sp=42768 -q
+
+ psample_capture_stop
+
+ rp2_ifindex=$(ip -j -p link show dev $rp2 | jq '.[]["ifindex"]')
+ grep -q -e "out-ifindex $rp2_ifindex " $CAPTURE_FILE
+ check_err $? "Sampled packets do not have expected out-ifindex"
+
+ log_test "tc sample oif"
+
+ tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
+}
+
+tc_sample_md_lag_oif_test()
+{
+ local rp4_ifindex
+
+ RET=0
+
+ tc filter add dev $rp3 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate 5 group 1
+ check_err $? "Failed to configure sampling rule"
+
+ psample_capture_start
+
+ ip vrf exec v${h3}_bond $MZ ${h3}_bond -c 3200 -d 1msec -p 64 \
+ -A 192.0.2.17 -B 198.51.100.17 -t udp dp=52768,sp=42768 -q
+
+ psample_capture_stop
+
+ rp4_ifindex=$(ip -j -p link show dev $rp4 | jq '.[]["ifindex"]')
+ grep -q -e "out-ifindex $rp4_ifindex " $CAPTURE_FILE
+ check_err $? "Sampled packets do not have expected out-ifindex"
+
+ log_test "tc sample lag oif"
+
+ tc filter del dev $rp3 ingress protocol all pref 1 handle 101 matchall
+}
+
+tc_sample_md_out_tc_test()
+{
+ RET=0
+
+ # Output traffic class is not supported on Spectrum-1.
+ [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return
+
+ tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate 5 group 1
+ check_err $? "Failed to configure sampling rule"
+
+ # By default, all the packets should go to the same traffic class (0).
+
+ psample_capture_start
+
+ ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
+ -B 198.51.100.1 -t udp dp=52768,sp=42768 -q
+
+ psample_capture_stop
+
+ grep -q -e "out-tc 0 " $CAPTURE_FILE
+ check_err $? "Sampled packets do not have expected out-tc (0)"
+
+ # Map all priorities to highest traffic class (7) and check reported
+ # out-tc.
+ tc qdisc replace dev $rp2 root handle 1: \
+ prio bands 3 priomap 0 0 0 0 0 0 0 0
+
+ psample_capture_start
+
+ ip vrf exec v$h1 $MZ $h1 -c 3200 -d 1msec -p 64 -A 192.0.2.1 \
+ -B 198.51.100.1 -t udp dp=52768,sp=42768 -q
+
+ psample_capture_stop
+
+ grep -q -e "out-tc 7 " $CAPTURE_FILE
+ check_err $? "Sampled packets do not have expected out-tc (7)"
+
+ log_test "tc sample out-tc"
+
+ tc qdisc del dev $rp2 root handle 1:
+ tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
+}
+
+tc_sample_md_out_tc_occ_test()
+{
+ local backlog pct occ
+
+ RET=0
+
+ # Output traffic class occupancy is not supported on Spectrum-1.
+ [[ "$DEVLINK_VIDDID" == "15b3:cb84" ]] && return
+
+ tc filter add dev $rp1 ingress protocol all pref 1 handle 101 matchall \
+ skip_sw action sample rate 1024 group 1
+ check_err $? "Failed to configure sampling rule"
+
+ # Configure a shaper on egress to create congestion.
+ tc qdisc replace dev $rp2 root handle 1: \
+ tbf rate 1Mbit burst 256k limit 1M
+
+ psample_capture_start
+
+ ip vrf exec v$h1 $MZ $h1 -c 0 -d 1usec -p 1400 -A 192.0.2.1 \
+ -B 198.51.100.1 -t udp dp=52768,sp=42768 -q &
+
+ # Allow congestion to reach steady state.
+ sleep 10
+
+ backlog=$(tc -j -p -s qdisc show dev $rp2 | jq '.[0]["backlog"]')
+
+ # Kill mausezahn.
+ { kill %% && wait %%; } 2>/dev/null
+
+ psample_capture_stop
+
+ # Record last congestion sample.
+ occ=$(grep -e "out-tc-occ " $CAPTURE_FILE | tail -n 1 | \
+ cut -d ' ' -f 16)
+
+ pct=$((100 * (occ - backlog) / backlog))
+ (( -1 <= pct && pct <= 1))
+ check_err $? "Recorded a congestion of $backlog bytes, but sampled congestion is $occ bytes, which is $pct% off. Required accuracy is +-5%"
+
+ log_test "tc sample out-tc-occ"
+
+ tc qdisc del dev $rp2 root handle 1:
+ tc filter del dev $rp1 ingress protocol all pref 1 handle 101 matchall
+}
+
+trap cleanup EXIT
+
+setup_prepare
+setup_wait
+
+tests_run
+
+exit $EXIT_STATUS
diff --git a/tools/testing/selftests/drivers/net/netdevsim/psample.sh b/tools/testing/selftests/drivers/net/netdevsim/psample.sh
new file mode 100755
index 000000000000..ee10b1a8933c
--- /dev/null
+++ b/tools/testing/selftests/drivers/net/netdevsim/psample.sh
@@ -0,0 +1,181 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# This test is for checking the psample module. It makes use of netdevsim
+# which periodically generates "sampled" packets.
+
+lib_dir=$(dirname $0)/../../../net/forwarding
+
+ALL_TESTS="
+ psample_enable_test
+ psample_group_num_test
+ psample_md_test
+"
+NETDEVSIM_PATH=/sys/bus/netdevsim/
+DEV_ADDR=1337
+DEV=netdevsim${DEV_ADDR}
+DEVLINK_DEV=netdevsim/${DEV}
+SYSFS_NET_DIR=/sys/bus/netdevsim/devices/$DEV/net/
+PSAMPLE_DIR=/sys/kernel/debug/netdevsim/$DEV/psample/
+CAPTURE_FILE=$(mktemp)
+NUM_NETIFS=0
+source $lib_dir/lib.sh
+source $lib_dir/devlink_lib.sh
+
+# Available at https://github.com/Mellanox/libpsample
+require_command psample
+
+psample_capture()
+{
+ rm -f $CAPTURE_FILE
+
+ timeout 2 ip netns exec testns1 psample &> $CAPTURE_FILE
+}
+
+psample_enable_test()
+{
+ RET=0
+
+ echo 1 > $PSAMPLE_DIR/enable
+ check_err $? "Failed to enable sampling when should not"
+
+ echo 1 > $PSAMPLE_DIR/enable 2>/dev/null
+ check_fail $? "Sampling enablement succeeded when should fail"
+
+ psample_capture
+ if [ $(cat $CAPTURE_FILE | wc -l) -eq 0 ]; then
+ check_err 1 "Failed to capture sampled packets"
+ fi
+
+ echo 0 > $PSAMPLE_DIR/enable
+ check_err $? "Failed to disable sampling when should not"
+
+ echo 0 > $PSAMPLE_DIR/enable 2>/dev/null
+ check_fail $? "Sampling disablement succeeded when should fail"
+
+ psample_capture
+ if [ $(cat $CAPTURE_FILE | wc -l) -ne 0 ]; then
+ check_err 1 "Captured sampled packets when should not"
+ fi
+
+ log_test "psample enable / disable"
+}
+
+psample_group_num_test()
+{
+ RET=0
+
+ echo 1234 > $PSAMPLE_DIR/group_num
+ echo 1 > $PSAMPLE_DIR/enable
+
+ psample_capture
+ grep -q -e "group 1234" $CAPTURE_FILE
+ check_err $? "Sampled packets reported with wrong group number"
+
+ # New group number should only be used after disable / enable.
+ echo 4321 > $PSAMPLE_DIR/group_num
+
+ psample_capture
+ grep -q -e "group 4321" $CAPTURE_FILE
+ check_fail $? "Group number changed while sampling is active"
+
+ echo 0 > $PSAMPLE_DIR/enable && echo 1 > $PSAMPLE_DIR/enable
+
+ psample_capture
+ grep -q -e "group 4321" $CAPTURE_FILE
+ check_err $? "Group number did not change after restarting sampling"
+
+ log_test "psample group number"
+
+ echo 0 > $PSAMPLE_DIR/enable
+}
+
+psample_md_test()
+{
+ RET=0
+
+ echo 1 > $PSAMPLE_DIR/enable
+
+ echo 1234 > $PSAMPLE_DIR/in_ifindex
+ echo 4321 > $PSAMPLE_DIR/out_ifindex
+ psample_capture
+
+ grep -q -e "in-ifindex 1234" $CAPTURE_FILE
+ check_err $? "Sampled packets reported with wrong in-ifindex"
+
+ grep -q -e "out-ifindex 4321" $CAPTURE_FILE
+ check_err $? "Sampled packets reported with wrong out-ifindex"
+
+ echo 5 > $PSAMPLE_DIR/out_tc
+ psample_capture
+
+ grep -q -e "out-tc 5" $CAPTURE_FILE
+ check_err $? "Sampled packets reported with wrong out-tc"
+
+ echo $((2**16 - 1)) > $PSAMPLE_DIR/out_tc
+ psample_capture
+
+ grep -q -e "out-tc " $CAPTURE_FILE
+ check_fail $? "Sampled packets reported with out-tc when should not"
+
+ echo 1 > $PSAMPLE_DIR/out_tc
+ echo 10000 > $PSAMPLE_DIR/out_tc_occ_max
+ psample_capture
+
+ grep -q -e "out-tc-occ " $CAPTURE_FILE
+ check_err $? "Sampled packets not reported with out-tc-occ when should"
+
+ echo 0 > $PSAMPLE_DIR/out_tc_occ_max
+ psample_capture
+
+ grep -q -e "out-tc-occ " $CAPTURE_FILE
+ check_fail $? "Sampled packets reported with out-tc-occ when should not"
+
+ echo 10000 > $PSAMPLE_DIR/latency_max
+ psample_capture
+
+ grep -q -e "latency " $CAPTURE_FILE
+ check_err $? "Sampled packets not reported with latency when should"
+
+ echo 0 > $PSAMPLE_DIR/latency_max
+ psample_capture
+
+ grep -q -e "latency " $CAPTURE_FILE
+ check_fail $? "Sampled packets reported with latency when should not"
+
+ log_test "psample metadata"
+
+ echo 0 > $PSAMPLE_DIR/enable
+}
+
+setup_prepare()
+{
+ modprobe netdevsim &> /dev/null
+
+ echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device
+ while [ ! -d $SYSFS_NET_DIR ] ; do :; done
+
+ set -e
+
+ ip netns add testns1
+ devlink dev reload $DEVLINK_DEV netns testns1
+
+ set +e
+}
+
+cleanup()
+{
+ pre_cleanup
+ rm -f $CAPTURE_FILE
+ ip netns del testns1
+ echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device
+ modprobe -r netdevsim &> /dev/null
+}
+
+trap cleanup EXIT
+
+setup_prepare
+
+tests_run
+
+exit $EXIT_STATUS