diff options
463 files changed, 15298 insertions, 5583 deletions
diff --git a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt index b878a1e305af..ed1456f5c94d 100644 --- a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt +++ b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt @@ -16,11 +16,13 @@ Required Properties: - clocks: Array of clocks required for SDHC. - Require at least input clock for Xenon IP core. + Require at least input clock for Xenon IP core. For Armada AP806 and + CP110, the AXI clock is also mandatory. - clock-names: Array of names corresponding to clocks property. The input clock for Xenon IP core should be named as "core". + The input clock for the AXI bus must be named as "axi". - reg: * For "marvell,armada-3700-sdhci", two register areas. @@ -106,8 +108,8 @@ Example: compatible = "marvell,armada-ap806-sdhci"; reg = <0xaa0000 0x1000>; interrupts = <GIC_SPI 13 IRQ_TYPE_LEVEL_HIGH> - clocks = <&emmc_clk>; - clock-names = "core"; + clocks = <&emmc_clk>,<&axi_clk>; + clock-names = "core", "axi"; bus-width = <4>; marvell,xenon-phy-slow-mode; marvell,xenon-tun-count = <11>; @@ -126,8 +128,8 @@ Example: interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH> vqmmc-supply = <&sd_vqmmc_regulator>; vmmc-supply = <&sd_vmmc_regulator>; - clocks = <&sdclk>; - clock-names = "core"; + clocks = <&sdclk>, <&axi_clk>; + clock-names = "core", "axi"; bus-width = <4>; marvell,xenon-tun-count = <9>; }; diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 36f528a7fdd6..8caa60734647 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -210,8 +210,11 @@ path as another overlay mount and it may use a lower layer path that is beneath or above the path of another overlay lower layer path. Using an upper layer path and/or a workdir path that are already used by -another overlay mount is not allowed and will fail with EBUSY. Using +another overlay mount is not allowed and may fail with EBUSY. Using partially overlapping paths is not allowed but will not fail with EBUSY. +If files are accessed from two overlayfs mounts which share or overlap the +upper layer and/or workdir path the behavior of the overlay is undefined, +though it will not result in a crash or deadlock. Mounting an overlay using an upper layer path, where the upper layer path was previously used by another mounted overlay in combination with a diff --git a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801 index 0500193434cb..d47702456926 100644 --- a/Documentation/i2c/busses/i2c-i801 +++ b/Documentation/i2c/busses/i2c-i801 @@ -36,6 +36,7 @@ Supported adapters: * Intel Gemini Lake (SOC) * Intel Cannon Lake-H (PCH) * Intel Cannon Lake-LP (PCH) + * Intel Cedar Fork (PCH) Datasheets: Publicly available at the Intel website On Intel Patsburg and later chipsets, both the normal host SMBus controller diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt index 57f52cdce32e..9ba04c0bab8d 100644 --- a/Documentation/networking/bonding.txt +++ b/Documentation/networking/bonding.txt @@ -2387,7 +2387,7 @@ broadcast: Like active-backup, there is not much advantage to this and packet type ID), so in a "gatewayed" configuration, all outgoing traffic will generally use the same device. Incoming traffic may also end up on a single device, but that is - dependent upon the balancing policy of the peer's 8023.ad + dependent upon the balancing policy of the peer's 802.3ad implementation. In a "local" configuration, traffic will be distributed across the devices in the bond. diff --git a/Documentation/networking/netvsc.txt b/Documentation/networking/netvsc.txt index 93560fb1170a..92f5b31392fa 100644 --- a/Documentation/networking/netvsc.txt +++ b/Documentation/networking/netvsc.txt @@ -19,12 +19,12 @@ Features Receive Side Scaling -------------------- - Hyper-V supports receive side scaling. For TCP, packets are - distributed among available queues based on IP address and port + Hyper-V supports receive side scaling. For TCP & UDP, packets can + be distributed among available queues based on IP address and port number. - For UDP, we can switch UDP hash level between L3 and L4 by ethtool - command. UDP over IPv4 and v6 can be set differently. The default + For TCP & UDP, we can switch hash level between L3 and L4 by ethtool + command. TCP/UDP over IPv4 and v6 can be set differently. The default hash level is L4. We currently only allow switching TX hash level from within the guests. diff --git a/MAINTAINERS b/MAINTAINERS index e90cdecd7b5d..3944f1626911 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5264,7 +5264,8 @@ S: Maintained F: drivers/iommu/exynos-iommu.c EZchip NPS platform support -M: Noam Camus <noamc@ezchip.com> +M: Elad Kanfi <eladkan@mellanox.com> +M: Vineet Gupta <vgupta@synopsys.com> S: Supported F: arch/arc/plat-eznps F: arch/arc/boot/dts/eznps.dts @@ -9366,7 +9367,7 @@ NETWORK BLOCK DEVICE (NBD) M: Josef Bacik <jbacik@fb.com> S: Maintained L: linux-block@vger.kernel.org -L: nbd-general@lists.sourceforge.net +L: nbd@other.debian.org F: Documentation/blockdev/nbd.txt F: drivers/block/nbd.c F: include/uapi/linux/nbd.h @@ -1,7 +1,7 @@ VERSION = 4 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Fearless Coyote # *DOCUMENTATION* diff --git a/arch/Kconfig b/arch/Kconfig index 1aafb4efbb51..d789a89cb32c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -937,9 +937,6 @@ config STRICT_MODULE_RWX and non-text memory will be made non-executable. This provides protection against certain security exploits (e.g. writing to text) -config ARCH_WANT_RELAX_ORDER - bool - config ARCH_HAS_REFCOUNT bool help diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index a598641eed98..c84e67fdea09 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -24,7 +24,7 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select HAVE_FUTEX_CMPXCHG + select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_KRETPROBES diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 3a4b52b7e09d..d37f49d6a27f 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -6,8 +6,6 @@ # published by the Free Software Foundation. # -UTS_MACHINE := arc - ifeq ($(CROSS_COMPILE),) ifndef CONFIG_CPU_BIG_ENDIAN CROSS_COMPILE := arc-linux- diff --git a/arch/arc/boot/dts/axs10x_mb.dtsi b/arch/arc/boot/dts/axs10x_mb.dtsi index 2367a67c5f10..e114000a84f5 100644 --- a/arch/arc/boot/dts/axs10x_mb.dtsi +++ b/arch/arc/boot/dts/axs10x_mb.dtsi @@ -44,7 +44,14 @@ mmcclk: mmcclk { compatible = "fixed-clock"; - clock-frequency = <50000000>; + /* + * DW sdio controller has external ciu clock divider + * controlled via register in SDIO IP. It divides + * sdio_ref_clk (which comes from CGU) by 16 for + * default. So default mmcclk clock (which comes + * to sdk_in) is 25000000 Hz. + */ + clock-frequency = <25000000>; #clock-cells = <0>; }; diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 229d13adbce4..8adde1b492f1 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -12,6 +12,7 @@ /dts-v1/; #include <dt-bindings/net/ti-dp83867.h> +#include <dt-bindings/reset/snps,hsdk-reset.h> / { model = "snps,hsdk"; @@ -57,10 +58,10 @@ }; }; - core_clk: core-clk { + input_clk: input-clk { #clock-cells = <0>; compatible = "fixed-clock"; - clock-frequency = <500000000>; + clock-frequency = <33333333>; }; cpu_intc: cpu-interrupt-controller { @@ -102,6 +103,19 @@ ranges = <0x00000000 0xf0000000 0x10000000>; + cgu_rst: reset-controller@8a0 { + compatible = "snps,hsdk-reset"; + #reset-cells = <1>; + reg = <0x8A0 0x4>, <0xFF0 0x4>; + }; + + core_clk: core-clk@0 { + compatible = "snps,hsdk-core-pll-clock"; + reg = <0x00 0x10>, <0x14B8 0x4>; + #clock-cells = <0>; + clocks = <&input_clk>; + }; + serial: serial@5000 { compatible = "snps,dw-apb-uart"; reg = <0x5000 0x100>; @@ -120,7 +134,17 @@ mmcclk_ciu: mmcclk-ciu { compatible = "fixed-clock"; - clock-frequency = <100000000>; + /* + * DW sdio controller has external ciu clock divider + * controlled via register in SDIO IP. Due to its + * unexpected default value (it should devide by 1 + * but it devides by 8) SDIO IP uses wrong clock and + * works unstable (see STAR 9001204800) + * So add temporary fix and change clock frequency + * from 100000000 to 12500000 Hz until we fix dw sdio + * driver itself. + */ + clock-frequency = <12500000>; #clock-cells = <0>; }; @@ -141,6 +165,8 @@ clocks = <&gmacclk>; clock-names = "stmmaceth"; phy-handle = <&phy0>; + resets = <&cgu_rst HSDK_ETH_RESET>; + reset-names = "stmmaceth"; mdio { #address-cells = <1>; diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig index 6980b966a364..ec7c849a5c8e 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -105,7 +105,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig index 2233f5777a71..63d3cf69e0b0 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -104,7 +104,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig index 30a3d4cf53d2..f613ecac14a7 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -107,7 +107,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig index 821a2e562f3f..3507be2af6fe 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -84,5 +84,5 @@ CONFIG_TMPFS=y CONFIG_NFS_FS=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig index 9a3fcf446388..15f0f6b5fec1 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -63,6 +63,7 @@ CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_DW=y # CONFIG_IOMMU_SUPPORT is not set +CONFIG_RESET_HSDK=y CONFIG_EXT3_FS=y CONFIG_VFAT_FS=y CONFIG_TMPFS=y @@ -72,7 +73,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_WARN_DEPRECATED is not set # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig index c0d6a010751a..4fcf4f2503f6 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -94,7 +94,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_SHIRQ=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig index 5c0971787acf..7b71464f6c2f 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -98,7 +98,7 @@ CONFIG_NLS_ISO8859_1=y # CONFIG_ENABLE_MUST_CHECK is not set CONFIG_STRIP_ASM_SYMS=y CONFIG_DEBUG_SHIRQ=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=10 # CONFIG_SCHED_DEBUG is not set # CONFIG_DEBUG_PREEMPT is not set diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index ba8e802dba80..b1c56d35f2a9 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -98,6 +98,7 @@ /* Auxiliary registers */ #define AUX_IDENTITY 4 +#define AUX_EXEC_CTRL 8 #define AUX_INTR_VEC_BASE 0x25 #define AUX_VOL 0x5e @@ -135,12 +136,12 @@ struct bcr_identity { #endif }; -struct bcr_isa { +struct bcr_isa_arcv2 { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int div_rem:4, pad2:4, ldd:1, unalign:1, atomic:1, be:1, - pad1:11, atomic1:1, ver:8; + pad1:12, ver:8; #else - unsigned int ver:8, atomic1:1, pad1:11, be:1, atomic:1, unalign:1, + unsigned int ver:8, pad1:12, be:1, atomic:1, unalign:1, ldd:1, pad2:4, div_rem:4; #endif }; @@ -263,13 +264,13 @@ struct cpuinfo_arc { struct cpuinfo_arc_mmu mmu; struct cpuinfo_arc_bpu bpu; struct bcr_identity core; - struct bcr_isa isa; + struct bcr_isa_arcv2 isa; const char *details, *name; unsigned int vec_base; struct cpuinfo_arc_ccm iccm, dccm; struct { unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, swape:1, pad1:2, - fpu_sp:1, fpu_dp:1, pad2:6, + fpu_sp:1, fpu_dp:1, dual_iss_enb:1, dual_iss_exist:1, pad2:4, debug:1, ap:1, smart:1, rtt:1, pad3:4, timer0:1, timer1:1, rtc:1, gfrc:1, pad4:4; } extn; diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 877cec8f5ea2..fb83844daeea 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -51,6 +51,7 @@ static const struct id_to_str arc_cpu_rel[] = { { 0x51, "R2.0" }, { 0x52, "R2.1" }, { 0x53, "R3.0" }, + { 0x54, "R4.0" }, #endif { 0x00, NULL } }; @@ -62,6 +63,7 @@ static const struct id_to_str arc_cpu_nm[] = { #else { 0x40, "ARC EM" }, { 0x50, "ARC HS38" }, + { 0x54, "ARC HS48" }, #endif { 0x00, "Unknown" } }; @@ -119,11 +121,11 @@ static void read_arc_build_cfg_regs(void) struct bcr_generic bcr; struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; const struct id_to_str *tbl; + struct bcr_isa_arcv2 isa; FIX_PTR(cpu); READ_BCR(AUX_IDENTITY, cpu->core); - READ_BCR(ARC_REG_ISA_CFG_BCR, cpu->isa); for (tbl = &arc_cpu_rel[0]; tbl->id != 0; tbl++) { if (cpu->core.family == tbl->id) { @@ -133,7 +135,7 @@ static void read_arc_build_cfg_regs(void) } for (tbl = &arc_cpu_nm[0]; tbl->id != 0; tbl++) { - if ((cpu->core.family & 0xF0) == tbl->id) + if ((cpu->core.family & 0xF4) == tbl->id) break; } cpu->name = tbl->str; @@ -192,6 +194,14 @@ static void read_arc_build_cfg_regs(void) cpu->bpu.full = bpu.ft; cpu->bpu.num_cache = 256 << bpu.bce; cpu->bpu.num_pred = 2048 << bpu.pte; + + if (cpu->core.family >= 0x54) { + unsigned int exec_ctrl; + + READ_BCR(AUX_EXEC_CTRL, exec_ctrl); + cpu->extn.dual_iss_exist = 1; + cpu->extn.dual_iss_enb = exec_ctrl & 1; + } } READ_BCR(ARC_REG_AP_BCR, bcr); @@ -205,18 +215,25 @@ static void read_arc_build_cfg_regs(void) cpu->extn.debug = cpu->extn.ap | cpu->extn.smart | cpu->extn.rtt; + READ_BCR(ARC_REG_ISA_CFG_BCR, isa); + /* some hacks for lack of feature BCR info in old ARC700 cores */ if (is_isa_arcompact()) { - if (!cpu->isa.ver) /* ISA BCR absent, use Kconfig info */ + if (!isa.ver) /* ISA BCR absent, use Kconfig info */ cpu->isa.atomic = IS_ENABLED(CONFIG_ARC_HAS_LLSC); - else - cpu->isa.atomic = cpu->isa.atomic1; + else { + /* ARC700_BUILD only has 2 bits of isa info */ + struct bcr_generic bcr = *(struct bcr_generic *)&isa; + cpu->isa.atomic = bcr.info & 1; + } cpu->isa.be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); /* there's no direct way to distinguish 750 vs. 770 */ if (unlikely(cpu->core.family < 0x34 || cpu->mmu.ver < 3)) cpu->name = "ARC750"; + } else { + cpu->isa = isa; } } @@ -232,10 +249,11 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) "\nIDENTITY\t: ARCVER [%#02x] ARCNUM [%#02x] CHIPID [%#4x]\n", core->family, core->cpu_id, core->chip_id); - n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s\n", + n += scnprintf(buf + n, len - n, "processor [%d]\t: %s %s (%s ISA) %s%s%s\n", cpu_id, cpu->name, cpu->details, is_isa_arcompact() ? "ARCompact" : "ARCv2", - IS_AVAIL1(cpu->isa.be, "[Big-Endian]")); + IS_AVAIL1(cpu->isa.be, "[Big-Endian]"), + IS_AVAIL3(cpu->extn.dual_iss_exist, cpu->extn.dual_iss_enb, " Dual-Issue")); n += scnprintf(buf + n, len - n, "Timers\t\t: %s%s%s%s%s%s\nISA Extn\t: ", IS_AVAIL1(cpu->extn.timer0, "Timer0 "), diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index f1ac6790da5f..cf14ebc36916 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -111,6 +111,13 @@ static void __init axs10x_early_init(void) axs10x_enable_gpio_intc_wire(); + /* + * Reset ethernet IP core. + * TODO: get rid of this quirk after axs10x reset driver (or simple + * reset driver) will be available in upstream. + */ + iowrite32((1 << 5), (void __iomem *) CREG_MB_SW_RESET); + scnprintf(mb, 32, "MainBoard v%d", mb_rev); axs10x_print_board_ver(CREG_MB_VER, mb); } diff --git a/arch/arc/plat-hsdk/Kconfig b/arch/arc/plat-hsdk/Kconfig index 5a6ed5afb009..bd08de4be75e 100644 --- a/arch/arc/plat-hsdk/Kconfig +++ b/arch/arc/plat-hsdk/Kconfig @@ -6,4 +6,5 @@ # menuconfig ARC_SOC_HSDK - bool "ARC HS Development Kit SOC" + bool "ARC HS Development Kit SOC" + select CLK_HSDK diff --git a/arch/arc/plat-hsdk/platform.c b/arch/arc/plat-hsdk/platform.c index a2e7fd17e36d..744e62e58788 100644 --- a/arch/arc/plat-hsdk/platform.c +++ b/arch/arc/plat-hsdk/platform.c @@ -38,6 +38,42 @@ static void __init hsdk_init_per_cpu(unsigned int cpu) #define CREG_PAE (CREG_BASE + 0x180) #define CREG_PAE_UPDATE (CREG_BASE + 0x194) +#define CREG_CORE_IF_CLK_DIV (CREG_BASE + 0x4B8) +#define CREG_CORE_IF_CLK_DIV_2 0x1 +#define CGU_BASE ARC_PERIPHERAL_BASE +#define CGU_PLL_STATUS (ARC_PERIPHERAL_BASE + 0x4) +#define CGU_PLL_CTRL (ARC_PERIPHERAL_BASE + 0x0) +#define CGU_PLL_STATUS_LOCK BIT(0) +#define CGU_PLL_STATUS_ERR BIT(1) +#define CGU_PLL_CTRL_1GHZ 0x3A10 +#define HSDK_PLL_LOCK_TIMEOUT 500 + +#define HSDK_PLL_LOCKED() \ + !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_LOCK) + +#define HSDK_PLL_ERR() \ + !!(ioread32((void __iomem *) CGU_PLL_STATUS) & CGU_PLL_STATUS_ERR) + +static void __init hsdk_set_cpu_freq_1ghz(void) +{ + u32 timeout = HSDK_PLL_LOCK_TIMEOUT; + + /* + * As we set cpu clock which exceeds 500MHz, the divider for the interface + * clock must be programmed to div-by-2. + */ + iowrite32(CREG_CORE_IF_CLK_DIV_2, (void __iomem *) CREG_CORE_IF_CLK_DIV); + + /* Set cpu clock to 1GHz */ + iowrite32(CGU_PLL_CTRL_1GHZ, (void __iomem *) CGU_PLL_CTRL); + + while (!HSDK_PLL_LOCKED() && timeout--) + cpu_relax(); + + if (!HSDK_PLL_LOCKED() || HSDK_PLL_ERR()) + pr_err("Failed to setup CPU frequency to 1GHz!"); +} + static void __init hsdk_init_early(void) { /* @@ -52,6 +88,12 @@ static void __init hsdk_init_early(void) /* Really apply settings made above */ writel(1, (void __iomem *) CREG_PAE_UPDATE); + + /* + * Setup CPU frequency to 1GHz. + * TODO: remove it after smart hsdk pll driver will be introduced. + */ + hsdk_set_cpu_freq_1ghz(); } static const char *hsdk_compat[] __initconst = { diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 3585a5e26151..f7c4d2146aed 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -95,16 +95,19 @@ #define KERNEL_END _end /* - * The size of the KASAN shadow region. This should be 1/8th of the - * size of the entire kernel virtual address space. + * KASAN requires 1/8th of the kernel virtual address space for the shadow + * region. KASAN can bloat the stack significantly, so double the (minimum) + * stack size when KASAN is in use. */ #ifdef CONFIG_KASAN #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3)) +#define KASAN_THREAD_SHIFT 1 #else #define KASAN_SHADOW_SIZE (0) +#define KASAN_THREAD_SHIFT 0 #endif -#define MIN_THREAD_SHIFT 14 +#define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) /* * VMAP'd stacks are allocated at page granularity, so we must ensure that such diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index f0e6d717885b..d06fbe4cd38d 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -649,4 +649,4 @@ static int __init armv8_deprecated_init(void) return 0; } -late_initcall(armv8_deprecated_init); +core_initcall(armv8_deprecated_init); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index cd52d365d1f0..21e2c95d24e7 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1307,4 +1307,4 @@ static int __init enable_mrs_emulation(void) return 0; } -late_initcall(enable_mrs_emulation); +core_initcall(enable_mrs_emulation); diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index f444f374bd7b..5d547deb6996 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -444,4 +444,4 @@ static int __init fpsimd_init(void) return 0; } -late_initcall(fpsimd_init); +core_initcall(fpsimd_init); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 2069e9bc0fca..b64958b23a7f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -97,7 +97,7 @@ static void data_abort_decode(unsigned int esr) (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT, (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT); } else { - pr_alert(" ISV = 0, ISS = 0x%08lu\n", esr & ESR_ELx_ISS_MASK); + pr_alert(" ISV = 0, ISS = 0x%08lx\n", esr & ESR_ELx_ISS_MASK); } pr_alert(" CM = %lu, WnR = %lu\n", diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index a45a67d526f8..30f92391a93e 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -146,7 +146,7 @@ void machine_power_off(void) /* prevent soft lockup/stalled CPU messages for endless loop. */ rcu_sysrq_start(); - lockup_detector_suspend(); + lockup_detector_soft_poweroff(); for (;;); } diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 1df770e8cbe0..7275fed271af 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -102,10 +102,10 @@ static void cpufeatures_flush_tlb(void) case PVR_POWER8: case PVR_POWER8E: case PVR_POWER8NVL: - __flush_tlb_power8(POWER8_TLB_SETS); + __flush_tlb_power8(TLB_INVAL_SCOPE_GLOBAL); break; case PVR_POWER9: - __flush_tlb_power9(POWER9_TLB_SETS_HASH); + __flush_tlb_power9(TLB_INVAL_SCOPE_GLOBAL); break; default: pr_err("unknown CPU version for boot TLB flush\n"); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 48da0f5d2f7f..b82586c53560 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -734,7 +734,29 @@ EXC_REAL(program_check, 0x700, 0x100) EXC_VIRT(program_check, 0x4700, 0x100, 0x700) TRAMP_KVM(PACA_EXGEN, 0x700) EXC_COMMON_BEGIN(program_check_common) - EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) + /* + * It's possible to receive a TM Bad Thing type program check with + * userspace register values (in particular r1), but with SRR1 reporting + * that we came from the kernel. Normally that would confuse the bad + * stack logic, and we would report a bad kernel stack pointer. Instead + * we switch to the emergency stack if we're taking a TM Bad Thing from + * the kernel. + */ + li r10,MSR_PR /* Build a mask of MSR_PR .. */ + oris r10,r10,0x200000@h /* .. and SRR1_PROGTM */ + and r10,r10,r12 /* Mask SRR1 with that. */ + srdi r10,r10,8 /* Shift it so we can compare */ + cmpldi r10,(0x200000 >> 8) /* .. with an immediate. */ + bne 1f /* If != go to normal path. */ + + /* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack */ + andi. r10,r12,MSR_PR; /* Set CR0 correctly for label */ + /* 3 in EXCEPTION_PROLOG_COMMON */ + mr r10,r1 /* Save r1 */ + ld r1,PACAEMERGSP(r13) /* Use emergency stack */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + b 3f /* Jump into the macro !! */ +1: EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index b76ca198e09c..72f153c6f3fa 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -624,5 +624,18 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs) long __machine_check_early_realmode_p9(struct pt_regs *regs) { + /* + * On POWER9 DD2.1 and below, it's possible to get a machine check + * caused by a paste instruction where only DSISR bit 25 is set. This + * will result in the MCE handler seeing an unknown event and the kernel + * crashing. An MCE that occurs like this is spurious, so we don't need + * to do anything in terms of servicing it. If there is something that + * needs to be serviced, the CPU will raise the MCE again with the + * correct DSISR so that it can be serviced properly. So detect this + * case and mark it as handled. + */ + if (SRR1_MC_LOADSTORE(regs->msr) && regs->dsisr == 0x02000000) + return 1; + return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 0ac741fae90e..2e3bc16d02b2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -904,9 +904,6 @@ void __init setup_arch(char **cmdline_p) #endif #endif -#ifdef CONFIG_PPC_64K_PAGES - init_mm.context.pte_frag = NULL; -#endif #ifdef CONFIG_SPAPR_TCE_IOMMU mm_iommu_init(&init_mm); #endif diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c index c83c115858c1..b2c002993d78 100644 --- a/arch/powerpc/kernel/signal_64.c +++ b/arch/powerpc/kernel/signal_64.c @@ -452,9 +452,20 @@ static long restore_tm_sigcontexts(struct task_struct *tsk, if (MSR_TM_RESV(msr)) return -EINVAL; - /* pull in MSR TM from user context */ + /* pull in MSR TS bits from user context */ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + /* + * Ensure that TM is enabled in regs->msr before we leave the signal + * handler. It could be the case that (a) user disabled the TM bit + * through the manipulation of the MSR bits in uc_mcontext or (b) the + * TM bit was disabled because a sufficient number of context switches + * happened whilst in the signal handler and load_tm overflowed, + * disabling the TM bit. In either case we can end up with an illegal + * TM state leading to a TM Bad Thing when we return to userspace. + */ + regs->msr |= MSR_TM; + /* pull in MSR LE from user context */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index 2f6eadd9408d..c702a8981452 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -310,9 +310,6 @@ static int start_wd_on_cpu(unsigned int cpu) if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) return 0; - if (watchdog_suspended) - return 0; - if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) return 0; @@ -358,36 +355,39 @@ static void watchdog_calc_timeouts(void) wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; } -void watchdog_nmi_reconfigure(void) +void watchdog_nmi_stop(void) { int cpu; - watchdog_calc_timeouts(); - for_each_cpu(cpu, &wd_cpus_enabled) stop_wd_on_cpu(cpu); +} +void watchdog_nmi_start(void) +{ + int cpu; + + watchdog_calc_timeouts(); for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) start_wd_on_cpu(cpu); } /* - * This runs after lockup_detector_init() which sets up watchdog_cpumask. + * Invoked from core watchdog init. */ -static int __init powerpc_watchdog_init(void) +int __init watchdog_nmi_probe(void) { int err; - watchdog_calc_timeouts(); - - err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", - start_wd_on_cpu, stop_wd_on_cpu); - if (err < 0) + err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "powerpc/watchdog:online", + start_wd_on_cpu, stop_wd_on_cpu); + if (err < 0) { pr_warn("Watchdog could not be initialized"); - + return err; + } return 0; } -arch_initcall(powerpc_watchdog_init); static void handle_backtrace_ipi(struct pt_regs *regs) { diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 13304622ab1c..bf457843e032 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -622,7 +622,7 @@ int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server, return -EINVAL; state = &sb->irq_state[idx]; arch_spin_lock(&sb->lock); - *server = state->guest_server; + *server = state->act_server; *priority = state->guest_priority; arch_spin_unlock(&sb->lock); @@ -1331,7 +1331,7 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr) xive->saved_src_count++; /* Convert saved state into something compatible with xics */ - val = state->guest_server; + val = state->act_server; prio = state->saved_scan_prio; if (prio == MASKED) { @@ -1507,7 +1507,6 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr) /* First convert prio and mark interrupt as untargetted */ act_prio = xive_prio_from_guest(guest_prio); state->act_priority = MASKED; - state->guest_server = server; /* * We need to drop the lock due to the mutex below. Hopefully diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 5938f7644dc1..6ba63f8e8a61 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -35,7 +35,6 @@ struct kvmppc_xive_irq_state { struct xive_irq_data *pt_data; /* XIVE Pass-through associated data */ /* Targetting as set by guest */ - u32 guest_server; /* Current guest selected target */ u8 guest_priority; /* Guest set priority */ u8 saved_priority; /* Saved priority when masking */ diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 65eda1997c3f..f6c7f54c0515 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -361,9 +361,9 @@ static int change_page_attr(struct page *page, int numpages, pgprot_t prot) break; } wmb(); + local_irq_restore(flags); flush_tlb_kernel_range((unsigned long)page_address(start), (unsigned long)page_address(page)); - local_irq_restore(flags); return err; } diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 897aa1400eb8..bbb73aa0eb8f 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -272,7 +272,15 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static unsigned long pnv_memory_block_size(void) { - return 256UL * 1024 * 1024; + /* + * We map the kernel linear region with 1GB large pages on radix. For + * memory hot unplug to work our memory block size must be at least + * this size. + */ + if (radix_enabled()) + return 1UL * 1024 * 1024 * 1024; + else + return 256UL * 1024 * 1024; } #endif diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index f387318678b9..a3b8d7d1316e 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1402,6 +1402,14 @@ void xive_teardown_cpu(void) if (xive_ops->teardown_cpu) xive_ops->teardown_cpu(cpu, xc); + +#ifdef CONFIG_SMP + /* Get rid of IPI */ + xive_cleanup_cpu_ipi(cpu, xc); +#endif + + /* Disable and free the queues */ + xive_cleanup_cpu_queues(cpu, xc); } void xive_kexec_teardown_cpu(int secondary) diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index f24a70bc6855..d9c4c9366049 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -431,7 +431,11 @@ static int xive_spapr_get_ipi(unsigned int cpu, struct xive_cpu *xc) static void xive_spapr_put_ipi(unsigned int cpu, struct xive_cpu *xc) { + if (!xc->hw_ipi) + return; + xive_irq_bitmap_free(xc->hw_ipi); + xc->hw_ipi = 0; } #endif /* CONFIG_SMP */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 0be3828752e5..4e83f950713e 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -44,7 +44,6 @@ config SPARC select ARCH_HAS_SG_CHAIN select CPU_NO_EFFICIENT_FFS select LOCKDEP_SMALL if LOCKDEP - select ARCH_WANT_RELAX_ORDER config SPARC32 def_bool !64BIT diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 829e89cfcee2..9fb9a1f1e47b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4409,10 +4409,9 @@ static __init int fixup_ht_bug(void) return 0; } - if (lockup_detector_suspend() != 0) { - pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); - return 0; - } + cpus_read_lock(); + + hardlockup_detector_perf_stop(); x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -4420,9 +4419,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - lockup_detector_resume(); - - cpus_read_lock(); + hardlockup_detector_perf_restart(); for_each_online_cpu(c) free_excl_cntrs(c); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index bc62e7cbf1b1..59ad3d132353 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); void __init kvm_guest_init(void); -void kvm_async_pf_task_wait(u32 token); +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); @@ -103,7 +103,7 @@ static inline void kvm_spinlock_init(void) #else /* CONFIG_KVM_GUEST */ #define kvm_guest_init() do {} while (0) -#define kvm_async_pf_task_wait(T) do {} while(0) +#define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index e675704fa6f7..8bb9594d0761 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -117,7 +117,11 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -void kvm_async_pf_task_wait(u32 token) +/* + * @interrupt_kernel: Is this called from a routine which interrupts the kernel + * (other than user space)? + */ +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -140,8 +144,10 @@ void kvm_async_pf_task_wait(u32 token) n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1 || - rcu_preempt_depth(); + n.halted = is_idle_task(current) || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) + ? preempt_count() > 1 || rcu_preempt_depth() + : interrupt_kernel); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); @@ -269,7 +275,7 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2()); + kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3ea624452f93..3c48bc8bf08c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -23,6 +23,7 @@ config KVM depends on HIGH_RES_TIMERS # for TASKSTATS/TASK_DELAY_ACCT: depends on NET && MULTIUSER + depends on X86_LOCAL_APIC select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a36254cbf776..d90cdc77e077 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -425,8 +425,10 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)); #op " %al \n\t" \ FOP_RET -asm(".global kvm_fastop_exception \n" - "kvm_fastop_exception: xor %esi, %esi; ret"); +asm(".pushsection .fixup, \"ax\"\n" + ".global kvm_fastop_exception \n" + "kvm_fastop_exception: xor %esi, %esi; ret\n" + ".popsection"); FOP_START(setcc) FOP_SETCC(seto) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eca30c1eb1d9..106d4a029a8a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3837,7 +3837,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, case KVM_PV_REASON_PAGE_NOT_PRESENT: vcpu->arch.apf.host_apf_reason = 0; local_irq_disable(); - kvm_async_pf_task_wait(fault_address); + kvm_async_pf_task_wait(fault_address, 0); local_irq_enable(); break; case KVM_PV_REASON_PAGE_READY: diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 980e73095643..de294d775acf 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -815,10 +815,14 @@ int blk_mq_debugfs_register(struct request_queue *q) goto err; /* - * blk_mq_init_hctx() attempted to do this already, but q->debugfs_dir + * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir * didn't exist yet (because we don't know what to name the directory * until the queue is registered to a gendisk). */ + if (q->elevator && !q->sched_debugfs_dir) + blk_mq_debugfs_register_sched(q); + + /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->debugfs_dir && blk_mq_debugfs_register_hctx(q, hctx)) goto err; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 0fea76aa0f3f..17816a028dcb 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1911,11 +1911,11 @@ static void throtl_upgrade_state(struct throtl_data *td) tg->disptime = jiffies - 1; throtl_select_dispatch(sq); - throtl_schedule_next_dispatch(sq, false); + throtl_schedule_next_dispatch(sq, true); } rcu_read_unlock(); throtl_select_dispatch(&td->service_queue); - throtl_schedule_next_dispatch(&td->service_queue, false); + throtl_schedule_next_dispatch(&td->service_queue, true); queue_work(kthrotld_workqueue, &td->dispatch_work); } diff --git a/block/bsg-lib.c b/block/bsg-lib.c index dbddff8174e5..15d25ccd51a5 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -207,20 +207,34 @@ static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) struct bsg_job *job = blk_mq_rq_to_pdu(req); struct scsi_request *sreq = &job->sreq; + /* called right after the request is allocated for the request_queue */ + + sreq->sense = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); + if (!sreq->sense) + return -ENOMEM; + + return 0; +} + +static void bsg_initialize_rq(struct request *req) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + struct scsi_request *sreq = &job->sreq; + void *sense = sreq->sense; + + /* called right before the request is given to the request_queue user */ + memset(job, 0, sizeof(*job)); scsi_req_init(sreq); + + sreq->sense = sense; sreq->sense_len = SCSI_SENSE_BUFFERSIZE; - sreq->sense = kzalloc(sreq->sense_len, gfp); - if (!sreq->sense) - return -ENOMEM; job->req = req; - job->reply = sreq->sense; + job->reply = sense; job->reply_len = sreq->sense_len; job->dd_data = job + 1; - - return 0; } static void bsg_exit_rq(struct request_queue *q, struct request *req) @@ -251,6 +265,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, q->cmd_size = sizeof(struct bsg_job) + dd_job_size; q->init_rq_fn = bsg_init_rq; q->exit_rq_fn = bsg_exit_rq; + q->initialize_rq_fn = bsg_initialize_rq; q->request_fn = bsg_request_fn; ret = blk_init_allocated_queue(q); diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 9565d572f8dd..de56394dd161 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1178,12 +1178,44 @@ dev_put: return ret; } +static bool __init iort_enable_acs(struct acpi_iort_node *iort_node) +{ + if (iort_node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX) { + struct acpi_iort_node *parent; + struct acpi_iort_id_mapping *map; + int i; + + map = ACPI_ADD_PTR(struct acpi_iort_id_mapping, iort_node, + iort_node->mapping_offset); + + for (i = 0; i < iort_node->mapping_count; i++, map++) { + if (!map->output_reference) + continue; + + parent = ACPI_ADD_PTR(struct acpi_iort_node, + iort_table, map->output_reference); + /* + * If we detect a RC->SMMU mapping, make sure + * we enable ACS on the system. + */ + if ((parent->type == ACPI_IORT_NODE_SMMU) || + (parent->type == ACPI_IORT_NODE_SMMU_V3)) { + pci_request_acs(); + return true; + } + } + } + + return false; +} + static void __init iort_init_platform_devices(void) { struct acpi_iort_node *iort_node, *iort_end; struct acpi_table_iort *iort; struct fwnode_handle *fwnode; int i, ret; + bool acs_enabled = false; /* * iort_table and iort both point to the start of IORT table, but @@ -1203,6 +1235,9 @@ static void __init iort_init_platform_devices(void) return; } + if (!acs_enabled) + acs_enabled = iort_enable_acs(iort_node); + if ((iort_node->type == ACPI_IORT_NODE_SMMU) || (iort_node->type == ACPI_IORT_NODE_SMMU_V3)) { diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c index f8b7e86907cc..126855e6cb7d 100644 --- a/drivers/atm/fore200e.c +++ b/drivers/atm/fore200e.c @@ -358,26 +358,33 @@ fore200e_shutdown(struct fore200e* fore200e) case FORE200E_STATE_COMPLETE: kfree(fore200e->stats); + /* fall through */ case FORE200E_STATE_IRQ: free_irq(fore200e->irq, fore200e->atm_dev); + /* fall through */ case FORE200E_STATE_ALLOC_BUF: fore200e_free_rx_buf(fore200e); + /* fall through */ case FORE200E_STATE_INIT_BSQ: fore200e_uninit_bs_queue(fore200e); + /* fall through */ case FORE200E_STATE_INIT_RXQ: fore200e->bus->dma_chunk_free(fore200e, &fore200e->host_rxq.status); fore200e->bus->dma_chunk_free(fore200e, &fore200e->host_rxq.rpd); + /* fall through */ case FORE200E_STATE_INIT_TXQ: fore200e->bus->dma_chunk_free(fore200e, &fore200e->host_txq.status); fore200e->bus->dma_chunk_free(fore200e, &fore200e->host_txq.tpd); + /* fall through */ case FORE200E_STATE_INIT_CMDQ: fore200e->bus->dma_chunk_free(fore200e, &fore200e->host_cmdq.status); + /* fall through */ case FORE200E_STATE_INITIALIZE: /* nothing to do for that state */ @@ -390,6 +397,7 @@ fore200e_shutdown(struct fore200e* fore200e) case FORE200E_STATE_MAP: fore200e->bus->unmap(fore200e); + /* fall through */ case FORE200E_STATE_CONFIGURE: /* nothing to do for that state */ diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c index 082aa02abc57..d781b3f87693 100644 --- a/drivers/atm/idt77105.c +++ b/drivers/atm/idt77105.c @@ -306,11 +306,9 @@ static int idt77105_start(struct atm_dev *dev) if (start_timer) { start_timer = 0; - setup_timer(&stats_timer, idt77105_stats_timer_func, 0UL); stats_timer.expires = jiffies+IDT77105_STATS_TIMER_PERIOD; add_timer(&stats_timer); - setup_timer(&restart_timer, idt77105_restart_timer_func, 0UL); restart_timer.expires = jiffies+IDT77105_RESTART_TIMER_PERIOD; add_timer(&restart_timer); } diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 4a438b8abe27..2dfe99b328f8 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -17,7 +17,7 @@ if BLK_DEV config BLK_DEV_NULL_BLK tristate "Null test block driver" - depends on CONFIGFS_FS + select CONFIGFS_FS config BLK_DEV_FD tristate "Normal floppy disk support" diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3684e21d543f..883dfebd3014 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -820,9 +820,13 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * appropriate. */ ret = nbd_handle_cmd(cmd, hctx->queue_num); + if (ret < 0) + ret = BLK_STS_IOERR; + else if (!ret) + ret = BLK_STS_OK; complete(&cmd->send_complete); - return ret < 0 ? BLK_STS_IOERR : BLK_STS_OK; + return ret; } static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg, diff --git a/drivers/clk/clk-bulk.c b/drivers/clk/clk-bulk.c index c834f5abfc49..4c10456f8a32 100644 --- a/drivers/clk/clk-bulk.c +++ b/drivers/clk/clk-bulk.c @@ -105,6 +105,7 @@ err: return ret; } +EXPORT_SYMBOL_GPL(clk_bulk_prepare); #endif /* CONFIG_HAVE_CLK_PREPARE */ diff --git a/drivers/clk/rockchip/clk-rk3128.c b/drivers/clk/rockchip/clk-rk3128.c index 62d7854e4b87..5970a50671b9 100644 --- a/drivers/clk/rockchip/clk-rk3128.c +++ b/drivers/clk/rockchip/clk-rk3128.c @@ -315,13 +315,13 @@ static struct rockchip_clk_branch common_clk_branches[] __initdata = { RK2928_CLKGATE_CON(10), 8, GFLAGS), GATE(SCLK_PVTM_CORE, "clk_pvtm_core", "xin24m", 0, - RK2928_CLKGATE_CON(10), 8, GFLAGS), + RK2928_CLKGATE_CON(10), 0, GFLAGS), GATE(SCLK_PVTM_GPU, "clk_pvtm_gpu", "xin24m", 0, - RK2928_CLKGATE_CON(10), 8, GFLAGS), + RK2928_CLKGATE_CON(10), 1, GFLAGS), GATE(SCLK_PVTM_FUNC, "clk_pvtm_func", "xin24m", 0, - RK2928_CLKGATE_CON(10), 8, GFLAGS), + RK2928_CLKGATE_CON(10), 2, GFLAGS), GATE(SCLK_MIPI_24M, "clk_mipi_24m", "xin24m", CLK_IGNORE_UNUSED, - RK2928_CLKGATE_CON(10), 8, GFLAGS), + RK2928_CLKGATE_CON(2), 15, GFLAGS), COMPOSITE(SCLK_SDMMC, "sclk_sdmmc0", mux_mmc_src_p, 0, RK2928_CLKSEL_CON(11), 6, 2, MFLAGS, 0, 6, DFLAGS, @@ -541,7 +541,7 @@ static struct rockchip_clk_branch common_clk_branches[] __initdata = { GATE(0, "pclk_grf", "pclk_cpu", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(5), 4, GFLAGS), GATE(0, "pclk_mipiphy", "pclk_cpu", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(5), 0, GFLAGS), - GATE(0, "pclk_pmu", "pclk_pmu_pre", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(9), 2, GFLAGS), + GATE(0, "pclk_pmu", "pclk_pmu_pre", 0, RK2928_CLKGATE_CON(9), 2, GFLAGS), GATE(0, "pclk_pmu_niu", "pclk_pmu_pre", CLK_IGNORE_UNUSED, RK2928_CLKGATE_CON(9), 3, GFLAGS), /* PD_MMC */ @@ -577,6 +577,8 @@ static const char *const rk3128_critical_clocks[] __initconst = { "aclk_peri", "hclk_peri", "pclk_peri", + "pclk_pmu", + "sclk_timer5", }; static struct rockchip_clk_provider *__init rk3128_common_clk_init(struct device_node *np) diff --git a/drivers/clk/samsung/clk-exynos4.c b/drivers/clk/samsung/clk-exynos4.c index e40b77583c47..d8d3cb67b402 100644 --- a/drivers/clk/samsung/clk-exynos4.c +++ b/drivers/clk/samsung/clk-exynos4.c @@ -294,6 +294,18 @@ static const struct samsung_clk_reg_dump src_mask_suspend_e4210[] = { #define PLL_ENABLED (1 << 31) #define PLL_LOCKED (1 << 29) +static void exynos4_clk_enable_pll(u32 reg) +{ + u32 pll_con = readl(reg_base + reg); + pll_con |= PLL_ENABLED; + writel(pll_con, reg_base + reg); + + while (!(pll_con & PLL_LOCKED)) { + cpu_relax(); + pll_con = readl(reg_base + reg); + } +} + static void exynos4_clk_wait_for_pll(u32 reg) { u32 pll_con; @@ -315,6 +327,9 @@ static int exynos4_clk_suspend(void) samsung_clk_save(reg_base, exynos4_save_pll, ARRAY_SIZE(exynos4_clk_pll_regs)); + exynos4_clk_enable_pll(EPLL_CON0); + exynos4_clk_enable_pll(VPLL_CON0); + if (exynos4_soc == EXYNOS4210) { samsung_clk_save(reg_base, exynos4_save_soc, ARRAY_SIZE(exynos4210_clk_save)); diff --git a/drivers/gpu/drm/i915/intel_audio.c b/drivers/gpu/drm/i915/intel_audio.c index d805b6e6fe71..27743be5b768 100644 --- a/drivers/gpu/drm/i915/intel_audio.c +++ b/drivers/gpu/drm/i915/intel_audio.c @@ -606,11 +606,6 @@ void intel_audio_codec_enable(struct intel_encoder *intel_encoder, connector->encoder->base.id, connector->encoder->name); - /* ELD Conn_Type */ - connector->eld[5] &= ~(3 << 2); - if (intel_crtc_has_dp_encoder(crtc_state)) - connector->eld[5] |= (1 << 2); - connector->eld[6] = drm_av_sync_delay(connector, adjusted_mode) / 2; if (dev_priv->display.audio_codec_enable) diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index 183e87e8ea31..00c6aee0a9a1 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1163,6 +1163,13 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port, is_hdmi = is_dvi && (child->common.device_type & DEVICE_TYPE_NOT_HDMI_OUTPUT) == 0; is_edp = is_dp && (child->common.device_type & DEVICE_TYPE_INTERNAL_CONNECTOR); + if (port == PORT_A && is_dvi) { + DRM_DEBUG_KMS("VBT claims port A supports DVI%s, ignoring\n", + is_hdmi ? "/HDMI" : ""); + is_dvi = false; + is_hdmi = false; + } + info->supports_dvi = is_dvi; info->supports_hdmi = is_hdmi; info->supports_dp = is_dp; diff --git a/drivers/gpu/drm/i915/intel_csr.c b/drivers/gpu/drm/i915/intel_csr.c index 965988f79a55..92c1f8e166dc 100644 --- a/drivers/gpu/drm/i915/intel_csr.c +++ b/drivers/gpu/drm/i915/intel_csr.c @@ -216,7 +216,7 @@ static void gen9_set_dc_state_debugmask(struct drm_i915_private *dev_priv) mask = DC_STATE_DEBUG_MASK_MEMORY_UP; - if (IS_BROXTON(dev_priv)) + if (IS_GEN9_LP(dev_priv)) mask |= DC_STATE_DEBUG_MASK_CORES; /* The below bit doesn't need to be cleared ever afterwards */ diff --git a/drivers/gpu/drm/i915/intel_ddi.c b/drivers/gpu/drm/i915/intel_ddi.c index 4b4fd1f8110b..476681d5940c 100644 --- a/drivers/gpu/drm/i915/intel_ddi.c +++ b/drivers/gpu/drm/i915/intel_ddi.c @@ -1655,7 +1655,8 @@ bool intel_ddi_get_hw_state(struct intel_encoder *encoder, out: if (ret && IS_GEN9_LP(dev_priv)) { tmp = I915_READ(BXT_PHY_CTL(port)); - if ((tmp & (BXT_PHY_LANE_POWERDOWN_ACK | + if ((tmp & (BXT_PHY_CMNLANE_POWERDOWN_ACK | + BXT_PHY_LANE_POWERDOWN_ACK | BXT_PHY_LANE_ENABLED)) != BXT_PHY_LANE_ENABLED) DRM_ERROR("Port %c enabled but PHY powered down? " "(PHY_CTL %08x)\n", port_name(port), tmp); diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 00cd17c76fdc..64f7b51ed97c 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -12359,7 +12359,6 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state) struct drm_crtc_state *old_crtc_state, *new_crtc_state; struct drm_crtc *crtc; struct intel_crtc_state *intel_cstate; - bool hw_check = intel_state->modeset; u64 put_domains[I915_MAX_PIPES] = {}; unsigned crtc_vblank_mask = 0; int i; @@ -12376,7 +12375,6 @@ static void intel_atomic_commit_tail(struct drm_atomic_state *state) if (needs_modeset(new_crtc_state) || to_intel_crtc_state(new_crtc_state)->update_pipe) { - hw_check = true; put_domains[to_intel_crtc(crtc)->pipe] = modeset_get_crtc_power_domains(crtc, diff --git a/drivers/gpu/drm/i915/intel_dpio_phy.c b/drivers/gpu/drm/i915/intel_dpio_phy.c index 09b670929786..de38d014ed39 100644 --- a/drivers/gpu/drm/i915/intel_dpio_phy.c +++ b/drivers/gpu/drm/i915/intel_dpio_phy.c @@ -208,12 +208,6 @@ static const struct bxt_ddi_phy_info glk_ddi_phy_info[] = { }, }; -static u32 bxt_phy_port_mask(const struct bxt_ddi_phy_info *phy_info) -{ - return (phy_info->dual_channel * BIT(phy_info->channel[DPIO_CH1].port)) | - BIT(phy_info->channel[DPIO_CH0].port); -} - static const struct bxt_ddi_phy_info * bxt_get_phy_list(struct drm_i915_private *dev_priv, int *count) { @@ -313,7 +307,6 @@ bool bxt_ddi_phy_is_enabled(struct drm_i915_private *dev_priv, enum dpio_phy phy) { const struct bxt_ddi_phy_info *phy_info; - enum port port; phy_info = bxt_get_phy_info(dev_priv, phy); @@ -335,19 +328,6 @@ bool bxt_ddi_phy_is_enabled(struct drm_i915_private *dev_priv, return false; } - for_each_port_masked(port, bxt_phy_port_mask(phy_info)) { - u32 tmp = I915_READ(BXT_PHY_CTL(port)); - - if (tmp & BXT_PHY_CMNLANE_POWERDOWN_ACK) { - DRM_DEBUG_DRIVER("DDI PHY %d powered, but common lane " - "for port %c powered down " - "(PHY_CTL %08x)\n", - phy, port_name(port), tmp); - - return false; - } - } - return true; } diff --git a/drivers/gpu/drm/i915/intel_modes.c b/drivers/gpu/drm/i915/intel_modes.c index 951e834dd274..28a778b785ac 100644 --- a/drivers/gpu/drm/i915/intel_modes.c +++ b/drivers/gpu/drm/i915/intel_modes.c @@ -30,6 +30,21 @@ #include "intel_drv.h" #include "i915_drv.h" +static void intel_connector_update_eld_conn_type(struct drm_connector *connector) +{ + u8 conn_type; + + if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort || + connector->connector_type == DRM_MODE_CONNECTOR_eDP) { + conn_type = DRM_ELD_CONN_TYPE_DP; + } else { + conn_type = DRM_ELD_CONN_TYPE_HDMI; + } + + connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] &= ~DRM_ELD_CONN_TYPE_MASK; + connector->eld[DRM_ELD_SAD_COUNT_CONN_TYPE] |= conn_type; +} + /** * intel_connector_update_modes - update connector from edid * @connector: DRM connector device to use @@ -44,6 +59,8 @@ int intel_connector_update_modes(struct drm_connector *connector, ret = drm_add_edid_modes(connector, edid); drm_edid_to_eld(connector, edid); + intel_connector_update_eld_conn_type(connector); + return ret; } diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index b66d8e136aa3..b3a087cb0860 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -2782,6 +2782,9 @@ static void cnl_display_core_init(struct drm_i915_private *dev_priv, bool resume /* 6. Enable DBUF */ gen9_dbuf_enable(dev_priv); + + if (resume && dev_priv->csr.dmc_payload) + intel_csr_load_program(dev_priv); } #undef CNL_PROCMON_IDX diff --git a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c index 9ea6cd5a1370..3cf1a6932fac 100644 --- a/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c +++ b/drivers/gpu/drm/sun4i/sun4i_hdmi_enc.c @@ -302,26 +302,29 @@ static int sun4i_hdmi_bind(struct device *dev, struct device *master, hdmi->mod_clk = devm_clk_get(dev, "mod"); if (IS_ERR(hdmi->mod_clk)) { dev_err(dev, "Couldn't get the HDMI mod clock\n"); - return PTR_ERR(hdmi->mod_clk); + ret = PTR_ERR(hdmi->mod_clk); + goto err_disable_bus_clk; } clk_prepare_enable(hdmi->mod_clk); hdmi->pll0_clk = devm_clk_get(dev, "pll-0"); if (IS_ERR(hdmi->pll0_clk)) { dev_err(dev, "Couldn't get the HDMI PLL 0 clock\n"); - return PTR_ERR(hdmi->pll0_clk); + ret = PTR_ERR(hdmi->pll0_clk); + goto err_disable_mod_clk; } hdmi->pll1_clk = devm_clk_get(dev, "pll-1"); if (IS_ERR(hdmi->pll1_clk)) { dev_err(dev, "Couldn't get the HDMI PLL 1 clock\n"); - return PTR_ERR(hdmi->pll1_clk); + ret = PTR_ERR(hdmi->pll1_clk); + goto err_disable_mod_clk; } ret = sun4i_tmds_create(hdmi); if (ret) { dev_err(dev, "Couldn't create the TMDS clock\n"); - return ret; + goto err_disable_mod_clk; } writel(SUN4I_HDMI_CTRL_ENABLE, hdmi->base + SUN4I_HDMI_CTRL_REG); @@ -362,7 +365,7 @@ static int sun4i_hdmi_bind(struct device *dev, struct device *master, ret = sun4i_hdmi_i2c_create(dev, hdmi); if (ret) { dev_err(dev, "Couldn't create the HDMI I2C adapter\n"); - return ret; + goto err_disable_mod_clk; } drm_encoder_helper_add(&hdmi->encoder, @@ -422,6 +425,10 @@ err_cleanup_connector: drm_encoder_cleanup(&hdmi->encoder); err_del_i2c_adapter: i2c_del_adapter(hdmi->i2c); +err_disable_mod_clk: + clk_disable_unprepare(hdmi->mod_clk); +err_disable_bus_clk: + clk_disable_unprepare(hdmi->bus_clk); return ret; } @@ -434,6 +441,8 @@ static void sun4i_hdmi_unbind(struct device *dev, struct device *master, drm_connector_cleanup(&hdmi->connector); drm_encoder_cleanup(&hdmi->encoder); i2c_del_adapter(hdmi->i2c); + clk_disable_unprepare(hdmi->mod_clk); + clk_disable_unprepare(hdmi->bus_clk); } static const struct component_ops sun4i_hdmi_ops = { diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index 9c0dbb8191ad..e1be61095532 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -630,7 +630,7 @@ static int xgene_hwmon_probe(struct platform_device *pdev) sizeof(struct slimpro_resp_msg) * ASYNC_MSG_FIFO_SIZE, GFP_KERNEL); if (rc) - goto out_mbox_free; + return -ENOMEM; INIT_WORK(&ctx->workq, xgene_hwmon_evt_work); @@ -646,7 +646,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev) if (IS_ERR(ctx->mbox_chan)) { dev_err(&pdev->dev, "SLIMpro mailbox channel request failed\n"); - return -ENODEV; + rc = -ENODEV; + goto out_mbox_free; } } else { struct acpi_pcct_hw_reduced *cppc_ss; @@ -654,7 +655,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev) if (device_property_read_u32(&pdev->dev, "pcc-channel", &ctx->mbox_idx)) { dev_err(&pdev->dev, "no pcc-channel property\n"); - return -ENODEV; + rc = -ENODEV; + goto out_mbox_free; } cl->rx_callback = xgene_hwmon_pcc_rx_cb; @@ -662,7 +664,8 @@ static int xgene_hwmon_probe(struct platform_device *pdev) if (IS_ERR(ctx->mbox_chan)) { dev_err(&pdev->dev, "PPC channel request failed\n"); - return -ENODEV; + rc = -ENODEV; + goto out_mbox_free; } /* @@ -675,13 +678,13 @@ static int xgene_hwmon_probe(struct platform_device *pdev) if (!cppc_ss) { dev_err(&pdev->dev, "PPC subspace not found\n"); rc = -ENODEV; - goto out_mbox_free; + goto out; } if (!ctx->mbox_chan->mbox->txdone_irq) { dev_err(&pdev->dev, "PCC IRQ not supported\n"); rc = -ENODEV; - goto out_mbox_free; + goto out; } /* @@ -696,14 +699,14 @@ static int xgene_hwmon_probe(struct platform_device *pdev) } else { dev_err(&pdev->dev, "Failed to get PCC comm region\n"); rc = -ENODEV; - goto out_mbox_free; + goto out; } if (!ctx->pcc_comm_addr) { dev_err(&pdev->dev, "Failed to ioremap PCC comm region\n"); rc = -ENOMEM; - goto out_mbox_free; + goto out; } /* diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index c06dce2c1da7..45a3f3ca29b3 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -131,6 +131,7 @@ config I2C_I801 Gemini Lake (SOC) Cannon Lake-H (PCH) Cannon Lake-LP (PCH) + Cedar Fork (PCH) This driver can also be built as a module. If so, the module will be called i2c-i801. diff --git a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c index e114e4e00d29..9e12a53ef7b8 100644 --- a/drivers/i2c/busses/i2c-i801.c +++ b/drivers/i2c/busses/i2c-i801.c @@ -68,6 +68,7 @@ * Gemini Lake (SOC) 0x31d4 32 hard yes yes yes * Cannon Lake-H (PCH) 0xa323 32 hard yes yes yes * Cannon Lake-LP (PCH) 0x9da3 32 hard yes yes yes + * Cedar Fork (PCH) 0x18df 32 hard yes yes yes * * Features supported by this driver: * Software PEC no @@ -204,6 +205,7 @@ /* Older devices have their ID defined in <linux/pci_ids.h> */ #define PCI_DEVICE_ID_INTEL_BAYTRAIL_SMBUS 0x0f12 +#define PCI_DEVICE_ID_INTEL_CDF_SMBUS 0x18df #define PCI_DEVICE_ID_INTEL_DNV_SMBUS 0x19df #define PCI_DEVICE_ID_INTEL_COUGARPOINT_SMBUS 0x1c22 #define PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS 0x1d22 @@ -1025,6 +1027,7 @@ static const struct pci_device_id i801_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BRASWELL_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SUNRISEPOINT_H_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_SUNRISEPOINT_LP_SMBUS) }, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CDF_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_DNV_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BROXTON_SMBUS) }, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS) }, @@ -1513,6 +1516,7 @@ static int i801_probe(struct pci_dev *dev, const struct pci_device_id *id) case PCI_DEVICE_ID_INTEL_CANNONLAKE_LP_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SMBUS: case PCI_DEVICE_ID_INTEL_LEWISBURG_SSKU_SMBUS: + case PCI_DEVICE_ID_INTEL_CDF_SMBUS: case PCI_DEVICE_ID_INTEL_DNV_SMBUS: case PCI_DEVICE_ID_INTEL_KABYLAKE_PCH_H_SMBUS: priv->features |= FEATURE_I2C_BLOCK_READ; diff --git a/drivers/i2c/busses/i2c-sprd.c b/drivers/i2c/busses/i2c-sprd.c index 22e08ae1704f..25fcc3c1e32b 100644 --- a/drivers/i2c/busses/i2c-sprd.c +++ b/drivers/i2c/busses/i2c-sprd.c @@ -627,6 +627,7 @@ static const struct dev_pm_ops sprd_i2c_pm_ops = { static const struct of_device_id sprd_i2c_of_match[] = { { .compatible = "sprd,sc9860-i2c", }, + {}, }; static struct platform_driver sprd_i2c_driver = { diff --git a/drivers/i2c/busses/i2c-stm32f7.c b/drivers/i2c/busses/i2c-stm32f7.c index 47c67b0ca896..d4a6e9c2e9aa 100644 --- a/drivers/i2c/busses/i2c-stm32f7.c +++ b/drivers/i2c/busses/i2c-stm32f7.c @@ -215,7 +215,7 @@ struct stm32f7_i2c_dev { unsigned int msg_num; unsigned int msg_id; struct stm32f7_i2c_msg f7_msg; - struct stm32f7_i2c_setup *setup; + struct stm32f7_i2c_setup setup; struct stm32f7_i2c_timings timing; }; @@ -265,7 +265,7 @@ static struct stm32f7_i2c_spec i2c_specs[] = { }, }; -struct stm32f7_i2c_setup stm32f7_setup = { +static const struct stm32f7_i2c_setup stm32f7_setup = { .rise_time = STM32F7_I2C_RISE_TIME_DEFAULT, .fall_time = STM32F7_I2C_FALL_TIME_DEFAULT, .dnf = STM32F7_I2C_DNF_DEFAULT, @@ -537,7 +537,7 @@ static void stm32f7_i2c_hw_config(struct stm32f7_i2c_dev *i2c_dev) writel_relaxed(timing, i2c_dev->base + STM32F7_I2C_TIMINGR); /* Enable I2C */ - if (i2c_dev->setup->analog_filter) + if (i2c_dev->setup.analog_filter) stm32f7_i2c_clr_bits(i2c_dev->base + STM32F7_I2C_CR1, STM32F7_I2C_CR1_ANFOFF); else @@ -887,22 +887,19 @@ static int stm32f7_i2c_probe(struct platform_device *pdev) } setup = of_device_get_match_data(&pdev->dev); - i2c_dev->setup->rise_time = setup->rise_time; - i2c_dev->setup->fall_time = setup->fall_time; - i2c_dev->setup->dnf = setup->dnf; - i2c_dev->setup->analog_filter = setup->analog_filter; + i2c_dev->setup = *setup; ret = device_property_read_u32(i2c_dev->dev, "i2c-scl-rising-time-ns", &rise_time); if (!ret) - i2c_dev->setup->rise_time = rise_time; + i2c_dev->setup.rise_time = rise_time; ret = device_property_read_u32(i2c_dev->dev, "i2c-scl-falling-time-ns", &fall_time); if (!ret) - i2c_dev->setup->fall_time = fall_time; + i2c_dev->setup.fall_time = fall_time; - ret = stm32f7_i2c_setup_timing(i2c_dev, i2c_dev->setup); + ret = stm32f7_i2c_setup_timing(i2c_dev, &i2c_dev->setup); if (ret) goto clk_free; diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c index 01b2adfd8226..eaf39e5db08b 100644 --- a/drivers/ide/ide-probe.c +++ b/drivers/ide/ide-probe.c @@ -1451,6 +1451,7 @@ int ide_host_register(struct ide_host *host, const struct ide_port_info *d, if (hwif_init(hwif) == 0) { printk(KERN_INFO "%s: failed to initialize IDE " "interface\n", hwif->name); + device_unregister(hwif->portdev); device_unregister(&hwif->gendev); ide_disable_port(hwif); continue; diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c index 86aa88aeb3a6..acf874800ca4 100644 --- a/drivers/ide/ide-scan-pci.c +++ b/drivers/ide/ide-scan-pci.c @@ -56,6 +56,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev) { struct list_head *l; struct pci_driver *d; + int ret; list_for_each(l, &ide_pci_drivers) { d = list_entry(l, struct pci_driver, node); @@ -63,10 +64,14 @@ static int __init ide_scan_pcidev(struct pci_dev *dev) const struct pci_device_id *id = pci_match_id(d->id_table, dev); - if (id != NULL && d->probe(dev, id) >= 0) { - dev->driver = d; - pci_dev_get(dev); - return 1; + if (id != NULL) { + pci_assign_irq(dev); + ret = d->probe(dev, id); + if (ret >= 0) { + dev->driver = d; + pci_dev_get(dev); + return 1; + } } } } diff --git a/drivers/ide/setup-pci.c b/drivers/ide/setup-pci.c index 112d2fe1bcdb..fdc8e813170c 100644 --- a/drivers/ide/setup-pci.c +++ b/drivers/ide/setup-pci.c @@ -179,6 +179,7 @@ EXPORT_SYMBOL_GPL(ide_setup_pci_noise); /** * ide_pci_enable - do PCI enables * @dev: PCI device + * @bars: PCI BARs mask * @d: IDE port info * * Enable the IDE PCI device. We attempt to enable the device in full @@ -189,9 +190,10 @@ EXPORT_SYMBOL_GPL(ide_setup_pci_noise); * Returns zero on success or an error code */ -static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d) +static int ide_pci_enable(struct pci_dev *dev, int bars, + const struct ide_port_info *d) { - int ret, bars; + int ret; if (pci_enable_device(dev)) { ret = pci_enable_device_io(dev); @@ -216,18 +218,6 @@ static int ide_pci_enable(struct pci_dev *dev, const struct ide_port_info *d) goto out; } - if (d->host_flags & IDE_HFLAG_SINGLE) - bars = (1 << 2) - 1; - else - bars = (1 << 4) - 1; - - if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) { - if (d->host_flags & IDE_HFLAG_CS5520) - bars |= (1 << 2); - else - bars |= (1 << 4); - } - ret = pci_request_selected_regions(dev, bars, d->name); if (ret < 0) printk(KERN_ERR "%s %s: can't reserve resources\n", @@ -403,6 +393,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d) /** * ide_setup_pci_controller - set up IDE PCI * @dev: PCI device + * @bars: PCI BARs mask * @d: IDE port info * @noisy: verbose flag * @@ -411,7 +402,7 @@ int ide_hwif_setup_dma(ide_hwif_t *hwif, const struct ide_port_info *d) * and enables it if need be */ -static int ide_setup_pci_controller(struct pci_dev *dev, +static int ide_setup_pci_controller(struct pci_dev *dev, int bars, const struct ide_port_info *d, int noisy) { int ret; @@ -420,7 +411,7 @@ static int ide_setup_pci_controller(struct pci_dev *dev, if (noisy) ide_setup_pci_noise(dev, d); - ret = ide_pci_enable(dev, d); + ret = ide_pci_enable(dev, bars, d); if (ret < 0) goto out; @@ -428,16 +419,20 @@ static int ide_setup_pci_controller(struct pci_dev *dev, if (ret < 0) { printk(KERN_ERR "%s %s: error accessing PCI regs\n", d->name, pci_name(dev)); - goto out; + goto out_free_bars; } if (!(pcicmd & PCI_COMMAND_IO)) { /* is device disabled? */ ret = ide_pci_configure(dev, d); if (ret < 0) - goto out; + goto out_free_bars; printk(KERN_INFO "%s %s: device enabled (Linux)\n", d->name, pci_name(dev)); } + goto out; + +out_free_bars: + pci_release_selected_regions(dev, bars); out: return ret; } @@ -540,13 +535,28 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, { struct pci_dev *pdev[] = { dev1, dev2 }; struct ide_host *host; - int ret, i, n_ports = dev2 ? 4 : 2; + int ret, i, n_ports = dev2 ? 4 : 2, bars; struct ide_hw hw[4], *hws[] = { NULL, NULL, NULL, NULL }; + if (d->host_flags & IDE_HFLAG_SINGLE) + bars = (1 << 2) - 1; + else + bars = (1 << 4) - 1; + + if ((d->host_flags & IDE_HFLAG_NO_DMA) == 0) { + if (d->host_flags & IDE_HFLAG_CS5520) + bars |= (1 << 2); + else + bars |= (1 << 4); + } + for (i = 0; i < n_ports / 2; i++) { - ret = ide_setup_pci_controller(pdev[i], d, !i); - if (ret < 0) + ret = ide_setup_pci_controller(pdev[i], bars, d, !i); + if (ret < 0) { + if (i == 1) + pci_release_selected_regions(pdev[0], bars); goto out; + } ide_pci_setup_ports(pdev[i], d, &hw[i*2], &hws[i*2]); } @@ -554,7 +564,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, host = ide_host_alloc(d, hws, n_ports); if (host == NULL) { ret = -ENOMEM; - goto out; + goto out_free_bars; } host->dev[0] = &dev1->dev; @@ -576,7 +586,7 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, * do_ide_setup_pci_device() on the first device! */ if (ret < 0) - goto out; + goto out_free_bars; /* fixup IRQ */ if (ide_pci_is_in_compatibility_mode(pdev[i])) { @@ -589,6 +599,13 @@ int ide_pci_init_two(struct pci_dev *dev1, struct pci_dev *dev2, ret = ide_host_register(host, d, hws); if (ret) ide_host_free(host); + else + goto out; + +out_free_bars: + i = n_ports / 2; + while (i--) + pci_release_selected_regions(pdev[i], bars); out: return ret; } diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c index 30825bb9b8e9..8861c052155a 100644 --- a/drivers/infiniband/core/iwpm_msg.c +++ b/drivers/infiniband/core/iwpm_msg.c @@ -100,6 +100,8 @@ int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client) if (ret) goto pid_query_error; + nlmsg_end(skb, nlh); + pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n", __func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name); @@ -170,6 +172,8 @@ int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) &pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR); if (ret) goto add_mapping_error; + + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); @@ -246,6 +250,8 @@ int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client) &pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR); if (ret) goto query_mapping_error; + + nlmsg_end(skb, nlh); nlmsg_request->req_buffer = pm_msg; ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); @@ -308,6 +314,8 @@ int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client) if (ret) goto remove_mapping_error; + nlmsg_end(skb, nlh); + ret = rdma_nl_unicast_wait(skb, iwpm_user_pid); if (ret) { skb = NULL; /* skb is freed in the netlink send-op handling */ diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c index c81c55942626..3c4faadb8cdd 100644 --- a/drivers/infiniband/core/iwpm_util.c +++ b/drivers/infiniband/core/iwpm_util.c @@ -597,6 +597,9 @@ static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid) &mapping_num, IWPM_NLA_MAPINFO_SEND_NUM); if (ret) goto mapinfo_num_error; + + nlmsg_end(skb, nlh); + ret = rdma_nl_unicast(skb, iwpm_pid); if (ret) { skb = NULL; @@ -678,6 +681,8 @@ int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid) if (ret) goto send_mapping_info_unlock; + nlmsg_end(skb, nlh); + iwpm_print_sockaddr(&map_info->local_sockaddr, "send_mapping_info: Local sockaddr:"); iwpm_print_sockaddr(&map_info->mapped_sockaddr, diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c index d1f5345f04f0..42ca5346777d 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c +++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c @@ -48,7 +48,7 @@ * @wqe: cqp wqe for header * @header: header for the cqp wqe */ -static inline void i40iw_insert_wqe_hdr(u64 *wqe, u64 header) +void i40iw_insert_wqe_hdr(u64 *wqe, u64 header) { wmb(); /* make sure WQE is populated before polarity is set */ set_64bit_val(wqe, 24, header); diff --git a/drivers/infiniband/hw/i40iw/i40iw_p.h b/drivers/infiniband/hw/i40iw/i40iw_p.h index e217a1259f57..5498ad01c280 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_p.h +++ b/drivers/infiniband/hw/i40iw/i40iw_p.h @@ -59,6 +59,8 @@ enum i40iw_status_code i40iw_sc_mr_fast_register(struct i40iw_sc_qp *qp, struct i40iw_fast_reg_stag_info *info, bool post_sq); +void i40iw_insert_wqe_hdr(u64 *wqe, u64 header); + /* HMC/FPM functions */ enum i40iw_status_code i40iw_sc_init_iw_hmc(struct i40iw_sc_dev *dev, u8 hmc_fn_id); diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c b/drivers/infiniband/hw/i40iw/i40iw_puda.c index c2cab20c4bc5..59f70676f0e0 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_puda.c +++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c @@ -123,12 +123,11 @@ static void i40iw_puda_post_recvbuf(struct i40iw_puda_rsrc *rsrc, u32 wqe_idx, get_64bit_val(wqe, 24, &offset24); offset24 = (offset24) ? 0 : LS_64(1, I40IWQPSQ_VALID); - set_64bit_val(wqe, 24, offset24); set_64bit_val(wqe, 0, buf->mem.pa); set_64bit_val(wqe, 8, LS_64(buf->mem.size, I40IWQPSQ_FRAG_LEN)); - set_64bit_val(wqe, 24, offset24); + i40iw_insert_wqe_hdr(wqe, offset24); } /** @@ -409,9 +408,7 @@ enum i40iw_status_code i40iw_puda_send(struct i40iw_sc_qp *qp, set_64bit_val(wqe, 8, LS_64(info->len, I40IWQPSQ_FRAG_LEN)); set_64bit_val(wqe, 16, header[0]); - /* Ensure all data is written before writing valid bit */ - wmb(); - set_64bit_val(wqe, 24, header[1]); + i40iw_insert_wqe_hdr(wqe, header[1]); i40iw_debug_buf(qp->dev, I40IW_DEBUG_PUDA, "PUDA SEND WQE", wqe, 32); i40iw_qp_post_wr(&qp->qp_uk); @@ -539,7 +536,7 @@ static enum i40iw_status_code i40iw_puda_qp_wqe(struct i40iw_sc_dev *dev, struct LS_64(2, I40IW_CQPSQ_QP_NEXTIWSTATE) | LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID); - set_64bit_val(wqe, 24, header); + i40iw_insert_wqe_hdr(wqe, header); i40iw_debug_buf(cqp->dev, I40IW_DEBUG_PUDA, "PUDA CQE", wqe, 32); i40iw_sc_cqp_post_sq(cqp); @@ -655,7 +652,7 @@ static enum i40iw_status_code i40iw_puda_cq_wqe(struct i40iw_sc_dev *dev, struct LS_64(1, I40IW_CQPSQ_CQ_ENCEQEMASK) | LS_64(1, I40IW_CQPSQ_CQ_CEQIDVALID) | LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID); - set_64bit_val(wqe, 24, header); + i40iw_insert_wqe_hdr(wqe, header); i40iw_debug_buf(dev, I40IW_DEBUG_PUDA, "PUDA CQE", wqe, I40IW_CQP_WQE_SIZE * 8); diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index 28b3d02d511b..62be0a41ad0b 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -826,12 +826,14 @@ static int i40iw_query_qp(struct ib_qp *ibqp, attr->cap.max_inline_data = I40IW_MAX_INLINE_DATA_SIZE; attr->cap.max_send_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; attr->cap.max_recv_sge = I40IW_MAX_WQ_FRAGMENT_COUNT; + attr->port_num = 1; init_attr->event_handler = iwqp->ibqp.event_handler; init_attr->qp_context = iwqp->ibqp.qp_context; init_attr->send_cq = iwqp->ibqp.send_cq; init_attr->recv_cq = iwqp->ibqp.recv_cq; init_attr->srq = iwqp->ibqp.srq; init_attr->cap = attr->cap; + init_attr->port_num = 1; return 0; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index d6fbad8f34aa..552f7bd4ecc3 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4174,9 +4174,9 @@ err_bfreg: err_uar_page: mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar); -err_cnt: - mlx5_ib_cleanup_cong_debugfs(dev); err_cong: + mlx5_ib_cleanup_cong_debugfs(dev); +err_cnt: if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) mlx5_ib_dealloc_counters(dev); diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index b2bb42e2805d..254083b524bd 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -387,7 +387,7 @@ struct qedr_qp { u8 wqe_size; u8 smac[ETH_ALEN]; - u16 vlan_id; + u16 vlan; int rc; } *rqe_wr_id; diff --git a/drivers/infiniband/hw/qedr/qedr_cm.c b/drivers/infiniband/hw/qedr/qedr_cm.c index 4689e802b332..ad8965397cf7 100644 --- a/drivers/infiniband/hw/qedr/qedr_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_cm.c @@ -105,7 +105,7 @@ void qedr_ll2_complete_rx_packet(void *cxt, qp->rqe_wr_id[qp->rq.gsi_cons].rc = data->u.data_length_error ? -EINVAL : 0; - qp->rqe_wr_id[qp->rq.gsi_cons].vlan_id = data->vlan; + qp->rqe_wr_id[qp->rq.gsi_cons].vlan = data->vlan; /* note: length stands for data length i.e. GRH is excluded */ qp->rqe_wr_id[qp->rq.gsi_cons].sg_list[0].length = data->length.data_length; @@ -694,6 +694,7 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) struct qedr_cq *cq = get_qedr_cq(ibcq); struct qedr_qp *qp = dev->gsi_qp; unsigned long flags; + u16 vlan_id; int i = 0; spin_lock_irqsave(&cq->cq_lock, flags); @@ -712,9 +713,14 @@ int qedr_gsi_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) wc[i].wc_flags |= IB_WC_GRH | IB_WC_IP_CSUM_OK; ether_addr_copy(wc[i].smac, qp->rqe_wr_id[qp->rq.cons].smac); wc[i].wc_flags |= IB_WC_WITH_SMAC; - if (qp->rqe_wr_id[qp->rq.cons].vlan_id) { + + vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan & + VLAN_VID_MASK; + if (vlan_id) { wc[i].wc_flags |= IB_WC_WITH_VLAN; - wc[i].vlan_id = qp->rqe_wr_id[qp->rq.cons].vlan_id; + wc[i].vlan_id = vlan_id; + wc[i].sl = (qp->rqe_wr_id[qp->rq.cons].vlan & + VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; } qedr_inc_sw_cons(&qp->rq); diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c index 7d5286b05036..1841d0359bac 100644 --- a/drivers/md/bcache/closure.c +++ b/drivers/md/bcache/closure.c @@ -64,7 +64,7 @@ EXPORT_SYMBOL(closure_put); void __closure_wake_up(struct closure_waitlist *wait_list) { struct llist_node *list; - struct closure *cl; + struct closure *cl, *t; struct llist_node *reverse = NULL; list = llist_del_all(&wait_list->list); @@ -73,7 +73,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) reverse = llist_reverse_order(list); /* Then do the wakeups */ - llist_for_each_entry(cl, reverse, list) { + llist_for_each_entry_safe(cl, t, reverse, list) { closure_set_waiting(cl, 0); closure_sub(cl, CLOSURE_WAITING + 1); } diff --git a/drivers/misc/cxl/cxllib.c b/drivers/misc/cxl/cxllib.c index 5dba23ca2e5f..dc9bc1807fdf 100644 --- a/drivers/misc/cxl/cxllib.c +++ b/drivers/misc/cxl/cxllib.c @@ -219,8 +219,17 @@ int cxllib_handle_fault(struct mm_struct *mm, u64 addr, u64 size, u64 flags) down_read(&mm->mmap_sem); - for (dar = addr; dar < addr + size; dar += page_size) { - if (!vma || dar < vma->vm_start || dar > vma->vm_end) { + vma = find_vma(mm, addr); + if (!vma) { + pr_err("Can't find vma for addr %016llx\n", addr); + rc = -EFAULT; + goto out; + } + /* get the size of the pages allocated */ + page_size = vma_kernel_pagesize(vma); + + for (dar = (addr & ~(page_size - 1)); dar < (addr + size); dar += page_size) { + if (dar < vma->vm_start || dar >= vma->vm_end) { vma = find_vma(mm, addr); if (!vma) { pr_err("Can't find vma for addr %016llx\n", addr); diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index 29fc1e662891..2ad7b5c69156 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -1634,8 +1634,6 @@ static void mmc_blk_data_prep(struct mmc_queue *mq, struct mmc_queue_req *mqrq, } mqrq->areq.mrq = &brq->mrq; - - mmc_queue_bounce_pre(mqrq); } static void mmc_blk_rw_rq_prep(struct mmc_queue_req *mqrq, @@ -1829,7 +1827,6 @@ static void mmc_blk_issue_rw_rq(struct mmc_queue *mq, struct request *new_req) brq = &mq_rq->brq; old_req = mmc_queue_req_to_req(mq_rq); type = rq_data_dir(old_req) == READ ? MMC_BLK_READ : MMC_BLK_WRITE; - mmc_queue_bounce_post(mq_rq); switch (status) { case MMC_BLK_SUCCESS: diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index a7eb623f8daa..36217ad5e9b1 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1286,6 +1286,23 @@ out_err: return err; } +static void mmc_select_driver_type(struct mmc_card *card) +{ + int card_drv_type, drive_strength, drv_type; + + card_drv_type = card->ext_csd.raw_driver_strength | + mmc_driver_type_mask(0); + + drive_strength = mmc_select_drive_strength(card, + card->ext_csd.hs200_max_dtr, + card_drv_type, &drv_type); + + card->drive_strength = drive_strength; + + if (drv_type) + mmc_set_driver_type(card->host, drv_type); +} + static int mmc_select_hs400es(struct mmc_card *card) { struct mmc_host *host = card->host; @@ -1341,6 +1358,8 @@ static int mmc_select_hs400es(struct mmc_card *card) goto out_err; } + mmc_select_driver_type(card); + /* Switch card to HS400 */ val = EXT_CSD_TIMING_HS400 | card->drive_strength << EXT_CSD_DRV_STR_SHIFT; @@ -1374,23 +1393,6 @@ out_err: return err; } -static void mmc_select_driver_type(struct mmc_card *card) -{ - int card_drv_type, drive_strength, drv_type; - - card_drv_type = card->ext_csd.raw_driver_strength | - mmc_driver_type_mask(0); - - drive_strength = mmc_select_drive_strength(card, - card->ext_csd.hs200_max_dtr, - card_drv_type, &drv_type); - - card->drive_strength = drive_strength; - - if (drv_type) - mmc_set_driver_type(card->host, drv_type); -} - /* * For device supporting HS200 mode, the following sequence * should be done before executing the tuning process. diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index 74c663b1c0a7..0a4e77a5ba33 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -23,8 +23,6 @@ #include "core.h" #include "card.h" -#define MMC_QUEUE_BOUNCESZ 65536 - /* * Prepare a MMC request. This just filters out odd stuff. */ @@ -150,26 +148,6 @@ static void mmc_queue_setup_discard(struct request_queue *q, queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, q); } -static unsigned int mmc_queue_calc_bouncesz(struct mmc_host *host) -{ - unsigned int bouncesz = MMC_QUEUE_BOUNCESZ; - - if (host->max_segs != 1 || (host->caps & MMC_CAP_NO_BOUNCE_BUFF)) - return 0; - - if (bouncesz > host->max_req_size) - bouncesz = host->max_req_size; - if (bouncesz > host->max_seg_size) - bouncesz = host->max_seg_size; - if (bouncesz > host->max_blk_count * 512) - bouncesz = host->max_blk_count * 512; - - if (bouncesz <= 512) - return 0; - - return bouncesz; -} - /** * mmc_init_request() - initialize the MMC-specific per-request data * @q: the request queue @@ -184,26 +162,9 @@ static int mmc_init_request(struct request_queue *q, struct request *req, struct mmc_card *card = mq->card; struct mmc_host *host = card->host; - if (card->bouncesz) { - mq_rq->bounce_buf = kmalloc(card->bouncesz, gfp); - if (!mq_rq->bounce_buf) - return -ENOMEM; - if (card->bouncesz > 512) { - mq_rq->sg = mmc_alloc_sg(1, gfp); - if (!mq_rq->sg) - return -ENOMEM; - mq_rq->bounce_sg = mmc_alloc_sg(card->bouncesz / 512, - gfp); - if (!mq_rq->bounce_sg) - return -ENOMEM; - } - } else { - mq_rq->bounce_buf = NULL; - mq_rq->bounce_sg = NULL; - mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp); - if (!mq_rq->sg) - return -ENOMEM; - } + mq_rq->sg = mmc_alloc_sg(host->max_segs, gfp); + if (!mq_rq->sg) + return -ENOMEM; return 0; } @@ -212,13 +173,6 @@ static void mmc_exit_request(struct request_queue *q, struct request *req) { struct mmc_queue_req *mq_rq = req_to_mmc_queue_req(req); - /* It is OK to kfree(NULL) so this will be smooth */ - kfree(mq_rq->bounce_sg); - mq_rq->bounce_sg = NULL; - - kfree(mq_rq->bounce_buf); - mq_rq->bounce_buf = NULL; - kfree(mq_rq->sg); mq_rq->sg = NULL; } @@ -242,12 +196,6 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask) limit = (u64)dma_max_pfn(mmc_dev(host)) << PAGE_SHIFT; - /* - * mmc_init_request() depends on card->bouncesz so it must be calculated - * before blk_init_allocated_queue() starts allocating requests. - */ - card->bouncesz = mmc_queue_calc_bouncesz(host); - mq->card = card; mq->queue = blk_alloc_queue(GFP_KERNEL); if (!mq->queue) @@ -271,17 +219,11 @@ int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, if (mmc_can_erase(card)) mmc_queue_setup_discard(mq->queue, card); - if (card->bouncesz) { - blk_queue_max_hw_sectors(mq->queue, card->bouncesz / 512); - blk_queue_max_segments(mq->queue, card->bouncesz / 512); - blk_queue_max_segment_size(mq->queue, card->bouncesz); - } else { - blk_queue_bounce_limit(mq->queue, limit); - blk_queue_max_hw_sectors(mq->queue, - min(host->max_blk_count, host->max_req_size / 512)); - blk_queue_max_segments(mq->queue, host->max_segs); - blk_queue_max_segment_size(mq->queue, host->max_seg_size); - } + blk_queue_bounce_limit(mq->queue, limit); + blk_queue_max_hw_sectors(mq->queue, + min(host->max_blk_count, host->max_req_size / 512)); + blk_queue_max_segments(mq->queue, host->max_segs); + blk_queue_max_segment_size(mq->queue, host->max_seg_size); sema_init(&mq->thread_sem, 1); @@ -370,56 +312,7 @@ void mmc_queue_resume(struct mmc_queue *mq) */ unsigned int mmc_queue_map_sg(struct mmc_queue *mq, struct mmc_queue_req *mqrq) { - unsigned int sg_len; - size_t buflen; - struct scatterlist *sg; struct request *req = mmc_queue_req_to_req(mqrq); - int i; - - if (!mqrq->bounce_buf) - return blk_rq_map_sg(mq->queue, req, mqrq->sg); - - sg_len = blk_rq_map_sg(mq->queue, req, mqrq->bounce_sg); - - mqrq->bounce_sg_len = sg_len; - - buflen = 0; - for_each_sg(mqrq->bounce_sg, sg, sg_len, i) - buflen += sg->length; - - sg_init_one(mqrq->sg, mqrq->bounce_buf, buflen); - - return 1; -} - -/* - * If writing, bounce the data to the buffer before the request - * is sent to the host driver - */ -void mmc_queue_bounce_pre(struct mmc_queue_req *mqrq) -{ - if (!mqrq->bounce_buf) - return; - - if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != WRITE) - return; - - sg_copy_to_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len, - mqrq->bounce_buf, mqrq->sg[0].length); -} - -/* - * If reading, bounce the data from the buffer after the request - * has been handled by the host driver - */ -void mmc_queue_bounce_post(struct mmc_queue_req *mqrq) -{ - if (!mqrq->bounce_buf) - return; - - if (rq_data_dir(mmc_queue_req_to_req(mqrq)) != READ) - return; - sg_copy_from_buffer(mqrq->bounce_sg, mqrq->bounce_sg_len, - mqrq->bounce_buf, mqrq->sg[0].length); + return blk_rq_map_sg(mq->queue, req, mqrq->sg); } diff --git a/drivers/mmc/core/queue.h b/drivers/mmc/core/queue.h index 04fc89360a7a..f18d3f656baa 100644 --- a/drivers/mmc/core/queue.h +++ b/drivers/mmc/core/queue.h @@ -49,9 +49,6 @@ enum mmc_drv_op { struct mmc_queue_req { struct mmc_blk_request brq; struct scatterlist *sg; - char *bounce_buf; - struct scatterlist *bounce_sg; - unsigned int bounce_sg_len; struct mmc_async_req areq; enum mmc_drv_op drv_op; int drv_op_result; @@ -81,11 +78,8 @@ extern int mmc_init_queue(struct mmc_queue *, struct mmc_card *, spinlock_t *, extern void mmc_cleanup_queue(struct mmc_queue *); extern void mmc_queue_suspend(struct mmc_queue *); extern void mmc_queue_resume(struct mmc_queue *); - extern unsigned int mmc_queue_map_sg(struct mmc_queue *, struct mmc_queue_req *); -extern void mmc_queue_bounce_pre(struct mmc_queue_req *); -extern void mmc_queue_bounce_post(struct mmc_queue_req *); extern int mmc_access_rpmb(struct mmc_queue *); diff --git a/drivers/mmc/host/cavium.c b/drivers/mmc/host/cavium.c index 27fb625cbcf3..fbd29f00fca0 100644 --- a/drivers/mmc/host/cavium.c +++ b/drivers/mmc/host/cavium.c @@ -1038,7 +1038,7 @@ int cvm_mmc_of_slot_probe(struct device *dev, struct cvm_mmc_host *host) */ mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED | MMC_CAP_ERASE | MMC_CAP_CMD23 | MMC_CAP_POWER_OFF_CARD | - MMC_CAP_3_3V_DDR | MMC_CAP_NO_BOUNCE_BUFF; + MMC_CAP_3_3V_DDR; if (host->use_sg) mmc->max_segs = 16; diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index c885c2d4b904..85745ef179e2 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -531,8 +531,7 @@ static int meson_mmc_clk_init(struct meson_host *host) div->shift = __ffs(CLK_DIV_MASK); div->width = __builtin_popcountl(CLK_DIV_MASK); div->hw.init = &init; - div->flags = (CLK_DIVIDER_ONE_BASED | - CLK_DIVIDER_ROUND_CLOSEST); + div->flags = CLK_DIVIDER_ONE_BASED; clk = devm_clk_register(host->dev, &div->hw); if (WARN_ON(IS_ERR(clk))) @@ -717,6 +716,22 @@ static int meson_mmc_clk_phase_tuning(struct mmc_host *mmc, u32 opcode, static int meson_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) { struct meson_host *host = mmc_priv(mmc); + int ret; + + /* + * If this is the initial tuning, try to get a sane Rx starting + * phase before doing the actual tuning. + */ + if (!mmc->doing_retune) { + ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk); + + if (ret) + return ret; + } + + ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->tx_clk); + if (ret) + return ret; return meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk); } @@ -746,6 +761,11 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) case MMC_POWER_UP: if (!IS_ERR(mmc->supply.vmmc)) mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd); + + /* Reset phases */ + clk_set_phase(host->rx_clk, 0); + clk_set_phase(host->tx_clk, 270); + break; case MMC_POWER_ON: @@ -759,8 +779,6 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) host->vqmmc_enabled = true; } - /* Reset rx phase */ - clk_set_phase(host->rx_clk, 0); break; } diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index 59ab194cb009..c763b404510f 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -702,11 +702,7 @@ static int pxamci_probe(struct platform_device *pdev) pxamci_init_ocr(host); - /* - * This architecture used to disable bounce buffers through its - * defconfig, now it is done at runtime as a host property. - */ - mmc->caps = MMC_CAP_NO_BOUNCE_BUFF; + mmc->caps = 0; host->cmdat = 0; if (!cpu_is_pxa25x()) { mmc->caps |= MMC_CAP_4_BIT_DATA | MMC_CAP_SDIO_IRQ; diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index 2eec2e652c53..0842bbc2d7ad 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -466,6 +466,7 @@ static int xenon_probe(struct platform_device *pdev) { struct sdhci_pltfm_host *pltfm_host; struct sdhci_host *host; + struct xenon_priv *priv; int err; host = sdhci_pltfm_init(pdev, &sdhci_xenon_pdata, @@ -474,6 +475,7 @@ static int xenon_probe(struct platform_device *pdev) return PTR_ERR(host); pltfm_host = sdhci_priv(host); + priv = sdhci_pltfm_priv(pltfm_host); /* * Link Xenon specific mmc_host_ops function, @@ -491,9 +493,20 @@ static int xenon_probe(struct platform_device *pdev) if (err) goto free_pltfm; + priv->axi_clk = devm_clk_get(&pdev->dev, "axi"); + if (IS_ERR(priv->axi_clk)) { + err = PTR_ERR(priv->axi_clk); + if (err == -EPROBE_DEFER) + goto err_clk; + } else { + err = clk_prepare_enable(priv->axi_clk); + if (err) + goto err_clk; + } + err = mmc_of_parse(host->mmc); if (err) - goto err_clk; + goto err_clk_axi; sdhci_get_of_property(pdev); @@ -502,11 +515,11 @@ static int xenon_probe(struct platform_device *pdev) /* Xenon specific dt parse */ err = xenon_probe_dt(pdev); if (err) - goto err_clk; + goto err_clk_axi; err = xenon_sdhc_prepare(host); if (err) - goto err_clk; + goto err_clk_axi; pm_runtime_get_noresume(&pdev->dev); pm_runtime_set_active(&pdev->dev); @@ -527,6 +540,8 @@ remove_sdhc: pm_runtime_disable(&pdev->dev); pm_runtime_put_noidle(&pdev->dev); xenon_sdhc_unprepare(host); +err_clk_axi: + clk_disable_unprepare(priv->axi_clk); err_clk: clk_disable_unprepare(pltfm_host->clk); free_pltfm: @@ -538,6 +553,7 @@ static int xenon_remove(struct platform_device *pdev) { struct sdhci_host *host = platform_get_drvdata(pdev); struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct xenon_priv *priv = sdhci_pltfm_priv(pltfm_host); pm_runtime_get_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); @@ -546,7 +562,7 @@ static int xenon_remove(struct platform_device *pdev) sdhci_remove_host(host, 0); xenon_sdhc_unprepare(host); - + clk_disable_unprepare(priv->axi_clk); clk_disable_unprepare(pltfm_host->clk); sdhci_pltfm_free(pdev); diff --git a/drivers/mmc/host/sdhci-xenon.h b/drivers/mmc/host/sdhci-xenon.h index 2bc0510c0769..9994995c7c56 100644 --- a/drivers/mmc/host/sdhci-xenon.h +++ b/drivers/mmc/host/sdhci-xenon.h @@ -83,6 +83,7 @@ struct xenon_priv { unsigned char bus_width; unsigned char timing; unsigned int clock; + struct clk *axi_clk; int phy_type; /* diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c index 7aecc98d0a18..32025b990437 100644 --- a/drivers/net/dsa/bcm_sf2.c +++ b/drivers/net/dsa/bcm_sf2.c @@ -205,6 +205,19 @@ static int bcm_sf2_port_setup(struct dsa_switch *ds, int port, if (port == priv->moca_port) bcm_sf2_port_intr_enable(priv, port); + /* Set per-queue pause threshold to 32 */ + core_writel(priv, 32, CORE_TXQ_THD_PAUSE_QN_PORT(port)); + + /* Set ACB threshold to 24 */ + for (i = 0; i < SF2_NUM_EGRESS_QUEUES; i++) { + reg = acb_readl(priv, ACB_QUEUE_CFG(port * + SF2_NUM_EGRESS_QUEUES + i)); + reg &= ~XOFF_THRESHOLD_MASK; + reg |= 24; + acb_writel(priv, reg, ACB_QUEUE_CFG(port * + SF2_NUM_EGRESS_QUEUES + i)); + } + return b53_enable_port(ds, port, phy); } @@ -613,6 +626,20 @@ static void bcm_sf2_sw_fixed_link_update(struct dsa_switch *ds, int port, status->pause = 1; } +static void bcm_sf2_enable_acb(struct dsa_switch *ds) +{ + struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); + u32 reg; + + /* Enable ACB globally */ + reg = acb_readl(priv, ACB_CONTROL); + reg |= (ACB_FLUSH_MASK << ACB_FLUSH_SHIFT); + acb_writel(priv, reg, ACB_CONTROL); + reg &= ~(ACB_FLUSH_MASK << ACB_FLUSH_SHIFT); + reg |= ACB_EN | ACB_ALGORITHM; + acb_writel(priv, reg, ACB_CONTROL); +} + static int bcm_sf2_sw_suspend(struct dsa_switch *ds) { struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds); @@ -655,6 +682,8 @@ static int bcm_sf2_sw_resume(struct dsa_switch *ds) bcm_sf2_imp_setup(ds, port); } + bcm_sf2_enable_acb(ds); + return 0; } @@ -766,6 +795,7 @@ static int bcm_sf2_sw_setup(struct dsa_switch *ds) } bcm_sf2_sw_configure_vlan(ds); + bcm_sf2_enable_acb(ds); return 0; } diff --git a/drivers/net/dsa/bcm_sf2_regs.h b/drivers/net/dsa/bcm_sf2_regs.h index d8b8074a47b9..d1596dfca323 100644 --- a/drivers/net/dsa/bcm_sf2_regs.h +++ b/drivers/net/dsa/bcm_sf2_regs.h @@ -115,6 +115,24 @@ enum bcm_sf2_reg_offs { #define P7_IRQ_OFF 0 #define P_IRQ_OFF(x) ((6 - (x)) * P_NUM_IRQ) +/* Register set relative to 'ACB' */ +#define ACB_CONTROL 0x00 +#define ACB_EN (1 << 0) +#define ACB_ALGORITHM (1 << 1) +#define ACB_FLUSH_SHIFT 2 +#define ACB_FLUSH_MASK 0x3 + +#define ACB_QUEUE_0_CFG 0x08 +#define XOFF_THRESHOLD_MASK 0x7ff +#define XON_EN (1 << 11) +#define TOTAL_XOFF_THRESHOLD_SHIFT 12 +#define TOTAL_XOFF_THRESHOLD_MASK 0x7ff +#define TOTAL_XOFF_EN (1 << 23) +#define TOTAL_XON_EN (1 << 24) +#define PKTLEN_SHIFT 25 +#define PKTLEN_MASK 0x3f +#define ACB_QUEUE_CFG(x) (ACB_QUEUE_0_CFG + ((x) * 0x4)) + /* Register set relative to 'CORE' */ #define CORE_G_PCTL_PORT0 0x00000 #define CORE_G_PCTL_PORT(x) (CORE_G_PCTL_PORT0 + (x * 0x4)) @@ -237,6 +255,11 @@ enum bcm_sf2_reg_offs { #define CORE_PORT_VLAN_CTL_PORT(x) (0xc400 + ((x) * 0x8)) #define PORT_VLAN_CTRL_MASK 0x1ff +#define CORE_TXQ_THD_PAUSE_QN_PORT_0 0x2c80 +#define TXQ_PAUSE_THD_MASK 0x7ff +#define CORE_TXQ_THD_PAUSE_QN_PORT(x) (CORE_TXQ_THD_PAUSE_QN_PORT_0 + \ + (x) * 0x8) + #define CORE_DEFAULT_1Q_TAG_P(x) (0xd040 + ((x) * 8)) #define CFI_SHIFT 12 #define PRI_SHIFT 13 diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index d55051abf4ed..3a3f4f7ba364 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -110,13 +110,6 @@ static void dsa_loop_get_ethtool_stats(struct dsa_switch *ds, int port, data[i] = ps->ports[port].mib[i].val; } -static int dsa_loop_set_addr(struct dsa_switch *ds, u8 *addr) -{ - dev_dbg(ds->dev, "%s\n", __func__); - - return 0; -} - static int dsa_loop_phy_read(struct dsa_switch *ds, int port, int regnum) { struct dsa_loop_priv *ps = ds->priv; @@ -263,7 +256,6 @@ static const struct dsa_switch_ops dsa_loop_driver = { .get_strings = dsa_loop_get_strings, .get_ethtool_stats = dsa_loop_get_ethtool_stats, .get_sset_count = dsa_loop_get_sset_count, - .set_addr = dsa_loop_set_addr, .phy_read = dsa_loop_phy_read, .phy_write = dsa_loop_phy_write, .port_bridge_join = dsa_loop_port_bridge_join, diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index 07355db2ad81..fecfe1fe67ea 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -18,6 +18,7 @@ #include <linux/mutex.h> #include <linux/mii.h> #include <linux/phy.h> +#include <linux/if_bridge.h> #include "lan9303.h" @@ -146,6 +147,7 @@ # define LAN9303_SWE_PORT_STATE_FORWARDING_PORT0 (0) # define LAN9303_SWE_PORT_STATE_LEARNING_PORT0 BIT(1) # define LAN9303_SWE_PORT_STATE_BLOCKING_PORT0 BIT(0) +# define LAN9303_SWE_PORT_STATE_DISABLED_PORT0 (3) #define LAN9303_SWE_PORT_MIRROR 0x1846 # define LAN9303_SWE_PORT_MIRROR_SNIFF_ALL BIT(8) # define LAN9303_SWE_PORT_MIRROR_SNIFFER_PORT2 BIT(7) @@ -156,7 +158,9 @@ # define LAN9303_SWE_PORT_MIRROR_MIRRORED_PORT0 BIT(2) # define LAN9303_SWE_PORT_MIRROR_ENABLE_RX_MIRRORING BIT(1) # define LAN9303_SWE_PORT_MIRROR_ENABLE_TX_MIRRORING BIT(0) +# define LAN9303_SWE_PORT_MIRROR_DISABLED 0 #define LAN9303_SWE_INGRESS_PORT_TYPE 0x1847 +#define LAN9303_SWE_INGRESS_PORT_TYPE_VLAN 3 #define LAN9303_BM_CFG 0x1c00 #define LAN9303_BM_EGRSS_PORT_TYPE 0x1c0c # define LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT2 (BIT(17) | BIT(16)) @@ -510,11 +514,30 @@ static int lan9303_enable_processing_port(struct lan9303 *chip, LAN9303_MAC_TX_CFG_X_TX_ENABLE); } +/* forward special tagged packets from port 0 to port 1 *or* port 2 */ +static int lan9303_setup_tagging(struct lan9303 *chip) +{ + int ret; + u32 val; + /* enable defining the destination port via special VLAN tagging + * for port 0 + */ + ret = lan9303_write_switch_reg(chip, LAN9303_SWE_INGRESS_PORT_TYPE, + LAN9303_SWE_INGRESS_PORT_TYPE_VLAN); + if (ret) + return ret; + + /* tag incoming packets at port 1 and 2 on their way to port 0 to be + * able to discover their source port + */ + val = LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0; + return lan9303_write_switch_reg(chip, LAN9303_BM_EGRSS_PORT_TYPE, val); +} + /* We want a special working switch: * - do not forward packets between port 1 and 2 * - forward everything from port 1 to port 0 * - forward everything from port 2 to port 0 - * - forward special tagged packets from port 0 to port 1 *or* port 2 */ static int lan9303_separate_ports(struct lan9303 *chip) { @@ -529,22 +552,6 @@ static int lan9303_separate_ports(struct lan9303 *chip) if (ret) return ret; - /* enable defining the destination port via special VLAN tagging - * for port 0 - */ - ret = lan9303_write_switch_reg(chip, LAN9303_SWE_INGRESS_PORT_TYPE, - 0x03); - if (ret) - return ret; - - /* tag incoming packets at port 1 and 2 on their way to port 0 to be - * able to discover their source port - */ - ret = lan9303_write_switch_reg(chip, LAN9303_BM_EGRSS_PORT_TYPE, - LAN9303_BM_EGRSS_PORT_TYPE_SPECIAL_TAG_PORT0); - if (ret) - return ret; - /* prevent port 1 and 2 from forwarding packets by their own */ return lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE, LAN9303_SWE_PORT_STATE_FORWARDING_PORT0 | @@ -552,6 +559,16 @@ static int lan9303_separate_ports(struct lan9303 *chip) LAN9303_SWE_PORT_STATE_BLOCKING_PORT2); } +static void lan9303_bridge_ports(struct lan9303 *chip) +{ + /* ports bridged: remove mirroring */ + lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_MIRROR, + LAN9303_SWE_PORT_MIRROR_DISABLED); + + lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE, + chip->swe_port_state); +} + static int lan9303_handle_reset(struct lan9303 *chip) { if (!chip->reset_gpio) @@ -644,6 +661,10 @@ static int lan9303_setup(struct dsa_switch *ds) return -EINVAL; } + ret = lan9303_setup_tagging(chip); + if (ret) + dev_err(chip->dev, "failed to setup port tagging %d\n", ret); + ret = lan9303_separate_ports(chip); if (ret) dev_err(chip->dev, "failed to separate ports %d\n", ret); @@ -836,6 +857,72 @@ static void lan9303_port_disable(struct dsa_switch *ds, int port, } } +static int lan9303_port_bridge_join(struct dsa_switch *ds, int port, + struct net_device *br) +{ + struct lan9303 *chip = ds->priv; + + dev_dbg(chip->dev, "%s(port %d)\n", __func__, port); + if (ds->ports[1].bridge_dev == ds->ports[2].bridge_dev) { + lan9303_bridge_ports(chip); + chip->is_bridged = true; /* unleash stp_state_set() */ + } + + return 0; +} + +static void lan9303_port_bridge_leave(struct dsa_switch *ds, int port, + struct net_device *br) +{ + struct lan9303 *chip = ds->priv; + + dev_dbg(chip->dev, "%s(port %d)\n", __func__, port); + if (chip->is_bridged) { + lan9303_separate_ports(chip); + chip->is_bridged = false; + } +} + +static void lan9303_port_stp_state_set(struct dsa_switch *ds, int port, + u8 state) +{ + int portmask, portstate; + struct lan9303 *chip = ds->priv; + + dev_dbg(chip->dev, "%s(port %d, state %d)\n", + __func__, port, state); + + switch (state) { + case BR_STATE_DISABLED: + portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0; + break; + case BR_STATE_BLOCKING: + case BR_STATE_LISTENING: + portstate = LAN9303_SWE_PORT_STATE_BLOCKING_PORT0; + break; + case BR_STATE_LEARNING: + portstate = LAN9303_SWE_PORT_STATE_LEARNING_PORT0; + break; + case BR_STATE_FORWARDING: + portstate = LAN9303_SWE_PORT_STATE_FORWARDING_PORT0; + break; + default: + portstate = LAN9303_SWE_PORT_STATE_DISABLED_PORT0; + dev_err(chip->dev, "unknown stp state: port %d, state %d\n", + port, state); + } + + portmask = 0x3 << (port * 2); + portstate <<= (port * 2); + + chip->swe_port_state = (chip->swe_port_state & ~portmask) | portstate; + + if (chip->is_bridged) + lan9303_write_switch_reg(chip, LAN9303_SWE_PORT_STATE, + chip->swe_port_state); + /* else: touching SWE_PORT_STATE would break port separation */ +} + static const struct dsa_switch_ops lan9303_switch_ops = { .get_tag_protocol = lan9303_get_tag_protocol, .setup = lan9303_setup, @@ -847,6 +934,9 @@ static const struct dsa_switch_ops lan9303_switch_ops = { .get_sset_count = lan9303_get_sset_count, .port_enable = lan9303_port_enable, .port_disable = lan9303_port_disable, + .port_bridge_join = lan9303_port_bridge_join, + .port_bridge_leave = lan9303_port_bridge_leave, + .port_stp_state_set = lan9303_port_stp_state_set, }; static int lan9303_register_switch(struct lan9303 *chip) diff --git a/drivers/net/dsa/lan9303.h b/drivers/net/dsa/lan9303.h index 4d8be555ff4d..68ecd544b658 100644 --- a/drivers/net/dsa/lan9303.h +++ b/drivers/net/dsa/lan9303.h @@ -21,6 +21,8 @@ struct lan9303 { struct dsa_switch *ds; struct mutex indirect_mutex; /* protect indexed register access */ const struct lan9303_phy_ops *ops; + bool is_bridged; /* true if port 1 and 2 are bridged */ + u32 swe_port_state; /* remember SWE_PORT_STATE while not bridged */ }; extern const struct regmap_access_table lan9303_register_set; diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c index 621cdc46ad81..6173be889d95 100644 --- a/drivers/net/dsa/mv88e6060.c +++ b/drivers/net/dsa/mv88e6060.c @@ -9,6 +9,7 @@ */ #include <linux/delay.h> +#include <linux/etherdevice.h> #include <linux/jiffies.h> #include <linux/list.h> #include <linux/module.h> @@ -188,6 +189,27 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p) return 0; } +static int mv88e6060_setup_addr(struct dsa_switch *ds) +{ + u8 addr[ETH_ALEN]; + u16 val; + + eth_random_addr(addr); + + val = addr[0] << 8 | addr[1]; + + /* The multicast bit is always transmitted as a zero, so the switch uses + * bit 8 for "DiffAddr", where 0 means all ports transmit the same SA. + */ + val &= 0xfeff; + + REG_WRITE(REG_GLOBAL, GLOBAL_MAC_01, val); + REG_WRITE(REG_GLOBAL, GLOBAL_MAC_23, (addr[2] << 8) | addr[3]); + REG_WRITE(REG_GLOBAL, GLOBAL_MAC_45, (addr[4] << 8) | addr[5]); + + return 0; +} + static int mv88e6060_setup(struct dsa_switch *ds) { int ret; @@ -203,6 +225,10 @@ static int mv88e6060_setup(struct dsa_switch *ds) if (ret < 0) return ret; + ret = mv88e6060_setup_addr(ds); + if (ret < 0) + return ret; + for (i = 0; i < MV88E6060_PORTS; i++) { ret = mv88e6060_setup_port(ds, i); if (ret < 0) @@ -212,16 +238,6 @@ static int mv88e6060_setup(struct dsa_switch *ds) return 0; } -static int mv88e6060_set_addr(struct dsa_switch *ds, u8 *addr) -{ - /* Use the same MAC Address as FD Pause frames for all ports */ - REG_WRITE(REG_GLOBAL, GLOBAL_MAC_01, (addr[0] << 9) | addr[1]); - REG_WRITE(REG_GLOBAL, GLOBAL_MAC_23, (addr[2] << 8) | addr[3]); - REG_WRITE(REG_GLOBAL, GLOBAL_MAC_45, (addr[4] << 8) | addr[5]); - - return 0; -} - static int mv88e6060_port_to_phy_addr(int port) { if (port >= 0 && port < MV88E6060_PORTS) @@ -256,7 +272,6 @@ static const struct dsa_switch_ops mv88e6060_switch_ops = { .get_tag_protocol = mv88e6060_get_tag_protocol, .probe = mv88e6060_drv_probe, .setup = mv88e6060_setup, - .set_addr = mv88e6060_set_addr, .phy_read = mv88e6060_phy_read, .phy_write = mv88e6060_phy_write, }; diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index d74c7335c512..76cf383dcf90 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -932,6 +932,19 @@ static int mv88e6xxx_irl_setup(struct mv88e6xxx_chip *chip) return 0; } +static int mv88e6xxx_mac_setup(struct mv88e6xxx_chip *chip) +{ + if (chip->info->ops->set_switch_mac) { + u8 addr[ETH_ALEN]; + + eth_random_addr(addr); + + return chip->info->ops->set_switch_mac(chip, addr); + } + + return 0; +} + static int mv88e6xxx_pvt_map(struct mv88e6xxx_chip *chip, int dev, int port) { u16 pvlan = 0; @@ -2013,6 +2026,10 @@ static int mv88e6xxx_setup(struct dsa_switch *ds) if (err) goto unlock; + err = mv88e6xxx_mac_setup(chip); + if (err) + goto unlock; + err = mv88e6xxx_phy_setup(chip); if (err) goto unlock; @@ -2043,21 +2060,6 @@ unlock: return err; } -static int mv88e6xxx_set_addr(struct dsa_switch *ds, u8 *addr) -{ - struct mv88e6xxx_chip *chip = ds->priv; - int err; - - if (!chip->info->ops->set_switch_mac) - return -EOPNOTSUPP; - - mutex_lock(&chip->reg_lock); - err = chip->info->ops->set_switch_mac(chip, addr); - mutex_unlock(&chip->reg_lock); - - return err; -} - static int mv88e6xxx_mdio_read(struct mii_bus *bus, int phy, int reg) { struct mv88e6xxx_mdio_bus *mdio_bus = bus->priv; @@ -3785,7 +3787,6 @@ static const struct dsa_switch_ops mv88e6xxx_switch_ops = { .probe = mv88e6xxx_drv_probe, .get_tag_protocol = mv88e6xxx_get_tag_protocol, .setup = mv88e6xxx_setup, - .set_addr = mv88e6xxx_set_addr, .adjust_link = mv88e6xxx_adjust_link, .get_strings = mv88e6xxx_get_strings, .get_ethtool_stats = mv88e6xxx_get_ethtool_stats, diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index 83eec9a8c275..dafc26690555 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1416,9 +1416,20 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv, tdma_writel(priv, 0, TDMA_DESC_RING_COUNT(index)); tdma_writel(priv, 1, TDMA_DESC_RING_INTR_CONTROL(index)); tdma_writel(priv, 0, TDMA_DESC_RING_PROD_CONS_INDEX(index)); - tdma_writel(priv, RING_IGNORE_STATUS, TDMA_DESC_RING_MAPPING(index)); + + /* Configure QID and port mapping */ + reg = tdma_readl(priv, TDMA_DESC_RING_MAPPING(index)); + reg &= ~(RING_QID_MASK | RING_PORT_ID_MASK << RING_PORT_ID_SHIFT); + reg |= ring->switch_queue & RING_QID_MASK; + reg |= ring->switch_port << RING_PORT_ID_SHIFT; + tdma_writel(priv, reg, TDMA_DESC_RING_MAPPING(index)); tdma_writel(priv, 0, TDMA_DESC_RING_PCP_DEI_VID(index)); + /* Enable ACB algorithm 2 */ + reg = tdma_readl(priv, TDMA_CONTROL); + reg |= tdma_control_bit(priv, ACB_ALGO); + tdma_writel(priv, reg, TDMA_CONTROL); + /* Do not use tdma_control_bit() here because TSB_SWAP1 collides * with the original definition of ACB_ALGO */ @@ -1447,8 +1458,9 @@ static int bcm_sysport_init_tx_ring(struct bcm_sysport_priv *priv, napi_enable(&ring->napi); netif_dbg(priv, hw, priv->netdev, - "TDMA cfg, size=%d, desc_cpu=%p\n", - ring->size, ring->desc_cpu); + "TDMA cfg, size=%d, desc_cpu=%p switch q=%d,port=%d\n", + ring->size, ring->desc_cpu, ring->switch_queue, + ring->switch_port); return 0; } @@ -2011,6 +2023,92 @@ static const struct ethtool_ops bcm_sysport_ethtool_ops = { .set_link_ksettings = phy_ethtool_set_link_ksettings, }; +static u16 bcm_sysport_select_queue(struct net_device *dev, struct sk_buff *skb, + void *accel_priv, + select_queue_fallback_t fallback) +{ + struct bcm_sysport_priv *priv = netdev_priv(dev); + u16 queue = skb_get_queue_mapping(skb); + struct bcm_sysport_tx_ring *tx_ring; + unsigned int q, port; + + if (!netdev_uses_dsa(dev)) + return fallback(dev, skb); + + /* DSA tagging layer will have configured the correct queue */ + q = BRCM_TAG_GET_QUEUE(queue); + port = BRCM_TAG_GET_PORT(queue); + tx_ring = priv->ring_map[q + port * priv->per_port_num_tx_queues]; + + return tx_ring->index; +} + +static int bcm_sysport_map_queues(struct net_device *dev, + struct dsa_notifier_register_info *info) +{ + struct bcm_sysport_priv *priv = netdev_priv(dev); + struct bcm_sysport_tx_ring *ring; + struct net_device *slave_dev; + unsigned int num_tx_queues; + unsigned int q, start, port; + + /* We can't be setting up queue inspection for non directly attached + * switches + */ + if (info->switch_number) + return 0; + + port = info->port_number; + slave_dev = info->info.dev; + + /* On SYSTEMPORT Lite we have twice as less queues, so we cannot do a + * 1:1 mapping, we can only do a 2:1 mapping. By reducing the number of + * per-port (slave_dev) network devices queue, we achieve just that. + * This need to happen now before any slave network device is used such + * it accurately reflects the number of real TX queues. + */ + if (priv->is_lite) + netif_set_real_num_tx_queues(slave_dev, + slave_dev->num_tx_queues / 2); + num_tx_queues = slave_dev->real_num_tx_queues; + + if (priv->per_port_num_tx_queues && + priv->per_port_num_tx_queues != num_tx_queues) + netdev_warn(slave_dev, "asymetric number of per-port queues\n"); + + priv->per_port_num_tx_queues = num_tx_queues; + + start = find_first_zero_bit(&priv->queue_bitmap, dev->num_tx_queues); + for (q = 0; q < num_tx_queues; q++) { + ring = &priv->tx_rings[q + start]; + + /* Just remember the mapping actual programming done + * during bcm_sysport_init_tx_ring + */ + ring->switch_queue = q; + ring->switch_port = port; + priv->ring_map[q + port * num_tx_queues] = ring; + + /* Set all queues as being used now */ + set_bit(q + start, &priv->queue_bitmap); + } + + return 0; +} + +static int bcm_sysport_dsa_notifier(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct dsa_notifier_register_info *info; + + if (event != DSA_PORT_REGISTER) + return NOTIFY_DONE; + + info = ptr; + + return notifier_from_errno(bcm_sysport_map_queues(info->master, info)); +} + static const struct net_device_ops bcm_sysport_netdev_ops = { .ndo_start_xmit = bcm_sysport_xmit, .ndo_tx_timeout = bcm_sysport_tx_timeout, @@ -2023,6 +2121,7 @@ static const struct net_device_ops bcm_sysport_netdev_ops = { .ndo_poll_controller = bcm_sysport_poll_controller, #endif .ndo_get_stats64 = bcm_sysport_get_stats64, + .ndo_select_queue = bcm_sysport_select_queue, }; #define REV_FMT "v%2x.%02x" @@ -2172,10 +2271,18 @@ static int bcm_sysport_probe(struct platform_device *pdev) u64_stats_init(&priv->syncp); + priv->dsa_notifier.notifier_call = bcm_sysport_dsa_notifier; + + ret = register_dsa_notifier(&priv->dsa_notifier); + if (ret) { + dev_err(&pdev->dev, "failed to register DSA notifier\n"); + goto err_deregister_fixed_link; + } + ret = register_netdev(dev); if (ret) { dev_err(&pdev->dev, "failed to register net_device\n"); - goto err_deregister_fixed_link; + goto err_deregister_notifier; } priv->rev = topctrl_readl(priv, REV_CNTL) & REV_MASK; @@ -2188,6 +2295,8 @@ static int bcm_sysport_probe(struct platform_device *pdev) return 0; +err_deregister_notifier: + unregister_dsa_notifier(&priv->dsa_notifier); err_deregister_fixed_link: if (of_phy_is_fixed_link(dn)) of_phy_deregister_fixed_link(dn); @@ -2199,11 +2308,13 @@ err_free_netdev: static int bcm_sysport_remove(struct platform_device *pdev) { struct net_device *dev = dev_get_drvdata(&pdev->dev); + struct bcm_sysport_priv *priv = netdev_priv(dev); struct device_node *dn = pdev->dev.of_node; /* Not much to do, ndo_close has been called * and we use managed allocations */ + unregister_dsa_notifier(&priv->dsa_notifier); unregister_netdev(dev); if (of_phy_is_fixed_link(dn)) of_phy_deregister_fixed_link(dn); diff --git a/drivers/net/ethernet/broadcom/bcmsysport.h b/drivers/net/ethernet/broadcom/bcmsysport.h index 82e401df199e..82f70a6783cb 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.h +++ b/drivers/net/ethernet/broadcom/bcmsysport.h @@ -404,7 +404,7 @@ struct bcm_rsb { #define RING_CONS_INDEX_MASK 0xffff #define RING_MAPPING 0x14 -#define RING_QID_MASK 0x3 +#define RING_QID_MASK 0x7 #define RING_PORT_ID_SHIFT 3 #define RING_PORT_ID_MASK 0x7 #define RING_IGNORE_STATUS (1 << 6) @@ -712,6 +712,8 @@ struct bcm_sysport_tx_ring { struct bcm_sysport_priv *priv; /* private context backpointer */ unsigned long packets; /* packets statistics */ unsigned long bytes; /* bytes statistics */ + unsigned int switch_queue; /* switch port queue number */ + unsigned int switch_port; /* switch port queue number */ }; /* Driver private structure */ @@ -765,5 +767,12 @@ struct bcm_sysport_priv { /* For atomic update generic 64bit value on 32bit Machine */ struct u64_stats_sync syncp; + + /* map information between switch port queues and local queues */ + struct notifier_block dsa_notifier; + unsigned int per_port_num_tx_queues; + unsigned long queue_bitmap; + struct bcm_sysport_tx_ring *ring_map[DSA_MAX_PORTS * 8]; + }; #endif /* __BCM_SYSPORT_H */ diff --git a/drivers/net/ethernet/broadcom/bgmac-bcma.c b/drivers/net/ethernet/broadcom/bgmac-bcma.c index 6322594ab260..6fe074c1588b 100644 --- a/drivers/net/ethernet/broadcom/bgmac-bcma.c +++ b/drivers/net/ethernet/broadcom/bgmac-bcma.c @@ -184,13 +184,19 @@ static int bgmac_probe(struct bcma_device *core) if (!bgmac_is_bcm4707_family(core) && !(ci->id == BCMA_CHIP_ID_BCM53573 && core->core_unit == 1)) { + struct phy_device *phydev; + mii_bus = bcma_mdio_mii_register(bgmac); if (IS_ERR(mii_bus)) { err = PTR_ERR(mii_bus); goto err; } - bgmac->mii_bus = mii_bus; + + phydev = mdiobus_get_phy(bgmac->mii_bus, bgmac->phyaddr); + if (ci->id == BCMA_CHIP_ID_BCM53573 && phydev && + (phydev->drv->phy_id & phydev->drv->phy_id_mask) == PHY_ID_BCM54210E) + phydev->dev_flags |= PHY_BRCM_EN_MASTER_MODE; } if (core->bus->hosttype == BCMA_HOSTTYPE_PCI) { diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c index 9ca994d0bab6..3591077a5f6b 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c @@ -1074,11 +1074,6 @@ static void bnx2x_vf_set_bars(struct bnx2x *bp, struct bnx2x_virtf *vf) } } -static int bnx2x_ari_enabled(struct pci_dev *dev) -{ - return dev->bus->self && dev->bus->self->ari_enabled; -} - static int bnx2x_get_vf_igu_cam_info(struct bnx2x *bp) { @@ -1212,7 +1207,7 @@ int bnx2x_iov_init_one(struct bnx2x *bp, int int_mode_param, err = -EIO; /* verify ari is enabled */ - if (!bnx2x_ari_enabled(bp->pdev)) { + if (!pci_ari_enabled(bp->pdev->bus)) { BNX2X_ERR("ARI not supported (check pci bridge ARI forwarding), SRIOV can not be enabled\n"); return 0; } diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile index 4f0cb8e1ffc0..457201f409a7 100644 --- a/drivers/net/ethernet/broadcom/bnxt/Makefile +++ b/drivers/net/ethernet/broadcom/bnxt/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_BNXT) += bnxt_en.o -bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o bnxt_tc.o +bnxt_en-y := bnxt.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_vfr.o +bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c index 7dd3d131043a..4730c048ed9b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_tc.c @@ -23,8 +23,6 @@ #include "bnxt_tc.h" #include "bnxt_vfr.h" -#ifdef CONFIG_BNXT_FLOWER_OFFLOAD - #define BNXT_FID_INVALID 0xffff #define VLAN_TCI(vid, prio) ((vid) | ((prio) << VLAN_PRIO_SHIFT)) @@ -833,6 +831,3 @@ void bnxt_shutdown_tc(struct bnxt *bp) rhashtable_destroy(&tc_info->flow_table); rhashtable_destroy(&tc_info->l2_table); } - -#else -#endif diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index d68478afccbf..71989e180289 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -566,8 +566,10 @@ static inline bool nicvf_xdp_rx(struct nicvf *nic, struct bpf_prog *prog, return true; default: bpf_warn_invalid_xdp_action(action); + /* fall through */ case XDP_ABORTED: trace_xdp_exception(nic->netdev, prog, action); + /* fall through */ case XDP_DROP: /* Check if it's a recycled page, if not * unmap the DMA mapping. diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile b/drivers/net/ethernet/chelsio/cxgb4/Makefile index fecd7aab673b..70d454379996 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/Makefile +++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_CHELSIO_T4) += cxgb4.o cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o \ cxgb4_uld.o sched.o cxgb4_filter.o cxgb4_tc_u32.o \ - cxgb4_ptp.o cxgb4_tc_flower.o + cxgb4_ptp.o cxgb4_tc_flower.o cxgb4_cudbg.o \ + cudbg_common.o cudbg_lib.o cxgb4-$(CONFIG_CHELSIO_T4_DCB) += cxgb4_dcb.o cxgb4-$(CONFIG_CHELSIO_T4_FCOE) += cxgb4_fcoe.o cxgb4-$(CONFIG_DEBUG_FS) += cxgb4_debugfs.o diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_common.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_common.c new file mode 100644 index 000000000000..f78ba1743b5a --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_common.c @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2017 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#include "cxgb4.h" +#include "cudbg_if.h" +#include "cudbg_lib_common.h" + +int cudbg_get_buff(struct cudbg_buffer *pdbg_buff, u32 size, + struct cudbg_buffer *pin_buff) +{ + u32 offset; + + offset = pdbg_buff->offset; + if (offset + size > pdbg_buff->size) + return CUDBG_STATUS_NO_MEM; + + pin_buff->data = (char *)pdbg_buff->data + offset; + pin_buff->offset = offset; + pin_buff->size = size; + pdbg_buff->size -= size; + return 0; +} + +void cudbg_put_buff(struct cudbg_buffer *pin_buff, + struct cudbg_buffer *pdbg_buff) +{ + pdbg_buff->size += pin_buff->size; + pin_buff->data = NULL; + pin_buff->offset = 0; + pin_buff->size = 0; +} + +void cudbg_update_buff(struct cudbg_buffer *pin_buff, + struct cudbg_buffer *pout_buff) +{ + /* We already write to buffer provided by ethool, so just + * increment offset to next free space. + */ + pout_buff->offset += pin_buff->size; +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h new file mode 100644 index 000000000000..d7f3392f618f --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_entity.h @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2017 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#ifndef __CUDBG_ENTITY_H__ +#define __CUDBG_ENTITY_H__ + +#define EDC0_FLAG 3 +#define EDC1_FLAG 4 + +struct card_mem { + u16 size_edc0; + u16 size_edc1; + u16 mem_flag; +}; + +struct cudbg_mbox_log { + struct mbox_cmd entry; + u32 hi[MBOX_LEN / 8]; + u32 lo[MBOX_LEN / 8]; +}; + +struct ireg_field { + u32 ireg_addr; + u32 ireg_data; + u32 ireg_local_offset; + u32 ireg_offset_range; +}; + +struct ireg_buf { + struct ireg_field tp_pio; + u32 outbuf[32]; +}; + +#define IREG_NUM_ELEM 4 + +static const u32 t6_tp_pio_array[][IREG_NUM_ELEM] = { + {0x7e40, 0x7e44, 0x020, 28}, /* t6_tp_pio_regs_20_to_3b */ + {0x7e40, 0x7e44, 0x040, 10}, /* t6_tp_pio_regs_40_to_49 */ + {0x7e40, 0x7e44, 0x050, 10}, /* t6_tp_pio_regs_50_to_59 */ + {0x7e40, 0x7e44, 0x060, 14}, /* t6_tp_pio_regs_60_to_6d */ + {0x7e40, 0x7e44, 0x06F, 1}, /* t6_tp_pio_regs_6f */ + {0x7e40, 0x7e44, 0x070, 6}, /* t6_tp_pio_regs_70_to_75 */ + {0x7e40, 0x7e44, 0x130, 18}, /* t6_tp_pio_regs_130_to_141 */ + {0x7e40, 0x7e44, 0x145, 19}, /* t6_tp_pio_regs_145_to_157 */ + {0x7e40, 0x7e44, 0x160, 1}, /* t6_tp_pio_regs_160 */ + {0x7e40, 0x7e44, 0x230, 25}, /* t6_tp_pio_regs_230_to_248 */ + {0x7e40, 0x7e44, 0x24a, 3}, /* t6_tp_pio_regs_24c */ + {0x7e40, 0x7e44, 0x8C0, 1} /* t6_tp_pio_regs_8c0 */ +}; + +static const u32 t5_tp_pio_array[][IREG_NUM_ELEM] = { + {0x7e40, 0x7e44, 0x020, 28}, /* t5_tp_pio_regs_20_to_3b */ + {0x7e40, 0x7e44, 0x040, 19}, /* t5_tp_pio_regs_40_to_52 */ + {0x7e40, 0x7e44, 0x054, 2}, /* t5_tp_pio_regs_54_to_55 */ + {0x7e40, 0x7e44, 0x060, 13}, /* t5_tp_pio_regs_60_to_6c */ + {0x7e40, 0x7e44, 0x06F, 1}, /* t5_tp_pio_regs_6f */ + {0x7e40, 0x7e44, 0x120, 4}, /* t5_tp_pio_regs_120_to_123 */ + {0x7e40, 0x7e44, 0x12b, 2}, /* t5_tp_pio_regs_12b_to_12c */ + {0x7e40, 0x7e44, 0x12f, 21}, /* t5_tp_pio_regs_12f_to_143 */ + {0x7e40, 0x7e44, 0x145, 19}, /* t5_tp_pio_regs_145_to_157 */ + {0x7e40, 0x7e44, 0x230, 25}, /* t5_tp_pio_regs_230_to_248 */ + {0x7e40, 0x7e44, 0x8C0, 1} /* t5_tp_pio_regs_8c0 */ +}; + +static const u32 t6_tp_tm_pio_array[][IREG_NUM_ELEM] = { + {0x7e18, 0x7e1c, 0x0, 12} +}; + +static const u32 t5_tp_tm_pio_array[][IREG_NUM_ELEM] = { + {0x7e18, 0x7e1c, 0x0, 12} +}; + +static const u32 t6_tp_mib_index_array[6][IREG_NUM_ELEM] = { + {0x7e50, 0x7e54, 0x0, 13}, + {0x7e50, 0x7e54, 0x10, 6}, + {0x7e50, 0x7e54, 0x18, 21}, + {0x7e50, 0x7e54, 0x30, 32}, + {0x7e50, 0x7e54, 0x50, 22}, + {0x7e50, 0x7e54, 0x68, 12} +}; + +static const u32 t5_tp_mib_index_array[9][IREG_NUM_ELEM] = { + {0x7e50, 0x7e54, 0x0, 13}, + {0x7e50, 0x7e54, 0x10, 6}, + {0x7e50, 0x7e54, 0x18, 8}, + {0x7e50, 0x7e54, 0x20, 13}, + {0x7e50, 0x7e54, 0x30, 16}, + {0x7e50, 0x7e54, 0x40, 16}, + {0x7e50, 0x7e54, 0x50, 16}, + {0x7e50, 0x7e54, 0x60, 6}, + {0x7e50, 0x7e54, 0x68, 4} +}; + +static const u32 t5_sge_dbg_index_array[2][IREG_NUM_ELEM] = { + {0x10cc, 0x10d0, 0x0, 16}, + {0x10cc, 0x10d4, 0x0, 16}, +}; + +static const u32 t5_pcie_pdbg_array[][IREG_NUM_ELEM] = { + {0x5a04, 0x5a0c, 0x00, 0x20}, /* t5_pcie_pdbg_regs_00_to_20 */ + {0x5a04, 0x5a0c, 0x21, 0x20}, /* t5_pcie_pdbg_regs_21_to_40 */ + {0x5a04, 0x5a0c, 0x41, 0x10}, /* t5_pcie_pdbg_regs_41_to_50 */ +}; + +static const u32 t5_pcie_cdbg_array[][IREG_NUM_ELEM] = { + {0x5a10, 0x5a18, 0x00, 0x20}, /* t5_pcie_cdbg_regs_00_to_20 */ + {0x5a10, 0x5a18, 0x21, 0x18}, /* t5_pcie_cdbg_regs_21_to_37 */ +}; + +static const u32 t5_pm_rx_array[][IREG_NUM_ELEM] = { + {0x8FD0, 0x8FD4, 0x10000, 0x20}, /* t5_pm_rx_regs_10000_to_10020 */ + {0x8FD0, 0x8FD4, 0x10021, 0x0D}, /* t5_pm_rx_regs_10021_to_1002c */ +}; + +static const u32 t5_pm_tx_array[][IREG_NUM_ELEM] = { + {0x8FF0, 0x8FF4, 0x10000, 0x20}, /* t5_pm_tx_regs_10000_to_10020 */ + {0x8FF0, 0x8FF4, 0x10021, 0x1D}, /* t5_pm_tx_regs_10021_to_1003c */ +}; + +static const u32 t6_ma_ireg_array[][IREG_NUM_ELEM] = { + {0x78f8, 0x78fc, 0xa000, 23}, /* t6_ma_regs_a000_to_a016 */ + {0x78f8, 0x78fc, 0xa400, 30}, /* t6_ma_regs_a400_to_a41e */ + {0x78f8, 0x78fc, 0xa800, 20} /* t6_ma_regs_a800_to_a813 */ +}; + +static const u32 t6_ma_ireg_array2[][IREG_NUM_ELEM] = { + {0x78f8, 0x78fc, 0xe400, 17}, /* t6_ma_regs_e400_to_e600 */ + {0x78f8, 0x78fc, 0xe640, 13} /* t6_ma_regs_e640_to_e7c0 */ +}; + +static const u32 t6_up_cim_reg_array[][IREG_NUM_ELEM] = { + {0x7b50, 0x7b54, 0x2000, 0x20}, /* up_cim_2000_to_207c */ + {0x7b50, 0x7b54, 0x2080, 0x1d}, /* up_cim_2080_to_20fc */ + {0x7b50, 0x7b54, 0x00, 0x20}, /* up_cim_00_to_7c */ + {0x7b50, 0x7b54, 0x80, 0x20}, /* up_cim_80_to_fc */ + {0x7b50, 0x7b54, 0x100, 0x11}, /* up_cim_100_to_14c */ + {0x7b50, 0x7b54, 0x200, 0x10}, /* up_cim_200_to_23c */ + {0x7b50, 0x7b54, 0x240, 0x2}, /* up_cim_240_to_244 */ + {0x7b50, 0x7b54, 0x250, 0x2}, /* up_cim_250_to_254 */ + {0x7b50, 0x7b54, 0x260, 0x2}, /* up_cim_260_to_264 */ + {0x7b50, 0x7b54, 0x270, 0x2}, /* up_cim_270_to_274 */ + {0x7b50, 0x7b54, 0x280, 0x20}, /* up_cim_280_to_2fc */ + {0x7b50, 0x7b54, 0x300, 0x20}, /* up_cim_300_to_37c */ + {0x7b50, 0x7b54, 0x380, 0x14}, /* up_cim_380_to_3cc */ + +}; + +static const u32 t5_up_cim_reg_array[][IREG_NUM_ELEM] = { + {0x7b50, 0x7b54, 0x2000, 0x20}, /* up_cim_2000_to_207c */ + {0x7b50, 0x7b54, 0x2080, 0x19}, /* up_cim_2080_to_20ec */ + {0x7b50, 0x7b54, 0x00, 0x20}, /* up_cim_00_to_7c */ + {0x7b50, 0x7b54, 0x80, 0x20}, /* up_cim_80_to_fc */ + {0x7b50, 0x7b54, 0x100, 0x11}, /* up_cim_100_to_14c */ + {0x7b50, 0x7b54, 0x200, 0x10}, /* up_cim_200_to_23c */ + {0x7b50, 0x7b54, 0x240, 0x2}, /* up_cim_240_to_244 */ + {0x7b50, 0x7b54, 0x250, 0x2}, /* up_cim_250_to_254 */ + {0x7b50, 0x7b54, 0x260, 0x2}, /* up_cim_260_to_264 */ + {0x7b50, 0x7b54, 0x270, 0x2}, /* up_cim_270_to_274 */ + {0x7b50, 0x7b54, 0x280, 0x20}, /* up_cim_280_to_2fc */ + {0x7b50, 0x7b54, 0x300, 0x20}, /* up_cim_300_to_37c */ + {0x7b50, 0x7b54, 0x380, 0x14}, /* up_cim_380_to_3cc */ +}; + +static const u32 t6_hma_ireg_array[][IREG_NUM_ELEM] = { + {0x51320, 0x51324, 0xa000, 32} /* t6_hma_regs_a000_to_a01f */ +}; +#endif /* __CUDBG_ENTITY_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h new file mode 100644 index 000000000000..9b8005e67811 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_if.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2017 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#ifndef __CUDBG_IF_H__ +#define __CUDBG_IF_H__ + +/* Error codes */ +#define CUDBG_STATUS_NO_MEM -19 +#define CUDBG_STATUS_ENTITY_NOT_FOUND -24 +#define CUDBG_SYSTEM_ERROR -29 + +#define CUDBG_MAJOR_VERSION 1 +#define CUDBG_MINOR_VERSION 14 + +enum cudbg_dbg_entity_type { + CUDBG_REG_DUMP = 1, + CUDBG_DEV_LOG = 2, + CUDBG_CIM_IBQ_TP0 = 6, + CUDBG_CIM_IBQ_TP1 = 7, + CUDBG_CIM_IBQ_ULP = 8, + CUDBG_CIM_IBQ_SGE0 = 9, + CUDBG_CIM_IBQ_SGE1 = 10, + CUDBG_CIM_IBQ_NCSI = 11, + CUDBG_CIM_OBQ_ULP0 = 12, + CUDBG_CIM_OBQ_ULP1 = 13, + CUDBG_CIM_OBQ_ULP2 = 14, + CUDBG_CIM_OBQ_ULP3 = 15, + CUDBG_CIM_OBQ_SGE = 16, + CUDBG_CIM_OBQ_NCSI = 17, + CUDBG_EDC0 = 18, + CUDBG_EDC1 = 19, + CUDBG_TP_INDIRECT = 36, + CUDBG_SGE_INDIRECT = 37, + CUDBG_CIM_OBQ_RXQ0 = 47, + CUDBG_CIM_OBQ_RXQ1 = 48, + CUDBG_PCIE_INDIRECT = 50, + CUDBG_PM_INDIRECT = 51, + CUDBG_MA_INDIRECT = 61, + CUDBG_UP_CIM_INDIRECT = 64, + CUDBG_MBOX_LOG = 66, + CUDBG_HMA_INDIRECT = 67, + CUDBG_MAX_ENTITY = 70, +}; + +struct cudbg_init { + struct adapter *adap; /* Pointer to adapter structure */ + void *outbuf; /* Output buffer */ + u32 outbuf_size; /* Output buffer size */ +}; + +static inline unsigned int cudbg_mbytes_to_bytes(unsigned int size) +{ + return size * 1024 * 1024; +} +#endif /* __CUDBG_IF_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c new file mode 100644 index 000000000000..c451b2e42a6c --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c @@ -0,0 +1,867 @@ +/* + * Copyright (C) 2017 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#include "t4_regs.h" +#include "cxgb4.h" +#include "cudbg_if.h" +#include "cudbg_lib_common.h" +#include "cudbg_lib.h" +#include "cudbg_entity.h" + +static void cudbg_write_and_release_buff(struct cudbg_buffer *pin_buff, + struct cudbg_buffer *dbg_buff) +{ + cudbg_update_buff(pin_buff, dbg_buff); + cudbg_put_buff(pin_buff, dbg_buff); +} + +static int is_fw_attached(struct cudbg_init *pdbg_init) +{ + struct adapter *padap = pdbg_init->adap; + + if (!(padap->flags & FW_OK) || padap->use_bd) + return 0; + + return 1; +} + +/* This function will add additional padding bytes into debug_buffer to make it + * 4 byte aligned. + */ +void cudbg_align_debug_buffer(struct cudbg_buffer *dbg_buff, + struct cudbg_entity_hdr *entity_hdr) +{ + u8 zero_buf[4] = {0}; + u8 padding, remain; + + remain = (dbg_buff->offset - entity_hdr->start_offset) % 4; + padding = 4 - remain; + if (remain) { + memcpy(((u8 *)dbg_buff->data) + dbg_buff->offset, &zero_buf, + padding); + dbg_buff->offset += padding; + entity_hdr->num_pad = padding; + } + entity_hdr->size = dbg_buff->offset - entity_hdr->start_offset; +} + +struct cudbg_entity_hdr *cudbg_get_entity_hdr(void *outbuf, int i) +{ + struct cudbg_hdr *cudbg_hdr = (struct cudbg_hdr *)outbuf; + + return (struct cudbg_entity_hdr *) + ((char *)outbuf + cudbg_hdr->hdr_len + + (sizeof(struct cudbg_entity_hdr) * (i - 1))); +} + +int cudbg_collect_reg_dump(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + u32 buf_size = 0; + int rc = 0; + + if (is_t4(padap->params.chip)) + buf_size = T4_REGMAP_SIZE; + else if (is_t5(padap->params.chip) || is_t6(padap->params.chip)) + buf_size = T5_REGMAP_SIZE; + + rc = cudbg_get_buff(dbg_buff, buf_size, &temp_buff); + if (rc) + return rc; + t4_get_regs(padap, (void *)temp_buff.data, temp_buff.size); + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_fw_devlog(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + struct devlog_params *dparams; + int rc = 0; + + rc = t4_init_devlog_params(padap); + if (rc < 0) { + cudbg_err->sys_err = rc; + return rc; + } + + dparams = &padap->params.devlog; + rc = cudbg_get_buff(dbg_buff, dparams->size, &temp_buff); + if (rc) + return rc; + + /* Collect FW devlog */ + if (dparams->start != 0) { + spin_lock(&padap->win0_lock); + rc = t4_memory_rw(padap, padap->params.drv_memwin, + dparams->memtype, dparams->start, + dparams->size, + (__be32 *)(char *)temp_buff.data, + 1); + spin_unlock(&padap->win0_lock); + if (rc) { + cudbg_err->sys_err = rc; + cudbg_put_buff(&temp_buff, dbg_buff); + return rc; + } + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +static int cudbg_read_cim_ibq(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err, int qid) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + int no_of_read_words, rc = 0; + u32 qsize; + + /* collect CIM IBQ */ + qsize = CIM_IBQ_SIZE * 4 * sizeof(u32); + rc = cudbg_get_buff(dbg_buff, qsize, &temp_buff); + if (rc) + return rc; + + /* t4_read_cim_ibq will return no. of read words or error */ + no_of_read_words = t4_read_cim_ibq(padap, qid, + (u32 *)((u32 *)temp_buff.data + + temp_buff.offset), qsize); + /* no_of_read_words is less than or equal to 0 means error */ + if (no_of_read_words <= 0) { + if (!no_of_read_words) + rc = CUDBG_SYSTEM_ERROR; + else + rc = no_of_read_words; + cudbg_err->sys_err = rc; + cudbg_put_buff(&temp_buff, dbg_buff); + return rc; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_cim_ibq_tp0(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 0); +} + +int cudbg_collect_cim_ibq_tp1(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 1); +} + +int cudbg_collect_cim_ibq_ulp(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 2); +} + +int cudbg_collect_cim_ibq_sge0(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 3); +} + +int cudbg_collect_cim_ibq_sge1(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 4); +} + +int cudbg_collect_cim_ibq_ncsi(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_ibq(pdbg_init, dbg_buff, cudbg_err, 5); +} + +static int cudbg_read_cim_obq(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err, int qid) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + int no_of_read_words, rc = 0; + u32 qsize; + + /* collect CIM OBQ */ + qsize = 6 * CIM_OBQ_SIZE * 4 * sizeof(u32); + rc = cudbg_get_buff(dbg_buff, qsize, &temp_buff); + if (rc) + return rc; + + /* t4_read_cim_obq will return no. of read words or error */ + no_of_read_words = t4_read_cim_obq(padap, qid, + (u32 *)((u32 *)temp_buff.data + + temp_buff.offset), qsize); + /* no_of_read_words is less than or equal to 0 means error */ + if (no_of_read_words <= 0) { + if (!no_of_read_words) + rc = CUDBG_SYSTEM_ERROR; + else + rc = no_of_read_words; + cudbg_err->sys_err = rc; + cudbg_put_buff(&temp_buff, dbg_buff); + return rc; + } + temp_buff.size = no_of_read_words * 4; + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_cim_obq_ulp0(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 0); +} + +int cudbg_collect_cim_obq_ulp1(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 1); +} + +int cudbg_collect_cim_obq_ulp2(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 2); +} + +int cudbg_collect_cim_obq_ulp3(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 3); +} + +int cudbg_collect_cim_obq_sge(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 4); +} + +int cudbg_collect_cim_obq_ncsi(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 5); +} + +int cudbg_collect_obq_sge_rx_q0(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 6); +} + +int cudbg_collect_obq_sge_rx_q1(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_read_cim_obq(pdbg_init, dbg_buff, cudbg_err, 7); +} + +static int cudbg_read_fw_mem(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, u8 mem_type, + unsigned long tot_len, + struct cudbg_error *cudbg_err) +{ + unsigned long bytes, bytes_left, bytes_read = 0; + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + int rc = 0; + + bytes_left = tot_len; + while (bytes_left > 0) { + bytes = min_t(unsigned long, bytes_left, + (unsigned long)CUDBG_CHUNK_SIZE); + rc = cudbg_get_buff(dbg_buff, bytes, &temp_buff); + if (rc) + return rc; + spin_lock(&padap->win0_lock); + rc = t4_memory_rw(padap, MEMWIN_NIC, mem_type, + bytes_read, bytes, + (__be32 *)temp_buff.data, + 1); + spin_unlock(&padap->win0_lock); + if (rc) { + cudbg_err->sys_err = rc; + cudbg_put_buff(&temp_buff, dbg_buff); + return rc; + } + bytes_left -= bytes; + bytes_read += bytes; + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + } + return rc; +} + +static void cudbg_collect_mem_info(struct cudbg_init *pdbg_init, + struct card_mem *mem_info) +{ + struct adapter *padap = pdbg_init->adap; + u32 value; + + value = t4_read_reg(padap, MA_EDRAM0_BAR_A); + value = EDRAM0_SIZE_G(value); + mem_info->size_edc0 = (u16)value; + + value = t4_read_reg(padap, MA_EDRAM1_BAR_A); + value = EDRAM1_SIZE_G(value); + mem_info->size_edc1 = (u16)value; + + value = t4_read_reg(padap, MA_TARGET_MEM_ENABLE_A); + if (value & EDRAM0_ENABLE_F) + mem_info->mem_flag |= (1 << EDC0_FLAG); + if (value & EDRAM1_ENABLE_F) + mem_info->mem_flag |= (1 << EDC1_FLAG); +} + +static void cudbg_t4_fwcache(struct cudbg_init *pdbg_init, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + int rc; + + if (is_fw_attached(pdbg_init)) { + /* Flush uP dcache before reading edcX/mcX */ + rc = t4_fwcache(padap, FW_PARAM_DEV_FWCACHE_FLUSH); + if (rc) + cudbg_err->sys_warn = rc; + } +} + +static int cudbg_collect_mem_region(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err, + u8 mem_type) +{ + struct card_mem mem_info = {0}; + unsigned long flag, size; + int rc; + + cudbg_t4_fwcache(pdbg_init, cudbg_err); + cudbg_collect_mem_info(pdbg_init, &mem_info); + switch (mem_type) { + case MEM_EDC0: + flag = (1 << EDC0_FLAG); + size = cudbg_mbytes_to_bytes(mem_info.size_edc0); + break; + case MEM_EDC1: + flag = (1 << EDC1_FLAG); + size = cudbg_mbytes_to_bytes(mem_info.size_edc1); + break; + default: + rc = CUDBG_STATUS_ENTITY_NOT_FOUND; + goto err; + } + + if (mem_info.mem_flag & flag) { + rc = cudbg_read_fw_mem(pdbg_init, dbg_buff, mem_type, + size, cudbg_err); + if (rc) + goto err; + } else { + rc = CUDBG_STATUS_ENTITY_NOT_FOUND; + goto err; + } +err: + return rc; +} + +int cudbg_collect_edc0_meminfo(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_collect_mem_region(pdbg_init, dbg_buff, cudbg_err, + MEM_EDC0); +} + +int cudbg_collect_edc1_meminfo(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + return cudbg_collect_mem_region(pdbg_init, dbg_buff, cudbg_err, + MEM_EDC1); +} + +int cudbg_collect_tp_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + struct ireg_buf *ch_tp_pio; + int i, rc, n = 0; + u32 size; + + if (is_t5(padap->params.chip)) + n = sizeof(t5_tp_pio_array) + + sizeof(t5_tp_tm_pio_array) + + sizeof(t5_tp_mib_index_array); + else + n = sizeof(t6_tp_pio_array) + + sizeof(t6_tp_tm_pio_array) + + sizeof(t6_tp_mib_index_array); + + n = n / (IREG_NUM_ELEM * sizeof(u32)); + size = sizeof(struct ireg_buf) * n; + rc = cudbg_get_buff(dbg_buff, size, &temp_buff); + if (rc) + return rc; + + ch_tp_pio = (struct ireg_buf *)temp_buff.data; + + /* TP_PIO */ + if (is_t5(padap->params.chip)) + n = sizeof(t5_tp_pio_array) / (IREG_NUM_ELEM * sizeof(u32)); + else if (is_t6(padap->params.chip)) + n = sizeof(t6_tp_pio_array) / (IREG_NUM_ELEM * sizeof(u32)); + + for (i = 0; i < n; i++) { + struct ireg_field *tp_pio = &ch_tp_pio->tp_pio; + u32 *buff = ch_tp_pio->outbuf; + + if (is_t5(padap->params.chip)) { + tp_pio->ireg_addr = t5_tp_pio_array[i][0]; + tp_pio->ireg_data = t5_tp_pio_array[i][1]; + tp_pio->ireg_local_offset = t5_tp_pio_array[i][2]; + tp_pio->ireg_offset_range = t5_tp_pio_array[i][3]; + } else if (is_t6(padap->params.chip)) { + tp_pio->ireg_addr = t6_tp_pio_array[i][0]; + tp_pio->ireg_data = t6_tp_pio_array[i][1]; + tp_pio->ireg_local_offset = t6_tp_pio_array[i][2]; + tp_pio->ireg_offset_range = t6_tp_pio_array[i][3]; + } + t4_tp_pio_read(padap, buff, tp_pio->ireg_offset_range, + tp_pio->ireg_local_offset, true); + ch_tp_pio++; + } + + /* TP_TM_PIO */ + if (is_t5(padap->params.chip)) + n = sizeof(t5_tp_tm_pio_array) / (IREG_NUM_ELEM * sizeof(u32)); + else if (is_t6(padap->params.chip)) + n = sizeof(t6_tp_tm_pio_array) / (IREG_NUM_ELEM * sizeof(u32)); + + for (i = 0; i < n; i++) { + struct ireg_field *tp_pio = &ch_tp_pio->tp_pio; + u32 *buff = ch_tp_pio->outbuf; + + if (is_t5(padap->params.chip)) { + tp_pio->ireg_addr = t5_tp_tm_pio_array[i][0]; + tp_pio->ireg_data = t5_tp_tm_pio_array[i][1]; + tp_pio->ireg_local_offset = t5_tp_tm_pio_array[i][2]; + tp_pio->ireg_offset_range = t5_tp_tm_pio_array[i][3]; + } else if (is_t6(padap->params.chip)) { + tp_pio->ireg_addr = t6_tp_tm_pio_array[i][0]; + tp_pio->ireg_data = t6_tp_tm_pio_array[i][1]; + tp_pio->ireg_local_offset = t6_tp_tm_pio_array[i][2]; + tp_pio->ireg_offset_range = t6_tp_tm_pio_array[i][3]; + } + t4_tp_tm_pio_read(padap, buff, tp_pio->ireg_offset_range, + tp_pio->ireg_local_offset, true); + ch_tp_pio++; + } + + /* TP_MIB_INDEX */ + if (is_t5(padap->params.chip)) + n = sizeof(t5_tp_mib_index_array) / + (IREG_NUM_ELEM * sizeof(u32)); + else if (is_t6(padap->params.chip)) + n = sizeof(t6_tp_mib_index_array) / + (IREG_NUM_ELEM * sizeof(u32)); + + for (i = 0; i < n ; i++) { + struct ireg_field *tp_pio = &ch_tp_pio->tp_pio; + u32 *buff = ch_tp_pio->outbuf; + + if (is_t5(padap->params.chip)) { + tp_pio->ireg_addr = t5_tp_mib_index_array[i][0]; + tp_pio->ireg_data = t5_tp_mib_index_array[i][1]; + tp_pio->ireg_local_offset = + t5_tp_mib_index_array[i][2]; + tp_pio->ireg_offset_range = + t5_tp_mib_index_array[i][3]; + } else if (is_t6(padap->params.chip)) { + tp_pio->ireg_addr = t6_tp_mib_index_array[i][0]; + tp_pio->ireg_data = t6_tp_mib_index_array[i][1]; + tp_pio->ireg_local_offset = + t6_tp_mib_index_array[i][2]; + tp_pio->ireg_offset_range = + t6_tp_mib_index_array[i][3]; + } + t4_tp_mib_read(padap, buff, tp_pio->ireg_offset_range, + tp_pio->ireg_local_offset, true); + ch_tp_pio++; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_sge_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + struct ireg_buf *ch_sge_dbg; + int i, rc; + + rc = cudbg_get_buff(dbg_buff, sizeof(*ch_sge_dbg) * 2, &temp_buff); + if (rc) + return rc; + + ch_sge_dbg = (struct ireg_buf *)temp_buff.data; + for (i = 0; i < 2; i++) { + struct ireg_field *sge_pio = &ch_sge_dbg->tp_pio; + u32 *buff = ch_sge_dbg->outbuf; + + sge_pio->ireg_addr = t5_sge_dbg_index_array[i][0]; + sge_pio->ireg_data = t5_sge_dbg_index_array[i][1]; + sge_pio->ireg_local_offset = t5_sge_dbg_index_array[i][2]; + sge_pio->ireg_offset_range = t5_sge_dbg_index_array[i][3]; + t4_read_indirect(padap, + sge_pio->ireg_addr, + sge_pio->ireg_data, + buff, + sge_pio->ireg_offset_range, + sge_pio->ireg_local_offset); + ch_sge_dbg++; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_pcie_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + struct ireg_buf *ch_pcie; + int i, rc, n; + u32 size; + + n = sizeof(t5_pcie_pdbg_array) / (IREG_NUM_ELEM * sizeof(u32)); + size = sizeof(struct ireg_buf) * n * 2; + rc = cudbg_get_buff(dbg_buff, size, &temp_buff); + if (rc) + return rc; + + ch_pcie = (struct ireg_buf *)temp_buff.data; + /* PCIE_PDBG */ + for (i = 0; i < n; i++) { + struct ireg_field *pcie_pio = &ch_pcie->tp_pio; + u32 *buff = ch_pcie->outbuf; + + pcie_pio->ireg_addr = t5_pcie_pdbg_array[i][0]; + pcie_pio->ireg_data = t5_pcie_pdbg_array[i][1]; + pcie_pio->ireg_local_offset = t5_pcie_pdbg_array[i][2]; + pcie_pio->ireg_offset_range = t5_pcie_pdbg_array[i][3]; + t4_read_indirect(padap, + pcie_pio->ireg_addr, + pcie_pio->ireg_data, + buff, + pcie_pio->ireg_offset_range, + pcie_pio->ireg_local_offset); + ch_pcie++; + } + + /* PCIE_CDBG */ + n = sizeof(t5_pcie_cdbg_array) / (IREG_NUM_ELEM * sizeof(u32)); + for (i = 0; i < n; i++) { + struct ireg_field *pcie_pio = &ch_pcie->tp_pio; + u32 *buff = ch_pcie->outbuf; + + pcie_pio->ireg_addr = t5_pcie_cdbg_array[i][0]; + pcie_pio->ireg_data = t5_pcie_cdbg_array[i][1]; + pcie_pio->ireg_local_offset = t5_pcie_cdbg_array[i][2]; + pcie_pio->ireg_offset_range = t5_pcie_cdbg_array[i][3]; + t4_read_indirect(padap, + pcie_pio->ireg_addr, + pcie_pio->ireg_data, + buff, + pcie_pio->ireg_offset_range, + pcie_pio->ireg_local_offset); + ch_pcie++; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_pm_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + struct ireg_buf *ch_pm; + int i, rc, n; + u32 size; + + n = sizeof(t5_pm_rx_array) / (IREG_NUM_ELEM * sizeof(u32)); + size = sizeof(struct ireg_buf) * n * 2; + rc = cudbg_get_buff(dbg_buff, size, &temp_buff); + if (rc) + return rc; + + ch_pm = (struct ireg_buf *)temp_buff.data; + /* PM_RX */ + for (i = 0; i < n; i++) { + struct ireg_field *pm_pio = &ch_pm->tp_pio; + u32 *buff = ch_pm->outbuf; + + pm_pio->ireg_addr = t5_pm_rx_array[i][0]; + pm_pio->ireg_data = t5_pm_rx_array[i][1]; + pm_pio->ireg_local_offset = t5_pm_rx_array[i][2]; + pm_pio->ireg_offset_range = t5_pm_rx_array[i][3]; + t4_read_indirect(padap, + pm_pio->ireg_addr, + pm_pio->ireg_data, + buff, + pm_pio->ireg_offset_range, + pm_pio->ireg_local_offset); + ch_pm++; + } + + /* PM_TX */ + n = sizeof(t5_pm_tx_array) / (IREG_NUM_ELEM * sizeof(u32)); + for (i = 0; i < n; i++) { + struct ireg_field *pm_pio = &ch_pm->tp_pio; + u32 *buff = ch_pm->outbuf; + + pm_pio->ireg_addr = t5_pm_tx_array[i][0]; + pm_pio->ireg_data = t5_pm_tx_array[i][1]; + pm_pio->ireg_local_offset = t5_pm_tx_array[i][2]; + pm_pio->ireg_offset_range = t5_pm_tx_array[i][3]; + t4_read_indirect(padap, + pm_pio->ireg_addr, + pm_pio->ireg_data, + buff, + pm_pio->ireg_offset_range, + pm_pio->ireg_local_offset); + ch_pm++; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_ma_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + struct ireg_buf *ma_indr; + int i, rc, n; + u32 size, j; + + if (CHELSIO_CHIP_VERSION(padap->params.chip) < CHELSIO_T6) + return CUDBG_STATUS_ENTITY_NOT_FOUND; + + n = sizeof(t6_ma_ireg_array) / (IREG_NUM_ELEM * sizeof(u32)); + size = sizeof(struct ireg_buf) * n * 2; + rc = cudbg_get_buff(dbg_buff, size, &temp_buff); + if (rc) + return rc; + + ma_indr = (struct ireg_buf *)temp_buff.data; + for (i = 0; i < n; i++) { + struct ireg_field *ma_fli = &ma_indr->tp_pio; + u32 *buff = ma_indr->outbuf; + + ma_fli->ireg_addr = t6_ma_ireg_array[i][0]; + ma_fli->ireg_data = t6_ma_ireg_array[i][1]; + ma_fli->ireg_local_offset = t6_ma_ireg_array[i][2]; + ma_fli->ireg_offset_range = t6_ma_ireg_array[i][3]; + t4_read_indirect(padap, ma_fli->ireg_addr, ma_fli->ireg_data, + buff, ma_fli->ireg_offset_range, + ma_fli->ireg_local_offset); + ma_indr++; + } + + n = sizeof(t6_ma_ireg_array2) / (IREG_NUM_ELEM * sizeof(u32)); + for (i = 0; i < n; i++) { + struct ireg_field *ma_fli = &ma_indr->tp_pio; + u32 *buff = ma_indr->outbuf; + + ma_fli->ireg_addr = t6_ma_ireg_array2[i][0]; + ma_fli->ireg_data = t6_ma_ireg_array2[i][1]; + ma_fli->ireg_local_offset = t6_ma_ireg_array2[i][2]; + for (j = 0; j < t6_ma_ireg_array2[i][3]; j++) { + t4_read_indirect(padap, ma_fli->ireg_addr, + ma_fli->ireg_data, buff, 1, + ma_fli->ireg_local_offset); + buff++; + ma_fli->ireg_local_offset += 0x20; + } + ma_indr++; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_up_cim_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + struct ireg_buf *up_cim; + int i, rc, n; + u32 size; + + n = sizeof(t5_up_cim_reg_array) / (IREG_NUM_ELEM * sizeof(u32)); + size = sizeof(struct ireg_buf) * n; + rc = cudbg_get_buff(dbg_buff, size, &temp_buff); + if (rc) + return rc; + + up_cim = (struct ireg_buf *)temp_buff.data; + for (i = 0; i < n; i++) { + struct ireg_field *up_cim_reg = &up_cim->tp_pio; + u32 *buff = up_cim->outbuf; + + if (is_t5(padap->params.chip)) { + up_cim_reg->ireg_addr = t5_up_cim_reg_array[i][0]; + up_cim_reg->ireg_data = t5_up_cim_reg_array[i][1]; + up_cim_reg->ireg_local_offset = + t5_up_cim_reg_array[i][2]; + up_cim_reg->ireg_offset_range = + t5_up_cim_reg_array[i][3]; + } else if (is_t6(padap->params.chip)) { + up_cim_reg->ireg_addr = t6_up_cim_reg_array[i][0]; + up_cim_reg->ireg_data = t6_up_cim_reg_array[i][1]; + up_cim_reg->ireg_local_offset = + t6_up_cim_reg_array[i][2]; + up_cim_reg->ireg_offset_range = + t6_up_cim_reg_array[i][3]; + } + + rc = t4_cim_read(padap, up_cim_reg->ireg_local_offset, + up_cim_reg->ireg_offset_range, buff); + if (rc) { + cudbg_put_buff(&temp_buff, dbg_buff); + return rc; + } + up_cim++; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_mbox_log(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_mbox_log *mboxlog = NULL; + struct cudbg_buffer temp_buff = { 0 }; + struct mbox_cmd_log *log = NULL; + struct mbox_cmd *entry; + unsigned int entry_idx; + u16 mbox_cmds; + int i, k, rc; + u64 flit; + u32 size; + + log = padap->mbox_log; + mbox_cmds = padap->mbox_log->size; + size = sizeof(struct cudbg_mbox_log) * mbox_cmds; + rc = cudbg_get_buff(dbg_buff, size, &temp_buff); + if (rc) + return rc; + + mboxlog = (struct cudbg_mbox_log *)temp_buff.data; + for (k = 0; k < mbox_cmds; k++) { + entry_idx = log->cursor + k; + if (entry_idx >= log->size) + entry_idx -= log->size; + + entry = mbox_cmd_log_entry(log, entry_idx); + /* skip over unused entries */ + if (entry->timestamp == 0) + continue; + + memcpy(&mboxlog->entry, entry, sizeof(struct mbox_cmd)); + for (i = 0; i < MBOX_LEN / 8; i++) { + flit = entry->cmd[i]; + mboxlog->hi[i] = (u32)(flit >> 32); + mboxlog->lo[i] = (u32)flit; + } + mboxlog++; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} + +int cudbg_collect_hma_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err) +{ + struct adapter *padap = pdbg_init->adap; + struct cudbg_buffer temp_buff = { 0 }; + struct ireg_buf *hma_indr; + int i, rc, n; + u32 size; + + if (CHELSIO_CHIP_VERSION(padap->params.chip) < CHELSIO_T6) + return CUDBG_STATUS_ENTITY_NOT_FOUND; + + n = sizeof(t6_hma_ireg_array) / (IREG_NUM_ELEM * sizeof(u32)); + size = sizeof(struct ireg_buf) * n; + rc = cudbg_get_buff(dbg_buff, size, &temp_buff); + if (rc) + return rc; + + hma_indr = (struct ireg_buf *)temp_buff.data; + for (i = 0; i < n; i++) { + struct ireg_field *hma_fli = &hma_indr->tp_pio; + u32 *buff = hma_indr->outbuf; + + hma_fli->ireg_addr = t6_hma_ireg_array[i][0]; + hma_fli->ireg_data = t6_hma_ireg_array[i][1]; + hma_fli->ireg_local_offset = t6_hma_ireg_array[i][2]; + hma_fli->ireg_offset_range = t6_hma_ireg_array[i][3]; + t4_read_indirect(padap, hma_fli->ireg_addr, hma_fli->ireg_data, + buff, hma_fli->ireg_offset_range, + hma_fli->ireg_local_offset); + hma_indr++; + } + cudbg_write_and_release_buff(&temp_buff, dbg_buff); + return rc; +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.h new file mode 100644 index 000000000000..c4440c1d0142 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.h @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2017 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#ifndef __CUDBG_LIB_H__ +#define __CUDBG_LIB_H__ + +int cudbg_collect_reg_dump(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_fw_devlog(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_ibq_tp0(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_ibq_tp1(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_ibq_ulp(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_ibq_sge0(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_ibq_sge1(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_ibq_ncsi(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_obq_ulp0(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_obq_ulp1(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_obq_ulp2(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_obq_ulp3(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_obq_sge(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_cim_obq_ncsi(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_edc0_meminfo(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_edc1_meminfo(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_tp_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_sge_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_obq_sge_rx_q0(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_obq_sge_rx_q1(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_pcie_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_pm_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_ma_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_up_cim_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_mbox_log(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); +int cudbg_collect_hma_indirect(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); + +struct cudbg_entity_hdr *cudbg_get_entity_hdr(void *outbuf, int i); +void cudbg_align_debug_buffer(struct cudbg_buffer *dbg_buff, + struct cudbg_entity_hdr *entity_hdr); +#endif /* __CUDBG_LIB_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib_common.h b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib_common.h new file mode 100644 index 000000000000..b150c5d1f7c0 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib_common.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2017 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#ifndef __CUDBG_LIB_COMMON_H__ +#define __CUDBG_LIB_COMMON_H__ + +#define CUDBG_SIGNATURE 67856866 /* CUDB in ascii */ + +enum cudbg_dump_type { + CUDBG_DUMP_TYPE_MINI = 1, +}; + +enum cudbg_compression_type { + CUDBG_COMPRESSION_NONE = 1, +}; + +struct cudbg_hdr { + u32 signature; + u32 hdr_len; + u16 major_ver; + u16 minor_ver; + u32 data_len; + u32 hdr_flags; + u16 max_entities; + u8 chip_ver; + u8 dump_type:3; + u8 reserved1:1; + u8 compress_type:4; + u32 reserved[8]; +}; + +struct cudbg_entity_hdr { + u32 entity_type; + u32 start_offset; + u32 size; + int hdr_flags; + u32 sys_warn; + u32 sys_err; + u8 num_pad; + u8 flag; /* bit 0 is used to indicate ext data */ + u8 reserved1[2]; + u32 next_ext_offset; /* pointer to next extended entity meta data */ + u32 reserved[5]; +}; + +struct cudbg_buffer { + u32 size; + u32 offset; + char *data; +}; + +struct cudbg_error { + int sys_err; + int sys_warn; + int app_err; +}; + +#define CDUMP_MAX_COMP_BUF_SIZE ((64 * 1024) - 1) +#define CUDBG_CHUNK_SIZE ((CDUMP_MAX_COMP_BUF_SIZE / 1024) * 1024) + +int cudbg_get_buff(struct cudbg_buffer *pdbg_buff, u32 size, + struct cudbg_buffer *pin_buff); +void cudbg_put_buff(struct cudbg_buffer *pin_buff, + struct cudbg_buffer *pdbg_buff); +void cudbg_update_buff(struct cudbg_buffer *pin_buff, + struct cudbg_buffer *pout_buff); +#endif /* __CUDBG_LIB_COMMON_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 0db3ab6ad094..4eaca05ebd3a 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -909,6 +909,9 @@ struct adapter { /* TC flower offload */ DECLARE_HASHTABLE(flower_anymatch_tbl, 9); struct timer_list flower_stats_timer; + + /* Ethtool Dump */ + struct ethtool_dump eth_dump; }; /* Support for "sched-class" command to allow a TX Scheduling Class to be @@ -1456,7 +1459,7 @@ unsigned int qtimer_val(const struct adapter *adap, int t4_init_devlog_params(struct adapter *adapter); int t4_init_sge_params(struct adapter *adapter); -int t4_init_tp_params(struct adapter *adap); +int t4_init_tp_params(struct adapter *adap, bool sleep_ok); int t4_filter_field_shift(const struct adapter *adap, int filter_sel); int t4_init_rss_mode(struct adapter *adap, int mbox); int t4_init_portinfo(struct port_info *pi, int mbox, @@ -1470,14 +1473,15 @@ int t4_config_glbl_rss(struct adapter *adapter, int mbox, unsigned int mode, int t4_config_vi_rss(struct adapter *adapter, int mbox, unsigned int viid, unsigned int flags, unsigned int defq); int t4_read_rss(struct adapter *adapter, u16 *entries); -void t4_read_rss_key(struct adapter *adapter, u32 *key); -void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx); +void t4_read_rss_key(struct adapter *adapter, u32 *key, bool sleep_ok); +void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx, + bool sleep_ok); void t4_read_rss_pf_config(struct adapter *adapter, unsigned int index, - u32 *valp); + u32 *valp, bool sleep_ok); void t4_read_rss_vf_config(struct adapter *adapter, unsigned int index, - u32 *vfl, u32 *vfh); -u32 t4_read_rss_pf_map(struct adapter *adapter); -u32 t4_read_rss_pf_mask(struct adapter *adapter); + u32 *vfl, u32 *vfh, bool sleep_ok); +u32 t4_read_rss_pf_map(struct adapter *adapter, bool sleep_ok); +u32 t4_read_rss_pf_mask(struct adapter *adapter, bool sleep_ok); unsigned int t4_get_mps_bg_map(struct adapter *adapter, int pidx); unsigned int t4_get_tp_ch_map(struct adapter *adapter, int pidx); @@ -1508,14 +1512,18 @@ void t4_read_cong_tbl(struct adapter *adap, u16 incr[NMTUS][NCCTRL_WIN]); void t4_tp_wr_bits_indirect(struct adapter *adap, unsigned int addr, unsigned int mask, unsigned int val); void t4_tp_read_la(struct adapter *adap, u64 *la_buf, unsigned int *wrptr); -void t4_tp_get_err_stats(struct adapter *adap, struct tp_err_stats *st); -void t4_tp_get_cpl_stats(struct adapter *adap, struct tp_cpl_stats *st); -void t4_tp_get_rdma_stats(struct adapter *adap, struct tp_rdma_stats *st); -void t4_get_usm_stats(struct adapter *adap, struct tp_usm_stats *st); +void t4_tp_get_err_stats(struct adapter *adap, struct tp_err_stats *st, + bool sleep_ok); +void t4_tp_get_cpl_stats(struct adapter *adap, struct tp_cpl_stats *st, + bool sleep_ok); +void t4_tp_get_rdma_stats(struct adapter *adap, struct tp_rdma_stats *st, + bool sleep_ok); +void t4_get_usm_stats(struct adapter *adap, struct tp_usm_stats *st, + bool sleep_ok); void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4, - struct tp_tcp_stats *v6); + struct tp_tcp_stats *v6, bool sleep_ok); void t4_get_fcoe_stats(struct adapter *adap, unsigned int idx, - struct tp_fcoe_stats *st); + struct tp_fcoe_stats *st, bool sleep_ok); void t4_load_mtus(struct adapter *adap, const unsigned short *mtus, const unsigned short *alpha, const unsigned short *beta); @@ -1624,6 +1632,13 @@ void t4_idma_monitor(struct adapter *adapter, int hz, int ticks); int t4_set_vf_mac_acl(struct adapter *adapter, unsigned int vf, unsigned int naddr, u8 *addr); +void t4_tp_pio_read(struct adapter *adap, u32 *buff, u32 nregs, + u32 start_index, bool sleep_ok); +void t4_tp_tm_pio_read(struct adapter *adap, u32 *buff, u32 nregs, + u32 start_index, bool sleep_ok); +void t4_tp_mib_read(struct adapter *adap, u32 *buff, u32 nregs, + u32 start_index, bool sleep_ok); + void t4_uld_mem_free(struct adapter *adap); int t4_uld_mem_alloc(struct adapter *adap); void t4_uld_clean_up(struct adapter *adap); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c new file mode 100644 index 000000000000..9d97080a9d17 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c @@ -0,0 +1,296 @@ +/* + * Copyright (C) 2017 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#include "t4_regs.h" +#include "cxgb4.h" +#include "cxgb4_cudbg.h" +#include "cudbg_entity.h" + +static const struct cxgb4_collect_entity cxgb4_collect_mem_dump[] = { + { CUDBG_EDC0, cudbg_collect_edc0_meminfo }, + { CUDBG_EDC1, cudbg_collect_edc1_meminfo }, +}; + +static const struct cxgb4_collect_entity cxgb4_collect_hw_dump[] = { + { CUDBG_MBOX_LOG, cudbg_collect_mbox_log }, + { CUDBG_DEV_LOG, cudbg_collect_fw_devlog }, + { CUDBG_REG_DUMP, cudbg_collect_reg_dump }, + { CUDBG_CIM_IBQ_TP0, cudbg_collect_cim_ibq_tp0 }, + { CUDBG_CIM_IBQ_TP1, cudbg_collect_cim_ibq_tp1 }, + { CUDBG_CIM_IBQ_ULP, cudbg_collect_cim_ibq_ulp }, + { CUDBG_CIM_IBQ_SGE0, cudbg_collect_cim_ibq_sge0 }, + { CUDBG_CIM_IBQ_SGE1, cudbg_collect_cim_ibq_sge1 }, + { CUDBG_CIM_IBQ_NCSI, cudbg_collect_cim_ibq_ncsi }, + { CUDBG_CIM_OBQ_ULP0, cudbg_collect_cim_obq_ulp0 }, + { CUDBG_CIM_OBQ_ULP1, cudbg_collect_cim_obq_ulp1 }, + { CUDBG_CIM_OBQ_ULP2, cudbg_collect_cim_obq_ulp2 }, + { CUDBG_CIM_OBQ_ULP3, cudbg_collect_cim_obq_ulp3 }, + { CUDBG_CIM_OBQ_SGE, cudbg_collect_cim_obq_sge }, + { CUDBG_CIM_OBQ_NCSI, cudbg_collect_cim_obq_ncsi }, + { CUDBG_TP_INDIRECT, cudbg_collect_tp_indirect }, + { CUDBG_SGE_INDIRECT, cudbg_collect_sge_indirect }, + { CUDBG_CIM_OBQ_RXQ0, cudbg_collect_obq_sge_rx_q0 }, + { CUDBG_CIM_OBQ_RXQ1, cudbg_collect_obq_sge_rx_q1 }, + { CUDBG_PCIE_INDIRECT, cudbg_collect_pcie_indirect }, + { CUDBG_PM_INDIRECT, cudbg_collect_pm_indirect }, + { CUDBG_MA_INDIRECT, cudbg_collect_ma_indirect }, + { CUDBG_UP_CIM_INDIRECT, cudbg_collect_up_cim_indirect }, + { CUDBG_HMA_INDIRECT, cudbg_collect_hma_indirect }, +}; + +static u32 cxgb4_get_entity_length(struct adapter *adap, u32 entity) +{ + u32 value, n = 0, len = 0; + + switch (entity) { + case CUDBG_REG_DUMP: + switch (CHELSIO_CHIP_VERSION(adap->params.chip)) { + case CHELSIO_T4: + len = T4_REGMAP_SIZE; + break; + case CHELSIO_T5: + case CHELSIO_T6: + len = T5_REGMAP_SIZE; + break; + default: + break; + } + break; + case CUDBG_DEV_LOG: + len = adap->params.devlog.size; + break; + case CUDBG_CIM_IBQ_TP0: + case CUDBG_CIM_IBQ_TP1: + case CUDBG_CIM_IBQ_ULP: + case CUDBG_CIM_IBQ_SGE0: + case CUDBG_CIM_IBQ_SGE1: + case CUDBG_CIM_IBQ_NCSI: + len = CIM_IBQ_SIZE * 4 * sizeof(u32); + break; + case CUDBG_CIM_OBQ_ULP0: + case CUDBG_CIM_OBQ_ULP1: + case CUDBG_CIM_OBQ_ULP2: + case CUDBG_CIM_OBQ_ULP3: + case CUDBG_CIM_OBQ_SGE: + case CUDBG_CIM_OBQ_NCSI: + case CUDBG_CIM_OBQ_RXQ0: + case CUDBG_CIM_OBQ_RXQ1: + len = 6 * CIM_OBQ_SIZE * 4 * sizeof(u32); + break; + case CUDBG_EDC0: + value = t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A); + if (value & EDRAM0_ENABLE_F) { + value = t4_read_reg(adap, MA_EDRAM0_BAR_A); + len = EDRAM0_SIZE_G(value); + } + len = cudbg_mbytes_to_bytes(len); + break; + case CUDBG_EDC1: + value = t4_read_reg(adap, MA_TARGET_MEM_ENABLE_A); + if (value & EDRAM1_ENABLE_F) { + value = t4_read_reg(adap, MA_EDRAM1_BAR_A); + len = EDRAM1_SIZE_G(value); + } + len = cudbg_mbytes_to_bytes(len); + break; + case CUDBG_TP_INDIRECT: + switch (CHELSIO_CHIP_VERSION(adap->params.chip)) { + case CHELSIO_T5: + n = sizeof(t5_tp_pio_array) + + sizeof(t5_tp_tm_pio_array) + + sizeof(t5_tp_mib_index_array); + break; + case CHELSIO_T6: + n = sizeof(t6_tp_pio_array) + + sizeof(t6_tp_tm_pio_array) + + sizeof(t6_tp_mib_index_array); + break; + default: + break; + } + n = n / (IREG_NUM_ELEM * sizeof(u32)); + len = sizeof(struct ireg_buf) * n; + break; + case CUDBG_SGE_INDIRECT: + len = sizeof(struct ireg_buf) * 2; + break; + case CUDBG_PCIE_INDIRECT: + n = sizeof(t5_pcie_pdbg_array) / (IREG_NUM_ELEM * sizeof(u32)); + len = sizeof(struct ireg_buf) * n * 2; + break; + case CUDBG_PM_INDIRECT: + n = sizeof(t5_pm_rx_array) / (IREG_NUM_ELEM * sizeof(u32)); + len = sizeof(struct ireg_buf) * n * 2; + break; + case CUDBG_MA_INDIRECT: + if (CHELSIO_CHIP_VERSION(adap->params.chip) > CHELSIO_T5) { + n = sizeof(t6_ma_ireg_array) / + (IREG_NUM_ELEM * sizeof(u32)); + len = sizeof(struct ireg_buf) * n * 2; + } + break; + case CUDBG_UP_CIM_INDIRECT: + n = sizeof(t5_up_cim_reg_array) / (IREG_NUM_ELEM * sizeof(u32)); + len = sizeof(struct ireg_buf) * n; + break; + case CUDBG_MBOX_LOG: + len = sizeof(struct cudbg_mbox_log) * adap->mbox_log->size; + break; + case CUDBG_HMA_INDIRECT: + if (CHELSIO_CHIP_VERSION(adap->params.chip) > CHELSIO_T5) { + n = sizeof(t6_hma_ireg_array) / + (IREG_NUM_ELEM * sizeof(u32)); + len = sizeof(struct ireg_buf) * n; + } + break; + default: + break; + } + + return len; +} + +u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag) +{ + u32 i, entity; + u32 len = 0; + + if (flag & CXGB4_ETH_DUMP_HW) { + for (i = 0; i < ARRAY_SIZE(cxgb4_collect_hw_dump); i++) { + entity = cxgb4_collect_hw_dump[i].entity; + len += cxgb4_get_entity_length(adap, entity); + } + } + + if (flag & CXGB4_ETH_DUMP_MEM) { + for (i = 0; i < ARRAY_SIZE(cxgb4_collect_mem_dump); i++) { + entity = cxgb4_collect_mem_dump[i].entity; + len += cxgb4_get_entity_length(adap, entity); + } + } + + return len; +} + +static void cxgb4_cudbg_collect_entity(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + const struct cxgb4_collect_entity *e_arr, + u32 arr_size, void *buf, u32 *tot_size) +{ + struct adapter *adap = pdbg_init->adap; + struct cudbg_error cudbg_err = { 0 }; + struct cudbg_entity_hdr *entity_hdr; + u32 entity_size, i; + u32 total_size = 0; + int ret; + + for (i = 0; i < arr_size; i++) { + const struct cxgb4_collect_entity *e = &e_arr[i]; + + /* Skip entities that won't fit in output buffer */ + entity_size = cxgb4_get_entity_length(adap, e->entity); + if (entity_size > + pdbg_init->outbuf_size - *tot_size - total_size) + continue; + + entity_hdr = cudbg_get_entity_hdr(buf, e->entity); + entity_hdr->entity_type = e->entity; + entity_hdr->start_offset = dbg_buff->offset; + memset(&cudbg_err, 0, sizeof(struct cudbg_error)); + ret = e->collect_cb(pdbg_init, dbg_buff, &cudbg_err); + if (ret) { + entity_hdr->size = 0; + dbg_buff->offset = entity_hdr->start_offset; + } else { + cudbg_align_debug_buffer(dbg_buff, entity_hdr); + } + + /* Log error and continue with next entity */ + if (cudbg_err.sys_err) + ret = CUDBG_SYSTEM_ERROR; + + entity_hdr->hdr_flags = ret; + entity_hdr->sys_err = cudbg_err.sys_err; + entity_hdr->sys_warn = cudbg_err.sys_warn; + total_size += entity_hdr->size; + } + + *tot_size += total_size; +} + +int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size, + u32 flag) +{ + struct cudbg_init cudbg_init = { 0 }; + struct cudbg_buffer dbg_buff = { 0 }; + u32 size, min_size, total_size = 0; + struct cudbg_hdr *cudbg_hdr; + + size = *buf_size; + + cudbg_init.adap = adap; + cudbg_init.outbuf = buf; + cudbg_init.outbuf_size = size; + + dbg_buff.data = buf; + dbg_buff.size = size; + dbg_buff.offset = 0; + + cudbg_hdr = (struct cudbg_hdr *)buf; + cudbg_hdr->signature = CUDBG_SIGNATURE; + cudbg_hdr->hdr_len = sizeof(struct cudbg_hdr); + cudbg_hdr->major_ver = CUDBG_MAJOR_VERSION; + cudbg_hdr->minor_ver = CUDBG_MINOR_VERSION; + cudbg_hdr->max_entities = CUDBG_MAX_ENTITY; + cudbg_hdr->chip_ver = adap->params.chip; + cudbg_hdr->dump_type = CUDBG_DUMP_TYPE_MINI; + cudbg_hdr->compress_type = CUDBG_COMPRESSION_NONE; + + min_size = sizeof(struct cudbg_hdr) + + sizeof(struct cudbg_entity_hdr) * + cudbg_hdr->max_entities; + if (size < min_size) + return -ENOMEM; + + dbg_buff.offset += min_size; + total_size = dbg_buff.offset; + + if (flag & CXGB4_ETH_DUMP_HW) + cxgb4_cudbg_collect_entity(&cudbg_init, &dbg_buff, + cxgb4_collect_hw_dump, + ARRAY_SIZE(cxgb4_collect_hw_dump), + buf, + &total_size); + + if (flag & CXGB4_ETH_DUMP_MEM) + cxgb4_cudbg_collect_entity(&cudbg_init, &dbg_buff, + cxgb4_collect_mem_dump, + ARRAY_SIZE(cxgb4_collect_mem_dump), + buf, + &total_size); + + cudbg_hdr->data_len = total_size; + *buf_size = total_size; + return 0; +} + +void cxgb4_init_ethtool_dump(struct adapter *adapter) +{ + adapter->eth_dump.flag = CXGB4_ETH_DUMP_NONE; + adapter->eth_dump.version = adapter->params.fw_vers; + adapter->eth_dump.len = 0; +} diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h new file mode 100644 index 000000000000..c099b5aa2214 --- /dev/null +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.h @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2017 Chelsio Communications. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * The full GNU General Public License is included in this distribution in + * the file called "COPYING". + * + */ + +#ifndef __CXGB4_CUDBG_H__ +#define __CXGB4_CUDBG_H__ + +#include "cudbg_if.h" +#include "cudbg_lib_common.h" +#include "cudbg_lib.h" + +typedef int (*cudbg_collect_callback_t)(struct cudbg_init *pdbg_init, + struct cudbg_buffer *dbg_buff, + struct cudbg_error *cudbg_err); + +struct cxgb4_collect_entity { + enum cudbg_dbg_entity_type entity; + cudbg_collect_callback_t collect_cb; +}; + +enum CXGB4_ETHTOOL_DUMP_FLAGS { + CXGB4_ETH_DUMP_NONE = ETH_FW_DUMP_DISABLE, + CXGB4_ETH_DUMP_MEM = (1 << 0), /* On-Chip Memory Dumps */ + CXGB4_ETH_DUMP_HW = (1 << 1), /* various FW and HW dumps */ +}; + +u32 cxgb4_get_dump_length(struct adapter *adap, u32 flag); +int cxgb4_cudbg_collect(struct adapter *adap, void *buf, u32 *buf_size, + u32 flag); +void cxgb4_init_ethtool_dump(struct adapter *adapter); +#endif /* __CXGB4_CUDBG_H__ */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index 76540b0e082d..917663b35603 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -2211,7 +2211,7 @@ static int rss_key_show(struct seq_file *seq, void *v) { u32 key[10]; - t4_read_rss_key(seq->private, key); + t4_read_rss_key(seq->private, key, true); seq_printf(seq, "%08x%08x%08x%08x%08x%08x%08x%08x%08x%08x\n", key[9], key[8], key[7], key[6], key[5], key[4], key[3], key[2], key[1], key[0]); @@ -2248,7 +2248,7 @@ static ssize_t rss_key_write(struct file *file, const char __user *buf, } } - t4_write_rss_key(adap, key, -1); + t4_write_rss_key(adap, key, -1, true); return count; } @@ -2325,12 +2325,13 @@ static int rss_pf_config_open(struct inode *inode, struct file *file) return -ENOMEM; pfconf = (struct rss_pf_conf *)p->data; - rss_pf_map = t4_read_rss_pf_map(adapter); - rss_pf_mask = t4_read_rss_pf_mask(adapter); + rss_pf_map = t4_read_rss_pf_map(adapter, true); + rss_pf_mask = t4_read_rss_pf_mask(adapter, true); for (pf = 0; pf < 8; pf++) { pfconf[pf].rss_pf_map = rss_pf_map; pfconf[pf].rss_pf_mask = rss_pf_mask; - t4_read_rss_pf_config(adapter, pf, &pfconf[pf].rss_pf_config); + t4_read_rss_pf_config(adapter, pf, &pfconf[pf].rss_pf_config, + true); } return 0; } @@ -2393,7 +2394,7 @@ static int rss_vf_config_open(struct inode *inode, struct file *file) vfconf = (struct rss_vf_conf *)p->data; for (vf = 0; vf < vfcount; vf++) { t4_read_rss_vf_config(adapter, vf, &vfconf[vf].rss_vf_vfl, - &vfconf[vf].rss_vf_vfh); + &vfconf[vf].rss_vf_vfh, true); } return 0; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c index a71af1e587e2..1b7f6b9ccc8b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c @@ -21,6 +21,7 @@ #include "cxgb4.h" #include "t4_regs.h" #include "t4fw_api.h" +#include "cxgb4_cudbg.h" #define EEPROM_MAGIC 0x38E2F10C @@ -335,10 +336,10 @@ static void collect_adapter_stats(struct adapter *adap, struct adapter_stats *s) memset(s, 0, sizeof(*s)); spin_lock(&adap->stats_lock); - t4_tp_get_tcp_stats(adap, &v4, &v6); - t4_tp_get_rdma_stats(adap, &rdma_stats); - t4_get_usm_stats(adap, &usm_stats); - t4_tp_get_err_stats(adap, &err_stats); + t4_tp_get_tcp_stats(adap, &v4, &v6, false); + t4_tp_get_rdma_stats(adap, &rdma_stats, false); + t4_get_usm_stats(adap, &usm_stats, false); + t4_tp_get_err_stats(adap, &err_stats, false); spin_unlock(&adap->stats_lock); s->db_drop = adap->db_stats.db_drop; @@ -388,9 +389,9 @@ static void collect_channel_stats(struct adapter *adap, struct channel_stats *s, memset(s, 0, sizeof(*s)); spin_lock(&adap->stats_lock); - t4_tp_get_cpl_stats(adap, &cpl_stats); - t4_tp_get_err_stats(adap, &err_stats); - t4_get_fcoe_stats(adap, i, &fcoe_stats); + t4_tp_get_cpl_stats(adap, &cpl_stats, false); + t4_tp_get_err_stats(adap, &err_stats, false); + t4_get_fcoe_stats(adap, i, &fcoe_stats, false); spin_unlock(&adap->stats_lock); s->cpl_req = cpl_stats.req[i]; @@ -1374,6 +1375,56 @@ static int get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *info, return -EOPNOTSUPP; } +static int set_dump(struct net_device *dev, struct ethtool_dump *eth_dump) +{ + struct adapter *adapter = netdev2adap(dev); + u32 len = 0; + + len = sizeof(struct cudbg_hdr) + + sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY; + len += cxgb4_get_dump_length(adapter, eth_dump->flag); + + adapter->eth_dump.flag = eth_dump->flag; + adapter->eth_dump.len = len; + return 0; +} + +static int get_dump_flag(struct net_device *dev, struct ethtool_dump *eth_dump) +{ + struct adapter *adapter = netdev2adap(dev); + + eth_dump->flag = adapter->eth_dump.flag; + eth_dump->len = adapter->eth_dump.len; + eth_dump->version = adapter->eth_dump.version; + return 0; +} + +static int get_dump_data(struct net_device *dev, struct ethtool_dump *eth_dump, + void *buf) +{ + struct adapter *adapter = netdev2adap(dev); + u32 len = 0; + int ret = 0; + + if (adapter->eth_dump.flag == CXGB4_ETH_DUMP_NONE) + return -ENOENT; + + len = sizeof(struct cudbg_hdr) + + sizeof(struct cudbg_entity_hdr) * CUDBG_MAX_ENTITY; + len += cxgb4_get_dump_length(adapter, adapter->eth_dump.flag); + if (eth_dump->len < len) + return -ENOMEM; + + ret = cxgb4_cudbg_collect(adapter, buf, &len, adapter->eth_dump.flag); + if (ret) + return ret; + + eth_dump->flag = adapter->eth_dump.flag; + eth_dump->len = len; + eth_dump->version = adapter->eth_dump.version; + return 0; +} + static const struct ethtool_ops cxgb_ethtool_ops = { .get_link_ksettings = get_link_ksettings, .set_link_ksettings = set_link_ksettings, @@ -1404,7 +1455,10 @@ static const struct ethtool_ops cxgb_ethtool_ops = { .get_rxfh = get_rss_table, .set_rxfh = set_rss_table, .flash_device = set_flash, - .get_ts_info = get_ts_info + .get_ts_info = get_ts_info, + .set_dump = set_dump, + .get_dump_flag = get_dump_flag, + .get_dump_data = get_dump_data, }; void cxgb4_set_ethtool_ops(struct net_device *netdev) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index fe4cbe22d5d7..8d97ae6039aa 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -81,6 +81,7 @@ #include "cxgb4_tc_u32.h" #include "cxgb4_tc_flower.h" #include "cxgb4_ptp.h" +#include "cxgb4_cudbg.h" char cxgb4_driver_name[] = KBUILD_MODNAME; @@ -1638,7 +1639,7 @@ void cxgb4_get_tcp_stats(struct pci_dev *pdev, struct tp_tcp_stats *v4, struct adapter *adap = pci_get_drvdata(pdev); spin_lock(&adap->stats_lock); - t4_tp_get_tcp_stats(adap, v4, v6); + t4_tp_get_tcp_stats(adap, v4, v6, false); spin_unlock(&adap->stats_lock); } EXPORT_SYMBOL(cxgb4_get_tcp_stats); @@ -4076,7 +4077,7 @@ static int adap_init0(struct adapter *adap) } t4_init_sge_params(adap); adap->flags |= FW_OK; - t4_init_tp_params(adap); + t4_init_tp_params(adap, true); return 0; /* @@ -5035,6 +5036,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) cxgb4_set_ethtool_ops(netdev); } + cxgb4_init_ethtool_dump(adapter); + pci_set_drvdata(pdev, adapter); if (adapter->flags & FW_OK) { diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index b65ce26ff72f..8fa40f9e75c4 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -5052,23 +5052,26 @@ static unsigned int t4_use_ldst(struct adapter *adap) } /** - * t4_fw_tp_pio_rw - Access TP PIO through LDST - * @adap: the adapter - * @vals: where the indirect register values are stored/written - * @nregs: how many indirect registers to read/write - * @start_idx: index of first indirect register to read/write - * @rw: Read (1) or Write (0) + * t4_tp_fw_ldst_rw - Access TP indirect register through LDST + * @adap: the adapter + * @cmd: TP fw ldst address space type + * @vals: where the indirect register values are stored/written + * @nregs: how many indirect registers to read/write + * @start_idx: index of first indirect register to read/write + * @rw: Read (1) or Write (0) + * @sleep_ok: if true we may sleep while awaiting command completion * - * Access TP PIO registers through LDST + * Access TP indirect registers through LDST */ -static void t4_fw_tp_pio_rw(struct adapter *adap, u32 *vals, unsigned int nregs, - unsigned int start_index, unsigned int rw) +static int t4_tp_fw_ldst_rw(struct adapter *adap, int cmd, u32 *vals, + unsigned int nregs, unsigned int start_index, + unsigned int rw, bool sleep_ok) { - int ret, i; - int cmd = FW_LDST_ADDRSPC_TP_PIO; + int ret = 0; + unsigned int i; struct fw_ldst_cmd c; - for (i = 0 ; i < nregs; i++) { + for (i = 0; i < nregs; i++) { memset(&c, 0, sizeof(c)); c.op_to_addrspace = cpu_to_be32(FW_CMD_OP_V(FW_LDST_CMD) | FW_CMD_REQUEST_F | @@ -5079,26 +5082,147 @@ static void t4_fw_tp_pio_rw(struct adapter *adap, u32 *vals, unsigned int nregs, c.u.addrval.addr = cpu_to_be32(start_index + i); c.u.addrval.val = rw ? 0 : cpu_to_be32(vals[i]); - ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c); - if (!ret && rw) + ret = t4_wr_mbox_meat(adap, adap->mbox, &c, sizeof(c), &c, + sleep_ok); + if (ret) + return ret; + + if (rw) vals[i] = be32_to_cpu(c.u.addrval.val); } + return 0; +} + +/** + * t4_tp_indirect_rw - Read/Write TP indirect register through LDST or backdoor + * @adap: the adapter + * @reg_addr: Address Register + * @reg_data: Data register + * @buff: where the indirect register values are stored/written + * @nregs: how many indirect registers to read/write + * @start_index: index of first indirect register to read/write + * @rw: READ(1) or WRITE(0) + * @sleep_ok: if true we may sleep while awaiting command completion + * + * Read/Write TP indirect registers through LDST if possible. + * Else, use backdoor access + **/ +static void t4_tp_indirect_rw(struct adapter *adap, u32 reg_addr, u32 reg_data, + u32 *buff, u32 nregs, u32 start_index, int rw, + bool sleep_ok) +{ + int rc = -EINVAL; + int cmd; + + switch (reg_addr) { + case TP_PIO_ADDR_A: + cmd = FW_LDST_ADDRSPC_TP_PIO; + break; + case TP_TM_PIO_ADDR_A: + cmd = FW_LDST_ADDRSPC_TP_TM_PIO; + break; + case TP_MIB_INDEX_A: + cmd = FW_LDST_ADDRSPC_TP_MIB; + break; + default: + goto indirect_access; + } + + if (t4_use_ldst(adap)) + rc = t4_tp_fw_ldst_rw(adap, cmd, buff, nregs, start_index, rw, + sleep_ok); + +indirect_access: + + if (rc) { + if (rw) + t4_read_indirect(adap, reg_addr, reg_data, buff, nregs, + start_index); + else + t4_write_indirect(adap, reg_addr, reg_data, buff, nregs, + start_index); + } +} + +/** + * t4_tp_pio_read - Read TP PIO registers + * @adap: the adapter + * @buff: where the indirect register values are written + * @nregs: how many indirect registers to read + * @start_index: index of first indirect register to read + * @sleep_ok: if true we may sleep while awaiting command completion + * + * Read TP PIO Registers + **/ +void t4_tp_pio_read(struct adapter *adap, u32 *buff, u32 nregs, + u32 start_index, bool sleep_ok) +{ + t4_tp_indirect_rw(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A, buff, nregs, + start_index, 1, sleep_ok); +} + +/** + * t4_tp_pio_write - Write TP PIO registers + * @adap: the adapter + * @buff: where the indirect register values are stored + * @nregs: how many indirect registers to write + * @start_index: index of first indirect register to write + * @sleep_ok: if true we may sleep while awaiting command completion + * + * Write TP PIO Registers + **/ +static void t4_tp_pio_write(struct adapter *adap, u32 *buff, u32 nregs, + u32 start_index, bool sleep_ok) +{ + t4_tp_indirect_rw(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A, buff, nregs, + start_index, 0, sleep_ok); +} + +/** + * t4_tp_tm_pio_read - Read TP TM PIO registers + * @adap: the adapter + * @buff: where the indirect register values are written + * @nregs: how many indirect registers to read + * @start_index: index of first indirect register to read + * @sleep_ok: if true we may sleep while awaiting command completion + * + * Read TP TM PIO Registers + **/ +void t4_tp_tm_pio_read(struct adapter *adap, u32 *buff, u32 nregs, + u32 start_index, bool sleep_ok) +{ + t4_tp_indirect_rw(adap, TP_TM_PIO_ADDR_A, TP_TM_PIO_DATA_A, buff, + nregs, start_index, 1, sleep_ok); +} + +/** + * t4_tp_mib_read - Read TP MIB registers + * @adap: the adapter + * @buff: where the indirect register values are written + * @nregs: how many indirect registers to read + * @start_index: index of first indirect register to read + * @sleep_ok: if true we may sleep while awaiting command completion + * + * Read TP MIB Registers + **/ +void t4_tp_mib_read(struct adapter *adap, u32 *buff, u32 nregs, u32 start_index, + bool sleep_ok) +{ + t4_tp_indirect_rw(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, buff, nregs, + start_index, 1, sleep_ok); } /** * t4_read_rss_key - read the global RSS key * @adap: the adapter * @key: 10-entry array holding the 320-bit RSS key + * @sleep_ok: if true we may sleep while awaiting command completion * * Reads the global 320-bit RSS key. */ -void t4_read_rss_key(struct adapter *adap, u32 *key) +void t4_read_rss_key(struct adapter *adap, u32 *key, bool sleep_ok) { - if (t4_use_ldst(adap)) - t4_fw_tp_pio_rw(adap, key, 10, TP_RSS_SECRET_KEY0_A, 1); - else - t4_read_indirect(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A, key, 10, - TP_RSS_SECRET_KEY0_A); + t4_tp_pio_read(adap, key, 10, TP_RSS_SECRET_KEY0_A, sleep_ok); } /** @@ -5106,12 +5230,14 @@ void t4_read_rss_key(struct adapter *adap, u32 *key) * @adap: the adapter * @key: 10-entry array holding the 320-bit RSS key * @idx: which RSS key to write + * @sleep_ok: if true we may sleep while awaiting command completion * * Writes one of the RSS keys with the given 320-bit value. If @idx is * 0..15 the corresponding entry in the RSS key table is written, * otherwise the global RSS key is written. */ -void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx) +void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx, + bool sleep_ok) { u8 rss_key_addr_cnt = 16; u32 vrt = t4_read_reg(adap, TP_RSS_CONFIG_VRT_A); @@ -5124,11 +5250,7 @@ void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx) (vrt & KEYEXTEND_F) && (KEYMODE_G(vrt) == 3)) rss_key_addr_cnt = 32; - if (t4_use_ldst(adap)) - t4_fw_tp_pio_rw(adap, (void *)key, 10, TP_RSS_SECRET_KEY0_A, 0); - else - t4_write_indirect(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A, key, 10, - TP_RSS_SECRET_KEY0_A); + t4_tp_pio_write(adap, (void *)key, 10, TP_RSS_SECRET_KEY0_A, sleep_ok); if (idx >= 0 && idx < rss_key_addr_cnt) { if (rss_key_addr_cnt > 16) @@ -5146,19 +5268,15 @@ void t4_write_rss_key(struct adapter *adap, const u32 *key, int idx) * @adapter: the adapter * @index: the entry in the PF RSS table to read * @valp: where to store the returned value + * @sleep_ok: if true we may sleep while awaiting command completion * * Reads the PF RSS Configuration Table at the specified index and returns * the value found there. */ void t4_read_rss_pf_config(struct adapter *adapter, unsigned int index, - u32 *valp) + u32 *valp, bool sleep_ok) { - if (t4_use_ldst(adapter)) - t4_fw_tp_pio_rw(adapter, valp, 1, - TP_RSS_PF0_CONFIG_A + index, 1); - else - t4_read_indirect(adapter, TP_PIO_ADDR_A, TP_PIO_DATA_A, - valp, 1, TP_RSS_PF0_CONFIG_A + index); + t4_tp_pio_read(adapter, valp, 1, TP_RSS_PF0_CONFIG_A + index, sleep_ok); } /** @@ -5167,12 +5285,13 @@ void t4_read_rss_pf_config(struct adapter *adapter, unsigned int index, * @index: the entry in the VF RSS table to read * @vfl: where to store the returned VFL * @vfh: where to store the returned VFH + * @sleep_ok: if true we may sleep while awaiting command completion * * Reads the VF RSS Configuration Table at the specified index and returns * the (VFL, VFH) values found there. */ void t4_read_rss_vf_config(struct adapter *adapter, unsigned int index, - u32 *vfl, u32 *vfh) + u32 *vfl, u32 *vfh, bool sleep_ok) { u32 vrt, mask, data; @@ -5193,50 +5312,37 @@ void t4_read_rss_vf_config(struct adapter *adapter, unsigned int index, /* Grab the VFL/VFH values ... */ - if (t4_use_ldst(adapter)) { - t4_fw_tp_pio_rw(adapter, vfl, 1, TP_RSS_VFL_CONFIG_A, 1); - t4_fw_tp_pio_rw(adapter, vfh, 1, TP_RSS_VFH_CONFIG_A, 1); - } else { - t4_read_indirect(adapter, TP_PIO_ADDR_A, TP_PIO_DATA_A, - vfl, 1, TP_RSS_VFL_CONFIG_A); - t4_read_indirect(adapter, TP_PIO_ADDR_A, TP_PIO_DATA_A, - vfh, 1, TP_RSS_VFH_CONFIG_A); - } + t4_tp_pio_read(adapter, vfl, 1, TP_RSS_VFL_CONFIG_A, sleep_ok); + t4_tp_pio_read(adapter, vfh, 1, TP_RSS_VFH_CONFIG_A, sleep_ok); } /** * t4_read_rss_pf_map - read PF RSS Map * @adapter: the adapter + * @sleep_ok: if true we may sleep while awaiting command completion * * Reads the PF RSS Map register and returns its value. */ -u32 t4_read_rss_pf_map(struct adapter *adapter) +u32 t4_read_rss_pf_map(struct adapter *adapter, bool sleep_ok) { u32 pfmap; - if (t4_use_ldst(adapter)) - t4_fw_tp_pio_rw(adapter, &pfmap, 1, TP_RSS_PF_MAP_A, 1); - else - t4_read_indirect(adapter, TP_PIO_ADDR_A, TP_PIO_DATA_A, - &pfmap, 1, TP_RSS_PF_MAP_A); + t4_tp_pio_read(adapter, &pfmap, 1, TP_RSS_PF_MAP_A, sleep_ok); return pfmap; } /** * t4_read_rss_pf_mask - read PF RSS Mask * @adapter: the adapter + * @sleep_ok: if true we may sleep while awaiting command completion * * Reads the PF RSS Mask register and returns its value. */ -u32 t4_read_rss_pf_mask(struct adapter *adapter) +u32 t4_read_rss_pf_mask(struct adapter *adapter, bool sleep_ok) { u32 pfmask; - if (t4_use_ldst(adapter)) - t4_fw_tp_pio_rw(adapter, &pfmask, 1, TP_RSS_PF_MSK_A, 1); - else - t4_read_indirect(adapter, TP_PIO_ADDR_A, TP_PIO_DATA_A, - &pfmask, 1, TP_RSS_PF_MSK_A); + t4_tp_pio_read(adapter, &pfmask, 1, TP_RSS_PF_MSK_A, sleep_ok); return pfmask; } @@ -5245,12 +5351,13 @@ u32 t4_read_rss_pf_mask(struct adapter *adapter) * @adap: the adapter * @v4: holds the TCP/IP counter values * @v6: holds the TCP/IPv6 counter values + * @sleep_ok: if true we may sleep while awaiting command completion * * Returns the values of TP's TCP/IP and TCP/IPv6 MIB counters. * Either @v4 or @v6 may be %NULL to skip the corresponding stats. */ void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4, - struct tp_tcp_stats *v6) + struct tp_tcp_stats *v6, bool sleep_ok) { u32 val[TP_MIB_TCP_RXT_SEG_LO_A - TP_MIB_TCP_OUT_RST_A + 1]; @@ -5259,16 +5366,16 @@ void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4, #define STAT64(x) (((u64)STAT(x##_HI) << 32) | STAT(x##_LO)) if (v4) { - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, val, - ARRAY_SIZE(val), TP_MIB_TCP_OUT_RST_A); + t4_tp_mib_read(adap, val, ARRAY_SIZE(val), + TP_MIB_TCP_OUT_RST_A, sleep_ok); v4->tcp_out_rsts = STAT(OUT_RST); v4->tcp_in_segs = STAT64(IN_SEG); v4->tcp_out_segs = STAT64(OUT_SEG); v4->tcp_retrans_segs = STAT64(RXT_SEG); } if (v6) { - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, val, - ARRAY_SIZE(val), TP_MIB_TCP_V6OUT_RST_A); + t4_tp_mib_read(adap, val, ARRAY_SIZE(val), + TP_MIB_TCP_V6OUT_RST_A, sleep_ok); v6->tcp_out_rsts = STAT(OUT_RST); v6->tcp_in_segs = STAT64(IN_SEG); v6->tcp_out_segs = STAT64(OUT_SEG); @@ -5283,63 +5390,66 @@ void t4_tp_get_tcp_stats(struct adapter *adap, struct tp_tcp_stats *v4, * t4_tp_get_err_stats - read TP's error MIB counters * @adap: the adapter * @st: holds the counter values + * @sleep_ok: if true we may sleep while awaiting command completion * * Returns the values of TP's error counters. */ -void t4_tp_get_err_stats(struct adapter *adap, struct tp_err_stats *st) +void t4_tp_get_err_stats(struct adapter *adap, struct tp_err_stats *st, + bool sleep_ok) { int nchan = adap->params.arch.nchan; - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - st->mac_in_errs, nchan, TP_MIB_MAC_IN_ERR_0_A); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - st->hdr_in_errs, nchan, TP_MIB_HDR_IN_ERR_0_A); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - st->tcp_in_errs, nchan, TP_MIB_TCP_IN_ERR_0_A); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - st->tnl_cong_drops, nchan, TP_MIB_TNL_CNG_DROP_0_A); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - st->ofld_chan_drops, nchan, TP_MIB_OFD_CHN_DROP_0_A); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - st->tnl_tx_drops, nchan, TP_MIB_TNL_DROP_0_A); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - st->ofld_vlan_drops, nchan, TP_MIB_OFD_VLN_DROP_0_A); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - st->tcp6_in_errs, nchan, TP_MIB_TCP_V6IN_ERR_0_A); - - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, - &st->ofld_no_neigh, 2, TP_MIB_OFD_ARP_DROP_A); + t4_tp_mib_read(adap, st->mac_in_errs, nchan, TP_MIB_MAC_IN_ERR_0_A, + sleep_ok); + t4_tp_mib_read(adap, st->hdr_in_errs, nchan, TP_MIB_HDR_IN_ERR_0_A, + sleep_ok); + t4_tp_mib_read(adap, st->tcp_in_errs, nchan, TP_MIB_TCP_IN_ERR_0_A, + sleep_ok); + t4_tp_mib_read(adap, st->tnl_cong_drops, nchan, + TP_MIB_TNL_CNG_DROP_0_A, sleep_ok); + t4_tp_mib_read(adap, st->ofld_chan_drops, nchan, + TP_MIB_OFD_CHN_DROP_0_A, sleep_ok); + t4_tp_mib_read(adap, st->tnl_tx_drops, nchan, TP_MIB_TNL_DROP_0_A, + sleep_ok); + t4_tp_mib_read(adap, st->ofld_vlan_drops, nchan, + TP_MIB_OFD_VLN_DROP_0_A, sleep_ok); + t4_tp_mib_read(adap, st->tcp6_in_errs, nchan, + TP_MIB_TCP_V6IN_ERR_0_A, sleep_ok); + t4_tp_mib_read(adap, &st->ofld_no_neigh, 2, TP_MIB_OFD_ARP_DROP_A, + sleep_ok); } /** * t4_tp_get_cpl_stats - read TP's CPL MIB counters * @adap: the adapter * @st: holds the counter values + * @sleep_ok: if true we may sleep while awaiting command completion * * Returns the values of TP's CPL counters. */ -void t4_tp_get_cpl_stats(struct adapter *adap, struct tp_cpl_stats *st) +void t4_tp_get_cpl_stats(struct adapter *adap, struct tp_cpl_stats *st, + bool sleep_ok) { int nchan = adap->params.arch.nchan; - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, st->req, - nchan, TP_MIB_CPL_IN_REQ_0_A); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, st->rsp, - nchan, TP_MIB_CPL_OUT_RSP_0_A); + t4_tp_mib_read(adap, st->req, nchan, TP_MIB_CPL_IN_REQ_0_A, sleep_ok); + t4_tp_mib_read(adap, st->rsp, nchan, TP_MIB_CPL_OUT_RSP_0_A, sleep_ok); } /** * t4_tp_get_rdma_stats - read TP's RDMA MIB counters * @adap: the adapter * @st: holds the counter values + * @sleep_ok: if true we may sleep while awaiting command completion * * Returns the values of TP's RDMA counters. */ -void t4_tp_get_rdma_stats(struct adapter *adap, struct tp_rdma_stats *st) +void t4_tp_get_rdma_stats(struct adapter *adap, struct tp_rdma_stats *st, + bool sleep_ok) { - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, &st->rqe_dfr_pkt, - 2, TP_MIB_RQE_DFR_PKT_A); + t4_tp_mib_read(adap, &st->rqe_dfr_pkt, 2, TP_MIB_RQE_DFR_PKT_A, + sleep_ok); } /** @@ -5347,20 +5457,24 @@ void t4_tp_get_rdma_stats(struct adapter *adap, struct tp_rdma_stats *st) * @adap: the adapter * @idx: the port index * @st: holds the counter values + * @sleep_ok: if true we may sleep while awaiting command completion * * Returns the values of TP's FCoE counters for the selected port. */ void t4_get_fcoe_stats(struct adapter *adap, unsigned int idx, - struct tp_fcoe_stats *st) + struct tp_fcoe_stats *st, bool sleep_ok) { u32 val[2]; - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, &st->frames_ddp, - 1, TP_MIB_FCOE_DDP_0_A + idx); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, &st->frames_drop, - 1, TP_MIB_FCOE_DROP_0_A + idx); - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, val, - 2, TP_MIB_FCOE_BYTE_0_HI_A + 2 * idx); + t4_tp_mib_read(adap, &st->frames_ddp, 1, TP_MIB_FCOE_DDP_0_A + idx, + sleep_ok); + + t4_tp_mib_read(adap, &st->frames_drop, 1, + TP_MIB_FCOE_DROP_0_A + idx, sleep_ok); + + t4_tp_mib_read(adap, val, 2, TP_MIB_FCOE_BYTE_0_HI_A + 2 * idx, + sleep_ok); + st->octets_ddp = ((u64)val[0] << 32) | val[1]; } @@ -5368,15 +5482,16 @@ void t4_get_fcoe_stats(struct adapter *adap, unsigned int idx, * t4_get_usm_stats - read TP's non-TCP DDP MIB counters * @adap: the adapter * @st: holds the counter values + * @sleep_ok: if true we may sleep while awaiting command completion * * Returns the values of TP's counters for non-TCP directly-placed packets. */ -void t4_get_usm_stats(struct adapter *adap, struct tp_usm_stats *st) +void t4_get_usm_stats(struct adapter *adap, struct tp_usm_stats *st, + bool sleep_ok) { u32 val[4]; - t4_read_indirect(adap, TP_MIB_INDEX_A, TP_MIB_DATA_A, val, 4, - TP_MIB_USM_PKTS_A); + t4_tp_mib_read(adap, val, 4, TP_MIB_USM_PKTS_A, sleep_ok); st->frames = val[0]; st->drops = val[1]; st->octets = ((u64)val[2] << 32) | val[3]; @@ -8205,7 +8320,7 @@ struct flash_desc { u32 size_mb; }; -static int get_flash_params(struct adapter *adap) +static int t4_get_flash_params(struct adapter *adap) { /* Table for non-Numonix supported flash parts. Numonix parts are left * to the preexisting code. All flash parts have 64KB sectors. @@ -8214,40 +8329,136 @@ static int get_flash_params(struct adapter *adap) { 0x150201, 4 << 20 }, /* Spansion 4MB S25FL032P */ }; + unsigned int part, manufacturer; + unsigned int density, size; + u32 flashid = 0; int ret; - u32 info; + + /* Issue a Read ID Command to the Flash part. We decode supported + * Flash parts and their sizes from this. There's a newer Query + * Command which can retrieve detailed geometry information but many + * Flash parts don't support it. + */ ret = sf1_write(adap, 1, 1, 0, SF_RD_ID); if (!ret) - ret = sf1_read(adap, 3, 0, 1, &info); + ret = sf1_read(adap, 3, 0, 1, &flashid); t4_write_reg(adap, SF_OP_A, 0); /* unlock SF */ if (ret) return ret; - for (ret = 0; ret < ARRAY_SIZE(supported_flash); ++ret) - if (supported_flash[ret].vendor_and_model_id == info) { - adap->params.sf_size = supported_flash[ret].size_mb; + /* Check to see if it's one of our non-standard supported Flash parts. + */ + for (part = 0; part < ARRAY_SIZE(supported_flash); part++) + if (supported_flash[part].vendor_and_model_id == flashid) { + adap->params.sf_size = supported_flash[part].size_mb; adap->params.sf_nsec = adap->params.sf_size / SF_SEC_SIZE; - return 0; + goto found; } - if ((info & 0xff) != 0x20) /* not a Numonix flash */ + /* Decode Flash part size. The code below looks repetative with + * common encodings, but that's not guaranteed in the JEDEC + * specification for the Read JADEC ID command. The only thing that + * we're guaranteed by the JADEC specification is where the + * Manufacturer ID is in the returned result. After that each + * Manufacturer ~could~ encode things completely differently. + * Note, all Flash parts must have 64KB sectors. + */ + manufacturer = flashid & 0xff; + switch (manufacturer) { + case 0x20: { /* Micron/Numonix */ + /* This Density -> Size decoding table is taken from Micron + * Data Sheets. + */ + density = (flashid >> 16) & 0xff; + switch (density) { + case 0x14: /* 1MB */ + size = 1 << 20; + break; + case 0x15: /* 2MB */ + size = 1 << 21; + break; + case 0x16: /* 4MB */ + size = 1 << 22; + break; + case 0x17: /* 8MB */ + size = 1 << 23; + break; + case 0x18: /* 16MB */ + size = 1 << 24; + break; + case 0x19: /* 32MB */ + size = 1 << 25; + break; + case 0x20: /* 64MB */ + size = 1 << 26; + break; + case 0x21: /* 128MB */ + size = 1 << 27; + break; + case 0x22: /* 256MB */ + size = 1 << 28; + break; + + default: + dev_err(adap->pdev_dev, "Micron Flash Part has bad size, ID = %#x, Density code = %#x\n", + flashid, density); return -EINVAL; - info >>= 16; /* log2 of size */ - if (info >= 0x14 && info < 0x18) - adap->params.sf_nsec = 1 << (info - 16); - else if (info == 0x18) - adap->params.sf_nsec = 64; - else + } + break; + } + case 0xc2: { /* Macronix */ + /* This Density -> Size decoding table is taken from Macronix + * Data Sheets. + */ + density = (flashid >> 16) & 0xff; + switch (density) { + case 0x17: /* 8MB */ + size = 1 << 23; + break; + case 0x18: /* 16MB */ + size = 1 << 24; + break; + default: + dev_err(adap->pdev_dev, "Macronix Flash Part has bad size, ID = %#x, Density code = %#x\n", + flashid, density); return -EINVAL; - adap->params.sf_size = 1 << info; - adap->params.sf_fw_start = - t4_read_reg(adap, CIM_BOOT_CFG_A) & BOOTADDR_M; + } + } + case 0xef: { /* Winbond */ + /* This Density -> Size decoding table is taken from Winbond + * Data Sheets. + */ + density = (flashid >> 16) & 0xff; + switch (density) { + case 0x17: /* 8MB */ + size = 1 << 23; + break; + case 0x18: /* 16MB */ + size = 1 << 24; + break; + default: + dev_err(adap->pdev_dev, "Winbond Flash Part has bad size, ID = %#x, Density code = %#x\n", + flashid, density); + return -EINVAL; + } + break; + } + default: + dev_err(adap->pdev_dev, "Unsupported Flash Part, ID = %#x\n", + flashid); + return -EINVAL; + } + + /* Store decoded Flash size and fall through into vetting code. */ + adap->params.sf_size = size; + adap->params.sf_nsec = size / SF_SEC_SIZE; +found: if (adap->params.sf_size < FLASH_MIN_SIZE) - dev_warn(adap->pdev_dev, "WARNING!!! FLASH size %#x < %#x!!!\n", - adap->params.sf_size, FLASH_MIN_SIZE); + dev_warn(adap->pdev_dev, "WARNING: Flash Part ID %#x, size %#x < %#x\n", + flashid, adap->params.sf_size, FLASH_MIN_SIZE); return 0; } @@ -8285,7 +8496,7 @@ int t4_prep_adapter(struct adapter *adapter) get_pci_mode(adapter, &adapter->params.pci); pl_rev = REV_G(t4_read_reg(adapter, PL_REV_A)); - ret = get_flash_params(adapter); + ret = t4_get_flash_params(adapter); if (ret < 0) { dev_err(adapter->pdev_dev, "error %d identifying flash\n", ret); return ret; @@ -8567,10 +8778,11 @@ int t4_init_sge_params(struct adapter *adapter) /** * t4_init_tp_params - initialize adap->params.tp * @adap: the adapter + * @sleep_ok: if true we may sleep while awaiting command completion * * Initialize various fields of the adapter's TP Parameters structure. */ -int t4_init_tp_params(struct adapter *adap) +int t4_init_tp_params(struct adapter *adap, bool sleep_ok) { int chan; u32 v; @@ -8586,19 +8798,11 @@ int t4_init_tp_params(struct adapter *adap) /* Cache the adapter's Compressed Filter Mode and global Incress * Configuration. */ - if (t4_use_ldst(adap)) { - t4_fw_tp_pio_rw(adap, &adap->params.tp.vlan_pri_map, 1, - TP_VLAN_PRI_MAP_A, 1); - t4_fw_tp_pio_rw(adap, &adap->params.tp.ingress_config, 1, - TP_INGRESS_CONFIG_A, 1); - } else { - t4_read_indirect(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A, - &adap->params.tp.vlan_pri_map, 1, - TP_VLAN_PRI_MAP_A); - t4_read_indirect(adap, TP_PIO_ADDR_A, TP_PIO_DATA_A, - &adap->params.tp.ingress_config, 1, - TP_INGRESS_CONFIG_A); - } + t4_tp_pio_read(adap, &adap->params.tp.vlan_pri_map, 1, + TP_VLAN_PRI_MAP_A, sleep_ok); + t4_tp_pio_read(adap, &adap->params.tp.ingress_config, 1, + TP_INGRESS_CONFIG_A, sleep_ok); + /* For T6, cache the adapter's compressed error vector * and passing outer header info for encapsulated packets. */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h index 633e9751a25e..8c22bb8c9fbf 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h @@ -181,6 +181,8 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN CH_PCI_ID_TABLE_FENTRY(0x50a7), /* Custom T580-CR */ CH_PCI_ID_TABLE_FENTRY(0x50a8), /* Custom T580-KR */ CH_PCI_ID_TABLE_FENTRY(0x50a9), /* Custom T580-KR */ + CH_PCI_ID_TABLE_FENTRY(0x50aa), /* Custom T580-CR */ + CH_PCI_ID_TABLE_FENTRY(0x50ab), /* Custom T520-CR */ /* T6 adapters: */ diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h index dac90837842b..82614e078f50 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_regs.h @@ -1447,6 +1447,8 @@ #define LKPTBLQUEUE0_M 0x3ffU #define LKPTBLQUEUE0_G(x) (((x) >> LKPTBLQUEUE0_S) & LKPTBLQUEUE0_M) +#define TP_TM_PIO_ADDR_A 0x7e18 +#define TP_TM_PIO_DATA_A 0x7e1c #define TP_PIO_ADDR_A 0x7e40 #define TP_PIO_DATA_A 0x7e44 #define TP_MIB_INDEX_A 0x7e50 diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig index 9d7cb0387bf7..30000b6aa7b8 100644 --- a/drivers/net/ethernet/hisilicon/Kconfig +++ b/drivers/net/ethernet/hisilicon/Kconfig @@ -78,7 +78,7 @@ config HNS_ENET config HNS3 tristate "Hisilicon Network Subsystem Support HNS3 (Framework)" - depends on PCI + depends on PCI ---help--- This selects the framework support for Hisilicon Network Subsystem 3. This layer facilitates clients like ENET, RoCE and user-space ethernet @@ -87,7 +87,7 @@ config HNS3 config HNS3_HCLGE tristate "Hisilicon HNS3 HCLGE Acceleration Engine & Compatibility Layer Support" - depends on PCI_MSI + depends on PCI_MSI depends on HNS3 ---help--- This selects the HNS3_HCLGE network acceleration engine & its hardware @@ -96,7 +96,7 @@ config HNS3_HCLGE config HNS3_ENET tristate "Hisilicon HNS3 Ethernet Device Support" - depends on 64BIT && PCI + depends on 64BIT && PCI depends on HNS3 && HNS3_HCLGE ---help--- This selects the Ethernet Driver for Hisilicon Network Subsystem 3 for hip08 diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index c677530841cf..575f50df340c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -339,6 +339,10 @@ struct hnae3_ae_ops { u8 *hfunc); int (*set_rss)(struct hnae3_handle *handle, const u32 *indir, const u8 *key, const u8 hfunc); + int (*set_rss_tuple)(struct hnae3_handle *handle, + struct ethtool_rxnfc *cmd); + int (*get_rss_tuple)(struct hnae3_handle *handle, + struct ethtool_rxnfc *cmd); int (*get_tc_size)(struct hnae3_handle *handle); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 8b511e6e0ce9..60960e588b5f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -85,6 +85,15 @@ static int hclge_init_cmd_queue(struct hclge_dev *hdev, int ring_type) return 0; } +void hclge_cmd_reuse_desc(struct hclge_desc *desc, bool is_read) +{ + desc->flag = cpu_to_le16(HCLGE_CMD_FLAG_NO_INTR | HCLGE_CMD_FLAG_IN); + if (is_read) + desc->flag |= cpu_to_le16(HCLGE_CMD_FLAG_WR); + else + desc->flag &= cpu_to_le16(~HCLGE_CMD_FLAG_WR); +} + void hclge_cmd_setup_basic_desc(struct hclge_desc *desc, enum hclge_opcode_type opcode, bool is_read) { @@ -208,7 +217,7 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num) * which will be use for hardware to write back */ ntc = hw->cmq.csq.next_to_use; - opcode = desc[0].opcode; + opcode = le16_to_cpu(desc[0].opcode); while (handle < num) { desc_to_use = &hw->cmq.csq.desc[hw->cmq.csq.next_to_use]; *desc_to_use = desc[handle]; @@ -225,7 +234,7 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num) * If the command is sync, wait for the firmware to write back, * if multi descriptors to be sent, use the first one to check */ - if (HCLGE_SEND_SYNC(desc->flag)) { + if (HCLGE_SEND_SYNC(le16_to_cpu(desc->flag))) { do { if (hclge_cmd_csq_done(hw)) break; @@ -244,9 +253,9 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num) pr_debug("Get cmd desc:\n"); if (likely(!hclge_is_special_opcode(opcode))) - desc_ret = desc[handle].retval; + desc_ret = le16_to_cpu(desc[handle].retval); else - desc_ret = desc[0].retval; + desc_ret = le16_to_cpu(desc[0].retval); if ((enum hclge_cmd_return_status)desc_ret == HCLGE_CMD_EXEC_SUCCESS) @@ -276,15 +285,15 @@ int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num) return retval; } -enum hclge_cmd_status hclge_cmd_query_firmware_version(struct hclge_hw *hw, - u32 *version) +static enum hclge_cmd_status hclge_cmd_query_firmware_version( + struct hclge_hw *hw, u32 *version) { - struct hclge_query_version *resp; + struct hclge_query_version_cmd *resp; struct hclge_desc desc; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_FW_VER, 1); - resp = (struct hclge_query_version *)desc.data; + resp = (struct hclge_query_version_cmd *)desc.data; ret = hclge_cmd_send(hw, &desc, 1); if (!ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h index 6b6d28eff664..b4373345c2b4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h @@ -221,12 +221,12 @@ enum hclge_opcode_type { #define HCLGE_RCB_INIT_QUERY_TIMEOUT 10 #define HCLGE_RCB_INIT_FLAG_EN_B 0 #define HCLGE_RCB_INIT_FLAG_FINI_B 8 -struct hclge_config_rcb_init { +struct hclge_config_rcb_init_cmd { __le16 rcb_init_flag; u8 rsv[22]; }; -struct hclge_tqp_map { +struct hclge_tqp_map_cmd { __le16 tqp_id; /* Absolute tqp id for in this pf */ u8 tqp_vf; /* VF id */ #define HCLGE_TQP_MAP_TYPE_PF 0 @@ -246,15 +246,15 @@ enum hclge_int_type { HCLGE_INT_EVENT, }; -struct hclge_ctrl_vector_chain { +struct hclge_ctrl_vector_chain_cmd { u8 int_vector_id; u8 int_cause_num; #define HCLGE_INT_TYPE_S 0 -#define HCLGE_INT_TYPE_M 0x3 +#define HCLGE_INT_TYPE_M GENMASK(1, 0) #define HCLGE_TQP_ID_S 2 -#define HCLGE_TQP_ID_M (0x7ff << HCLGE_TQP_ID_S) +#define HCLGE_TQP_ID_M GENMASK(12, 2) #define HCLGE_INT_GL_IDX_S 13 -#define HCLGE_INT_GL_IDX_M (0x3 << HCLGE_INT_GL_IDX_S) +#define HCLGE_INT_GL_IDX_M GENMASK(14, 13) __le16 tqp_type_and_id[HCLGE_VECTOR_ELEMENTS_PER_CMD]; u8 vfid; u8 rsv; @@ -263,18 +263,18 @@ struct hclge_ctrl_vector_chain { #define HCLGE_TC_NUM 8 #define HCLGE_TC0_PRI_BUF_EN_B 15 /* Bit 15 indicate enable or not */ #define HCLGE_BUF_UNIT_S 7 /* Buf size is united by 128 bytes */ -struct hclge_tx_buff_alloc { +struct hclge_tx_buff_alloc_cmd { __le16 tx_pkt_buff[HCLGE_TC_NUM]; u8 tx_buff_rsv[8]; }; -struct hclge_rx_priv_buff { +struct hclge_rx_priv_buff_cmd { __le16 buf_num[HCLGE_TC_NUM]; __le16 shared_buf; u8 rsv[6]; }; -struct hclge_query_version { +struct hclge_query_version_cmd { __le32 firmware; __le32 firmware_rsv[5]; }; @@ -328,14 +328,14 @@ struct hclge_pkt_buf_alloc { }; #define HCLGE_RX_COM_WL_EN_B 15 -struct hclge_rx_com_wl_buf { +struct hclge_rx_com_wl_buf_cmd { __le16 high_wl; __le16 low_wl; u8 rsv[20]; }; #define HCLGE_RX_PKT_EN_B 15 -struct hclge_rx_pkt_buf { +struct hclge_rx_pkt_buf_cmd { __le16 high_pkt; __le16 low_pkt; u8 rsv[20]; @@ -348,7 +348,7 @@ struct hclge_rx_pkt_buf { #define HCLGE_PF_MAC_NUM_MASK 0x3 #define HCLGE_PF_STATE_MAIN BIT(HCLGE_PF_STATE_MAIN_B) #define HCLGE_PF_STATE_DONE BIT(HCLGE_PF_STATE_DONE_B) -struct hclge_func_status { +struct hclge_func_status_cmd { __le32 vf_rst_state[4]; u8 pf_state; u8 mac_id; @@ -359,7 +359,7 @@ struct hclge_func_status { u8 rsv[2]; }; -struct hclge_pf_res { +struct hclge_pf_res_cmd { __le16 tqp_num; __le16 buf_size; __le16 msixcap_localid_ba_nic; @@ -372,30 +372,30 @@ struct hclge_pf_res { }; #define HCLGE_CFG_OFFSET_S 0 -#define HCLGE_CFG_OFFSET_M 0xfffff /* Byte (8-10.3) */ +#define HCLGE_CFG_OFFSET_M GENMASK(19, 0) #define HCLGE_CFG_RD_LEN_S 24 -#define HCLGE_CFG_RD_LEN_M (0xf << HCLGE_CFG_RD_LEN_S) +#define HCLGE_CFG_RD_LEN_M GENMASK(27, 24) #define HCLGE_CFG_RD_LEN_BYTES 16 #define HCLGE_CFG_RD_LEN_UNIT 4 #define HCLGE_CFG_VMDQ_S 0 -#define HCLGE_CFG_VMDQ_M (0xff << HCLGE_CFG_VMDQ_S) +#define HCLGE_CFG_VMDQ_M GENMASK(7, 0) #define HCLGE_CFG_TC_NUM_S 8 -#define HCLGE_CFG_TC_NUM_M (0xff << HCLGE_CFG_TC_NUM_S) +#define HCLGE_CFG_TC_NUM_M GENMASK(15, 8) #define HCLGE_CFG_TQP_DESC_N_S 16 -#define HCLGE_CFG_TQP_DESC_N_M (0xffff << HCLGE_CFG_TQP_DESC_N_S) +#define HCLGE_CFG_TQP_DESC_N_M GENMASK(31, 16) #define HCLGE_CFG_PHY_ADDR_S 0 -#define HCLGE_CFG_PHY_ADDR_M (0x1f << HCLGE_CFG_PHY_ADDR_S) +#define HCLGE_CFG_PHY_ADDR_M GENMASK(4, 0) #define HCLGE_CFG_MEDIA_TP_S 8 -#define HCLGE_CFG_MEDIA_TP_M (0xff << HCLGE_CFG_MEDIA_TP_S) +#define HCLGE_CFG_MEDIA_TP_M GENMASK(15, 8) #define HCLGE_CFG_RX_BUF_LEN_S 16 -#define HCLGE_CFG_RX_BUF_LEN_M (0xffff << HCLGE_CFG_RX_BUF_LEN_S) +#define HCLGE_CFG_RX_BUF_LEN_M GENMASK(31, 16) #define HCLGE_CFG_MAC_ADDR_H_S 0 -#define HCLGE_CFG_MAC_ADDR_H_M (0xffff << HCLGE_CFG_MAC_ADDR_H_S) +#define HCLGE_CFG_MAC_ADDR_H_M GENMASK(15, 0) #define HCLGE_CFG_DEFAULT_SPEED_S 16 -#define HCLGE_CFG_DEFAULT_SPEED_M (0xff << HCLGE_CFG_DEFAULT_SPEED_S) +#define HCLGE_CFG_DEFAULT_SPEED_M GENMASK(23, 16) -struct hclge_cfg_param { +struct hclge_cfg_param_cmd { __le32 offset; __le32 rsv; __le32 param[4]; @@ -405,7 +405,7 @@ struct hclge_cfg_param { #define HCLGE_DESC_NUM 0x40 #define HCLGE_ALLOC_VALID_B 0 -struct hclge_vf_num { +struct hclge_vf_num_cmd { u8 alloc_valid; u8 rsv[23]; }; @@ -413,13 +413,13 @@ struct hclge_vf_num { #define HCLGE_RSS_DEFAULT_OUTPORT_B 4 #define HCLGE_RSS_HASH_KEY_OFFSET_B 4 #define HCLGE_RSS_HASH_KEY_NUM 16 -struct hclge_rss_config { +struct hclge_rss_config_cmd { u8 hash_config; u8 rsv[7]; u8 hash_key[HCLGE_RSS_HASH_KEY_NUM]; }; -struct hclge_rss_input_tuple { +struct hclge_rss_input_tuple_cmd { u8 ipv4_tcp_en; u8 ipv4_udp_en; u8 ipv4_sctp_en; @@ -433,26 +433,26 @@ struct hclge_rss_input_tuple { #define HCLGE_RSS_CFG_TBL_SIZE 16 -struct hclge_rss_indirection_table { - u16 start_table_index; - u16 rss_set_bitmap; +struct hclge_rss_indirection_table_cmd { + __le16 start_table_index; + __le16 rss_set_bitmap; u8 rsv[4]; u8 rss_result[HCLGE_RSS_CFG_TBL_SIZE]; }; #define HCLGE_RSS_TC_OFFSET_S 0 -#define HCLGE_RSS_TC_OFFSET_M (0x3ff << HCLGE_RSS_TC_OFFSET_S) +#define HCLGE_RSS_TC_OFFSET_M GENMASK(9, 0) #define HCLGE_RSS_TC_SIZE_S 12 -#define HCLGE_RSS_TC_SIZE_M (0x7 << HCLGE_RSS_TC_SIZE_S) +#define HCLGE_RSS_TC_SIZE_M GENMASK(14, 12) #define HCLGE_RSS_TC_VALID_B 15 -struct hclge_rss_tc_mode { - u16 rss_tc_mode[HCLGE_MAX_TC_NUM]; +struct hclge_rss_tc_mode_cmd { + __le16 rss_tc_mode[HCLGE_MAX_TC_NUM]; u8 rsv[8]; }; #define HCLGE_LINK_STS_B 0 #define HCLGE_LINK_STATUS BIT(HCLGE_LINK_STS_B) -struct hclge_link_status { +struct hclge_link_status_cmd { u8 status; u8 rsv[23]; }; @@ -467,7 +467,7 @@ struct hclge_promisc_param { #define HCLGE_PROMISC_EN_UC 0x1 #define HCLGE_PROMISC_EN_MC 0x2 #define HCLGE_PROMISC_EN_BC 0x4 -struct hclge_promisc_cfg { +struct hclge_promisc_cfg_cmd { u8 flag; u8 vf_id; __le16 rsv0; @@ -495,18 +495,18 @@ enum hclge_promisc_type { #define HCLGE_MAC_TX_UNDER_MIN_ERR_B 21 #define HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B 22 -struct hclge_config_mac_mode { +struct hclge_config_mac_mode_cmd { __le32 txrx_pad_fcs_loop_en; u8 rsv[20]; }; #define HCLGE_CFG_SPEED_S 0 -#define HCLGE_CFG_SPEED_M (0x3f << HCLGE_CFG_SPEED_S) +#define HCLGE_CFG_SPEED_M GENMASK(5, 0) #define HCLGE_CFG_DUPLEX_B 7 #define HCLGE_CFG_DUPLEX_M BIT(HCLGE_CFG_DUPLEX_B) -struct hclge_config_mac_speed_dup { +struct hclge_config_mac_speed_dup_cmd { u8 speed_dup; #define HCLGE_CFG_MAC_SPEED_CHANGE_EN_B 0 @@ -518,17 +518,17 @@ struct hclge_config_mac_speed_dup { #define HCLGE_QUERY_AN_B 0 #define HCLGE_QUERY_DUPLEX_B 2 -#define HCLGE_QUERY_SPEED_M (0x1f << HCLGE_QUERY_SPEED_S) +#define HCLGE_QUERY_SPEED_M GENMASK(4, 0) #define HCLGE_QUERY_AN_M BIT(HCLGE_QUERY_AN_B) #define HCLGE_QUERY_DUPLEX_M BIT(HCLGE_QUERY_DUPLEX_B) -struct hclge_query_an_speed_dup { +struct hclge_query_an_speed_dup_cmd { u8 an_syn_dup_speed; u8 pause; u8 rsv[23]; }; -#define HCLGE_RING_ID_MASK 0x3ff +#define HCLGE_RING_ID_MASK GENMASK(9, 0) #define HCLGE_TQP_ENABLE_B 0 #define HCLGE_MAC_CFG_AN_EN_B 0 @@ -539,7 +539,7 @@ struct hclge_query_an_speed_dup { #define HCLGE_MAC_CFG_AN_EN BIT(HCLGE_MAC_CFG_AN_EN_B) -struct hclge_config_auto_neg { +struct hclge_config_auto_neg_cmd { __le32 cfg_an_cmd_flag; u8 rsv[20]; }; @@ -548,7 +548,7 @@ struct hclge_config_auto_neg { #define HCLGE_MAC_MAX_MTU 9728 #define HCLGE_MAC_UPLINK_PORT 0x100 -struct hclge_config_max_frm_size { +struct hclge_config_max_frm_size_cmd { __le16 max_frm_size; u8 rsv[22]; }; @@ -565,10 +565,10 @@ enum hclge_mac_vlan_tbl_opcode { #define HCLGE_MAC_EPORT_SW_EN_B 0xc #define HCLGE_MAC_EPORT_TYPE_B 0xb #define HCLGE_MAC_EPORT_VFID_S 0x3 -#define HCLGE_MAC_EPORT_VFID_M (0xff << HCLGE_MAC_EPORT_VFID_S) +#define HCLGE_MAC_EPORT_VFID_M GENMASK(10, 3) #define HCLGE_MAC_EPORT_PFID_S 0x0 -#define HCLGE_MAC_EPORT_PFID_M (0x7 << HCLGE_MAC_EPORT_PFID_S) -struct hclge_mac_vlan_tbl_entry { +#define HCLGE_MAC_EPORT_PFID_M GENMASK(2, 0) +struct hclge_mac_vlan_tbl_entry_cmd { u8 flags; u8 resp_code; __le16 vlan_tag; @@ -583,15 +583,15 @@ struct hclge_mac_vlan_tbl_entry { }; #define HCLGE_CFG_MTA_MAC_SEL_S 0x0 -#define HCLGE_CFG_MTA_MAC_SEL_M (0x3 << HCLGE_CFG_MTA_MAC_SEL_S) +#define HCLGE_CFG_MTA_MAC_SEL_M GENMASK(1, 0) #define HCLGE_CFG_MTA_MAC_EN_B 0x7 -struct hclge_mta_filter_mode { +struct hclge_mta_filter_mode_cmd { u8 dmac_sel_en; /* Use lowest 2 bit as sel_mode, bit 7 as enable */ u8 rsv[23]; }; #define HCLGE_CFG_FUNC_MTA_ACCEPT_B 0x0 -struct hclge_cfg_func_mta_filter { +struct hclge_cfg_func_mta_filter_cmd { u8 accept; /* Only used lowest 1 bit */ u8 function_id; u8 rsv[22]; @@ -599,14 +599,14 @@ struct hclge_cfg_func_mta_filter { #define HCLGE_CFG_MTA_ITEM_ACCEPT_B 0x0 #define HCLGE_CFG_MTA_ITEM_IDX_S 0x0 -#define HCLGE_CFG_MTA_ITEM_IDX_M (0xfff << HCLGE_CFG_MTA_ITEM_IDX_S) -struct hclge_cfg_func_mta_item { - u16 item_idx; /* Only used lowest 12 bit */ +#define HCLGE_CFG_MTA_ITEM_IDX_M GENMASK(11, 0) +struct hclge_cfg_func_mta_item_cmd { + __le16 item_idx; /* Only used lowest 12 bit */ u8 accept; /* Only used lowest 1 bit */ u8 rsv[21]; }; -struct hclge_mac_vlan_add { +struct hclge_mac_vlan_add_cmd { __le16 flags; __le16 mac_addr_hi16; __le32 mac_addr_lo32; @@ -619,7 +619,7 @@ struct hclge_mac_vlan_add { }; #define HNS3_MAC_VLAN_CFG_FLAG_BIT 0 -struct hclge_mac_vlan_remove { +struct hclge_mac_vlan_remove_cmd { __le16 flags; __le16 mac_addr_hi16; __le32 mac_addr_lo32; @@ -631,21 +631,21 @@ struct hclge_mac_vlan_remove { u8 rsv[4]; }; -struct hclge_vlan_filter_ctrl { +struct hclge_vlan_filter_ctrl_cmd { u8 vlan_type; u8 vlan_fe; u8 rsv[22]; }; -struct hclge_vlan_filter_pf_cfg { +struct hclge_vlan_filter_pf_cfg_cmd { u8 vlan_offset; u8 vlan_cfg; u8 rsv[2]; u8 vlan_offset_bitmap[20]; }; -struct hclge_vlan_filter_vf_cfg { - u16 vlan_id; +struct hclge_vlan_filter_vf_cfg_cmd { + __le16 vlan_id; u8 resp_code; u8 rsv; u8 vlan_cfg; @@ -653,14 +653,14 @@ struct hclge_vlan_filter_vf_cfg { u8 vf_bitmap[16]; }; -struct hclge_cfg_com_tqp_queue { +struct hclge_cfg_com_tqp_queue_cmd { __le16 tqp_id; __le16 stream_id; u8 enable; u8 rsv[19]; }; -struct hclge_cfg_tx_queue_pointer { +struct hclge_cfg_tx_queue_pointer_cmd { __le16 tqp_id; __le16 tx_tail; __le16 tx_head; @@ -670,12 +670,12 @@ struct hclge_cfg_tx_queue_pointer { }; #define HCLGE_TSO_MSS_MIN_S 0 -#define HCLGE_TSO_MSS_MIN_M (0x3FFF << HCLGE_TSO_MSS_MIN_S) +#define HCLGE_TSO_MSS_MIN_M GENMASK(13, 0) #define HCLGE_TSO_MSS_MAX_S 16 -#define HCLGE_TSO_MSS_MAX_M (0x3FFF << HCLGE_TSO_MSS_MAX_S) +#define HCLGE_TSO_MSS_MAX_M GENMASK(29, 16) -struct hclge_cfg_tso_status { +struct hclge_cfg_tso_status_cmd { __le16 tso_mss_min; __le16 tso_mss_max; u8 rsv[20]; @@ -685,7 +685,7 @@ struct hclge_cfg_tso_status { #define HCLGE_TSO_MSS_MAX 9668 #define HCLGE_TQP_RESET_B 0 -struct hclge_reset_tqp_queue { +struct hclge_reset_tqp_queue_cmd { __le16 tqp_id; u8 reset_req; u8 ready_to_reset; @@ -739,6 +739,7 @@ struct hclge_hw; int hclge_cmd_send(struct hclge_hw *hw, struct hclge_desc *desc, int num); void hclge_cmd_setup_basic_desc(struct hclge_desc *desc, enum hclge_opcode_type opcode, bool is_read); +void hclge_cmd_reuse_desc(struct hclge_desc *desc, bool is_read); int hclge_cmd_set_promisc_mode(struct hclge_dev *hdev, struct hclge_promisc_param *param); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 1a13614af3de..c322b4534148 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -362,7 +362,7 @@ static int hclge_64_bit_update_stats(struct hclge_dev *hdev) #define HCLGE_64_BIT_RTN_DATANUM 4 u64 *data = (u64 *)(&hdev->hw_stats.all_64_bit_stats); struct hclge_desc desc[HCLGE_64_BIT_CMD_NUM]; - u64 *desc_data; + __le64 *desc_data; int i, k, n; int ret; @@ -376,14 +376,14 @@ static int hclge_64_bit_update_stats(struct hclge_dev *hdev) for (i = 0; i < HCLGE_64_BIT_CMD_NUM; i++) { if (unlikely(i == 0)) { - desc_data = (u64 *)(&desc[i].data[0]); + desc_data = (__le64 *)(&desc[i].data[0]); n = HCLGE_64_BIT_RTN_DATANUM - 1; } else { - desc_data = (u64 *)(&desc[i]); + desc_data = (__le64 *)(&desc[i]); n = HCLGE_64_BIT_RTN_DATANUM; } for (k = 0; k < n; k++) { - *data++ += cpu_to_le64(*desc_data); + *data++ += le64_to_cpu(*desc_data); desc_data++; } } @@ -411,7 +411,7 @@ static int hclge_32_bit_update_stats(struct hclge_dev *hdev) struct hclge_desc desc[HCLGE_32_BIT_CMD_NUM]; struct hclge_32_bit_stats *all_32_bit_stats; - u32 *desc_data; + __le32 *desc_data; int i, k, n; u64 *data; int ret; @@ -431,21 +431,27 @@ static int hclge_32_bit_update_stats(struct hclge_dev *hdev) hclge_reset_partial_32bit_counter(all_32_bit_stats); for (i = 0; i < HCLGE_32_BIT_CMD_NUM; i++) { if (unlikely(i == 0)) { + __le16 *desc_data_16bit; + all_32_bit_stats->igu_rx_err_pkt += - cpu_to_le32(desc[i].data[0]); + le32_to_cpu(desc[i].data[0]); + + desc_data_16bit = (__le16 *)&desc[i].data[1]; all_32_bit_stats->igu_rx_no_eof_pkt += - cpu_to_le32(desc[i].data[1] & 0xffff); + le16_to_cpu(*desc_data_16bit); + + desc_data_16bit++; all_32_bit_stats->igu_rx_no_sof_pkt += - cpu_to_le32((desc[i].data[1] >> 16) & 0xffff); + le16_to_cpu(*desc_data_16bit); - desc_data = (u32 *)(&desc[i].data[2]); + desc_data = &desc[i].data[2]; n = HCLGE_32_BIT_RTN_DATANUM - 4; } else { - desc_data = (u32 *)(&desc[i]); + desc_data = (__le32 *)&desc[i]; n = HCLGE_32_BIT_RTN_DATANUM; } for (k = 0; k < n; k++) { - *data++ += cpu_to_le32(*desc_data); + *data++ += le32_to_cpu(*desc_data); desc_data++; } } @@ -460,7 +466,7 @@ static int hclge_mac_update_stats(struct hclge_dev *hdev) u64 *data = (u64 *)(&hdev->hw_stats.mac_stats); struct hclge_desc desc[HCLGE_MAC_CMD_NUM]; - u64 *desc_data; + __le64 *desc_data; int i, k, n; int ret; @@ -475,14 +481,14 @@ static int hclge_mac_update_stats(struct hclge_dev *hdev) for (i = 0; i < HCLGE_MAC_CMD_NUM; i++) { if (unlikely(i == 0)) { - desc_data = (u64 *)(&desc[i].data[0]); + desc_data = (__le64 *)(&desc[i].data[0]); n = HCLGE_RTN_DATA_NUM - 2; } else { - desc_data = (u64 *)(&desc[i]); + desc_data = (__le64 *)(&desc[i]); n = HCLGE_RTN_DATA_NUM; } for (k = 0; k < n; k++) { - *data++ += cpu_to_le64(*desc_data); + *data++ += le64_to_cpu(*desc_data); desc_data++; } } @@ -508,7 +514,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle) HCLGE_OPC_QUERY_RX_STATUS, true); - desc[0].data[0] = (tqp->index & 0x1ff); + desc[0].data[0] = cpu_to_le32((tqp->index & 0x1ff)); ret = hclge_cmd_send(&hdev->hw, desc, 1); if (ret) { dev_err(&hdev->pdev->dev, @@ -517,7 +523,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle) return ret; } tqp->tqp_stats.rcb_rx_ring_pktnum_rcd += - cpu_to_le32(desc[0].data[4]); + le32_to_cpu(desc[0].data[4]); } for (i = 0; i < kinfo->num_tqps; i++) { @@ -528,7 +534,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle) HCLGE_OPC_QUERY_TX_STATUS, true); - desc[0].data[0] = (tqp->index & 0x1ff); + desc[0].data[0] = cpu_to_le32((tqp->index & 0x1ff)); ret = hclge_cmd_send(&hdev->hw, desc, 1); if (ret) { dev_err(&hdev->pdev->dev, @@ -537,7 +543,7 @@ static int hclge_tqps_update_stats(struct hnae3_handle *handle) return ret; } tqp->tqp_stats.rcb_tx_ring_pktnum_rcd += - cpu_to_le32(desc[0].data[4]); + le32_to_cpu(desc[0].data[4]); } return 0; @@ -552,12 +558,12 @@ static u64 *hclge_tqps_get_stats(struct hnae3_handle *handle, u64 *data) for (i = 0; i < kinfo->num_tqps; i++) { tqp = container_of(kinfo->tqp[i], struct hclge_tqp, q); - *buff++ = cpu_to_le64(tqp->tqp_stats.rcb_tx_ring_pktnum_rcd); + *buff++ = tqp->tqp_stats.rcb_tx_ring_pktnum_rcd; } for (i = 0; i < kinfo->num_tqps; i++) { tqp = container_of(kinfo->tqp[i], struct hclge_tqp, q); - *buff++ = cpu_to_le64(tqp->tqp_stats.rcb_rx_ring_pktnum_rcd); + *buff++ = tqp->tqp_stats.rcb_rx_ring_pktnum_rcd; } return buff; @@ -820,7 +826,7 @@ static void hclge_get_stats(struct hnae3_handle *handle, u64 *data) } static int hclge_parse_func_status(struct hclge_dev *hdev, - struct hclge_func_status *status) + struct hclge_func_status_cmd *status) { if (!(status->pf_state & HCLGE_PF_STATE_DONE)) return -EINVAL; @@ -837,13 +843,13 @@ static int hclge_parse_func_status(struct hclge_dev *hdev, static int hclge_query_function_status(struct hclge_dev *hdev) { - struct hclge_func_status *req; + struct hclge_func_status_cmd *req; struct hclge_desc desc; int timeout = 0; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_FUNC_STATUS, true); - req = (struct hclge_func_status *)desc.data; + req = (struct hclge_func_status_cmd *)desc.data; do { ret = hclge_cmd_send(&hdev->hw, &desc, 1); @@ -868,7 +874,7 @@ static int hclge_query_function_status(struct hclge_dev *hdev) static int hclge_query_pf_resource(struct hclge_dev *hdev) { - struct hclge_pf_res *req; + struct hclge_pf_res_cmd *req; struct hclge_desc desc; int ret; @@ -880,7 +886,7 @@ static int hclge_query_pf_resource(struct hclge_dev *hdev) return ret; } - req = (struct hclge_pf_res *)desc.data; + req = (struct hclge_pf_res_cmd *)desc.data; hdev->num_tqps = __le16_to_cpu(req->tqp_num); hdev->pkt_buf_size = __le16_to_cpu(req->buf_size) << HCLGE_BUF_UNIT_S; @@ -938,12 +944,12 @@ static int hclge_parse_speed(int speed_cmd, int *speed) static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc) { - struct hclge_cfg_param *req; + struct hclge_cfg_param_cmd *req; u64 mac_addr_tmp_high; u64 mac_addr_tmp; int i; - req = (struct hclge_cfg_param *)desc[0].data; + req = (struct hclge_cfg_param_cmd *)desc[0].data; /* get the configuration */ cfg->vmdq_vport_num = hnae_get_field(__le32_to_cpu(req->param[0]), @@ -978,7 +984,7 @@ static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc) for (i = 0; i < ETH_ALEN; i++) cfg->mac_addr[i] = (mac_addr_tmp >> (8 * i)) & 0xff; - req = (struct hclge_cfg_param *)desc[1].data; + req = (struct hclge_cfg_param_cmd *)desc[1].data; cfg->numa_node_map = __le32_to_cpu(req->param[0]); } @@ -989,20 +995,21 @@ static void hclge_parse_cfg(struct hclge_cfg *cfg, struct hclge_desc *desc) static int hclge_get_cfg(struct hclge_dev *hdev, struct hclge_cfg *hcfg) { struct hclge_desc desc[HCLGE_PF_CFG_DESC_NUM]; - struct hclge_cfg_param *req; + struct hclge_cfg_param_cmd *req; int i, ret; for (i = 0; i < HCLGE_PF_CFG_DESC_NUM; i++) { - req = (struct hclge_cfg_param *)desc[i].data; + u32 offset = 0; + + req = (struct hclge_cfg_param_cmd *)desc[i].data; hclge_cmd_setup_basic_desc(&desc[i], HCLGE_OPC_GET_CFG_PARAM, true); - hnae_set_field(req->offset, HCLGE_CFG_OFFSET_M, + hnae_set_field(offset, HCLGE_CFG_OFFSET_M, HCLGE_CFG_OFFSET_S, i * HCLGE_CFG_RD_LEN_BYTES); /* Len should be united by 4 bytes when send to hardware */ - hnae_set_field(req->offset, HCLGE_CFG_RD_LEN_M, - HCLGE_CFG_RD_LEN_S, + hnae_set_field(offset, HCLGE_CFG_RD_LEN_M, HCLGE_CFG_RD_LEN_S, HCLGE_CFG_RD_LEN_BYTES / HCLGE_CFG_RD_LEN_UNIT); - req->offset = cpu_to_le32(req->offset); + req->offset = cpu_to_le32(offset); } ret = hclge_cmd_send(&hdev->hw, desc, HCLGE_PF_CFG_DESC_NUM); @@ -1099,16 +1106,23 @@ static int hclge_configure(struct hclge_dev *hdev) static int hclge_config_tso(struct hclge_dev *hdev, int tso_mss_min, int tso_mss_max) { - struct hclge_cfg_tso_status *req; + struct hclge_cfg_tso_status_cmd *req; struct hclge_desc desc; + u16 tso_mss; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TSO_GENERIC_CONFIG, false); - req = (struct hclge_cfg_tso_status *)desc.data; - hnae_set_field(req->tso_mss_min, HCLGE_TSO_MSS_MIN_M, + req = (struct hclge_cfg_tso_status_cmd *)desc.data; + + tso_mss = 0; + hnae_set_field(tso_mss, HCLGE_TSO_MSS_MIN_M, HCLGE_TSO_MSS_MIN_S, tso_mss_min); - hnae_set_field(req->tso_mss_max, HCLGE_TSO_MSS_MIN_M, + req->tso_mss_min = cpu_to_le16(tso_mss); + + tso_mss = 0; + hnae_set_field(tso_mss, HCLGE_TSO_MSS_MIN_M, HCLGE_TSO_MSS_MIN_S, tso_mss_max); + req->tso_mss_max = cpu_to_le16(tso_mss); return hclge_cmd_send(&hdev->hw, &desc, 1); } @@ -1144,15 +1158,15 @@ static int hclge_alloc_tqps(struct hclge_dev *hdev) static int hclge_map_tqps_to_func(struct hclge_dev *hdev, u16 func_id, u16 tqp_pid, u16 tqp_vid, bool is_pf) { - struct hclge_tqp_map *req; + struct hclge_tqp_map_cmd *req; struct hclge_desc desc; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_SET_TQP_MAP, false); - req = (struct hclge_tqp_map *)desc.data; + req = (struct hclge_tqp_map_cmd *)desc.data; req->tqp_id = cpu_to_le16(tqp_pid); - req->tqp_vf = cpu_to_le16(func_id); + req->tqp_vf = func_id; req->tqp_flag = !is_pf << HCLGE_TQP_MAP_TYPE_B | 1 << HCLGE_TQP_MAP_EN_B; req->tqp_vid = cpu_to_le16(tqp_vid); @@ -1340,12 +1354,12 @@ static int hclge_cmd_alloc_tx_buff(struct hclge_dev *hdev, /* TX buffer size is unit by 128 byte */ #define HCLGE_BUF_SIZE_UNIT_SHIFT 7 #define HCLGE_BUF_SIZE_UPDATE_EN_MSK BIT(15) - struct hclge_tx_buff_alloc *req; + struct hclge_tx_buff_alloc_cmd *req; struct hclge_desc desc; int ret; u8 i; - req = (struct hclge_tx_buff_alloc *)desc.data; + req = (struct hclge_tx_buff_alloc_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_TX_BUFF_ALLOC, 0); for (i = 0; i < HCLGE_TC_NUM; i++) { @@ -1536,8 +1550,8 @@ static int hclge_tx_buffer_calc(struct hclge_dev *hdev, * @buf_alloc: pointer to buffer calculation data * @return: 0: calculate sucessful, negative: fail */ -int hclge_rx_buffer_calc(struct hclge_dev *hdev, - struct hclge_pkt_buf_alloc *buf_alloc) +static int hclge_rx_buffer_calc(struct hclge_dev *hdev, + struct hclge_pkt_buf_alloc *buf_alloc) { u32 rx_all = hdev->pkt_buf_size; int no_pfc_priv_num, pfc_priv_num; @@ -1672,13 +1686,13 @@ int hclge_rx_buffer_calc(struct hclge_dev *hdev, static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev, struct hclge_pkt_buf_alloc *buf_alloc) { - struct hclge_rx_priv_buff *req; + struct hclge_rx_priv_buff_cmd *req; struct hclge_desc desc; int ret; int i; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RX_PRIV_BUFF_ALLOC, false); - req = (struct hclge_rx_priv_buff *)desc.data; + req = (struct hclge_rx_priv_buff_cmd *)desc.data; /* Alloc private buffer TCs */ for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { @@ -1687,7 +1701,7 @@ static int hclge_rx_priv_buf_alloc(struct hclge_dev *hdev, req->buf_num[i] = cpu_to_le16(priv->buf_size >> HCLGE_BUF_UNIT_S); req->buf_num[i] |= - cpu_to_le16(true << HCLGE_TC0_PRI_BUF_EN_B); + cpu_to_le16(1 << HCLGE_TC0_PRI_BUF_EN_B); } req->shared_buf = @@ -2000,11 +2014,11 @@ static void hclge_check_speed_dup(struct hclge_dev *hdev, int duplex, int speed) int hclge_cfg_mac_speed_dup(struct hclge_dev *hdev, int speed, u8 duplex) { - struct hclge_config_mac_speed_dup *req; + struct hclge_config_mac_speed_dup_cmd *req; struct hclge_desc desc; int ret; - req = (struct hclge_config_mac_speed_dup *)desc.data; + req = (struct hclge_config_mac_speed_dup_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_SPEED_DUP, false); @@ -2075,12 +2089,12 @@ static int hclge_cfg_mac_speed_dup_h(struct hnae3_handle *handle, int speed, static int hclge_query_mac_an_speed_dup(struct hclge_dev *hdev, int *speed, u8 *duplex) { - struct hclge_query_an_speed_dup *req; + struct hclge_query_an_speed_dup_cmd *req; struct hclge_desc desc; int speed_tmp; int ret; - req = (struct hclge_query_an_speed_dup *)desc.data; + req = (struct hclge_query_an_speed_dup_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_AN_RESULT, true); ret = hclge_cmd_send(&hdev->hw, &desc, 1); @@ -2108,11 +2122,11 @@ static int hclge_query_mac_an_speed_dup(struct hclge_dev *hdev, int *speed, static int hclge_query_autoneg_result(struct hclge_dev *hdev) { struct hclge_mac *mac = &hdev->hw.mac; - struct hclge_query_an_speed_dup *req; + struct hclge_query_an_speed_dup_cmd *req; struct hclge_desc desc; int ret; - req = (struct hclge_query_an_speed_dup *)desc.data; + req = (struct hclge_query_an_speed_dup_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_AN_RESULT, true); ret = hclge_cmd_send(&hdev->hw, &desc, 1); @@ -2129,14 +2143,16 @@ static int hclge_query_autoneg_result(struct hclge_dev *hdev) static int hclge_set_autoneg_en(struct hclge_dev *hdev, bool enable) { - struct hclge_config_auto_neg *req; + struct hclge_config_auto_neg_cmd *req; struct hclge_desc desc; + u32 flag = 0; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_AN_MODE, false); - req = (struct hclge_config_auto_neg *)desc.data; - hnae_set_bit(req->cfg_an_cmd_flag, HCLGE_MAC_CFG_AN_EN_B, !!enable); + req = (struct hclge_config_auto_neg_cmd *)desc.data; + hnae_set_bit(flag, HCLGE_MAC_CFG_AN_EN_B, !!enable); + req->cfg_an_cmd_flag = cpu_to_le32(flag); ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { @@ -2214,7 +2230,7 @@ static void hclge_task_schedule(struct hclge_dev *hdev) static int hclge_get_mac_link_status(struct hclge_dev *hdev) { - struct hclge_link_status *req; + struct hclge_link_status_cmd *req; struct hclge_desc desc; int link_status; int ret; @@ -2227,7 +2243,7 @@ static int hclge_get_mac_link_status(struct hclge_dev *hdev) return ret; } - req = (struct hclge_link_status *)desc.data; + req = (struct hclge_link_status_cmd *)desc.data; link_status = req->status & HCLGE_LINK_STATUS; return !!link_status; @@ -2451,7 +2467,7 @@ static u32 hclge_get_rss_indir_size(struct hnae3_handle *handle) static int hclge_get_rss_algo(struct hclge_dev *hdev) { - struct hclge_rss_config *req; + struct hclge_rss_config_cmd *req; struct hclge_desc desc; int rss_hash_algo; int ret; @@ -2465,7 +2481,7 @@ static int hclge_get_rss_algo(struct hclge_dev *hdev) return ret; } - req = (struct hclge_rss_config *)desc.data; + req = (struct hclge_rss_config_cmd *)desc.data; rss_hash_algo = (req->hash_config & HCLGE_RSS_HASH_ALGO_MASK); if (rss_hash_algo == HCLGE_RSS_HASH_ALGO_TOEPLITZ) @@ -2477,13 +2493,13 @@ static int hclge_get_rss_algo(struct hclge_dev *hdev) static int hclge_set_rss_algo_key(struct hclge_dev *hdev, const u8 hfunc, const u8 *key) { - struct hclge_rss_config *req; + struct hclge_rss_config_cmd *req; struct hclge_desc desc; int key_offset; int key_size; int ret; - req = (struct hclge_rss_config *)desc.data; + req = (struct hclge_rss_config_cmd *)desc.data; for (key_offset = 0; key_offset < 3; key_offset++) { hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_GENERIC_CONFIG, @@ -2514,19 +2530,20 @@ static int hclge_set_rss_algo_key(struct hclge_dev *hdev, static int hclge_set_rss_indir_table(struct hclge_dev *hdev, const u32 *indir) { - struct hclge_rss_indirection_table *req; + struct hclge_rss_indirection_table_cmd *req; struct hclge_desc desc; int i, j; int ret; - req = (struct hclge_rss_indirection_table *)desc.data; + req = (struct hclge_rss_indirection_table_cmd *)desc.data; for (i = 0; i < HCLGE_RSS_CFG_TBL_NUM; i++) { hclge_cmd_setup_basic_desc (&desc, HCLGE_OPC_RSS_INDIR_TABLE, false); - req->start_table_index = i * HCLGE_RSS_CFG_TBL_SIZE; - req->rss_set_bitmap = HCLGE_RSS_SET_BITMAP_MSK; + req->start_table_index = + cpu_to_le16(i * HCLGE_RSS_CFG_TBL_SIZE); + req->rss_set_bitmap = cpu_to_le16(HCLGE_RSS_SET_BITMAP_MSK); for (j = 0; j < HCLGE_RSS_CFG_TBL_SIZE; j++) req->rss_result[j] = @@ -2546,21 +2563,24 @@ static int hclge_set_rss_indir_table(struct hclge_dev *hdev, const u32 *indir) static int hclge_set_rss_tc_mode(struct hclge_dev *hdev, u16 *tc_valid, u16 *tc_size, u16 *tc_offset) { - struct hclge_rss_tc_mode *req; + struct hclge_rss_tc_mode_cmd *req; struct hclge_desc desc; int ret; int i; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_TC_MODE, false); - req = (struct hclge_rss_tc_mode *)desc.data; + req = (struct hclge_rss_tc_mode_cmd *)desc.data; for (i = 0; i < HCLGE_MAX_TC_NUM; i++) { - hnae_set_bit(req->rss_tc_mode[i], HCLGE_RSS_TC_VALID_B, - (tc_valid[i] & 0x1)); - hnae_set_field(req->rss_tc_mode[i], HCLGE_RSS_TC_SIZE_M, + u16 mode = 0; + + hnae_set_bit(mode, HCLGE_RSS_TC_VALID_B, (tc_valid[i] & 0x1)); + hnae_set_field(mode, HCLGE_RSS_TC_SIZE_M, HCLGE_RSS_TC_SIZE_S, tc_size[i]); - hnae_set_field(req->rss_tc_mode[i], HCLGE_RSS_TC_OFFSET_M, + hnae_set_field(mode, HCLGE_RSS_TC_OFFSET_M, HCLGE_RSS_TC_OFFSET_S, tc_offset[i]); + + req->rss_tc_mode[i] = cpu_to_le16(mode); } ret = hclge_cmd_send(&hdev->hw, &desc, 1); @@ -2575,15 +2595,13 @@ static int hclge_set_rss_tc_mode(struct hclge_dev *hdev, u16 *tc_valid, static int hclge_set_rss_input_tuple(struct hclge_dev *hdev) { -#define HCLGE_RSS_INPUT_TUPLE_OTHER 0xf -#define HCLGE_RSS_INPUT_TUPLE_SCTP 0x1f - struct hclge_rss_input_tuple *req; + struct hclge_rss_input_tuple_cmd *req; struct hclge_desc desc; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, false); - req = (struct hclge_rss_input_tuple *)desc.data; + req = (struct hclge_rss_input_tuple_cmd *)desc.data; req->ipv4_tcp_en = HCLGE_RSS_INPUT_TUPLE_OTHER; req->ipv4_udp_en = HCLGE_RSS_INPUT_TUPLE_OTHER; req->ipv4_sctp_en = HCLGE_RSS_INPUT_TUPLE_SCTP; @@ -2657,6 +2675,161 @@ static int hclge_set_rss(struct hnae3_handle *handle, const u32 *indir, return ret; } +static u8 hclge_get_rss_hash_bits(struct ethtool_rxnfc *nfc) +{ + u8 hash_sets = nfc->data & RXH_L4_B_0_1 ? HCLGE_S_PORT_BIT : 0; + + if (nfc->data & RXH_L4_B_2_3) + hash_sets |= HCLGE_D_PORT_BIT; + else + hash_sets &= ~HCLGE_D_PORT_BIT; + + if (nfc->data & RXH_IP_SRC) + hash_sets |= HCLGE_S_IP_BIT; + else + hash_sets &= ~HCLGE_S_IP_BIT; + + if (nfc->data & RXH_IP_DST) + hash_sets |= HCLGE_D_IP_BIT; + else + hash_sets &= ~HCLGE_D_IP_BIT; + + if (nfc->flow_type == SCTP_V4_FLOW || nfc->flow_type == SCTP_V6_FLOW) + hash_sets |= HCLGE_V_TAG_BIT; + + return hash_sets; +} + +static int hclge_set_rss_tuple(struct hnae3_handle *handle, + struct ethtool_rxnfc *nfc) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + struct hclge_rss_input_tuple_cmd *req; + struct hclge_desc desc; + u8 tuple_sets; + int ret; + + if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST | + RXH_L4_B_0_1 | RXH_L4_B_2_3)) + return -EINVAL; + + req = (struct hclge_rss_input_tuple_cmd *)desc.data; + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, true); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "Read rss tuple fail, status = %d\n", ret); + return ret; + } + + hclge_cmd_reuse_desc(&desc, false); + + tuple_sets = hclge_get_rss_hash_bits(nfc); + switch (nfc->flow_type) { + case TCP_V4_FLOW: + req->ipv4_tcp_en = tuple_sets; + break; + case TCP_V6_FLOW: + req->ipv6_tcp_en = tuple_sets; + break; + case UDP_V4_FLOW: + req->ipv4_udp_en = tuple_sets; + break; + case UDP_V6_FLOW: + req->ipv6_udp_en = tuple_sets; + break; + case SCTP_V4_FLOW: + req->ipv4_sctp_en = tuple_sets; + break; + case SCTP_V6_FLOW: + if ((nfc->data & RXH_L4_B_0_1) || + (nfc->data & RXH_L4_B_2_3)) + return -EINVAL; + + req->ipv6_sctp_en = tuple_sets; + break; + case IPV4_FLOW: + req->ipv4_fragment_en = HCLGE_RSS_INPUT_TUPLE_OTHER; + break; + case IPV6_FLOW: + req->ipv6_fragment_en = HCLGE_RSS_INPUT_TUPLE_OTHER; + break; + default: + return -EINVAL; + } + + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) + dev_err(&hdev->pdev->dev, + "Set rss tuple fail, status = %d\n", ret); + + return ret; +} + +static int hclge_get_rss_tuple(struct hnae3_handle *handle, + struct ethtool_rxnfc *nfc) +{ + struct hclge_vport *vport = hclge_get_vport(handle); + struct hclge_dev *hdev = vport->back; + struct hclge_rss_input_tuple_cmd *req; + struct hclge_desc desc; + u8 tuple_sets; + int ret; + + nfc->data = 0; + + req = (struct hclge_rss_input_tuple_cmd *)desc.data; + hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RSS_INPUT_TUPLE, true); + ret = hclge_cmd_send(&hdev->hw, &desc, 1); + if (ret) { + dev_err(&hdev->pdev->dev, + "Read rss tuple fail, status = %d\n", ret); + return ret; + } + + switch (nfc->flow_type) { + case TCP_V4_FLOW: + tuple_sets = req->ipv4_tcp_en; + break; + case UDP_V4_FLOW: + tuple_sets = req->ipv4_udp_en; + break; + case TCP_V6_FLOW: + tuple_sets = req->ipv6_tcp_en; + break; + case UDP_V6_FLOW: + tuple_sets = req->ipv6_udp_en; + break; + case SCTP_V4_FLOW: + tuple_sets = req->ipv4_sctp_en; + break; + case SCTP_V6_FLOW: + tuple_sets = req->ipv6_sctp_en; + break; + case IPV4_FLOW: + case IPV6_FLOW: + tuple_sets = HCLGE_S_IP_BIT | HCLGE_D_IP_BIT; + break; + default: + return -EINVAL; + } + + if (!tuple_sets) + return 0; + + if (tuple_sets & HCLGE_D_PORT_BIT) + nfc->data |= RXH_L4_B_2_3; + if (tuple_sets & HCLGE_S_PORT_BIT) + nfc->data |= RXH_L4_B_0_1; + if (tuple_sets & HCLGE_D_IP_BIT) + nfc->data |= RXH_IP_DST; + if (tuple_sets & HCLGE_S_IP_BIT) + nfc->data |= RXH_IP_SRC; + + return 0; +} + static int hclge_get_tc_size(struct hnae3_handle *handle) { struct hclge_vport *vport = hclge_get_vport(handle); @@ -2750,7 +2923,7 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id, struct hnae3_ring_chain_node *ring_chain) { struct hclge_dev *hdev = vport->back; - struct hclge_ctrl_vector_chain *req; + struct hclge_ctrl_vector_chain_cmd *req; struct hnae3_ring_chain_node *node; struct hclge_desc desc; int ret; @@ -2758,20 +2931,21 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id, hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_ADD_RING_TO_VECTOR, false); - req = (struct hclge_ctrl_vector_chain *)desc.data; + req = (struct hclge_ctrl_vector_chain_cmd *)desc.data; req->int_vector_id = vector_id; i = 0; for (node = ring_chain; node; node = node->next) { - hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_TYPE_M, - HCLGE_INT_TYPE_S, + u16 type_and_id = 0; + + hnae_set_field(type_and_id, HCLGE_INT_TYPE_M, HCLGE_INT_TYPE_S, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); - hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, - HCLGE_TQP_ID_S, node->tqp_index); - hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + hnae_set_field(type_and_id, HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, + node->tqp_index); + hnae_set_field(type_and_id, HCLGE_INT_GL_IDX_M, HCLGE_INT_GL_IDX_S, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); - req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->tqp_type_and_id[i] = cpu_to_le16(type_and_id); req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { @@ -2807,9 +2981,9 @@ int hclge_map_vport_ring_to_vector(struct hclge_vport *vport, int vector_id, return 0; } -int hclge_map_handle_ring_to_vector(struct hnae3_handle *handle, - int vector, - struct hnae3_ring_chain_node *ring_chain) +static int hclge_map_handle_ring_to_vector( + struct hnae3_handle *handle, int vector, + struct hnae3_ring_chain_node *ring_chain) { struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_dev *hdev = vport->back; @@ -2831,7 +3005,7 @@ static int hclge_unmap_ring_from_vector( { struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_dev *hdev = vport->back; - struct hclge_ctrl_vector_chain *req; + struct hclge_ctrl_vector_chain_cmd *req; struct hnae3_ring_chain_node *node; struct hclge_desc desc; int i, vector_id; @@ -2846,21 +3020,22 @@ static int hclge_unmap_ring_from_vector( hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_DEL_RING_TO_VECTOR, false); - req = (struct hclge_ctrl_vector_chain *)desc.data; + req = (struct hclge_ctrl_vector_chain_cmd *)desc.data; req->int_vector_id = vector_id; i = 0; for (node = ring_chain; node; node = node->next) { - hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_TYPE_M, - HCLGE_INT_TYPE_S, + u16 type_and_id = 0; + + hnae_set_field(type_and_id, HCLGE_INT_TYPE_M, HCLGE_INT_TYPE_S, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); - hnae_set_field(req->tqp_type_and_id[i], HCLGE_TQP_ID_M, - HCLGE_TQP_ID_S, node->tqp_index); - hnae_set_field(req->tqp_type_and_id[i], HCLGE_INT_GL_IDX_M, + hnae_set_field(type_and_id, HCLGE_TQP_ID_M, HCLGE_TQP_ID_S, + node->tqp_index); + hnae_set_field(type_and_id, HCLGE_INT_GL_IDX_M, HCLGE_INT_GL_IDX_S, hnae_get_bit(node->flag, HNAE3_RING_TYPE_B)); - req->tqp_type_and_id[i] = cpu_to_le16(req->tqp_type_and_id[i]); + req->tqp_type_and_id[i] = cpu_to_le16(type_and_id); req->vfid = vport->vport_id; if (++i >= HCLGE_VECTOR_ELEMENTS_PER_CMD) { @@ -2898,13 +3073,13 @@ static int hclge_unmap_ring_from_vector( int hclge_cmd_set_promisc_mode(struct hclge_dev *hdev, struct hclge_promisc_param *param) { - struct hclge_promisc_cfg *req; + struct hclge_promisc_cfg_cmd *req; struct hclge_desc desc; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_PROMISC_MODE, false); - req = (struct hclge_promisc_cfg *)desc.data; + req = (struct hclge_promisc_cfg_cmd *)desc.data; req->vf_id = param->vf_id; req->flag = (param->enable << HCLGE_PROMISC_EN_B); @@ -2946,29 +3121,27 @@ static void hclge_set_promisc_mode(struct hnae3_handle *handle, u32 en) static void hclge_cfg_mac_mode(struct hclge_dev *hdev, bool enable) { struct hclge_desc desc; - struct hclge_config_mac_mode *req = - (struct hclge_config_mac_mode *)desc.data; + struct hclge_config_mac_mode_cmd *req = + (struct hclge_config_mac_mode_cmd *)desc.data; + u32 loop_en = 0; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAC_MODE, false); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_TX_EN_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_RX_EN_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_PAD_TX_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_PAD_RX_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_1588_TX_B, 0); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_1588_RX_B, 0); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_APP_LP_B, 0); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_LINE_LP_B, 0); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_FCS_TX_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, HCLGE_MAC_RX_FCS_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, - HCLGE_MAC_RX_FCS_STRIP_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, - HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, - HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B, enable); - hnae_set_bit(req->txrx_pad_fcs_loop_en, - HCLGE_MAC_TX_UNDER_MIN_ERR_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_TX_EN_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_RX_EN_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_PAD_TX_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_PAD_RX_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_1588_TX_B, 0); + hnae_set_bit(loop_en, HCLGE_MAC_1588_RX_B, 0); + hnae_set_bit(loop_en, HCLGE_MAC_APP_LP_B, 0); + hnae_set_bit(loop_en, HCLGE_MAC_LINE_LP_B, 0); + hnae_set_bit(loop_en, HCLGE_MAC_FCS_TX_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_RX_FCS_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_RX_FCS_STRIP_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_TX_OVERSIZE_TRUNCATE_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_RX_OVERSIZE_TRUNCATE_B, enable); + hnae_set_bit(loop_en, HCLGE_MAC_TX_UNDER_MIN_ERR_B, enable); + req->txrx_pad_fcs_loop_en = cpu_to_le32(loop_en); ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) @@ -2980,8 +3153,8 @@ static int hclge_tqp_enable(struct hclge_dev *hdev, int tqp_id, int stream_id, bool enable) { struct hclge_desc desc; - struct hclge_cfg_com_tqp_queue *req = - (struct hclge_cfg_com_tqp_queue *)desc.data; + struct hclge_cfg_com_tqp_queue_cmd *req = + (struct hclge_cfg_com_tqp_queue_cmd *)desc.data; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CFG_COM_TQP_QUEUE, false); @@ -3145,16 +3318,16 @@ static int hclge_update_desc_vfid(struct hclge_desc *desc, int vfid, bool clr) word_num = vfid / 32; bit_num = vfid % 32; if (clr) - desc[1].data[word_num] &= ~(1 << bit_num); + desc[1].data[word_num] &= cpu_to_le32(~(1 << bit_num)); else - desc[1].data[word_num] |= (1 << bit_num); + desc[1].data[word_num] |= cpu_to_le32(1 << bit_num); } else { word_num = (vfid - 192) / 32; bit_num = vfid % 32; if (clr) - desc[2].data[word_num] &= ~(1 << bit_num); + desc[2].data[word_num] &= cpu_to_le32(~(1 << bit_num)); else - desc[2].data[word_num] |= (1 << bit_num); + desc[2].data[word_num] |= cpu_to_le32(1 << bit_num); } return 0; @@ -3174,7 +3347,7 @@ static bool hclge_is_all_function_id_zero(struct hclge_desc *desc) return true; } -static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry *new_req, +static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry_cmd *new_req, const u8 *addr) { const unsigned char *mac_addr = addr; @@ -3186,8 +3359,8 @@ static void hclge_prepare_mac_addr(struct hclge_mac_vlan_tbl_entry *new_req, new_req->mac_addr_lo16 = cpu_to_le16(low_val & 0xffff); } -u16 hclge_get_mac_addr_to_mta_index(struct hclge_vport *vport, - const u8 *addr) +static u16 hclge_get_mac_addr_to_mta_index(struct hclge_vport *vport, + const u8 *addr) { u16 high_val = addr[1] | (addr[0] << 8); struct hclge_dev *hdev = vport->back; @@ -3201,11 +3374,11 @@ static int hclge_set_mta_filter_mode(struct hclge_dev *hdev, enum hclge_mta_dmac_sel_type mta_mac_sel, bool enable) { - struct hclge_mta_filter_mode *req; + struct hclge_mta_filter_mode_cmd *req; struct hclge_desc desc; int ret; - req = (struct hclge_mta_filter_mode *)desc.data; + req = (struct hclge_mta_filter_mode_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_MAC_MODE_CFG, false); hnae_set_bit(req->dmac_sel_en, HCLGE_CFG_MTA_MAC_EN_B, @@ -3228,11 +3401,11 @@ int hclge_cfg_func_mta_filter(struct hclge_dev *hdev, u8 func_id, bool enable) { - struct hclge_cfg_func_mta_filter *req; + struct hclge_cfg_func_mta_filter_cmd *req; struct hclge_desc desc; int ret; - req = (struct hclge_cfg_func_mta_filter *)desc.data; + req = (struct hclge_cfg_func_mta_filter_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_MAC_FUNC_CFG, false); hnae_set_bit(req->accept, HCLGE_CFG_FUNC_MTA_ACCEPT_B, @@ -3255,17 +3428,18 @@ static int hclge_set_mta_table_item(struct hclge_vport *vport, bool enable) { struct hclge_dev *hdev = vport->back; - struct hclge_cfg_func_mta_item *req; + struct hclge_cfg_func_mta_item_cmd *req; struct hclge_desc desc; + u16 item_idx = 0; int ret; - req = (struct hclge_cfg_func_mta_item *)desc.data; + req = (struct hclge_cfg_func_mta_item_cmd *)desc.data; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MTA_TBL_ITEM_CFG, false); hnae_set_bit(req->accept, HCLGE_CFG_MTA_ITEM_ACCEPT_B, enable); - hnae_set_field(req->item_idx, HCLGE_CFG_MTA_ITEM_IDX_M, + hnae_set_field(item_idx, HCLGE_CFG_MTA_ITEM_IDX_M, HCLGE_CFG_MTA_ITEM_IDX_S, idx); - req->item_idx = cpu_to_le16(req->item_idx); + req->item_idx = cpu_to_le16(item_idx); ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { @@ -3279,16 +3453,17 @@ static int hclge_set_mta_table_item(struct hclge_vport *vport, } static int hclge_remove_mac_vlan_tbl(struct hclge_vport *vport, - struct hclge_mac_vlan_tbl_entry *req) + struct hclge_mac_vlan_tbl_entry_cmd *req) { struct hclge_dev *hdev = vport->back; struct hclge_desc desc; u8 resp_code; + u16 retval; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MAC_VLAN_REMOVE, false); - memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry)); + memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry_cmd)); ret = hclge_cmd_send(&hdev->hw, &desc, 1); if (ret) { @@ -3297,19 +3472,21 @@ static int hclge_remove_mac_vlan_tbl(struct hclge_vport *vport, ret); return ret; } - resp_code = (desc.data[0] >> 8) & 0xff; + resp_code = (le32_to_cpu(desc.data[0]) >> 8) & 0xff; + retval = le16_to_cpu(desc.retval); - return hclge_get_mac_vlan_cmd_status(vport, desc.retval, resp_code, + return hclge_get_mac_vlan_cmd_status(vport, retval, resp_code, HCLGE_MAC_VLAN_REMOVE); } static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport, - struct hclge_mac_vlan_tbl_entry *req, + struct hclge_mac_vlan_tbl_entry_cmd *req, struct hclge_desc *desc, bool is_mc) { struct hclge_dev *hdev = vport->back; u8 resp_code; + u16 retval; int ret; hclge_cmd_setup_basic_desc(&desc[0], HCLGE_OPC_MAC_VLAN_ADD, true); @@ -3317,7 +3494,7 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport, desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT); memcpy(desc[0].data, req, - sizeof(struct hclge_mac_vlan_tbl_entry)); + sizeof(struct hclge_mac_vlan_tbl_entry_cmd)); hclge_cmd_setup_basic_desc(&desc[1], HCLGE_OPC_MAC_VLAN_ADD, true); @@ -3329,7 +3506,7 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport, } else { memcpy(desc[0].data, req, - sizeof(struct hclge_mac_vlan_tbl_entry)); + sizeof(struct hclge_mac_vlan_tbl_entry_cmd)); ret = hclge_cmd_send(&hdev->hw, desc, 1); } if (ret) { @@ -3338,19 +3515,21 @@ static int hclge_lookup_mac_vlan_tbl(struct hclge_vport *vport, ret); return ret; } - resp_code = (desc[0].data[0] >> 8) & 0xff; + resp_code = (le32_to_cpu(desc[0].data[0]) >> 8) & 0xff; + retval = le16_to_cpu(desc[0].retval); - return hclge_get_mac_vlan_cmd_status(vport, desc[0].retval, resp_code, + return hclge_get_mac_vlan_cmd_status(vport, retval, resp_code, HCLGE_MAC_VLAN_LKUP); } static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport, - struct hclge_mac_vlan_tbl_entry *req, + struct hclge_mac_vlan_tbl_entry_cmd *req, struct hclge_desc *mc_desc) { struct hclge_dev *hdev = vport->back; int cfg_status; u8 resp_code; + u16 retval; int ret; if (!mc_desc) { @@ -3359,10 +3538,13 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport, hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_MAC_VLAN_ADD, false); - memcpy(desc.data, req, sizeof(struct hclge_mac_vlan_tbl_entry)); + memcpy(desc.data, req, + sizeof(struct hclge_mac_vlan_tbl_entry_cmd)); ret = hclge_cmd_send(&hdev->hw, &desc, 1); - resp_code = (desc.data[0] >> 8) & 0xff; - cfg_status = hclge_get_mac_vlan_cmd_status(vport, desc.retval, + resp_code = (le32_to_cpu(desc.data[0]) >> 8) & 0xff; + retval = le16_to_cpu(desc.retval); + + cfg_status = hclge_get_mac_vlan_cmd_status(vport, retval, resp_code, HCLGE_MAC_VLAN_ADD); } else { @@ -3373,11 +3555,12 @@ static int hclge_add_mac_vlan_tbl(struct hclge_vport *vport, mc_desc[2].flag &= cpu_to_le16(~HCLGE_CMD_FLAG_WR); mc_desc[2].flag &= cpu_to_le16(~HCLGE_CMD_FLAG_NEXT); memcpy(mc_desc[0].data, req, - sizeof(struct hclge_mac_vlan_tbl_entry)); + sizeof(struct hclge_mac_vlan_tbl_entry_cmd)); ret = hclge_cmd_send(&hdev->hw, mc_desc, 3); - resp_code = (mc_desc[0].data[0] >> 8) & 0xff; - cfg_status = hclge_get_mac_vlan_cmd_status(vport, - mc_desc[0].retval, + resp_code = (le32_to_cpu(mc_desc[0].data[0]) >> 8) & 0xff; + retval = le16_to_cpu(mc_desc[0].retval); + + cfg_status = hclge_get_mac_vlan_cmd_status(vport, retval, resp_code, HCLGE_MAC_VLAN_ADD); } @@ -3404,8 +3587,9 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport, const unsigned char *addr) { struct hclge_dev *hdev = vport->back; - struct hclge_mac_vlan_tbl_entry req; + struct hclge_mac_vlan_tbl_entry_cmd req; enum hclge_cmd_status status; + u16 egress_port = 0; /* mac addr check */ if (is_zero_ether_addr(addr) || @@ -3425,15 +3609,15 @@ int hclge_add_uc_addr_common(struct hclge_vport *vport, hnae_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT0_EN_B, 0); hnae_set_bit(req.entry_type, HCLGE_MAC_VLAN_BIT1_EN_B, 0); hnae_set_bit(req.mc_mac_en, HCLGE_MAC_VLAN_BIT0_EN_B, 0); - hnae_set_bit(req.egress_port, - HCLGE_MAC_EPORT_SW_EN_B, 0); - hnae_set_bit(req.egress_port, - HCLGE_MAC_EPORT_TYPE_B, 0); - hnae_set_field(req.egress_port, HCLGE_MAC_EPORT_VFID_M, + + hnae_set_bit(egress_port, HCLGE_MAC_EPORT_SW_EN_B, 0); + hnae_set_bit(egress_port, HCLGE_MAC_EPORT_TYPE_B, 0); + hnae_set_field(egress_port, HCLGE_MAC_EPORT_VFID_M, HCLGE_MAC_EPORT_VFID_S, vport->vport_id); - hnae_set_field(req.egress_port, HCLGE_MAC_EPORT_PFID_M, + hnae_set_field(egress_port, HCLGE_MAC_EPORT_PFID_M, HCLGE_MAC_EPORT_PFID_S, 0); - req.egress_port = cpu_to_le16(req.egress_port); + + req.egress_port = cpu_to_le16(egress_port); hclge_prepare_mac_addr(&req, addr); @@ -3454,7 +3638,7 @@ int hclge_rm_uc_addr_common(struct hclge_vport *vport, const unsigned char *addr) { struct hclge_dev *hdev = vport->back; - struct hclge_mac_vlan_tbl_entry req; + struct hclge_mac_vlan_tbl_entry_cmd req; enum hclge_cmd_status status; /* mac addr check */ @@ -3488,7 +3672,7 @@ int hclge_add_mc_addr_common(struct hclge_vport *vport, const unsigned char *addr) { struct hclge_dev *hdev = vport->back; - struct hclge_mac_vlan_tbl_entry req; + struct hclge_mac_vlan_tbl_entry_cmd req; struct hclge_desc desc[3]; u16 tbl_idx; int status; @@ -3539,7 +3723,7 @@ int hclge_rm_mc_addr_common(struct hclge_vport *vport, const unsigned char *addr) { struct hclge_dev *hdev = vport->back; - struct hclge_mac_vlan_tbl_entry req; + struct hclge_mac_vlan_tbl_entry_cmd req; enum hclge_cmd_status status; struct hclge_desc desc[3]; u16 tbl_idx; @@ -3622,13 +3806,13 @@ static int hclge_set_mac_addr(struct hnae3_handle *handle, void *p) static int hclge_set_vlan_filter_ctrl(struct hclge_dev *hdev, u8 vlan_type, bool filter_en) { - struct hclge_vlan_filter_ctrl *req; + struct hclge_vlan_filter_ctrl_cmd *req; struct hclge_desc desc; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_VLAN_FILTER_CTRL, false); - req = (struct hclge_vlan_filter_ctrl *)desc.data; + req = (struct hclge_vlan_filter_ctrl_cmd *)desc.data; req->vlan_type = vlan_type; req->vlan_fe = filter_en; @@ -3646,8 +3830,8 @@ int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid, bool is_kill, u16 vlan, u8 qos, __be16 proto) { #define HCLGE_MAX_VF_BYTES 16 - struct hclge_vlan_filter_vf_cfg *req0; - struct hclge_vlan_filter_vf_cfg *req1; + struct hclge_vlan_filter_vf_cfg_cmd *req0; + struct hclge_vlan_filter_vf_cfg_cmd *req1; struct hclge_desc desc[2]; u8 vf_byte_val; u8 vf_byte_off; @@ -3663,10 +3847,10 @@ int hclge_set_vf_vlan_common(struct hclge_dev *hdev, int vfid, vf_byte_off = vfid / 8; vf_byte_val = 1 << (vfid % 8); - req0 = (struct hclge_vlan_filter_vf_cfg *)desc[0].data; - req1 = (struct hclge_vlan_filter_vf_cfg *)desc[1].data; + req0 = (struct hclge_vlan_filter_vf_cfg_cmd *)desc[0].data; + req1 = (struct hclge_vlan_filter_vf_cfg_cmd *)desc[1].data; - req0->vlan_id = vlan; + req0->vlan_id = cpu_to_le16(vlan); req0->vlan_cfg = is_kill; if (vf_byte_off < HCLGE_MAX_VF_BYTES) @@ -3707,7 +3891,7 @@ static int hclge_set_port_vlan_filter(struct hnae3_handle *handle, { struct hclge_vport *vport = hclge_get_vport(handle); struct hclge_dev *hdev = vport->back; - struct hclge_vlan_filter_pf_cfg *req; + struct hclge_vlan_filter_pf_cfg_cmd *req; struct hclge_desc desc; u8 vlan_offset_byte_val; u8 vlan_offset_byte; @@ -3720,7 +3904,7 @@ static int hclge_set_port_vlan_filter(struct hnae3_handle *handle, vlan_offset_byte = (vlan_id % 160) / 8; vlan_offset_byte_val = 1 << (vlan_id % 8); - req = (struct hclge_vlan_filter_pf_cfg *)desc.data; + req = (struct hclge_vlan_filter_pf_cfg_cmd *)desc.data; req->vlan_offset = vlan_offset_160; req->vlan_cfg = is_kill; req->vlan_offset_bitmap[vlan_offset_byte] = vlan_offset_byte_val; @@ -3782,7 +3966,7 @@ static int hclge_init_vlan_config(struct hclge_dev *hdev) static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu) { struct hclge_vport *vport = hclge_get_vport(handle); - struct hclge_config_max_frm_size *req; + struct hclge_config_max_frm_size_cmd *req; struct hclge_dev *hdev = vport->back; struct hclge_desc desc; int ret; @@ -3793,7 +3977,7 @@ static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu) hdev->mps = new_mtu; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_CONFIG_MAX_FRM_SIZE, false); - req = (struct hclge_config_max_frm_size *)desc.data; + req = (struct hclge_config_max_frm_size_cmd *)desc.data; req->max_frm_size = cpu_to_le16(new_mtu); ret = hclge_cmd_send(&hdev->hw, &desc, 1); @@ -3808,13 +3992,13 @@ static int hclge_set_mtu(struct hnae3_handle *handle, int new_mtu) static int hclge_send_reset_tqp_cmd(struct hclge_dev *hdev, u16 queue_id, bool enable) { - struct hclge_reset_tqp_queue *req; + struct hclge_reset_tqp_queue_cmd *req; struct hclge_desc desc; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RESET_TQP_QUEUE, false); - req = (struct hclge_reset_tqp_queue *)desc.data; + req = (struct hclge_reset_tqp_queue_cmd *)desc.data; req->tqp_id = cpu_to_le16(queue_id & HCLGE_RING_ID_MASK); hnae_set_bit(req->reset_req, HCLGE_TQP_RESET_B, enable); @@ -3830,13 +4014,13 @@ static int hclge_send_reset_tqp_cmd(struct hclge_dev *hdev, u16 queue_id, static int hclge_get_reset_status(struct hclge_dev *hdev, u16 queue_id) { - struct hclge_reset_tqp_queue *req; + struct hclge_reset_tqp_queue_cmd *req; struct hclge_desc desc; int ret; hclge_cmd_setup_basic_desc(&desc, HCLGE_OPC_RESET_TQP_QUEUE, true); - req = (struct hclge_reset_tqp_queue *)desc.data; + req = (struct hclge_reset_tqp_queue_cmd *)desc.data; req->tqp_id = cpu_to_le16(queue_id & HCLGE_RING_ID_MASK); ret = hclge_cmd_send(&hdev->hw, &desc, 1); @@ -4313,6 +4497,8 @@ static const struct hnae3_ae_ops hclge_ops = { .get_rss_indir_size = hclge_get_rss_indir_size, .get_rss = hclge_get_rss, .set_rss = hclge_set_rss, + .set_rss_tuple = hclge_set_rss_tuple, + .get_rss_tuple = hclge_get_rss_tuple, .get_tc_size = hclge_get_tc_size, .get_mac_addr = hclge_get_mac_addr, .set_mac_addr = hclge_set_mac_addr, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index 7c66c00e8a3e..a7c018c7b0ec 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h @@ -32,7 +32,7 @@ #define HCLGE_VECTOR_VF_OFFSET 0x100000 #define HCLGE_RSS_IND_TBL_SIZE 512 -#define HCLGE_RSS_SET_BITMAP_MSK 0xffff +#define HCLGE_RSS_SET_BITMAP_MSK GENMASK(15, 0) #define HCLGE_RSS_KEY_SIZE 40 #define HCLGE_RSS_HASH_ALGO_TOEPLITZ 0 #define HCLGE_RSS_HASH_ALGO_SIMPLE 1 @@ -41,6 +41,14 @@ #define HCLGE_RSS_CFG_TBL_NUM \ (HCLGE_RSS_IND_TBL_SIZE / HCLGE_RSS_CFG_TBL_SIZE) +#define HCLGE_RSS_INPUT_TUPLE_OTHER GENMASK(3, 0) +#define HCLGE_RSS_INPUT_TUPLE_SCTP GENMASK(4, 0) +#define HCLGE_D_PORT_BIT BIT(0) +#define HCLGE_S_PORT_BIT BIT(1) +#define HCLGE_D_IP_BIT BIT(2) +#define HCLGE_S_IP_BIT BIT(3) +#define HCLGE_V_TAG_BIT BIT(4) + #define HCLGE_RSS_TC_SIZE_0 1 #define HCLGE_RSS_TC_SIZE_1 2 #define HCLGE_RSS_TC_SIZE_2 4 @@ -65,7 +73,7 @@ #define HCLGE_PHY_CSS_REG 17 #define HCLGE_PHY_MDIX_CTRL_S (5) -#define HCLGE_PHY_MDIX_CTRL_M (3 << HCLGE_PHY_MDIX_CTRL_S) +#define HCLGE_PHY_MDIX_CTRL_M GENMASK(6, 5) #define HCLGE_PHY_MDIX_STATUS_B (6) #define HCLGE_PHY_SPEED_DUP_RESOLVE_B (11) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c index 359ee670d1e1..1ae6eae82eb3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_tm.c @@ -283,6 +283,7 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev, struct hclge_pg_shapping_cmd *shap_cfg_cmd; enum hclge_opcode_type opcode; struct hclge_desc desc; + u32 shapping_para = 0; opcode = bucket ? HCLGE_OPC_TM_PG_P_SHAPPING : HCLGE_OPC_TM_PG_C_SHAPPING; @@ -292,11 +293,13 @@ static int hclge_tm_pg_shapping_cfg(struct hclge_dev *hdev, shap_cfg_cmd->pg_id = pg_id; - hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_B, ir_b); - hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_U, ir_u); - hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, IR_S, ir_s); - hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_B, bs_b); - hclge_tm_set_field(shap_cfg_cmd->pg_shapping_para, BS_S, bs_s); + hclge_tm_set_field(shapping_para, IR_B, ir_b); + hclge_tm_set_field(shapping_para, IR_U, ir_u); + hclge_tm_set_field(shapping_para, IR_S, ir_s); + hclge_tm_set_field(shapping_para, BS_B, bs_b); + hclge_tm_set_field(shapping_para, BS_S, bs_s); + + shap_cfg_cmd->pg_shapping_para = cpu_to_le32(shapping_para); return hclge_cmd_send(&hdev->hw, &desc, 1); } @@ -337,6 +340,7 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev, struct hclge_pri_shapping_cmd *shap_cfg_cmd; enum hclge_opcode_type opcode; struct hclge_desc desc; + u32 shapping_para = 0; opcode = bucket ? HCLGE_OPC_TM_PRI_P_SHAPPING : HCLGE_OPC_TM_PRI_C_SHAPPING; @@ -347,11 +351,13 @@ static int hclge_tm_pri_shapping_cfg(struct hclge_dev *hdev, shap_cfg_cmd->pri_id = pri_id; - hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_B, ir_b); - hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_U, ir_u); - hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, IR_S, ir_s); - hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_B, bs_b); - hclge_tm_set_field(shap_cfg_cmd->pri_shapping_para, BS_S, bs_s); + hclge_tm_set_field(shapping_para, IR_B, ir_b); + hclge_tm_set_field(shapping_para, IR_U, ir_u); + hclge_tm_set_field(shapping_para, IR_S, ir_s); + hclge_tm_set_field(shapping_para, BS_B, bs_b); + hclge_tm_set_field(shapping_para, BS_S, bs_s); + + shap_cfg_cmd->pri_shapping_para = cpu_to_le32(shapping_para); return hclge_cmd_send(&hdev->hw, &desc, 1); } diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c index 9832172bfb08..925619a7c50a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_dcbnl.c @@ -13,8 +13,7 @@ static int hns3_dcbnl_ieee_getets(struct net_device *ndev, struct ieee_ets *ets) { - struct hns3_nic_priv *priv = netdev_priv(ndev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(ndev); if (h->kinfo.dcb_ops->ieee_getets) return h->kinfo.dcb_ops->ieee_getets(h, ets); @@ -25,8 +24,7 @@ int hns3_dcbnl_ieee_getets(struct net_device *ndev, struct ieee_ets *ets) static int hns3_dcbnl_ieee_setets(struct net_device *ndev, struct ieee_ets *ets) { - struct hns3_nic_priv *priv = netdev_priv(ndev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(ndev); if (h->kinfo.dcb_ops->ieee_setets) return h->kinfo.dcb_ops->ieee_setets(h, ets); @@ -37,8 +35,7 @@ int hns3_dcbnl_ieee_setets(struct net_device *ndev, struct ieee_ets *ets) static int hns3_dcbnl_ieee_getpfc(struct net_device *ndev, struct ieee_pfc *pfc) { - struct hns3_nic_priv *priv = netdev_priv(ndev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(ndev); if (h->kinfo.dcb_ops->ieee_getpfc) return h->kinfo.dcb_ops->ieee_getpfc(h, pfc); @@ -49,8 +46,7 @@ int hns3_dcbnl_ieee_getpfc(struct net_device *ndev, struct ieee_pfc *pfc) static int hns3_dcbnl_ieee_setpfc(struct net_device *ndev, struct ieee_pfc *pfc) { - struct hns3_nic_priv *priv = netdev_priv(ndev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(ndev); if (h->kinfo.dcb_ops->ieee_setpfc) return h->kinfo.dcb_ops->ieee_setpfc(h, pfc); @@ -61,8 +57,7 @@ int hns3_dcbnl_ieee_setpfc(struct net_device *ndev, struct ieee_pfc *pfc) /* DCBX configuration */ static u8 hns3_dcbnl_getdcbx(struct net_device *ndev) { - struct hns3_nic_priv *priv = netdev_priv(ndev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(ndev); if (h->kinfo.dcb_ops->getdcbx) return h->kinfo.dcb_ops->getdcbx(h); @@ -73,8 +68,7 @@ static u8 hns3_dcbnl_getdcbx(struct net_device *ndev) /* return 0 if successful, otherwise fail */ static u8 hns3_dcbnl_setdcbx(struct net_device *ndev, u8 mode) { - struct hns3_nic_priv *priv = netdev_priv(ndev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(ndev); if (h->kinfo.dcb_ops->setdcbx) return h->kinfo.dcb_ops->setdcbx(h, mode); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c index c31506514e5d..ba550c1b5b01 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.c @@ -24,7 +24,7 @@ #include "hnae3.h" #include "hns3_enet.h" -const char hns3_driver_name[] = "hns3"; +static const char hns3_driver_name[] = "hns3"; const char hns3_driver_version[] = VERMAGIC_STRING; static const char hns3_driver_string[] = "Hisilicon Ethernet Network Driver for Hip08 Family"; @@ -198,8 +198,7 @@ static void hns3_vector_gl_rl_init(struct hns3_enet_tqp_vector *tqp_vector) static int hns3_nic_set_real_num_queue(struct net_device *netdev) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); struct hnae3_knic_private_info *kinfo = &h->kinfo; unsigned int queue_size = kinfo->rss_size * kinfo->num_tc; int ret; @@ -305,24 +304,10 @@ static int hns3_nic_net_stop(struct net_device *netdev) return 0; } -void hns3_set_multicast_list(struct net_device *netdev) -{ - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; - struct netdev_hw_addr *ha = NULL; - - if (h->ae_algo->ops->set_mc_addr) { - netdev_for_each_mc_addr(ha, netdev) - if (h->ae_algo->ops->set_mc_addr(h, ha->addr)) - netdev_err(netdev, "set multicast fail\n"); - } -} - static int hns3_nic_uc_sync(struct net_device *netdev, const unsigned char *addr) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (h->ae_algo->ops->add_uc_addr) return h->ae_algo->ops->add_uc_addr(h, addr); @@ -333,8 +318,7 @@ static int hns3_nic_uc_sync(struct net_device *netdev, static int hns3_nic_uc_unsync(struct net_device *netdev, const unsigned char *addr) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (h->ae_algo->ops->rm_uc_addr) return h->ae_algo->ops->rm_uc_addr(h, addr); @@ -345,8 +329,7 @@ static int hns3_nic_uc_unsync(struct net_device *netdev, static int hns3_nic_mc_sync(struct net_device *netdev, const unsigned char *addr) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (h->ae_algo->ops->add_mc_addr) return h->ae_algo->ops->add_mc_addr(h, addr); @@ -357,8 +340,7 @@ static int hns3_nic_mc_sync(struct net_device *netdev, static int hns3_nic_mc_unsync(struct net_device *netdev, const unsigned char *addr) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (h->ae_algo->ops->rm_mc_addr) return h->ae_algo->ops->rm_mc_addr(h, addr); @@ -366,10 +348,9 @@ static int hns3_nic_mc_unsync(struct net_device *netdev, return 0; } -void hns3_nic_set_rx_mode(struct net_device *netdev) +static void hns3_nic_set_rx_mode(struct net_device *netdev) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (h->ae_algo->ops->set_promisc_mode) { if (netdev->flags & IFF_PROMISC) @@ -768,7 +749,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, if (type == DESC_TYPE_SKB) { skb = (struct sk_buff *)priv; - paylen = cpu_to_le16(skb->len); + paylen = skb->len; if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_mac_len(skb); @@ -802,7 +783,7 @@ static int hns3_fill_desc(struct hns3_enet_ring *ring, void *priv, cpu_to_le32(ol_type_vlan_len_msec); desc->tx.type_cs_vlan_tso_len = cpu_to_le32(type_cs_vlan_tso); - desc->tx.paylen = cpu_to_le16(paylen); + desc->tx.paylen = cpu_to_le32(paylen); desc->tx.mss = cpu_to_le16(mss); } @@ -1025,8 +1006,7 @@ out_net_tx_busy: static int hns3_nic_net_set_mac_address(struct net_device *netdev, void *p) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); struct sockaddr *mac_addr = p; int ret; @@ -1208,8 +1188,7 @@ static void hns3_nic_udp_tunnel_del(struct net_device *netdev, static int hns3_setup_tc(struct net_device *netdev, u8 tc) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); struct hnae3_knic_private_info *kinfo = &h->kinfo; unsigned int i; int ret; @@ -1259,8 +1238,7 @@ static int hns3_nic_setup_tc(struct net_device *dev, enum tc_setup_type type, static int hns3_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); int ret = -EIO; if (h->ae_algo->ops->set_vlan_filter) @@ -1272,8 +1250,7 @@ static int hns3_vlan_rx_add_vid(struct net_device *netdev, static int hns3_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); int ret = -EIO; if (h->ae_algo->ops->set_vlan_filter) @@ -1285,8 +1262,7 @@ static int hns3_vlan_rx_kill_vid(struct net_device *netdev, static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, u8 qos, __be16 vlan_proto) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); int ret = -EIO; if (h->ae_algo->ops->set_vf_vlan_filter) @@ -1298,8 +1274,7 @@ static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); bool if_running = netif_running(netdev); int ret; @@ -2609,7 +2584,7 @@ static void hns3_fini_ring(struct hns3_enet_ring *ring) ring->next_to_use = 0; } -int hns3_buf_size2type(u32 buf_size) +static int hns3_buf_size2type(u32 buf_size) { int bd_size_type; @@ -2662,7 +2637,7 @@ static void hns3_init_ring_hw(struct hns3_enet_ring *ring) } } -static int hns3_init_all_ring(struct hns3_nic_priv *priv) +int hns3_init_all_ring(struct hns3_nic_priv *priv) { struct hnae3_handle *h = priv->ae_handle; int ring_num = h->kinfo.num_tqps * 2; @@ -2686,12 +2661,12 @@ static int hns3_init_all_ring(struct hns3_nic_priv *priv) out_when_alloc_ring_memory: for (j = i - 1; j >= 0; j--) - hns3_fini_ring(priv->ring_data[i].ring); + hns3_fini_ring(priv->ring_data[j].ring); return -ENOMEM; } -static int hns3_uninit_all_ring(struct hns3_nic_priv *priv) +int hns3_uninit_all_ring(struct hns3_nic_priv *priv) { struct hnae3_handle *h = priv->ae_handle; int i; @@ -2921,7 +2896,7 @@ err_out: return ret; } -const struct hnae3_client_ops client_ops = { +static const struct hnae3_client_ops client_ops = { .init_instance = hns3_client_init, .uninit_instance = hns3_client_uninit, .link_status_change = hns3_link_status_change, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h index 481eada73e2d..66599890b4d4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_enet.h @@ -76,6 +76,8 @@ enum hns3_nic_state { #define HNS3_RING_NAME_LEN 16 #define HNS3_BUFFER_SIZE_2048 2048 #define HNS3_RING_MAX_PENDING 32768 +#define HNS3_RING_MIN_PENDING 8 +#define HNS3_RING_BD_MULTIPLE 8 #define HNS3_MAX_MTU 9728 #define HNS3_BD_SIZE_512_TYPE 0 @@ -587,9 +589,14 @@ static inline void hns3_write_reg(void __iomem *base, u32 reg, u32 value) #define hns3_for_each_ring(pos, head) \ for (pos = (head).ring; pos; pos = pos->next) +#define hns3_get_handle(ndev) \ + (((struct hns3_nic_priv *)netdev_priv(ndev))->ae_handle) + void hns3_ethtool_set_ops(struct net_device *netdev); int hns3_clean_tx_ring(struct hns3_enet_ring *ring, int budget); +int hns3_init_all_ring(struct hns3_nic_priv *priv); +int hns3_uninit_all_ring(struct hns3_nic_priv *priv); #ifdef CONFIG_HNS3_DCB void hns3_dcbnl_setup(struct hnae3_handle *handle); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c index d636399232fb..ddbd7f30c6a4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hns3_ethtool.c @@ -102,8 +102,7 @@ static void hns3_driv_to_eth_caps(u32 caps, struct ethtool_link_ksettings *cmd, static int hns3_get_sset_count(struct net_device *netdev, int stringset) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); const struct hnae3_ae_ops *ops = h->ae_algo->ops; if (!ops->get_sset_count) @@ -164,8 +163,7 @@ static u8 *hns3_get_strings_tqps(struct hnae3_handle *handle, u8 *data) static void hns3_get_strings(struct net_device *netdev, u32 stringset, u8 *data) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); const struct hnae3_ae_ops *ops = h->ae_algo->ops; char *buff = (char *)data; @@ -217,11 +215,10 @@ static u64 *hns3_get_stats_tqps(struct hnae3_handle *handle, u64 *data) * @stats: statistics info. * @data: statistics data. */ -void hns3_get_stats(struct net_device *netdev, struct ethtool_stats *stats, - u64 *data) +static void hns3_get_stats(struct net_device *netdev, + struct ethtool_stats *stats, u64 *data) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); u64 *p = data; if (!h->ae_algo->ops->get_stats || !h->ae_algo->ops->update_stats) { @@ -262,10 +259,7 @@ static void hns3_get_drvinfo(struct net_device *netdev, static u32 hns3_get_link(struct net_device *netdev) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h; - - h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (h->ae_algo && h->ae_algo->ops && h->ae_algo->ops->get_status) return h->ae_algo->ops->get_status(h); @@ -277,7 +271,8 @@ static void hns3_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *param) { struct hns3_nic_priv *priv = netdev_priv(netdev); - int queue_num = priv->ae_handle->kinfo.num_tqps; + struct hnae3_handle *h = priv->ae_handle; + int queue_num = h->kinfo.num_tqps; param->tx_max_pending = HNS3_RING_MAX_PENDING; param->rx_max_pending = HNS3_RING_MAX_PENDING; @@ -289,8 +284,7 @@ static void hns3_get_ringparam(struct net_device *netdev, static void hns3_get_pauseparam(struct net_device *netdev, struct ethtool_pauseparam *param) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (h->ae_algo && h->ae_algo->ops && h->ae_algo->ops->get_pauseparam) h->ae_algo->ops->get_pauseparam(h, ¶m->autoneg, @@ -300,8 +294,7 @@ static void hns3_get_pauseparam(struct net_device *netdev, static int hns3_get_link_ksettings(struct net_device *netdev, struct ethtool_link_ksettings *cmd) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); u32 supported_caps; u32 advertised_caps; u8 media_type = HNAE3_MEDIA_TYPE_UNKNOWN; @@ -392,8 +385,7 @@ static int hns3_get_link_ksettings(struct net_device *netdev, static u32 hns3_get_rss_key_size(struct net_device *netdev) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->get_rss_key_size) @@ -404,8 +396,7 @@ static u32 hns3_get_rss_key_size(struct net_device *netdev) static u32 hns3_get_rss_indir_size(struct net_device *netdev) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->get_rss_indir_size) @@ -417,8 +408,7 @@ static u32 hns3_get_rss_indir_size(struct net_device *netdev) static int hns3_get_rss(struct net_device *netdev, u32 *indir, u8 *key, u8 *hfunc) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->get_rss) return -EOPNOTSUPP; @@ -429,8 +419,7 @@ static int hns3_get_rss(struct net_device *netdev, u32 *indir, u8 *key, static int hns3_set_rss(struct net_device *netdev, const u32 *indir, const u8 *key, const u8 hfunc) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->set_rss) return -EOPNOTSUPP; @@ -454,16 +443,17 @@ static int hns3_get_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd, u32 *rule_locs) { - struct hns3_nic_priv *priv = netdev_priv(netdev); - struct hnae3_handle *h = priv->ae_handle; + struct hnae3_handle *h = hns3_get_handle(netdev); if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->get_tc_size) return -EOPNOTSUPP; switch (cmd->cmd) { case ETHTOOL_GRXRINGS: - cmd->data = h->ae_algo->ops->get_tc_size(h); + cmd->data = h->kinfo.num_tc * h->kinfo.rss_size; break; + case ETHTOOL_GRXFH: + return h->ae_algo->ops->get_rss_tuple(h, cmd); default: return -EOPNOTSUPP; } @@ -471,15 +461,108 @@ static int hns3_get_rxnfc(struct net_device *netdev, return 0; } +static int hns3_change_all_ring_bd_num(struct hns3_nic_priv *priv, + u32 new_desc_num) +{ + struct hnae3_handle *h = priv->ae_handle; + int i; + + h->kinfo.num_desc = new_desc_num; + + for (i = 0; i < h->kinfo.num_tqps * 2; i++) + priv->ring_data[i].ring->desc_num = new_desc_num; + + return hns3_init_all_ring(priv); +} + +static int hns3_set_ringparam(struct net_device *ndev, + struct ethtool_ringparam *param) +{ + struct hns3_nic_priv *priv = netdev_priv(ndev); + struct hnae3_handle *h = priv->ae_handle; + bool if_running = netif_running(ndev); + u32 old_desc_num, new_desc_num; + int ret; + + if (param->rx_mini_pending || param->rx_jumbo_pending) + return -EINVAL; + + if (param->tx_pending != param->rx_pending) { + netdev_err(ndev, + "Descriptors of tx and rx must be equal"); + return -EINVAL; + } + + if (param->tx_pending > HNS3_RING_MAX_PENDING || + param->tx_pending < HNS3_RING_MIN_PENDING) { + netdev_err(ndev, + "Descriptors requested (Tx/Rx: %d) out of range [%d-%d]\n", + param->tx_pending, HNS3_RING_MIN_PENDING, + HNS3_RING_MAX_PENDING); + return -EINVAL; + } + + new_desc_num = param->tx_pending; + + /* Hardware requires that its descriptors must be multiple of eight */ + new_desc_num = ALIGN(new_desc_num, HNS3_RING_BD_MULTIPLE); + old_desc_num = h->kinfo.num_desc; + if (old_desc_num == new_desc_num) + return 0; + + netdev_info(ndev, + "Changing descriptor count from %d to %d.\n", + old_desc_num, new_desc_num); + + if (if_running) + dev_close(ndev); + + ret = hns3_uninit_all_ring(priv); + if (ret) + return ret; + + ret = hns3_change_all_ring_bd_num(priv, new_desc_num); + if (ret) { + ret = hns3_change_all_ring_bd_num(priv, old_desc_num); + if (ret) { + netdev_err(ndev, + "Revert to old bd num fail, ret=%d.\n", ret); + return ret; + } + } + + if (if_running) + ret = dev_open(ndev); + + return ret; +} + +static int hns3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd) +{ + struct hnae3_handle *h = hns3_get_handle(netdev); + + if (!h->ae_algo || !h->ae_algo->ops || !h->ae_algo->ops->set_rss_tuple) + return -EOPNOTSUPP; + + switch (cmd->cmd) { + case ETHTOOL_SRXFH: + return h->ae_algo->ops->set_rss_tuple(h, cmd); + default: + return -EOPNOTSUPP; + } +} + static const struct ethtool_ops hns3_ethtool_ops = { .get_drvinfo = hns3_get_drvinfo, .get_link = hns3_get_link, .get_ringparam = hns3_get_ringparam, + .set_ringparam = hns3_set_ringparam, .get_pauseparam = hns3_get_pauseparam, .get_strings = hns3_get_strings, .get_ethtool_stats = hns3_get_stats, .get_sset_count = hns3_get_sset_count, .get_rxnfc = hns3_get_rxnfc, + .set_rxnfc = hns3_set_rxnfc, .get_rxfh_key_size = hns3_get_rss_key_size, .get_rxfh_indir_size = hns3_get_rss_indir_size, .get_rxfh = hns3_get_rss, diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h index 0641c0098738..afb7ebe20b24 100644 --- a/drivers/net/ethernet/intel/e1000e/defines.h +++ b/drivers/net/ethernet/intel/e1000e/defines.h @@ -398,6 +398,7 @@ #define E1000_ICR_LSC 0x00000004 /* Link Status Change */ #define E1000_ICR_RXSEQ 0x00000008 /* Rx sequence error */ #define E1000_ICR_RXDMT0 0x00000010 /* Rx desc min. threshold (0) */ +#define E1000_ICR_RXO 0x00000040 /* Receiver Overrun */ #define E1000_ICR_RXT0 0x00000080 /* Rx timer intr (ring 0) */ #define E1000_ICR_ECCER 0x00400000 /* Uncorrectable ECC Error */ /* If this bit asserted, the driver should claim the interrupt */ diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h b/drivers/net/ethernet/intel/e1000e/e1000.h index 98e68888abb1..2311b31bdcac 100644 --- a/drivers/net/ethernet/intel/e1000e/e1000.h +++ b/drivers/net/ethernet/intel/e1000e/e1000.h @@ -94,10 +94,6 @@ struct e1000_info; */ #define E1000_CHECK_RESET_COUNT 25 -#define DEFAULT_RDTR 0 -#define DEFAULT_RADV 8 -#define BURST_RDTR 0x20 -#define BURST_RADV 0x20 #define PCICFG_DESC_RING_STATUS 0xe4 #define FLUSH_DESC_REQUIRED 0x100 diff --git a/drivers/net/ethernet/intel/e1000e/mac.c b/drivers/net/ethernet/intel/e1000e/mac.c index b322011ec282..f457c5703d0c 100644 --- a/drivers/net/ethernet/intel/e1000e/mac.c +++ b/drivers/net/ethernet/intel/e1000e/mac.c @@ -410,6 +410,9 @@ void e1000e_clear_hw_cntrs_base(struct e1000_hw *hw) * Checks to see of the link status of the hardware has changed. If a * change in link status has been detected, then we read the PHY registers * to get the current speed/duplex if link exists. + * + * Returns a negative error code (-E1000_ERR_*) or 0 (link down) or 1 (link + * up). **/ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) { @@ -423,7 +426,7 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) * Change or Rx Sequence Error interrupt. */ if (!mac->get_link_status) - return 0; + return 1; /* First we want to see if the MII Status Register reports * link. If so, then we want to get the current speed/duplex @@ -461,10 +464,12 @@ s32 e1000e_check_for_copper_link(struct e1000_hw *hw) * different link partner. */ ret_val = e1000e_config_fc_after_link_up(hw); - if (ret_val) + if (ret_val) { e_dbg("Error configuring flow control\n"); + return ret_val; + } - return ret_val; + return 1; } /** diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c index 8436c5f2c3e8..bf8f38f76953 100644 --- a/drivers/net/ethernet/intel/e1000e/netdev.c +++ b/drivers/net/ethernet/intel/e1000e/netdev.c @@ -1071,7 +1071,8 @@ next_desc: } static void e1000_put_txbuf(struct e1000_ring *tx_ring, - struct e1000_buffer *buffer_info) + struct e1000_buffer *buffer_info, + bool drop) { struct e1000_adapter *adapter = tx_ring->adapter; @@ -1085,7 +1086,10 @@ static void e1000_put_txbuf(struct e1000_ring *tx_ring, buffer_info->dma = 0; } if (buffer_info->skb) { - dev_kfree_skb_any(buffer_info->skb); + if (drop) + dev_kfree_skb_any(buffer_info->skb); + else + dev_consume_skb_any(buffer_info->skb); buffer_info->skb = NULL; } buffer_info->time_stamp = 0; @@ -1199,7 +1203,7 @@ static void e1000e_tx_hwtstamp_work(struct work_struct *work) wmb(); /* force write prior to skb_tstamp_tx */ skb_tstamp_tx(skb, &shhwtstamps); - dev_kfree_skb_any(skb); + dev_consume_skb_any(skb); } else if (time_after(jiffies, adapter->tx_hwtstamp_start + adapter->tx_timeout_factor * HZ)) { dev_kfree_skb_any(adapter->tx_hwtstamp_skb); @@ -1254,7 +1258,7 @@ static bool e1000_clean_tx_irq(struct e1000_ring *tx_ring) } } - e1000_put_txbuf(tx_ring, buffer_info); + e1000_put_txbuf(tx_ring, buffer_info, false); tx_desc->upper.data = 0; i++; @@ -1910,14 +1914,30 @@ static irqreturn_t e1000_msix_other(int __always_unused irq, void *data) struct net_device *netdev = data; struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_hw *hw = &adapter->hw; + u32 icr; + bool enable = true; + + icr = er32(ICR); + if (icr & E1000_ICR_RXO) { + ew32(ICR, E1000_ICR_RXO); + enable = false; + /* napi poll will re-enable Other, make sure it runs */ + if (napi_schedule_prep(&adapter->napi)) { + adapter->total_rx_bytes = 0; + adapter->total_rx_packets = 0; + __napi_schedule(&adapter->napi); + } + } + if (icr & E1000_ICR_LSC) { + ew32(ICR, E1000_ICR_LSC); + hw->mac.get_link_status = true; + /* guard against interrupt when we're going down */ + if (!test_bit(__E1000_DOWN, &adapter->state)) + mod_timer(&adapter->watchdog_timer, jiffies + 1); + } - hw->mac.get_link_status = true; - - /* guard against interrupt when we're going down */ - if (!test_bit(__E1000_DOWN, &adapter->state)) { - mod_timer(&adapter->watchdog_timer, jiffies + 1); + if (enable && !test_bit(__E1000_DOWN, &adapter->state)) ew32(IMS, E1000_IMS_OTHER); - } return IRQ_HANDLED; } @@ -2421,7 +2441,7 @@ static void e1000_clean_tx_ring(struct e1000_ring *tx_ring) for (i = 0; i < tx_ring->count; i++) { buffer_info = &tx_ring->buffer_info[i]; - e1000_put_txbuf(tx_ring, buffer_info); + e1000_put_txbuf(tx_ring, buffer_info, false); } netdev_reset_queue(adapter->netdev); @@ -2687,7 +2707,8 @@ static int e1000e_poll(struct napi_struct *napi, int weight) napi_complete_done(napi, work_done); if (!test_bit(__E1000_DOWN, &adapter->state)) { if (adapter->msix_entries) - ew32(IMS, adapter->rx_ring->ims_val); + ew32(IMS, adapter->rx_ring->ims_val | + E1000_IMS_OTHER); else e1000_irq_enable(adapter); } @@ -3004,8 +3025,8 @@ static void e1000_configure_tx(struct e1000_adapter *adapter) hw->mac.ops.config_collision_dist(hw); - /* SPT and CNP Si errata workaround to avoid data corruption */ - if (hw->mac.type >= e1000_pch_spt) { + /* SPT and KBL Si errata workaround to avoid data corruption */ + if (hw->mac.type == e1000_pch_spt) { u32 reg_val; reg_val = er32(IOSFPC); @@ -3013,7 +3034,9 @@ static void e1000_configure_tx(struct e1000_adapter *adapter) ew32(IOSFPC, reg_val); reg_val = er32(TARC(0)); - reg_val |= E1000_TARC0_CB_MULTIQ_3_REQ; + /* SPT and KBL Si errata workaround to avoid Tx hang */ + reg_val &= ~BIT(28); + reg_val |= BIT(29); ew32(TARC(0), reg_val); } } @@ -3223,14 +3246,6 @@ static void e1000_configure_rx(struct e1000_adapter *adapter) */ ew32(RXDCTL(0), E1000_RXDCTL_DMA_BURST_ENABLE); ew32(RXDCTL(1), E1000_RXDCTL_DMA_BURST_ENABLE); - - /* override the delay timers for enabling bursting, only if - * the value was not set by the user via module options - */ - if (adapter->rx_int_delay == DEFAULT_RDTR) - adapter->rx_int_delay = BURST_RDTR; - if (adapter->rx_abs_int_delay == DEFAULT_RADV) - adapter->rx_abs_int_delay = BURST_RADV; } /* set the Receive Delay Timer Register */ @@ -4204,7 +4219,7 @@ static void e1000e_trigger_lsc(struct e1000_adapter *adapter) struct e1000_hw *hw = &adapter->hw; if (adapter->msix_entries) - ew32(ICS, E1000_ICS_OTHER); + ew32(ICS, E1000_ICS_LSC | E1000_ICS_OTHER); else ew32(ICS, E1000_ICS_LSC); } @@ -5074,14 +5089,14 @@ static bool e1000e_has_link(struct e1000_adapter *adapter) /* get_link_status is set on LSC (link status) interrupt or * Rx sequence error interrupt. get_link_status will stay - * false until the check_for_link establishes link + * true until the check_for_link establishes link * for copper adapters ONLY */ switch (hw->phy.media_type) { case e1000_media_type_copper: if (hw->mac.get_link_status) { ret_val = hw->mac.ops.check_for_link(hw); - link_active = !hw->mac.get_link_status; + link_active = ret_val > 0; } else { link_active = true; } @@ -5092,14 +5107,14 @@ static bool e1000e_has_link(struct e1000_adapter *adapter) break; case e1000_media_type_internal_serdes: ret_val = hw->mac.ops.check_for_link(hw); - link_active = adapter->hw.mac.serdes_has_link; + link_active = hw->mac.serdes_has_link; break; default: case e1000_media_type_unknown: break; } - if ((ret_val == E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) && + if ((ret_val == -E1000_ERR_PHY) && (hw->phy.type == e1000_phy_igp_3) && (er32(CTRL) & E1000_PHY_CTRL_GBE_DISABLE)) { /* See e1000_kmrn_lock_loss_workaround_ich8lan() */ e_info("Gigabit has been disabled, downgrading speed\n"); @@ -5614,7 +5629,7 @@ dma_error: i += tx_ring->count; i--; buffer_info = &tx_ring->buffer_info[i]; - e1000_put_txbuf(tx_ring, buffer_info); + e1000_put_txbuf(tx_ring, buffer_info, true); } return 0; @@ -7408,7 +7423,7 @@ static void e1000_remove(struct pci_dev *pdev) if (adapter->flags & FLAG_HAS_HW_TIMESTAMP) { cancel_work_sync(&adapter->tx_hwtstamp_work); if (adapter->tx_hwtstamp_skb) { - dev_kfree_skb_any(adapter->tx_hwtstamp_skb); + dev_consume_skb_any(adapter->tx_hwtstamp_skb); adapter->tx_hwtstamp_skb = NULL; } } diff --git a/drivers/net/ethernet/intel/e1000e/param.c b/drivers/net/ethernet/intel/e1000e/param.c index 6d8c39abee16..47da51864543 100644 --- a/drivers/net/ethernet/intel/e1000e/param.c +++ b/drivers/net/ethernet/intel/e1000e/param.c @@ -73,17 +73,25 @@ E1000_PARAM(TxAbsIntDelay, "Transmit Absolute Interrupt Delay"); /* Receive Interrupt Delay in units of 1.024 microseconds * hardware will likely hang if you set this to anything but zero. * + * Burst variant is used as default if device has FLAG2_DMA_BURST. + * * Valid Range: 0-65535 */ E1000_PARAM(RxIntDelay, "Receive Interrupt Delay"); +#define DEFAULT_RDTR 0 +#define BURST_RDTR 0x20 #define MAX_RXDELAY 0xFFFF #define MIN_RXDELAY 0 /* Receive Absolute Interrupt Delay in units of 1.024 microseconds * + * Burst variant is used as default if device has FLAG2_DMA_BURST. + * * Valid Range: 0-65535 */ E1000_PARAM(RxAbsIntDelay, "Receive Absolute Interrupt Delay"); +#define DEFAULT_RADV 8 +#define BURST_RADV 0x20 #define MAX_RXABSDELAY 0xFFFF #define MIN_RXABSDELAY 0 @@ -297,6 +305,9 @@ void e1000e_check_options(struct e1000_adapter *adapter) .max = MAX_RXDELAY } } }; + if (adapter->flags2 & FLAG2_DMA_BURST) + opt.def = BURST_RDTR; + if (num_RxIntDelay > bd) { adapter->rx_int_delay = RxIntDelay[bd]; e1000_validate_option(&adapter->rx_int_delay, &opt, @@ -307,7 +318,7 @@ void e1000e_check_options(struct e1000_adapter *adapter) } /* Receive Absolute Interrupt Delay */ { - static const struct e1000_option opt = { + static struct e1000_option opt = { .type = range_option, .name = "Receive Absolute Interrupt Delay", .err = "using default of " @@ -317,6 +328,9 @@ void e1000e_check_options(struct e1000_adapter *adapter) .max = MAX_RXABSDELAY } } }; + if (adapter->flags2 & FLAG2_DMA_BURST) + opt.def = BURST_RADV; + if (num_RxAbsIntDelay > bd) { adapter->rx_abs_int_delay = RxAbsIntDelay[bd]; e1000_validate_option(&adapter->rx_abs_int_delay, &opt, diff --git a/drivers/net/ethernet/intel/e1000e/phy.c b/drivers/net/ethernet/intel/e1000e/phy.c index d78d47b41a71..86ff0969efb6 100644 --- a/drivers/net/ethernet/intel/e1000e/phy.c +++ b/drivers/net/ethernet/intel/e1000e/phy.c @@ -1744,6 +1744,7 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations, s32 ret_val = 0; u16 i, phy_status; + *success = false; for (i = 0; i < iterations; i++) { /* Some PHYs require the MII_BMSR register to be read * twice due to the link bit being sticky. No harm doing @@ -1763,16 +1764,16 @@ s32 e1000e_phy_has_link_generic(struct e1000_hw *hw, u32 iterations, ret_val = e1e_rphy(hw, MII_BMSR, &phy_status); if (ret_val) break; - if (phy_status & BMSR_LSTATUS) + if (phy_status & BMSR_LSTATUS) { + *success = true; break; + } if (usec_interval >= 1000) msleep(usec_interval / 1000); else udelay(usec_interval); } - *success = (i < iterations); - return ret_val; } diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 439c63cb2a0c..8139b4ee1dc3 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -350,7 +350,7 @@ struct i40e_pf { u16 num_vmdq_vsis; /* num vmdq vsis this PF has set up */ u16 num_vmdq_qps; /* num queue pairs per vmdq pool */ u16 num_vmdq_msix; /* num queue vectors per vmdq pool */ - u16 num_req_vfs; /* num VFs requested for this VF */ + u16 num_req_vfs; /* num VFs requested for this PF */ u16 num_vf_qps; /* num queue pairs per VF */ u16 num_lan_qps; /* num lan queues this PF has set up */ u16 num_lan_msix; /* num queue vectors for the base PF vsi */ @@ -403,55 +403,57 @@ struct i40e_pf { struct timer_list service_timer; struct work_struct service_task; - u64 hw_features; -#define I40E_HW_RSS_AQ_CAPABLE BIT_ULL(0) -#define I40E_HW_128_QP_RSS_CAPABLE BIT_ULL(1) -#define I40E_HW_ATR_EVICT_CAPABLE BIT_ULL(2) -#define I40E_HW_WB_ON_ITR_CAPABLE BIT_ULL(3) -#define I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE BIT_ULL(4) -#define I40E_HW_NO_PCI_LINK_CHECK BIT_ULL(5) -#define I40E_HW_100M_SGMII_CAPABLE BIT_ULL(6) -#define I40E_HW_NO_DCB_SUPPORT BIT_ULL(7) -#define I40E_HW_USE_SET_LLDP_MIB BIT_ULL(8) -#define I40E_HW_GENEVE_OFFLOAD_CAPABLE BIT_ULL(9) -#define I40E_HW_PTP_L4_CAPABLE BIT_ULL(10) -#define I40E_HW_WOL_MC_MAGIC_PKT_WAKE BIT_ULL(11) -#define I40E_HW_MPLS_HDR_OFFLOAD_CAPABLE BIT_ULL(12) -#define I40E_HW_HAVE_CRT_RETIMER BIT_ULL(13) -#define I40E_HW_OUTER_UDP_CSUM_CAPABLE BIT_ULL(14) -#define I40E_HW_PHY_CONTROLS_LEDS BIT_ULL(15) -#define I40E_HW_STOP_FW_LLDP BIT_ULL(16) -#define I40E_HW_PORT_ID_VALID BIT_ULL(17) -#define I40E_HW_RESTART_AUTONEG BIT_ULL(18) - - u64 flags; -#define I40E_FLAG_RX_CSUM_ENABLED BIT_ULL(1) -#define I40E_FLAG_MSI_ENABLED BIT_ULL(2) -#define I40E_FLAG_MSIX_ENABLED BIT_ULL(3) -#define I40E_FLAG_HW_ATR_EVICT_ENABLED BIT_ULL(4) -#define I40E_FLAG_RSS_ENABLED BIT_ULL(6) -#define I40E_FLAG_VMDQ_ENABLED BIT_ULL(7) -#define I40E_FLAG_IWARP_ENABLED BIT_ULL(10) -#define I40E_FLAG_FILTER_SYNC BIT_ULL(15) -#define I40E_FLAG_SERVICE_CLIENT_REQUESTED BIT_ULL(16) -#define I40E_FLAG_SRIOV_ENABLED BIT_ULL(19) -#define I40E_FLAG_DCB_ENABLED BIT_ULL(20) -#define I40E_FLAG_FD_SB_ENABLED BIT_ULL(21) -#define I40E_FLAG_FD_ATR_ENABLED BIT_ULL(22) -#define I40E_FLAG_FD_SB_AUTO_DISABLED BIT_ULL(23) -#define I40E_FLAG_FD_ATR_AUTO_DISABLED BIT_ULL(24) -#define I40E_FLAG_PTP BIT_ULL(25) -#define I40E_FLAG_MFP_ENABLED BIT_ULL(26) -#define I40E_FLAG_UDP_FILTER_SYNC BIT_ULL(27) -#define I40E_FLAG_DCB_CAPABLE BIT_ULL(29) -#define I40E_FLAG_VEB_STATS_ENABLED BIT_ULL(37) -#define I40E_FLAG_LINK_POLLING_ENABLED BIT_ULL(39) -#define I40E_FLAG_VEB_MODE_ENABLED BIT_ULL(40) -#define I40E_FLAG_TRUE_PROMISC_SUPPORT BIT_ULL(51) -#define I40E_FLAG_CLIENT_RESET BIT_ULL(54) -#define I40E_FLAG_TEMP_LINK_POLLING BIT_ULL(55) -#define I40E_FLAG_CLIENT_L2_CHANGE BIT_ULL(56) -#define I40E_FLAG_LEGACY_RX BIT_ULL(58) + u32 hw_features; +#define I40E_HW_RSS_AQ_CAPABLE BIT(0) +#define I40E_HW_128_QP_RSS_CAPABLE BIT(1) +#define I40E_HW_ATR_EVICT_CAPABLE BIT(2) +#define I40E_HW_WB_ON_ITR_CAPABLE BIT(3) +#define I40E_HW_MULTIPLE_TCP_UDP_RSS_PCTYPE BIT(4) +#define I40E_HW_NO_PCI_LINK_CHECK BIT(5) +#define I40E_HW_100M_SGMII_CAPABLE BIT(6) +#define I40E_HW_NO_DCB_SUPPORT BIT(7) +#define I40E_HW_USE_SET_LLDP_MIB BIT(8) +#define I40E_HW_GENEVE_OFFLOAD_CAPABLE BIT(9) +#define I40E_HW_PTP_L4_CAPABLE BIT(10) +#define I40E_HW_WOL_MC_MAGIC_PKT_WAKE BIT(11) +#define I40E_HW_MPLS_HDR_OFFLOAD_CAPABLE BIT(12) +#define I40E_HW_HAVE_CRT_RETIMER BIT(13) +#define I40E_HW_OUTER_UDP_CSUM_CAPABLE BIT(14) +#define I40E_HW_PHY_CONTROLS_LEDS BIT(15) +#define I40E_HW_STOP_FW_LLDP BIT(16) +#define I40E_HW_PORT_ID_VALID BIT(17) +#define I40E_HW_RESTART_AUTONEG BIT(18) + + u32 flags; +#define I40E_FLAG_RX_CSUM_ENABLED BIT(0) +#define I40E_FLAG_MSI_ENABLED BIT(1) +#define I40E_FLAG_MSIX_ENABLED BIT(2) +#define I40E_FLAG_RSS_ENABLED BIT(3) +#define I40E_FLAG_VMDQ_ENABLED BIT(4) +#define I40E_FLAG_FILTER_SYNC BIT(5) +#define I40E_FLAG_SRIOV_ENABLED BIT(6) +#define I40E_FLAG_DCB_CAPABLE BIT(7) +#define I40E_FLAG_DCB_ENABLED BIT(8) +#define I40E_FLAG_FD_SB_ENABLED BIT(9) +#define I40E_FLAG_FD_ATR_ENABLED BIT(10) +#define I40E_FLAG_FD_SB_AUTO_DISABLED BIT(11) +#define I40E_FLAG_FD_ATR_AUTO_DISABLED BIT(12) +#define I40E_FLAG_MFP_ENABLED BIT(13) +#define I40E_FLAG_UDP_FILTER_SYNC BIT(14) +#define I40E_FLAG_HW_ATR_EVICT_ENABLED BIT(15) +#define I40E_FLAG_VEB_MODE_ENABLED BIT(16) +#define I40E_FLAG_VEB_STATS_ENABLED BIT(17) +#define I40E_FLAG_LINK_POLLING_ENABLED BIT(18) +#define I40E_FLAG_TRUE_PROMISC_SUPPORT BIT(19) +#define I40E_FLAG_TEMP_LINK_POLLING BIT(20) +#define I40E_FLAG_LEGACY_RX BIT(21) +#define I40E_FLAG_PTP BIT(22) +#define I40E_FLAG_IWARP_ENABLED BIT(23) +#define I40E_FLAG_SERVICE_CLIENT_REQUESTED BIT(24) +#define I40E_FLAG_CLIENT_L2_CHANGE BIT(25) +#define I40E_FLAG_CLIENT_RESET BIT(26) +#define I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED BIT(27) +#define I40E_FLAG_SOURCE_PRUNING_DISABLED BIT(28) struct i40e_client_instance *cinst; bool stat_offsets_loaded; @@ -947,9 +949,6 @@ static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector) struct i40e_hw *hw = &pf->hw; u32 val; - /* definitely clear the PBA here, as this function is meant to - * clean out all previous interrupts AND enable the interrupt - */ val = I40E_PFINT_DYN_CTLN_INTENA_MASK | I40E_PFINT_DYN_CTLN_CLEARPBA_MASK | (I40E_ITR_NONE << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT); @@ -958,7 +957,7 @@ static inline void i40e_irq_dynamic_enable(struct i40e_vsi *vsi, int vector) } void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf); -void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba); +void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf); int i40e_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd); int i40e_open(struct net_device *netdev); int i40e_close(struct net_device *netdev); diff --git a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h index 4c85ea9cd89a..a8f65aed5421 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40e/i40e_adminq_cmd.h @@ -1771,9 +1771,10 @@ enum i40e_aq_phy_type { I40E_PHY_TYPE_25GBASE_CR = 0x20, I40E_PHY_TYPE_25GBASE_SR = 0x21, I40E_PHY_TYPE_25GBASE_LR = 0x22, + I40E_PHY_TYPE_MAX, + I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP = 0xFD, I40E_PHY_TYPE_EMPTY = 0xFE, I40E_PHY_TYPE_DEFAULT = 0xFF, - I40E_PHY_TYPE_MAX }; #define I40E_LINK_SPEED_100MB_SHIFT 0x1 diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c index 60542beda7ad..53aad378d49c 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_common.c +++ b/drivers/net/ethernet/intel/i40e/i40e_common.c @@ -1567,30 +1567,46 @@ i40e_status i40e_aq_get_phy_capabilities(struct i40e_hw *hw, struct i40e_aq_desc desc; i40e_status status; u16 abilities_size = sizeof(struct i40e_aq_get_phy_abilities_resp); + u16 max_delay = I40E_MAX_PHY_TIMEOUT, total_delay = 0; if (!abilities) return I40E_ERR_PARAM; - i40e_fill_default_direct_cmd_desc(&desc, - i40e_aqc_opc_get_phy_abilities); + do { + i40e_fill_default_direct_cmd_desc(&desc, + i40e_aqc_opc_get_phy_abilities); - desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF); - if (abilities_size > I40E_AQ_LARGE_BUF) - desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB); + desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_BUF); + if (abilities_size > I40E_AQ_LARGE_BUF) + desc.flags |= cpu_to_le16((u16)I40E_AQ_FLAG_LB); - if (qualified_modules) - desc.params.external.param0 |= + if (qualified_modules) + desc.params.external.param0 |= cpu_to_le32(I40E_AQ_PHY_REPORT_QUALIFIED_MODULES); - if (report_init) - desc.params.external.param0 |= + if (report_init) + desc.params.external.param0 |= cpu_to_le32(I40E_AQ_PHY_REPORT_INITIAL_VALUES); - status = i40e_asq_send_command(hw, &desc, abilities, abilities_size, - cmd_details); + status = i40e_asq_send_command(hw, &desc, abilities, + abilities_size, cmd_details); - if (hw->aq.asq_last_status == I40E_AQ_RC_EIO) - status = I40E_ERR_UNKNOWN_PHY; + if (status) + break; + + if (hw->aq.asq_last_status == I40E_AQ_RC_EIO) { + status = I40E_ERR_UNKNOWN_PHY; + break; + } else if (hw->aq.asq_last_status == I40E_AQ_RC_EAGAIN) { + usleep_range(1000, 2000); + total_delay++; + status = I40E_ERR_TIMEOUT; + } + } while ((hw->aq.asq_last_status != I40E_AQ_RC_OK) && + (total_delay < max_delay)); + + if (status) + return status; if (report_init) { if (hw->mac.type == I40E_MAC_XL710 && diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index 8f326f87a815..6f2725fc50a1 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -278,8 +278,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) rx_ring->netdev, rx_ring->rx_bi); dev_info(&pf->pdev->dev, - " rx_rings[%i]: state = %li, queue_index = %d, reg_idx = %d\n", - i, rx_ring->state, + " rx_rings[%i]: state = %lu, queue_index = %d, reg_idx = %d\n", + i, *rx_ring->state, rx_ring->queue_index, rx_ring->reg_idx); dev_info(&pf->pdev->dev, @@ -334,8 +334,8 @@ static void i40e_dbg_dump_vsi_seid(struct i40e_pf *pf, int seid) tx_ring->netdev, tx_ring->tx_bi); dev_info(&pf->pdev->dev, - " tx_rings[%i]: state = %li, queue_index = %d, reg_idx = %d\n", - i, tx_ring->state, + " tx_rings[%i]: state = %lu, queue_index = %d, reg_idx = %d\n", + i, *tx_ring->state, tx_ring->queue_index, tx_ring->reg_idx); dev_info(&pf->pdev->dev, diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 1136d02e2e95..afd3ca8d9851 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -227,6 +227,8 @@ static const struct i40e_priv_flags i40e_gstrings_priv_flags[] = { I40E_PRIV_FLAG("veb-stats", I40E_FLAG_VEB_STATS_ENABLED, 0), I40E_PRIV_FLAG("hw-atr-eviction", I40E_FLAG_HW_ATR_EVICT_ENABLED, 0), I40E_PRIV_FLAG("legacy-rx", I40E_FLAG_LEGACY_RX, 0), + I40E_PRIV_FLAG("disable-source-pruning", + I40E_FLAG_SOURCE_PRUNING_DISABLED, 0), }; #define I40E_PRIV_FLAGS_STR_LEN ARRAY_SIZE(i40e_gstrings_priv_flags) @@ -2008,7 +2010,9 @@ static int i40e_set_phys_id(struct net_device *netdev, if (!(pf->hw_features & I40E_HW_PHY_CONTROLS_LEDS)) { pf->led_status = i40e_led_get(hw); } else { - i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL, NULL); + if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE)) + i40e_aq_set_phy_debug(hw, I40E_PHY_DEBUG_ALL, + NULL); ret = i40e_led_get_phy(hw, &temp_status, &pf->phy_led_val); pf->led_status = temp_status; @@ -2033,7 +2037,8 @@ static int i40e_set_phys_id(struct net_device *netdev, ret = i40e_led_set_phy(hw, false, pf->led_status, (pf->phy_led_val | I40E_PHY_LED_MODE_ORIG)); - i40e_aq_set_phy_debug(hw, 0, NULL); + if (!(hw->flags & I40E_HW_FLAG_AQ_PHY_ACCESS_CAPABLE)) + i40e_aq_set_phy_debug(hw, 0, NULL); } break; default: @@ -4090,7 +4095,7 @@ static int i40e_set_priv_flags(struct net_device *dev, u32 flags) struct i40e_netdev_priv *np = netdev_priv(dev); struct i40e_vsi *vsi = np->vsi; struct i40e_pf *pf = vsi->back; - u64 orig_flags, new_flags, changed_flags; + u32 orig_flags, new_flags, changed_flags; u32 i, j; orig_flags = READ_ONCE(pf->flags); @@ -4142,12 +4147,12 @@ flags_complete: return -EOPNOTSUPP; /* Compare and exchange the new flags into place. If we failed, that - * is if cmpxchg64 returns anything but the old value, this means that + * is if cmpxchg returns anything but the old value, this means that * something else has modified the flags variable since we copied it * originally. We'll just punt with an error and log something in the * message buffer. */ - if (cmpxchg64(&pf->flags, orig_flags, new_flags) != orig_flags) { + if (cmpxchg(&pf->flags, orig_flags, new_flags) != orig_flags) { dev_warn(&pf->pdev->dev, "Unable to update pf->flags as it was modified by another thread...\n"); return -EAGAIN; @@ -4189,8 +4194,9 @@ flags_complete: /* Issue reset to cause things to take effect, as additional bits * are added we will need to create a mask of bits requiring reset */ - if ((changed_flags & I40E_FLAG_VEB_STATS_ENABLED) || - ((changed_flags & I40E_FLAG_LEGACY_RX) && netif_running(dev))) + if (changed_flags & (I40E_FLAG_VEB_STATS_ENABLED | + I40E_FLAG_LEGACY_RX | + I40E_FLAG_SOURCE_PRUNING_DISABLED)) i40e_do_reset(pf, BIT(__I40E_PF_RESET_REQUESTED), true); return 0; diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 3f9e89b054ec..4de52001a2b9 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1776,11 +1776,6 @@ static void i40e_set_rx_mode(struct net_device *netdev) vsi->flags |= I40E_VSI_FLAG_FILTER_CHANGED; vsi->back->flags |= I40E_FLAG_FILTER_SYNC; } - - /* schedule our worker thread which will take care of - * applying the new filter changes - */ - i40e_service_event_schedule(vsi->back); } /** @@ -2884,22 +2879,18 @@ static void i40e_vsi_free_rx_resources(struct i40e_vsi *vsi) **/ static void i40e_config_xps_tx_ring(struct i40e_ring *ring) { - struct i40e_vsi *vsi = ring->vsi; + int cpu; if (!ring->q_vector || !ring->netdev) return; - if ((vsi->tc_config.numtc <= 1) && - !test_and_set_bit(__I40E_TX_XPS_INIT_DONE, &ring->state)) { - netif_set_xps_queue(ring->netdev, - get_cpu_mask(ring->q_vector->v_idx), - ring->queue_index); - } + /* We only initialize XPS once, so as not to overwrite user settings */ + if (test_and_set_bit(__I40E_TX_XPS_INIT_DONE, ring->state)) + return; - /* schedule our worker thread which will take care of - * applying the new filter changes - */ - i40e_service_event_schedule(vsi->back); + cpu = cpumask_local_spread(ring->q_vector->v_idx, -1); + netif_set_xps_queue(ring->netdev, get_cpu_mask(cpu), + ring->queue_index); } /** @@ -3009,7 +3000,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) struct i40e_hmc_obj_rxq rx_ctx; i40e_status err = 0; - ring->state = 0; + bitmap_zero(ring->state, __I40E_RING_STATE_NBITS); /* clear the context structure first */ memset(&rx_ctx, 0, sizeof(rx_ctx)); @@ -3034,7 +3025,7 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) if (hw->revision_id == 0) rx_ctx.lrxqthresh = 0; else - rx_ctx.lrxqthresh = 2; + rx_ctx.lrxqthresh = 1; rx_ctx.crcstrip = 1; rx_ctx.l2tsel = 1; /* this controls whether VLAN is stripped from inner headers */ @@ -3407,15 +3398,14 @@ void i40e_irq_dynamic_disable_icr0(struct i40e_pf *pf) /** * i40e_irq_dynamic_enable_icr0 - Enable default interrupt generation for icr0 * @pf: board private structure - * @clearpba: true when all pending interrupt events should be cleared **/ -void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf, bool clearpba) +void i40e_irq_dynamic_enable_icr0(struct i40e_pf *pf) { struct i40e_hw *hw = &pf->hw; u32 val; val = I40E_PFINT_DYN_CTL0_INTENA_MASK | - (clearpba ? I40E_PFINT_DYN_CTL0_CLEARPBA_MASK : 0) | + I40E_PFINT_DYN_CTL0_CLEARPBA_MASK | (I40E_ITR_NONE << I40E_PFINT_DYN_CTL0_ITR_INDX_SHIFT); wr32(hw, I40E_PFINT_DYN_CTL0, val); @@ -3482,6 +3472,7 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename) int tx_int_idx = 0; int vector, err; int irq_num; + int cpu; for (vector = 0; vector < q_vectors; vector++) { struct i40e_q_vector *q_vector = vsi->q_vectors[vector]; @@ -3517,10 +3508,14 @@ static int i40e_vsi_request_irq_msix(struct i40e_vsi *vsi, char *basename) q_vector->affinity_notify.notify = i40e_irq_affinity_notify; q_vector->affinity_notify.release = i40e_irq_affinity_release; irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify); - /* get_cpu_mask returns a static constant mask with - * a permanent lifetime so it's ok to use here. + /* Spread affinity hints out across online CPUs. + * + * get_cpu_mask returns a static constant mask with + * a permanent lifetime so it's ok to pass to + * irq_set_affinity_hint without making a copy. */ - irq_set_affinity_hint(irq_num, get_cpu_mask(q_vector->v_idx)); + cpu = cpumask_local_spread(q_vector->v_idx, -1); + irq_set_affinity_hint(irq_num, get_cpu_mask(cpu)); } vsi->irqs_ready = true; @@ -3596,7 +3591,7 @@ static int i40e_vsi_enable_irq(struct i40e_vsi *vsi) for (i = 0; i < vsi->num_q_vectors; i++) i40e_irq_dynamic_enable(vsi, i); } else { - i40e_irq_dynamic_enable_icr0(pf, true); + i40e_irq_dynamic_enable_icr0(pf); } i40e_flush(&pf->hw); @@ -3745,7 +3740,7 @@ enable_intr: wr32(hw, I40E_PFINT_ICR0_ENA, ena_mask); if (!test_bit(__I40E_DOWN, pf->state)) { i40e_service_event_schedule(pf); - i40e_irq_dynamic_enable_icr0(pf, false); + i40e_irq_dynamic_enable_icr0(pf); } return ret; @@ -6231,6 +6226,7 @@ void i40e_fdir_check_and_reenable(struct i40e_pf *pf) hlist_del(&filter->fdir_node); kfree(filter); pf->fdir_pf_active_filters--; + pf->fd_inv = 0; } } } @@ -6557,12 +6553,26 @@ static void i40e_handle_link_event(struct i40e_pf *pf, */ i40e_link_event(pf); - /* check for unqualified module, if link is down */ - if ((status->link_info & I40E_AQ_MEDIA_AVAILABLE) && - (!(status->an_info & I40E_AQ_QUALIFIED_MODULE)) && - (!(status->link_info & I40E_AQ_LINK_UP))) + /* Check if module meets thermal requirements */ + if (status->phy_type == I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP) { + dev_err(&pf->pdev->dev, + "Rx/Tx is disabled on this device because the module does not meet thermal requirements.\n"); dev_err(&pf->pdev->dev, - "The driver failed to link because an unqualified module was detected.\n"); + "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n"); + } else { + /* check for unqualified module, if link is down, suppress + * the message if link was forced to be down. + */ + if ((status->link_info & I40E_AQ_MEDIA_AVAILABLE) && + (!(status->an_info & I40E_AQ_QUALIFIED_MODULE)) && + (!(status->link_info & I40E_AQ_LINK_UP)) && + (!(pf->flags & I40E_FLAG_LINK_DOWN_ON_CLOSE_ENABLED))) { + dev_err(&pf->pdev->dev, + "Rx/Tx is disabled on this device because an unsupported SFP module type was detected.\n"); + dev_err(&pf->pdev->dev, + "Refer to the Intel(R) Ethernet Adapters and Devices User Guide for a list of supported modules.\n"); + } + } } /** @@ -7678,7 +7688,7 @@ static int i40e_set_num_rings_in_vsi(struct i40e_vsi *vsi) /** * i40e_vsi_alloc_arrays - Allocate queue and vector pointer arrays for the vsi - * @type: VSI pointer + * @vsi: VSI pointer * @alloc_qvectors: a bool to specify if q_vectors need to be allocated. * * On error: returns error code (negative) @@ -8439,7 +8449,7 @@ static int i40e_setup_misc_vector(struct i40e_pf *pf) i40e_flush(hw); - i40e_irq_dynamic_enable_icr0(pf, true); + i40e_irq_dynamic_enable_icr0(pf); return err; } @@ -8967,8 +8977,8 @@ static int i40e_sw_init(struct i40e_pf *pf) I40E_FLAG_MSIX_ENABLED; /* Set default ITR */ - pf->rx_itr_default = I40E_ITR_DYNAMIC | I40E_ITR_RX_DEF; - pf->tx_itr_default = I40E_ITR_DYNAMIC | I40E_ITR_TX_DEF; + pf->rx_itr_default = I40E_ITR_RX_DEF; + pf->tx_itr_default = I40E_ITR_TX_DEF; /* Depending on PF configurations, it is possible that the RSS * maximum might end up larger than the available queues @@ -9068,6 +9078,11 @@ static int i40e_sw_init(struct i40e_pf *pf) (pf->hw.aq.fw_maj_ver >= 5))) pf->hw_features |= I40E_HW_USE_SET_LLDP_MIB; + /* Enable PTP L4 if FW > v6.0 */ + if (pf->hw.mac.type == I40E_MAC_XL710 && + pf->hw.aq.fw_maj_ver >= 6) + pf->hw_features |= I40E_HW_PTP_L4_CAPABLE; + if (pf->hw.func_caps.vmdq) { pf->num_vmdq_vsis = I40E_DEFAULT_NUM_VMDQ_VSI; pf->flags |= I40E_FLAG_VMDQ_ENABLED; @@ -9903,6 +9918,31 @@ static int i40e_add_vsi(struct i40e_vsi *vsi) enabled_tc = i40e_pf_get_tc_map(pf); + /* Source pruning is enabled by default, so the flag is + * negative logic - if it's set, we need to fiddle with + * the VSI to disable source pruning. + */ + if (pf->flags & I40E_FLAG_SOURCE_PRUNING_DISABLED) { + memset(&ctxt, 0, sizeof(ctxt)); + ctxt.seid = pf->main_vsi_seid; + ctxt.pf_num = pf->hw.pf_id; + ctxt.vf_num = 0; + ctxt.info.valid_sections |= + cpu_to_le16(I40E_AQ_VSI_PROP_SWITCH_VALID); + ctxt.info.switch_id = + cpu_to_le16(I40E_AQ_VSI_SW_ID_FLAG_LOCAL_LB); + ret = i40e_aq_update_vsi_params(hw, &ctxt, NULL); + if (ret) { + dev_info(&pf->pdev->dev, + "update vsi failed, err %s aq_err %s\n", + i40e_stat_str(&pf->hw, ret), + i40e_aq_str(&pf->hw, + pf->hw.aq.asq_last_status)); + ret = -ENOENT; + goto err; + } + } + /* MFP mode setup queue map and update VSI */ if ((pf->flags & I40E_FLAG_MFP_ENABLED) && !(pf->hw.func_caps.iscsi)) { /* NIC type PF */ @@ -12000,6 +12040,28 @@ static pci_ers_result_t i40e_pci_error_slot_reset(struct pci_dev *pdev) } /** + * i40e_pci_error_reset_prepare - prepare device driver for pci reset + * @pdev: PCI device information struct + */ +static void i40e_pci_error_reset_prepare(struct pci_dev *pdev) +{ + struct i40e_pf *pf = pci_get_drvdata(pdev); + + i40e_prep_for_reset(pf, false); +} + +/** + * i40e_pci_error_reset_done - pci reset done, device driver reset can begin + * @pdev: PCI device information struct + */ +static void i40e_pci_error_reset_done(struct pci_dev *pdev) +{ + struct i40e_pf *pf = pci_get_drvdata(pdev); + + i40e_reset_and_rebuild(pf, false, false); +} + +/** * i40e_pci_error_resume - restart operations after PCI error recovery * @pdev: PCI device information struct * @@ -12189,6 +12251,8 @@ static int i40e_resume(struct device *dev) static const struct pci_error_handlers i40e_err_handler = { .error_detected = i40e_pci_error_detected, .slot_reset = i40e_pci_error_slot_reset, + .reset_prepare = i40e_pci_error_reset_prepare, + .reset_done = i40e_pci_error_reset_done, .resume = i40e_pci_error_resume, }; diff --git a/drivers/net/ethernet/intel/i40e/i40e_nvm.c b/drivers/net/ethernet/intel/i40e/i40e_nvm.c index 57505b1df98d..151d9cfb6ea4 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_nvm.c +++ b/drivers/net/ethernet/intel/i40e/i40e_nvm.c @@ -311,13 +311,10 @@ static i40e_status i40e_read_nvm_word_aq(struct i40e_hw *hw, u16 offset, static i40e_status __i40e_read_nvm_word(struct i40e_hw *hw, u16 offset, u16 *data) { - i40e_status ret_code = 0; - if (hw->flags & I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE) - ret_code = i40e_read_nvm_word_aq(hw, offset, data); - else - ret_code = i40e_read_nvm_word_srctl(hw, offset, data); - return ret_code; + return i40e_read_nvm_word_aq(hw, offset, data); + + return i40e_read_nvm_word_srctl(hw, offset, data); } /** @@ -331,7 +328,7 @@ static i40e_status __i40e_read_nvm_word(struct i40e_hw *hw, i40e_status i40e_read_nvm_word(struct i40e_hw *hw, u16 offset, u16 *data) { - i40e_status ret_code = 0; + i40e_status ret_code; ret_code = i40e_acquire_nvm(hw, I40E_RESOURCE_READ); if (ret_code) @@ -446,13 +443,10 @@ static i40e_status __i40e_read_nvm_buffer(struct i40e_hw *hw, u16 offset, u16 *words, u16 *data) { - i40e_status ret_code = 0; - if (hw->flags & I40E_HW_FLAG_AQ_SRCTL_ACCESS_ENABLE) - ret_code = i40e_read_nvm_buffer_aq(hw, offset, words, data); - else - ret_code = i40e_read_nvm_buffer_srctl(hw, offset, words, data); - return ret_code; + return i40e_read_nvm_buffer_aq(hw, offset, words, data); + + return i40e_read_nvm_buffer_srctl(hw, offset, words, data); } /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_register.h b/drivers/net/ethernet/intel/i40e/i40e_register.h index 86ca27f72f02..c234758dad15 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_register.h +++ b/drivers/net/ethernet/intel/i40e/i40e_register.h @@ -2794,7 +2794,7 @@ #define I40E_GLV_RUPP_MAX_INDEX 383 #define I40E_GLV_RUPP_RUPP_SHIFT 0 #define I40E_GLV_RUPP_RUPP_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_RUPP_RUPP_SHIFT) -#define I40E_GLV_TEPC(_VSI) (0x00344000 + ((_VSI) * 4)) /* _i=0...383 */ /* Reset: CORER */ +#define I40E_GLV_TEPC(_i) (0x00344000 + ((_i) * 8)) /* _i=0...383 */ /* Reset: CORER */ #define I40E_GLV_TEPC_MAX_INDEX 383 #define I40E_GLV_TEPC_TEPC_SHIFT 0 #define I40E_GLV_TEPC_TEPC_MASK I40E_MASK(0xFFFFFFFF, I40E_GLV_TEPC_TEPC_SHIFT) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index d9fdf69bbc6e..a23306f04e00 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1372,6 +1372,15 @@ bool i40e_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count) union i40e_rx_desc *rx_desc; struct i40e_rx_buffer *bi; + /* Hardware only fetches new descriptors in cache lines of 8, + * essentially ignoring the lower 3 bits of the tail register. We want + * to ensure our tail writes are aligned to avoid unnecessary work. We + * can't simply round down the cleaned count, since we might fail to + * allocate some buffers. What we really want is to ensure that + * next_to_used + cleaned_count produces an aligned value. + */ + cleaned_count -= (ntu + cleaned_count) & 0x7; + /* do nothing if no valid netdev defined */ if (!rx_ring->netdev || !cleaned_count) return false; @@ -2202,9 +2211,7 @@ static u32 i40e_buildreg_itr(const int type, const u16 itr) u32 val; val = I40E_PFINT_DYN_CTLN_INTENA_MASK | - /* Don't clear PBA because that can cause lost interrupts that - * came in while we were cleaning/polling - */ + I40E_PFINT_DYN_CTLN_CLEARPBA_MASK | (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) | (itr << I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT); @@ -2241,7 +2248,7 @@ static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, /* If we don't have MSIX, then we only need to re-enable icr0 */ if (!(vsi->back->flags & I40E_FLAG_MSIX_ENABLED)) { - i40e_irq_dynamic_enable_icr0(vsi->back, false); + i40e_irq_dynamic_enable_icr0(vsi->back); return; } @@ -3167,38 +3174,12 @@ static inline int i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, /* write last descriptor with EOP bit */ td_cmd |= I40E_TX_DESC_CMD_EOP; - /* We can OR these values together as they both are checked against - * 4 below and at this point desc_count will be used as a boolean value - * after this if/else block. + /* We OR these values together to check both against 4 (WB_STRIDE) + * below. This is safe since we don't re-use desc_count afterwards. */ desc_count |= ++tx_ring->packet_stride; - /* Algorithm to optimize tail and RS bit setting: - * if queue is stopped - * mark RS bit - * reset packet counter - * else if xmit_more is supported and is true - * advance packet counter to 4 - * reset desc_count to 0 - * - * if desc_count >= 4 - * mark RS bit - * reset packet counter - * if desc_count > 0 - * update tail - * - * Note: If there are less than 4 descriptors - * pending and interrupts were disabled the service task will - * trigger a force WB. - */ - if (netif_xmit_stopped(txring_txq(tx_ring))) { - goto do_rs; - } else if (skb->xmit_more) { - /* set stride to arm on next packet and reset desc_count */ - tx_ring->packet_stride = WB_STRIDE; - desc_count = 0; - } else if (desc_count >= WB_STRIDE) { -do_rs: + if (desc_count >= WB_STRIDE) { /* write last descriptor with RS bit set */ td_cmd |= I40E_TX_DESC_CMD_RS; tx_ring->packet_stride = 0; @@ -3219,7 +3200,7 @@ do_rs: first->next_to_watch = tx_desc; /* notify HW of packet */ - if (desc_count) { + if (netif_xmit_stopped(txring_txq(tx_ring)) || !skb->xmit_more) { writel(i, tx_ring->tail); /* we need this if more than one processor can write to our tail diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h index 2f848bc5e391..ff57ae451524 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h @@ -38,8 +38,10 @@ #define I40E_ITR_8K 0x003E #define I40E_ITR_4K 0x007A #define I40E_MAX_INTRL 0x3B /* reg uses 4 usec resolution */ -#define I40E_ITR_RX_DEF I40E_ITR_20K -#define I40E_ITR_TX_DEF I40E_ITR_20K +#define I40E_ITR_RX_DEF (ITR_REG_TO_USEC(I40E_ITR_20K) | \ + I40E_ITR_DYNAMIC) +#define I40E_ITR_TX_DEF (ITR_REG_TO_USEC(I40E_ITR_20K) | \ + I40E_ITR_DYNAMIC) #define I40E_ITR_DYNAMIC 0x8000 /* use top bit as a flag */ #define I40E_MIN_INT_RATE 250 /* ~= 1000000 / (I40E_MAX_ITR * 2) */ #define I40E_MAX_INT_RATE 500000 /* == 1000000 / (I40E_MIN_ITR * 2) */ @@ -206,7 +208,7 @@ static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc, } /* How many Rx Buffers do we bundle into one write to the hardware ? */ -#define I40E_RX_BUFFER_WRITE 16 /* Must be power of 2 */ +#define I40E_RX_BUFFER_WRITE 32 /* Must be power of 2 */ #define I40E_RX_INCREMENT(r, i) \ do { \ (i)++; \ @@ -342,6 +344,7 @@ struct i40e_rx_queue_stats { enum i40e_ring_state_t { __I40E_TX_FDIR_INIT_DONE, __I40E_TX_XPS_INIT_DONE, + __I40E_RING_STATE_NBITS /* must be last */ }; /* some useful defines for virtchannel interface, which @@ -366,7 +369,7 @@ struct i40e_ring { struct i40e_tx_buffer *tx_bi; struct i40e_rx_buffer *rx_bi; }; - unsigned long state; + DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS); u16 queue_index; /* Queue number of ring */ u8 dcb_tc; /* Traffic class of ring */ u8 __iomem *tail; diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h index 4b32b1d38a66..0410fcbdbb94 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_type.h +++ b/drivers/net/ethernet/intel/i40e/i40e_type.h @@ -46,6 +46,9 @@ /* Max default timeout in ms, */ #define I40E_MAX_NVM_TIMEOUT 18000 +/* Max timeout in ms for the phy to respond */ +#define I40E_MAX_PHY_TIMEOUT 500 + /* Switch from ms to the 1usec global time (this is the GTIME resolution) */ #define I40E_MS_TO_GTIME(time) ((time) * 1000) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index 04568137e029..0c4fa225c7be 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -273,7 +273,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id, struct i40e_hw *hw = &pf->hw; u16 vsi_queue_id, pf_queue_id; enum i40e_queue_type qtype; - u16 next_q, vector_id; + u16 next_q, vector_id, size; u32 reg, reg_idx; u16 itr_idx = 0; @@ -303,9 +303,11 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id, vsi_queue_id + 1)); } - next_q = find_first_bit(&linklistmap, - (I40E_MAX_VSI_QP * - I40E_VIRTCHNL_SUPPORTED_QTYPES)); + size = I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES; + next_q = find_first_bit(&linklistmap, size); + if (unlikely(next_q == size)) + goto irq_list_done; + vsi_queue_id = next_q / I40E_VIRTCHNL_SUPPORTED_QTYPES; qtype = next_q % I40E_VIRTCHNL_SUPPORTED_QTYPES; pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, vsi_queue_id); @@ -313,7 +315,7 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id, wr32(hw, reg_idx, reg); - while (next_q < (I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES)) { + while (next_q < size) { switch (qtype) { case I40E_QUEUE_TYPE_RX: reg_idx = I40E_QINT_RQCTL(pf_queue_id); @@ -327,12 +329,8 @@ static void i40e_config_irq_link_list(struct i40e_vf *vf, u16 vsi_id, break; } - next_q = find_next_bit(&linklistmap, - (I40E_MAX_VSI_QP * - I40E_VIRTCHNL_SUPPORTED_QTYPES), - next_q + 1); - if (next_q < - (I40E_MAX_VSI_QP * I40E_VIRTCHNL_SUPPORTED_QTYPES)) { + next_q = find_next_bit(&linklistmap, size, next_q + 1); + if (next_q < size) { vsi_queue_id = next_q / I40E_VIRTCHNL_SUPPORTED_QTYPES; qtype = next_q % I40E_VIRTCHNL_SUPPORTED_QTYPES; pf_queue_id = i40e_vc_get_pf_queue_id(vf, vsi_id, @@ -639,7 +637,7 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id, rx_ctx.dsize = 1; /* default values */ - rx_ctx.lrxqthresh = 2; + rx_ctx.lrxqthresh = 1; rx_ctx.crcstrip = 1; rx_ctx.prefena = 1; rx_ctx.l2tsel = 1; @@ -1358,7 +1356,7 @@ err_alloc: i40e_free_vfs(pf); err_iov: /* Re-enable interrupt 0. */ - i40e_irq_dynamic_enable_icr0(pf, false); + i40e_irq_dynamic_enable_icr0(pf); return ret; } @@ -2883,6 +2881,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) struct i40e_mac_filter *f; struct i40e_vf *vf; int ret = 0; + struct hlist_node *h; int bkt; /* validate the request */ @@ -2921,7 +2920,7 @@ int i40e_ndo_set_vf_mac(struct net_device *netdev, int vf_id, u8 *mac) /* Delete all the filters for this VSI - we're going to kill it * anyway. */ - hash_for_each(vsi->mac_filter_hash, bkt, f, hlist) + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) __i40e_del_filter(vsi, f); spin_unlock_bh(&vsi->mac_filter_hash_lock); diff --git a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h index ed5602f4bbcd..60c892f559b9 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_adminq_cmd.h @@ -1767,9 +1767,10 @@ enum i40e_aq_phy_type { I40E_PHY_TYPE_25GBASE_CR = 0x20, I40E_PHY_TYPE_25GBASE_SR = 0x21, I40E_PHY_TYPE_25GBASE_LR = 0x22, + I40E_PHY_TYPE_MAX, + I40E_PHY_TYPE_NOT_SUPPORTED_HIGH_TEMP = 0xFD, I40E_PHY_TYPE_EMPTY = 0xFE, I40E_PHY_TYPE_DEFAULT = 0xFF, - I40E_PHY_TYPE_MAX }; #define I40E_LINK_SPEED_100MB_SHIFT 0x1 diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c index 37e1de886d48..6806ada11490 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c @@ -711,6 +711,15 @@ bool i40evf_alloc_rx_buffers(struct i40e_ring *rx_ring, u16 cleaned_count) union i40e_rx_desc *rx_desc; struct i40e_rx_buffer *bi; + /* Hardware only fetches new descriptors in cache lines of 8, + * essentially ignoring the lower 3 bits of the tail register. We want + * to ensure our tail writes are aligned to avoid unnecessary work. We + * can't simply round down the cleaned count, since we might fail to + * allocate some buffers. What we really want is to ensure that + * next_to_used + cleaned_count produces an aligned value. + */ + cleaned_count -= (ntu + cleaned_count) & 0x7; + /* do nothing if no valid netdev defined */ if (!rx_ring->netdev || !cleaned_count) return false; @@ -1409,9 +1418,7 @@ static u32 i40e_buildreg_itr(const int type, const u16 itr) u32 val; val = I40E_VFINT_DYN_CTLN1_INTENA_MASK | - /* Don't clear PBA because that can cause lost interrupts that - * came in while we were cleaning/polling - */ + I40E_VFINT_DYN_CTLN1_CLEARPBA_MASK | (type << I40E_VFINT_DYN_CTLN1_ITR_INDX_SHIFT) | (itr << I40E_VFINT_DYN_CTLN1_INTERVAL_SHIFT); diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h index 0d9f98bc07bd..8d26c85d12e1 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.h @@ -38,8 +38,10 @@ #define I40E_ITR_8K 0x003E #define I40E_ITR_4K 0x007A #define I40E_MAX_INTRL 0x3B /* reg uses 4 usec resolution */ -#define I40E_ITR_RX_DEF I40E_ITR_20K -#define I40E_ITR_TX_DEF I40E_ITR_20K +#define I40E_ITR_RX_DEF (ITR_REG_TO_USEC(I40E_ITR_20K) | \ + I40E_ITR_DYNAMIC) +#define I40E_ITR_TX_DEF (ITR_REG_TO_USEC(I40E_ITR_20K) | \ + I40E_ITR_DYNAMIC) #define I40E_ITR_DYNAMIC 0x8000 /* use top bit as a flag */ #define I40E_MIN_INT_RATE 250 /* ~= 1000000 / (I40E_MAX_ITR * 2) */ #define I40E_MAX_INT_RATE 500000 /* == 1000000 / (I40E_MIN_ITR * 2) */ @@ -189,7 +191,7 @@ static inline bool i40e_test_staterr(union i40e_rx_desc *rx_desc, } /* How many Rx Buffers do we bundle into one write to the hardware ? */ -#define I40E_RX_BUFFER_WRITE 16 /* Must be power of 2 */ +#define I40E_RX_BUFFER_WRITE 32 /* Must be power of 2 */ #define I40E_RX_INCREMENT(r, i) \ do { \ (i)++; \ @@ -325,6 +327,7 @@ struct i40e_rx_queue_stats { enum i40e_ring_state_t { __I40E_TX_FDIR_INIT_DONE, __I40E_TX_XPS_INIT_DONE, + __I40E_RING_STATE_NBITS /* must be last */ }; /* some useful defines for virtchannel interface, which @@ -348,7 +351,7 @@ struct i40e_ring { struct i40e_tx_buffer *tx_bi; struct i40e_rx_buffer *rx_bi; }; - unsigned long state; + DECLARE_BITMAP(state, __I40E_RING_STATE_NBITS); u16 queue_index; /* Queue number of ring */ u8 dcb_tc; /* Traffic class of ring */ u8 __iomem *tail; diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h b/drivers/net/ethernet/intel/i40evf/i40e_type.h index 9364b67fff9c..213b773dfad6 100644 --- a/drivers/net/ethernet/intel/i40evf/i40e_type.h +++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h @@ -46,6 +46,9 @@ /* Max default timeout in ms, */ #define I40E_MAX_NVM_TIMEOUT 18000 +/* Max timeout in ms for the phy to respond */ +#define I40E_MAX_PHY_TIMEOUT 500 + /* Switch from ms to the 1usec global time (this is the GTIME resolution) */ #define I40E_MS_TO_GTIME(time) ((time) * 1000) diff --git a/drivers/net/ethernet/intel/i40evf/i40evf.h b/drivers/net/ethernet/intel/i40evf/i40evf.h index 5982362c5643..de0af521d602 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf.h +++ b/drivers/net/ethernet/intel/i40evf/i40evf.h @@ -222,22 +222,22 @@ struct i40evf_adapter { u32 flags; #define I40EVF_FLAG_RX_CSUM_ENABLED BIT(0) -#define I40EVF_FLAG_IMIR_ENABLED BIT(5) -#define I40EVF_FLAG_MQ_CAPABLE BIT(6) -#define I40EVF_FLAG_PF_COMMS_FAILED BIT(8) -#define I40EVF_FLAG_RESET_PENDING BIT(9) -#define I40EVF_FLAG_RESET_NEEDED BIT(10) -#define I40EVF_FLAG_WB_ON_ITR_CAPABLE BIT(11) -#define I40EVF_FLAG_OUTER_UDP_CSUM_CAPABLE BIT(12) -#define I40EVF_FLAG_ADDR_SET_BY_PF BIT(13) -#define I40EVF_FLAG_SERVICE_CLIENT_REQUESTED BIT(14) -#define I40EVF_FLAG_CLIENT_NEEDS_OPEN BIT(15) -#define I40EVF_FLAG_CLIENT_NEEDS_CLOSE BIT(16) -#define I40EVF_FLAG_CLIENT_NEEDS_L2_PARAMS BIT(17) -#define I40EVF_FLAG_PROMISC_ON BIT(18) -#define I40EVF_FLAG_ALLMULTI_ON BIT(19) -#define I40EVF_FLAG_LEGACY_RX BIT(20) -#define I40EVF_FLAG_REINIT_ITR_NEEDED BIT(21) +#define I40EVF_FLAG_IMIR_ENABLED BIT(1) +#define I40EVF_FLAG_MQ_CAPABLE BIT(2) +#define I40EVF_FLAG_PF_COMMS_FAILED BIT(3) +#define I40EVF_FLAG_RESET_PENDING BIT(4) +#define I40EVF_FLAG_RESET_NEEDED BIT(5) +#define I40EVF_FLAG_WB_ON_ITR_CAPABLE BIT(6) +#define I40EVF_FLAG_OUTER_UDP_CSUM_CAPABLE BIT(7) +#define I40EVF_FLAG_ADDR_SET_BY_PF BIT(8) +#define I40EVF_FLAG_SERVICE_CLIENT_REQUESTED BIT(9) +#define I40EVF_FLAG_CLIENT_NEEDS_OPEN BIT(10) +#define I40EVF_FLAG_CLIENT_NEEDS_CLOSE BIT(11) +#define I40EVF_FLAG_CLIENT_NEEDS_L2_PARAMS BIT(12) +#define I40EVF_FLAG_PROMISC_ON BIT(13) +#define I40EVF_FLAG_ALLMULTI_ON BIT(14) +#define I40EVF_FLAG_LEGACY_RX BIT(15) +#define I40EVF_FLAG_REINIT_ITR_NEEDED BIT(16) /* duplicates for common code */ #define I40E_FLAG_DCB_ENABLED 0 #define I40E_FLAG_RX_CSUM_ENABLED I40EVF_FLAG_RX_CSUM_ENABLED diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c b/drivers/net/ethernet/intel/i40evf/i40evf_main.c index f2f1e754c2ce..5bcbd46e2f6c 100644 --- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c +++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c @@ -515,6 +515,7 @@ i40evf_request_traffic_irqs(struct i40evf_adapter *adapter, char *basename) unsigned int vector, q_vectors; unsigned int rx_int_idx = 0, tx_int_idx = 0; int irq_num, err; + int cpu; i40evf_irq_disable(adapter); /* Decrement for Other and TCP Timer vectors */ @@ -553,10 +554,12 @@ i40evf_request_traffic_irqs(struct i40evf_adapter *adapter, char *basename) q_vector->affinity_notify.release = i40evf_irq_affinity_release; irq_set_affinity_notifier(irq_num, &q_vector->affinity_notify); - /* get_cpu_mask returns a static constant mask with - * a permanent lifetime so it's ok to use here. + /* Spread the IRQ affinity hints across online CPUs. Note that + * get_cpu_mask returns a mask with a permanent lifetime so + * it's safe to use as a hint for irq_set_affinity_hint. */ - irq_set_affinity_hint(irq_num, get_cpu_mask(q_vector->v_idx)); + cpu = cpumask_local_spread(q_vector->v_idx, -1); + irq_set_affinity_hint(irq_num, get_cpu_mask(cpu)); } return 0; @@ -877,6 +880,8 @@ i40evf_mac_filter *i40evf_add_filter(struct i40evf_adapter *adapter, list_add_tail(&f->list, &adapter->mac_filter_list); f->add = true; adapter->aq_required |= I40EVF_FLAG_AQ_ADD_MAC_FILTER; + } else { + f->remove = false; } clear_bit(__I40EVF_IN_CRITICAL_TASK, &adapter->crit_section); @@ -1218,7 +1223,7 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter) tx_ring->netdev = adapter->netdev; tx_ring->dev = &adapter->pdev->dev; tx_ring->count = adapter->tx_desc_count; - tx_ring->tx_itr_setting = (I40E_ITR_DYNAMIC | I40E_ITR_TX_DEF); + tx_ring->tx_itr_setting = I40E_ITR_TX_DEF; if (adapter->flags & I40EVF_FLAG_WB_ON_ITR_CAPABLE) tx_ring->flags |= I40E_TXR_FLAGS_WB_ON_ITR; @@ -1227,7 +1232,7 @@ static int i40evf_alloc_queues(struct i40evf_adapter *adapter) rx_ring->netdev = adapter->netdev; rx_ring->dev = &adapter->pdev->dev; rx_ring->count = adapter->rx_desc_count; - rx_ring->rx_itr_setting = (I40E_ITR_DYNAMIC | I40E_ITR_RX_DEF); + rx_ring->rx_itr_setting = I40E_ITR_RX_DEF; } adapter->num_active_queues = num_active_queues; @@ -2420,10 +2425,6 @@ out_err: return features & ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK); } -#define I40EVF_VLAN_FEATURES (NETIF_F_HW_VLAN_CTAG_TX |\ - NETIF_F_HW_VLAN_CTAG_RX |\ - NETIF_F_HW_VLAN_CTAG_FILTER) - /** * i40evf_fix_features - fix up the netdev feature bits * @netdev: our net device @@ -2436,9 +2437,11 @@ static netdev_features_t i40evf_fix_features(struct net_device *netdev, { struct i40evf_adapter *adapter = netdev_priv(netdev); - features &= ~I40EVF_VLAN_FEATURES; - if (adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN) - features |= I40EVF_VLAN_FEATURES; + if (!(adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN)) + features &= ~(NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_CTAG_RX | + NETIF_F_HW_VLAN_CTAG_FILTER); + return features; } @@ -2569,9 +2572,17 @@ int i40evf_process_config(struct i40evf_adapter *adapter) */ hw_features = hw_enc_features; + /* Enable VLAN features if supported */ + if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN) + hw_features |= (NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_CTAG_RX); + netdev->hw_features |= hw_features; - netdev->features |= hw_features | I40EVF_VLAN_FEATURES; + netdev->features |= hw_features; + + if (vfres->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_VLAN) + netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; adapter->vsi.id = adapter->vsi_res->vsi_id; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index fd4a46b03cc8..837d9b46a390 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3162,6 +3162,8 @@ static int igb_sw_init(struct igb_adapter *adapter) /* Setup and initialize a copy of the hw vlan table array */ adapter->shadow_vfta = kcalloc(E1000_VLAN_FILTER_TBL_SIZE, sizeof(u32), GFP_ATOMIC); + if (!adapter->shadow_vfta) + return -ENOMEM; /* This call may decrease the number of queues */ if (igb_init_interrupt_scheme(adapter, true)) { diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h index dd5578756ae0..468c3555a629 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h @@ -275,6 +275,7 @@ struct ixgbe_rx_queue_stats { u64 rsc_count; u64 rsc_flush; u64 non_eop_descs; + u64 alloc_rx_page; u64 alloc_rx_page_failed; u64 alloc_rx_buff_failed; u64 csum_err; @@ -434,8 +435,15 @@ static inline unsigned int ixgbe_rx_pg_order(struct ixgbe_ring *ring) } #define ixgbe_rx_pg_size(_ring) (PAGE_SIZE << ixgbe_rx_pg_order(_ring)) +#define IXGBE_ITR_ADAPTIVE_MIN_INC 2 +#define IXGBE_ITR_ADAPTIVE_MIN_USECS 10 +#define IXGBE_ITR_ADAPTIVE_MAX_USECS 126 +#define IXGBE_ITR_ADAPTIVE_LATENCY 0x80 +#define IXGBE_ITR_ADAPTIVE_BULK 0x00 + struct ixgbe_ring_container { struct ixgbe_ring *ring; /* pointer to linked list of rings */ + unsigned long next_update; /* jiffies value of last update */ unsigned int total_bytes; /* total bytes processed this int */ unsigned int total_packets; /* total packets processed this int */ u16 work_limit; /* total work allowed per interrupt */ @@ -655,6 +663,7 @@ struct ixgbe_adapter { u64 rsc_total_count; u64 rsc_total_flush; u64 non_eop_descs; + u32 alloc_rx_page; u32 alloc_rx_page_failed; u32 alloc_rx_buff_failed; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c index 523f9d05a810..8a32eb7d47b9 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_82598.c @@ -175,31 +175,9 @@ static s32 ixgbe_init_phy_ops_82598(struct ixgbe_hw *hw) **/ static s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw) { -#ifndef CONFIG_SPARC - u32 regval; - u32 i; -#endif s32 ret_val; ret_val = ixgbe_start_hw_generic(hw); - -#ifndef CONFIG_SPARC - /* Disable relaxed ordering */ - for (i = 0; ((i < hw->mac.max_tx_queues) && - (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) { - regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL(i)); - regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; - IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL(i), regval); - } - - for (i = 0; ((i < hw->mac.max_rx_queues) && - (i < IXGBE_DCA_MAX_QUEUES_82598)); i++) { - regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i)); - regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN | - IXGBE_DCA_RXCTRL_HEAD_WRO_EN); - IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval); - } -#endif if (ret_val) return ret_val; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c index 2c19070d2a0b..9bef255f6a18 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_common.c @@ -366,25 +366,6 @@ s32 ixgbe_start_hw_gen2(struct ixgbe_hw *hw) } IXGBE_WRITE_FLUSH(hw); -#ifndef CONFIG_ARCH_WANT_RELAX_ORDER - /* Disable relaxed ordering */ - for (i = 0; i < hw->mac.max_tx_queues; i++) { - u32 regval; - - regval = IXGBE_READ_REG(hw, IXGBE_DCA_TXCTRL_82599(i)); - regval &= ~IXGBE_DCA_TXCTRL_DESC_WRO_EN; - IXGBE_WRITE_REG(hw, IXGBE_DCA_TXCTRL_82599(i), regval); - } - - for (i = 0; i < hw->mac.max_rx_queues; i++) { - u32 regval; - - regval = IXGBE_READ_REG(hw, IXGBE_DCA_RXCTRL(i)); - regval &= ~(IXGBE_DCA_RXCTRL_DATA_WRO_EN | - IXGBE_DCA_RXCTRL_HEAD_WRO_EN); - IXGBE_WRITE_REG(hw, IXGBE_DCA_RXCTRL(i), regval); - } -#endif return 0; } @@ -3800,10 +3781,10 @@ s32 ixgbe_set_fw_drv_ver_generic(struct ixgbe_hw *hw, u8 maj, u8 min, fw_cmd.ver_build = build; fw_cmd.ver_sub = sub; fw_cmd.hdr.checksum = 0; - fw_cmd.hdr.checksum = ixgbe_calculate_checksum((u8 *)&fw_cmd, - (FW_CEM_HDR_LEN + fw_cmd.hdr.buf_len)); fw_cmd.pad = 0; fw_cmd.pad2 = 0; + fw_cmd.hdr.checksum = ixgbe_calculate_checksum((u8 *)&fw_cmd, + (FW_CEM_HDR_LEN + fw_cmd.hdr.buf_len)); for (i = 0; i <= FW_CEM_MAX_RETRIES; i++) { ret_val = ixgbe_host_interface_command(hw, &fw_cmd, @@ -4100,8 +4081,8 @@ bool ixgbe_mng_present(struct ixgbe_hw *hw) return false; fwsm = IXGBE_READ_REG(hw, IXGBE_FWSM(hw)); - fwsm &= IXGBE_FWSM_MODE_MASK; - return fwsm == IXGBE_FWSM_FW_MODE_PT; + + return !!(fwsm & IXGBE_FWSM_FW_MODE_PT); } /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index 72c565712a5f..0aad1c2a3667 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -104,6 +104,7 @@ static const struct ixgbe_stats ixgbe_gstrings_stats[] = { {"tx_flow_control_xoff", IXGBE_STAT(stats.lxofftxc)}, {"rx_flow_control_xoff", IXGBE_STAT(stats.lxoffrxc)}, {"rx_csum_offload_errors", IXGBE_STAT(hw_csum_rx_error)}, + {"alloc_rx_page", IXGBE_STAT(alloc_rx_page)}, {"alloc_rx_page_failed", IXGBE_STAT(alloc_rx_page_failed)}, {"alloc_rx_buff_failed", IXGBE_STAT(alloc_rx_buff_failed)}, {"rx_no_dma_resources", IXGBE_STAT(hw_rx_no_dma_resources)}, @@ -1048,7 +1049,7 @@ static int ixgbe_set_ringparam(struct net_device *netdev, { struct ixgbe_adapter *adapter = netdev_priv(netdev); struct ixgbe_ring *temp_ring; - int i, err = 0; + int i, j, err = 0; u32 new_rx_count, new_tx_count; if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending)) @@ -1085,8 +1086,8 @@ static int ixgbe_set_ringparam(struct net_device *netdev, } /* allocate temporary buffer to store rings in */ - i = max_t(int, adapter->num_tx_queues, adapter->num_rx_queues); - i = max_t(int, i, adapter->num_xdp_queues); + i = max_t(int, adapter->num_tx_queues + adapter->num_xdp_queues, + adapter->num_rx_queues); temp_ring = vmalloc(i * sizeof(struct ixgbe_ring)); if (!temp_ring) { @@ -1118,8 +1119,8 @@ static int ixgbe_set_ringparam(struct net_device *netdev, } } - for (i = 0; i < adapter->num_xdp_queues; i++) { - memcpy(&temp_ring[i], adapter->xdp_ring[i], + for (j = 0; j < adapter->num_xdp_queues; j++, i++) { + memcpy(&temp_ring[i], adapter->xdp_ring[j], sizeof(struct ixgbe_ring)); temp_ring[i].count = new_tx_count; @@ -1139,10 +1140,10 @@ static int ixgbe_set_ringparam(struct net_device *netdev, memcpy(adapter->tx_ring[i], &temp_ring[i], sizeof(struct ixgbe_ring)); } - for (i = 0; i < adapter->num_xdp_queues; i++) { - ixgbe_free_tx_resources(adapter->xdp_ring[i]); + for (j = 0; j < adapter->num_xdp_queues; j++, i++) { + ixgbe_free_tx_resources(adapter->xdp_ring[j]); - memcpy(adapter->xdp_ring[i], &temp_ring[i], + memcpy(adapter->xdp_ring[j], &temp_ring[i], sizeof(struct ixgbe_ring)); } @@ -1916,8 +1917,6 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring, unsigned int size) { union ixgbe_adv_rx_desc *rx_desc; - struct ixgbe_rx_buffer *rx_buffer; - struct ixgbe_tx_buffer *tx_buffer; u16 rx_ntc, tx_ntc, count = 0; /* initialize next to clean and descriptor values */ @@ -1925,7 +1924,38 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring, tx_ntc = tx_ring->next_to_clean; rx_desc = IXGBE_RX_DESC(rx_ring, rx_ntc); + while (tx_ntc != tx_ring->next_to_use) { + union ixgbe_adv_tx_desc *tx_desc; + struct ixgbe_tx_buffer *tx_buffer; + + tx_desc = IXGBE_TX_DESC(tx_ring, tx_ntc); + + /* if DD is not set transmit has not completed */ + if (!(tx_desc->wb.status & cpu_to_le32(IXGBE_TXD_STAT_DD))) + return count; + + /* unmap buffer on Tx side */ + tx_buffer = &tx_ring->tx_buffer_info[tx_ntc]; + + /* Free all the Tx ring sk_buffs */ + dev_kfree_skb_any(tx_buffer->skb); + + /* unmap skb header data */ + dma_unmap_single(tx_ring->dev, + dma_unmap_addr(tx_buffer, dma), + dma_unmap_len(tx_buffer, len), + DMA_TO_DEVICE); + dma_unmap_len_set(tx_buffer, len, 0); + + /* increment Tx next to clean counter */ + tx_ntc++; + if (tx_ntc == tx_ring->count) + tx_ntc = 0; + } + while (rx_desc->wb.upper.length) { + struct ixgbe_rx_buffer *rx_buffer; + /* check Rx buffer */ rx_buffer = &rx_ring->rx_buffer_info[rx_ntc]; @@ -1938,6 +1968,8 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring, /* verify contents of skb */ if (ixgbe_check_lbtest_frame(rx_buffer, size)) count++; + else + break; /* sync Rx buffer for device write */ dma_sync_single_for_device(rx_ring->dev, @@ -1945,26 +1977,10 @@ static u16 ixgbe_clean_test_rings(struct ixgbe_ring *rx_ring, ixgbe_rx_bufsz(rx_ring), DMA_FROM_DEVICE); - /* unmap buffer on Tx side */ - tx_buffer = &tx_ring->tx_buffer_info[tx_ntc]; - - /* Free all the Tx ring sk_buffs */ - dev_kfree_skb_any(tx_buffer->skb); - - /* unmap skb header data */ - dma_unmap_single(tx_ring->dev, - dma_unmap_addr(tx_buffer, dma), - dma_unmap_len(tx_buffer, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buffer, len, 0); - - /* increment Rx/Tx next to clean counters */ + /* increment Rx next to clean counter */ rx_ntc++; if (rx_ntc == rx_ring->count) rx_ntc = 0; - tx_ntc++; - if (tx_ntc == tx_ring->count) - tx_ntc = 0; /* fetch next descriptor */ rx_desc = IXGBE_RX_DESC(rx_ring, rx_ntc); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c index f1bfae0c41d0..8e2a957aca18 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c @@ -806,6 +806,7 @@ static void ixgbe_add_ring(struct ixgbe_ring *ring, ring->next = head->ring; head->ring = ring; head->count++; + head->next_update = jiffies + 1; } /** @@ -879,8 +880,11 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter, /* initialize work limits */ q_vector->tx.work_limit = adapter->tx_work_limit; - /* initialize pointer to rings */ - ring = q_vector->ring; + /* Initialize setting for adaptive ITR */ + q_vector->tx.itr = IXGBE_ITR_ADAPTIVE_MAX_USECS | + IXGBE_ITR_ADAPTIVE_LATENCY; + q_vector->rx.itr = IXGBE_ITR_ADAPTIVE_MAX_USECS | + IXGBE_ITR_ADAPTIVE_LATENCY; /* intialize ITR */ if (txr_count && !rxr_count) { @@ -897,6 +901,9 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter, q_vector->itr = adapter->rx_itr_setting; } + /* initialize pointer to rings */ + ring = q_vector->ring; + while (txr_count) { /* assign generic ring traits */ ring->dev = &adapter->pdev->dev; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index 3942c6208745..7683c14024aa 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1620,6 +1620,7 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring, bi->page = page; bi->page_offset = ixgbe_rx_offset(rx_ring); bi->pagecnt_bias = 1; + rx_ring->rx_stats.alloc_rx_page++; return true; } @@ -2539,50 +2540,174 @@ enum latency_range { static void ixgbe_update_itr(struct ixgbe_q_vector *q_vector, struct ixgbe_ring_container *ring_container) { - int bytes = ring_container->total_bytes; - int packets = ring_container->total_packets; - u32 timepassed_us; - u64 bytes_perint; - u8 itr_setting = ring_container->itr; + unsigned int itr = IXGBE_ITR_ADAPTIVE_MIN_USECS | + IXGBE_ITR_ADAPTIVE_LATENCY; + unsigned int avg_wire_size, packets, bytes; + unsigned long next_update = jiffies; - if (packets == 0) + /* If we don't have any rings just leave ourselves set for maximum + * possible latency so we take ourselves out of the equation. + */ + if (!ring_container->ring) return; - /* simple throttlerate management - * 0-10MB/s lowest (100000 ints/s) - * 10-20MB/s low (20000 ints/s) - * 20-1249MB/s bulk (12000 ints/s) + /* If we didn't update within up to 1 - 2 jiffies we can assume + * that either packets are coming in so slow there hasn't been + * any work, or that there is so much work that NAPI is dealing + * with interrupt moderation and we don't need to do anything. */ - /* what was last interrupt timeslice? */ - timepassed_us = q_vector->itr >> 2; - if (timepassed_us == 0) - return; + if (time_after(next_update, ring_container->next_update)) + goto clear_counts; - bytes_perint = bytes / timepassed_us; /* bytes/usec */ + packets = ring_container->total_packets; - switch (itr_setting) { - case lowest_latency: - if (bytes_perint > 10) - itr_setting = low_latency; - break; - case low_latency: - if (bytes_perint > 20) - itr_setting = bulk_latency; - else if (bytes_perint <= 10) - itr_setting = lowest_latency; + /* We have no packets to actually measure against. This means + * either one of the other queues on this vector is active or + * we are a Tx queue doing TSO with too high of an interrupt rate. + * + * When this occurs just tick up our delay by the minimum value + * and hope that this extra delay will prevent us from being called + * without any work on our queue. + */ + if (!packets) { + itr = (q_vector->itr >> 2) + IXGBE_ITR_ADAPTIVE_MIN_INC; + if (itr > IXGBE_ITR_ADAPTIVE_MAX_USECS) + itr = IXGBE_ITR_ADAPTIVE_MAX_USECS; + itr += ring_container->itr & IXGBE_ITR_ADAPTIVE_LATENCY; + goto clear_counts; + } + + bytes = ring_container->total_bytes; + + /* If packets are less than 4 or bytes are less than 9000 assume + * insufficient data to use bulk rate limiting approach. We are + * likely latency driven. + */ + if (packets < 4 && bytes < 9000) { + itr = IXGBE_ITR_ADAPTIVE_LATENCY; + goto adjust_by_size; + } + + /* Between 4 and 48 we can assume that our current interrupt delay + * is only slightly too low. As such we should increase it by a small + * fixed amount. + */ + if (packets < 48) { + itr = (q_vector->itr >> 2) + IXGBE_ITR_ADAPTIVE_MIN_INC; + if (itr > IXGBE_ITR_ADAPTIVE_MAX_USECS) + itr = IXGBE_ITR_ADAPTIVE_MAX_USECS; + goto clear_counts; + } + + /* Between 48 and 96 is our "goldilocks" zone where we are working + * out "just right". Just report that our current ITR is good for us. + */ + if (packets < 96) { + itr = q_vector->itr >> 2; + goto clear_counts; + } + + /* If packet count is 96 or greater we are likely looking at a slight + * overrun of the delay we want. Try halving our delay to see if that + * will cut the number of packets in half per interrupt. + */ + if (packets < 256) { + itr = q_vector->itr >> 3; + if (itr < IXGBE_ITR_ADAPTIVE_MIN_USECS) + itr = IXGBE_ITR_ADAPTIVE_MIN_USECS; + goto clear_counts; + } + + /* The paths below assume we are dealing with a bulk ITR since number + * of packets is 256 or greater. We are just going to have to compute + * a value and try to bring the count under control, though for smaller + * packet sizes there isn't much we can do as NAPI polling will likely + * be kicking in sooner rather than later. + */ + itr = IXGBE_ITR_ADAPTIVE_BULK; + +adjust_by_size: + /* If packet counts are 256 or greater we can assume we have a gross + * overestimation of what the rate should be. Instead of trying to fine + * tune it just use the formula below to try and dial in an exact value + * give the current packet size of the frame. + */ + avg_wire_size = bytes / packets; + + /* The following is a crude approximation of: + * wmem_default / (size + overhead) = desired_pkts_per_int + * rate / bits_per_byte / (size + ethernet overhead) = pkt_rate + * (desired_pkt_rate / pkt_rate) * usecs_per_sec = ITR value + * + * Assuming wmem_default is 212992 and overhead is 640 bytes per + * packet, (256 skb, 64 headroom, 320 shared info), we can reduce the + * formula down to + * + * (170 * (size + 24)) / (size + 640) = ITR + * + * We first do some math on the packet size and then finally bitshift + * by 8 after rounding up. We also have to account for PCIe link speed + * difference as ITR scales based on this. + */ + if (avg_wire_size <= 60) { + /* Start at 50k ints/sec */ + avg_wire_size = 5120; + } else if (avg_wire_size <= 316) { + /* 50K ints/sec to 16K ints/sec */ + avg_wire_size *= 40; + avg_wire_size += 2720; + } else if (avg_wire_size <= 1084) { + /* 16K ints/sec to 9.2K ints/sec */ + avg_wire_size *= 15; + avg_wire_size += 11452; + } else if (avg_wire_size <= 1980) { + /* 9.2K ints/sec to 8K ints/sec */ + avg_wire_size *= 5; + avg_wire_size += 22420; + } else { + /* plateau at a limit of 8K ints/sec */ + avg_wire_size = 32256; + } + + /* If we are in low latency mode half our delay which doubles the rate + * to somewhere between 100K to 16K ints/sec + */ + if (itr & IXGBE_ITR_ADAPTIVE_LATENCY) + avg_wire_size >>= 1; + + /* Resultant value is 256 times larger than it needs to be. This + * gives us room to adjust the value as needed to either increase + * or decrease the value based on link speeds of 10G, 2.5G, 1G, etc. + * + * Use addition as we have already recorded the new latency flag + * for the ITR value. + */ + switch (q_vector->adapter->link_speed) { + case IXGBE_LINK_SPEED_10GB_FULL: + case IXGBE_LINK_SPEED_100_FULL: + default: + itr += DIV_ROUND_UP(avg_wire_size, + IXGBE_ITR_ADAPTIVE_MIN_INC * 256) * + IXGBE_ITR_ADAPTIVE_MIN_INC; break; - case bulk_latency: - if (bytes_perint <= 20) - itr_setting = low_latency; + case IXGBE_LINK_SPEED_2_5GB_FULL: + case IXGBE_LINK_SPEED_1GB_FULL: + case IXGBE_LINK_SPEED_10_FULL: + itr += DIV_ROUND_UP(avg_wire_size, + IXGBE_ITR_ADAPTIVE_MIN_INC * 64) * + IXGBE_ITR_ADAPTIVE_MIN_INC; break; } - /* clear work counters since we have the values we need */ +clear_counts: + /* write back value */ + ring_container->itr = itr; + + /* next update should occur within next jiffy */ + ring_container->next_update = next_update + 1; + ring_container->total_bytes = 0; ring_container->total_packets = 0; - - /* write updated itr to ring container */ - ring_container->itr = itr_setting; } /** @@ -2624,34 +2749,19 @@ void ixgbe_write_eitr(struct ixgbe_q_vector *q_vector) static void ixgbe_set_itr(struct ixgbe_q_vector *q_vector) { - u32 new_itr = q_vector->itr; - u8 current_itr; + u32 new_itr; ixgbe_update_itr(q_vector, &q_vector->tx); ixgbe_update_itr(q_vector, &q_vector->rx); - current_itr = max(q_vector->rx.itr, q_vector->tx.itr); + /* use the smallest value of new ITR delay calculations */ + new_itr = min(q_vector->rx.itr, q_vector->tx.itr); - switch (current_itr) { - /* counts and packets in update_itr are dependent on these numbers */ - case lowest_latency: - new_itr = IXGBE_100K_ITR; - break; - case low_latency: - new_itr = IXGBE_20K_ITR; - break; - case bulk_latency: - new_itr = IXGBE_12K_ITR; - break; - default: - break; - } + /* Clear latency flag if set, shift into correct position */ + new_itr &= ~IXGBE_ITR_ADAPTIVE_LATENCY; + new_itr <<= 2; if (new_itr != q_vector->itr) { - /* do an exponential smoothing */ - new_itr = (10 * new_itr * q_vector->itr) / - ((9 * new_itr) + q_vector->itr); - /* save the algorithm value here */ q_vector->itr = new_itr; @@ -4904,7 +5014,7 @@ static void ixgbe_clear_udp_tunnel_port(struct ixgbe_adapter *adapter, u32 mask) IXGBE_FLAG_GENEVE_OFFLOAD_CAPABLE))) return; - vxlanctrl = IXGBE_READ_REG(hw, IXGBE_VXLANCTRL) && ~mask; + vxlanctrl = IXGBE_READ_REG(hw, IXGBE_VXLANCTRL) & ~mask; IXGBE_WRITE_REG(hw, IXGBE_VXLANCTRL, vxlanctrl); if (mask & IXGBE_VXLANCTRL_VXLAN_UDPPORT_MASK) @@ -6794,6 +6904,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) u32 i, missed_rx = 0, mpc, bprc, lxon, lxoff, xon_off_tot; u64 non_eop_descs = 0, restart_queue = 0, tx_busy = 0; u64 alloc_rx_page_failed = 0, alloc_rx_buff_failed = 0; + u64 alloc_rx_page = 0; u64 bytes = 0, packets = 0, hw_csum_rx_error = 0; if (test_bit(__IXGBE_DOWN, &adapter->state) || @@ -6814,6 +6925,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) for (i = 0; i < adapter->num_rx_queues; i++) { struct ixgbe_ring *rx_ring = adapter->rx_ring[i]; non_eop_descs += rx_ring->rx_stats.non_eop_descs; + alloc_rx_page += rx_ring->rx_stats.alloc_rx_page; alloc_rx_page_failed += rx_ring->rx_stats.alloc_rx_page_failed; alloc_rx_buff_failed += rx_ring->rx_stats.alloc_rx_buff_failed; hw_csum_rx_error += rx_ring->rx_stats.csum_err; @@ -6821,6 +6933,7 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) packets += rx_ring->stats.packets; } adapter->non_eop_descs = non_eop_descs; + adapter->alloc_rx_page = alloc_rx_page; adapter->alloc_rx_page_failed = alloc_rx_page_failed; adapter->alloc_rx_buff_failed = alloc_rx_buff_failed; adapter->hw_csum_rx_error = hw_csum_rx_error; @@ -8552,6 +8665,10 @@ static int ixgbe_ioctl(struct net_device *netdev, struct ifreq *req, int cmd) return ixgbe_ptp_set_ts_config(adapter, req); case SIOCGHWTSTAMP: return ixgbe_ptp_get_ts_config(adapter, req); + case SIOCGMIIPHY: + if (!adapter->hw.phy.ops.read_reg) + return -EOPNOTSUPP; + /* fall through */ default: return mdio_mii_ioctl(&adapter->hw.phy.mdio, if_mii(req), cmd); } @@ -9758,6 +9875,17 @@ static void ixgbe_fwd_del(struct net_device *pdev, void *priv) limit = find_last_bit(&adapter->fwd_bitmask, 32); adapter->ring_feature[RING_F_VMDQ].limit = limit + 1; ixgbe_fwd_ring_down(fwd_adapter->netdev, fwd_adapter); + + /* go back to full RSS if we're done with our VMQs */ + if (adapter->ring_feature[RING_F_VMDQ].limit == 1) { + int rss = min_t(int, ixgbe_max_rss_indices(adapter), + num_online_cpus()); + + adapter->flags &= ~IXGBE_FLAG_VMDQ_ENABLED; + adapter->flags &= ~IXGBE_FLAG_SRIOV_ENABLED; + adapter->ring_feature[RING_F_RSS].limit = rss; + } + ixgbe_setup_tc(pdev, netdev_get_num_tc(pdev)); netdev_dbg(pdev, "pool %i:%i queues %i:%i VSI bitmask %lx\n", fwd_adapter->pool, adapter->num_rx_pools, @@ -10737,6 +10865,9 @@ skip_bad_vf_detection: if (!test_bit(__IXGBE_SERVICE_INITED, &adapter->state)) return PCI_ERS_RESULT_DISCONNECT; + if (!netif_device_present(netdev)) + return PCI_ERS_RESULT_DISCONNECT; + rtnl_lock(); netif_device_detach(netdev); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c index 6ea0d6a5fb90..b8c5fd2a2115 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x540.c @@ -619,12 +619,6 @@ s32 ixgbe_acquire_swfw_sync_X540(struct ixgbe_hw *hw, u32 mask) usleep_range(5000, 10000); } - /* Failed to get SW only semaphore */ - if (swmask == IXGBE_GSSR_SW_MNG_SM) { - hw_dbg(hw, "Failed to get SW only semaphore\n"); - return IXGBE_ERR_SWFW_SYNC; - } - /* If the resource is not released by the FW/HW the SW can assume that * the FW/HW malfunctions. In that case the SW should set the SW bit(s) * of the requested resource(s) while ignoring the corresponding FW/HW @@ -647,7 +641,8 @@ s32 ixgbe_acquire_swfw_sync_X540(struct ixgbe_hw *hw, u32 mask) */ if (swfw_sync & swmask) { u32 rmask = IXGBE_GSSR_EEP_SM | IXGBE_GSSR_PHY0_SM | - IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM; + IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM | + IXGBE_GSSR_SW_MNG_SM; if (swi2c_mask) rmask |= IXGBE_GSSR_I2C_MASK; @@ -763,6 +758,8 @@ static void ixgbe_release_swfw_sync_semaphore(struct ixgbe_hw *hw) **/ void ixgbe_init_swfw_sync_X540(struct ixgbe_hw *hw) { + u32 rmask; + /* First try to grab the semaphore but we don't need to bother * looking to see whether we got the lock or not since we do * the same thing regardless of whether we got the lock or not. @@ -771,6 +768,14 @@ void ixgbe_init_swfw_sync_X540(struct ixgbe_hw *hw) */ ixgbe_get_swfw_sync_semaphore(hw); ixgbe_release_swfw_sync_semaphore(hw); + + /* Acquire and release all software resources. */ + rmask = IXGBE_GSSR_EEP_SM | IXGBE_GSSR_PHY0_SM | + IXGBE_GSSR_PHY1_SM | IXGBE_GSSR_MAC_CSR_SM | + IXGBE_GSSR_SW_MNG_SM | IXGBE_GSSR_I2C_MASK; + + ixgbe_acquire_swfw_sync_X540(hw, rmask); + ixgbe_release_swfw_sync_X540(hw, rmask); } /** diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c index 19fbb2f28ea4..cb7da5f9c4da 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_x550.c @@ -900,6 +900,8 @@ static s32 ixgbe_read_ee_hostif_buffer_X550(struct ixgbe_hw *hw, /* convert offset from words to bytes */ buffer.address = cpu_to_be32((offset + current_word) * 2); buffer.length = cpu_to_be16(words_to_read * 2); + buffer.pad2 = 0; + buffer.pad3 = 0; status = ixgbe_hic_unlocked(hw, (u32 *)&buffer, sizeof(buffer), IXGBE_HI_COMMAND_TIMEOUT); @@ -3192,6 +3194,9 @@ static s32 ixgbe_init_phy_ops_X550em(struct ixgbe_hw *hw) /* Identify the PHY or SFP module */ ret_val = phy->ops.identify(hw); + if (ret_val == IXGBE_ERR_SFP_NOT_SUPPORTED || + ret_val == IXGBE_ERR_PHY_ADDR_INVALID) + return ret_val; /* Setup function pointers based on detected hardware */ ixgbe_init_mac_link_ops_X550em(hw); @@ -3394,9 +3399,10 @@ static s32 ixgbe_reset_hw_X550em(struct ixgbe_hw *hw) ixgbe_clear_tx_pending(hw); /* PHY ops must be identified and initialized prior to reset */ - - /* Identify PHY and related function pointers */ status = hw->phy.ops.init(hw); + if (status == IXGBE_ERR_SFP_NOT_SUPPORTED || + status == IXGBE_ERR_PHY_ADDR_INVALID) + return status; /* start the external PHY */ if (hw->phy.type == ixgbe_phy_x550em_ext_t) { @@ -3884,7 +3890,7 @@ static const struct ixgbe_mac_operations mac_ops_X550EM_x_fw = { .write_iosf_sb_reg = ixgbe_write_iosf_sb_reg_x550, }; -static struct ixgbe_mac_operations mac_ops_x550em_a = { +static const struct ixgbe_mac_operations mac_ops_x550em_a = { X550_COMMON_MAC .led_on = ixgbe_led_on_t_x550em, .led_off = ixgbe_led_off_t_x550em, @@ -3905,7 +3911,7 @@ static struct ixgbe_mac_operations mac_ops_x550em_a = { .write_iosf_sb_reg = ixgbe_write_iosf_sb_reg_x550a, }; -static struct ixgbe_mac_operations mac_ops_x550em_a_fw = { +static const struct ixgbe_mac_operations mac_ops_x550em_a_fw = { X550_COMMON_MAC .led_on = ixgbe_led_on_generic, .led_off = ixgbe_led_off_generic, diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 3d4e4a5d00d1..bf1f04164885 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -1742,13 +1742,18 @@ static int mlx4_en_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) return err; } +static int mlx4_en_get_max_num_rx_rings(struct net_device *dev) +{ + return min_t(int, num_online_cpus(), MAX_RX_RINGS); +} + static void mlx4_en_get_channels(struct net_device *dev, struct ethtool_channels *channel) { struct mlx4_en_priv *priv = netdev_priv(dev); - channel->max_rx = MAX_RX_RINGS; - channel->max_tx = MLX4_EN_MAX_TX_RING_P_UP; + channel->max_rx = mlx4_en_get_max_num_rx_rings(dev); + channel->max_tx = priv->mdev->profile.max_num_tx_rings_p_up; channel->rx_count = priv->rx_ring_num; channel->tx_count = priv->tx_ring_num[TX] / @@ -1777,7 +1782,7 @@ static int mlx4_en_set_channels(struct net_device *dev, mutex_lock(&mdev->state_lock); xdp_count = priv->tx_ring_num[TX_XDP] ? channel->rx_count : 0; if (channel->tx_count * priv->prof->num_up + xdp_count > - MAX_TX_RINGS) { + priv->mdev->profile.max_num_tx_rings_p_up * priv->prof->num_up) { err = -EINVAL; en_err(priv, "Total number of TX and XDP rings (%d) exceeds the maximum supported (%d)\n", diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c index 686e18de9a97..2c2965497ed3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c @@ -153,7 +153,7 @@ static void mlx4_en_get_profile(struct mlx4_en_dev *mdev) int i; params->udp_rss = udp_rss; - params->num_tx_rings_p_up = mlx4_low_memory_profile() ? + params->max_num_tx_rings_p_up = mlx4_low_memory_profile() ? MLX4_EN_MIN_TX_RING_P_UP : min_t(int, num_online_cpus(), MLX4_EN_MAX_TX_RING_P_UP); @@ -170,8 +170,8 @@ static void mlx4_en_get_profile(struct mlx4_en_dev *mdev) params->prof[i].tx_ring_size = MLX4_EN_DEF_TX_RING_SIZE; params->prof[i].rx_ring_size = MLX4_EN_DEF_RX_RING_SIZE; params->prof[i].num_up = MLX4_EN_NUM_UP_LOW; - params->prof[i].num_tx_rings_p_up = params->num_tx_rings_p_up; - params->prof[i].tx_ring_num[TX] = params->num_tx_rings_p_up * + params->prof[i].num_tx_rings_p_up = params->max_num_tx_rings_p_up; + params->prof[i].tx_ring_num[TX] = params->max_num_tx_rings_p_up * params->prof[i].num_up; params->prof[i].rss_rings = 0; params->prof[i].inline_thold = inline_thold; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 9c218f1cfc6c..d611df2f274d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -1752,6 +1752,7 @@ int mlx4_en_start_port(struct net_device *dev) mlx4_en_arm_cq(priv, cq); } else { + mlx4_en_init_tx_xdp_ring_descs(priv, tx_ring); mlx4_en_init_recycle_ring(priv, i); /* XDP TX CQ should never be armed */ } @@ -3305,7 +3306,7 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port, priv->pflags = MLX4_EN_PRIV_FLAGS_BLUEFLAME; priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | MLX4_WQE_CTRL_SOLICITED); - priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up; + priv->num_tx_rings_p_up = mdev->profile.max_num_tx_rings_p_up; priv->tx_work_limit = MLX4_EN_DEFAULT_TX_WORK; netdev_rss_key_fill(priv->rss_key, sizeof(priv->rss_key)); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c index 5a47f9669621..6883ac75d37f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c @@ -53,7 +53,7 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride, if (is_tx) { context->sq_size_stride = ilog2(size) << 3 | (ilog2(stride) - 4); if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_PORT_REMAP) - context->params2 |= MLX4_QP_BIT_FPP; + context->params2 |= cpu_to_be32(MLX4_QP_BIT_FPP); } else { context->sq_size_stride = ilog2(TXBB_SIZE) - 4; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 8f9cb8abc497..92aec17f4b4d 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -254,8 +254,7 @@ void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev) DEF_RX_RINGS)); num_rx_rings = mlx4_low_memory_profile() ? MIN_RX_RINGS : - min_t(int, num_of_eqs, - netif_get_num_default_rss_queues()); + min_t(int, num_of_eqs, num_online_cpus()); mdev->profile.prof[i].rx_ring_num = rounddown_pow_of_two(num_rx_rings); } @@ -779,7 +778,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud case XDP_PASS: break; case XDP_TX: - if (likely(!mlx4_en_xmit_frame(ring, frags, dev, + if (likely(!mlx4_en_xmit_frame(ring, frags, priv, length, cq_ring, &doorbell_pending))) { frags[0].page = NULL; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 8a32a8f7f9c0..596445a4a241 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -718,7 +718,7 @@ void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring) #else iowrite32be( #endif - ring->doorbell_qpn, + (__force u32)ring->doorbell_qpn, ring->bf.uar->map + MLX4_SEND_DOORBELL); } @@ -1085,13 +1085,35 @@ tx_drop: #define MLX4_EN_XDP_TX_REAL_SZ (((CTRL_SIZE + MLX4_EN_XDP_TX_NRTXBB * DS_SIZE) \ / 16) & 0x3f) +void mlx4_en_init_tx_xdp_ring_descs(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring) +{ + int i; + + for (i = 0; i < ring->size; i++) { + struct mlx4_en_tx_info *tx_info = &ring->tx_info[i]; + struct mlx4_en_tx_desc *tx_desc = ring->buf + + (i << LOG_TXBB_SIZE); + + tx_info->map0_byte_count = PAGE_SIZE; + tx_info->nr_txbb = MLX4_EN_XDP_TX_NRTXBB; + tx_info->data_offset = offsetof(struct mlx4_en_tx_desc, data); + tx_info->ts_requested = 0; + tx_info->nr_maps = 1; + tx_info->linear = 1; + tx_info->inl = 0; + + tx_desc->data.lkey = ring->mr_key; + tx_desc->ctrl.qpn_vlan.fence_size = MLX4_EN_XDP_TX_REAL_SZ; + tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; + } +} + netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring, struct mlx4_en_rx_alloc *frame, - struct net_device *dev, unsigned int length, + struct mlx4_en_priv *priv, unsigned int length, int tx_ind, bool *doorbell_pending) { - struct mlx4_en_priv *priv = netdev_priv(dev); - union mlx4_wqe_qpn_vlan qpn_vlan = {}; struct mlx4_en_tx_desc *tx_desc; struct mlx4_en_tx_info *tx_info; struct mlx4_wqe_data_seg *data; @@ -1123,25 +1145,16 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring, tx_info->page = frame->page; frame->page = NULL; tx_info->map0_dma = dma; - tx_info->map0_byte_count = PAGE_SIZE; - tx_info->nr_txbb = MLX4_EN_XDP_TX_NRTXBB; tx_info->nr_bytes = max_t(unsigned int, length, ETH_ZLEN); - tx_info->data_offset = offsetof(struct mlx4_en_tx_desc, data); - tx_info->ts_requested = 0; - tx_info->nr_maps = 1; - tx_info->linear = 1; - tx_info->inl = 0; dma_sync_single_range_for_device(priv->ddev, dma, frame->page_offset, length, PCI_DMA_TODEVICE); data->addr = cpu_to_be64(dma + frame->page_offset); - data->lkey = ring->mr_key; dma_wmb(); data->byte_count = cpu_to_be32(length); /* tx completion can avoid cache line miss for common cases */ - tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; op_own = cpu_to_be32(MLX4_OPCODE_SEND) | ((ring->prod & ring->size) ? @@ -1152,10 +1165,13 @@ netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring, ring->prod += MLX4_EN_XDP_TX_NRTXBB; - qpn_vlan.fence_size = MLX4_EN_XDP_TX_REAL_SZ; + /* Ensure new descriptor hits memory + * before setting ownership of this descriptor to HW + */ + dma_wmb(); + tx_desc->ctrl.owner_opcode = op_own; + ring->xmit_more++; - mlx4_en_tx_write_desc(ring, tx_desc, qpn_vlan, TXBB_SIZE, 0, - op_own, false, false); *doorbell_pending = true; return NETDEV_TX_OK; diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 16c09949afd5..634f603f941c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -57,12 +57,12 @@ MODULE_PARM_DESC(enable_qos, "Enable Enhanced QoS support (default: off)"); #define MLX4_GET(dest, source, offset) \ do { \ void *__p = (char *) (source) + (offset); \ - u64 val; \ - switch (sizeof(dest)) { \ + __be64 val; \ + switch (sizeof(dest)) { \ case 1: (dest) = *(u8 *) __p; break; \ case 2: (dest) = be16_to_cpup(__p); break; \ case 4: (dest) = be32_to_cpup(__p); break; \ - case 8: val = get_unaligned((u64 *)__p); \ + case 8: val = get_unaligned((__be64 *)__p); \ (dest) = be64_to_cpu(val); break; \ default: __buggy_use_of_MLX4_GET(); \ } \ diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h index fdb3ad0cbe54..1856e279a7e0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h @@ -399,7 +399,7 @@ struct mlx4_en_profile { u32 active_ports; u32 small_pkt_int; u8 no_reset; - u8 num_tx_rings_p_up; + u8 max_num_tx_rings_p_up; struct mlx4_en_port_profile prof[MLX4_MAX_PORTS + 1]; }; @@ -693,7 +693,7 @@ u16 mlx4_en_select_queue(struct net_device *dev, struct sk_buff *skb, netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t mlx4_en_xmit_frame(struct mlx4_en_rx_ring *rx_ring, struct mlx4_en_rx_alloc *frame, - struct net_device *dev, unsigned int length, + struct mlx4_en_priv *priv, unsigned int length, int tx_ind, bool *doorbell_pending); void mlx4_en_xmit_doorbell(struct mlx4_en_tx_ring *ring); bool mlx4_en_rx_recycle(struct mlx4_en_rx_ring *ring, @@ -705,6 +705,8 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv, int node, int queue_index); void mlx4_en_destroy_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring **pring); +void mlx4_en_init_tx_xdp_ring_descs(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring); int mlx4_en_activate_tx_ring(struct mlx4_en_priv *priv, struct mlx4_en_tx_ring *ring, int cq, int user_prio); diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 728a2fb1f5c0..203320923340 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -925,7 +925,7 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt, context->flags &= cpu_to_be32(~(0xf << 28)); context->flags |= cpu_to_be32(states[i + 1] << 28); if (states[i + 1] != MLX4_QP_STATE_RTR) - context->params2 &= ~MLX4_QP_BIT_FPP; + context->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP); err = mlx4_qp_modify(dev, mtt, states[i], states[i + 1], context, 0, 0, qp); if (err) { diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index fabb53379727..04304dd894c6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -3185,7 +3185,7 @@ static int verify_qp_parameters(struct mlx4_dev *dev, optpar = be32_to_cpu(*(__be32 *) inbox->buf); if (slave != mlx4_master_func_num(dev)) { - qp_ctx->params2 &= ~MLX4_QP_BIT_FPP; + qp_ctx->params2 &= ~cpu_to_be32(MLX4_QP_BIT_FPP); /* setting QP rate-limit is disallowed for VFs */ if (qp_ctx->rate_limit_params) return -EPERM; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index cc13d3dbd366..5ec6d3e8dc89 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -1081,6 +1081,9 @@ int mlx5e_ethtool_get_ts_info(struct mlx5e_priv *priv, int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, struct ethtool_flash *flash); +int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data); + /* mlx5e generic netdev management API */ struct net_device* mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index cc11bbbd0309..2a32102e7648 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -3108,8 +3108,8 @@ static int mlx5e_setup_tc_cls_flower(struct net_device *dev, } #endif -static int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, - void *type_data) +int mlx5e_setup_tc(struct net_device *dev, enum tc_setup_type type, + void *type_data) { switch (type) { #ifdef CONFIG_MLX5_ESWITCH diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 45e03c427faf..765fc74fbb1b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -34,6 +34,7 @@ #include <linux/mlx5/fs.h> #include <net/switchdev.h> #include <net/pkt_cls.h> +#include <net/act_api.h> #include <net/netevent.h> #include <net/arp.h> @@ -667,14 +668,6 @@ mlx5e_rep_setup_tc_cls_flower(struct net_device *dev, cls_flower->common.chain_index) return -EOPNOTSUPP; - if (cls_flower->egress_dev) { - struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; - - dev = mlx5_eswitch_get_uplink_netdev(esw); - return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - cls_flower); - } - switch (cls_flower->command) { case TC_CLSFLOWER_REPLACE: return mlx5e_configure_flower(priv, cls_flower); @@ -698,6 +691,14 @@ static int mlx5e_rep_setup_tc(struct net_device *dev, enum tc_setup_type type, } } +static int mlx5e_rep_setup_tc_cb(enum tc_setup_type type, void *type_data, + void *cb_priv) +{ + struct net_device *dev = cb_priv; + + return mlx5e_setup_tc(dev, type, type_data); +} + bool mlx5e_is_uplink_rep(struct mlx5e_priv *priv) { struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; @@ -1017,15 +1018,24 @@ mlx5e_vport_rep_load(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep) goto err_detach_netdev; } + err = tc_setup_cb_egdev_register(netdev, mlx5e_rep_setup_tc_cb, + mlx5_eswitch_get_uplink_netdev(esw)); + if (err) + goto err_neigh_cleanup; + err = register_netdev(netdev); if (err) { pr_warn("Failed to register representor netdev for vport %d\n", rep->vport); - goto err_neigh_cleanup; + goto err_egdev_cleanup; } return 0; +err_egdev_cleanup: + tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb, + mlx5_eswitch_get_uplink_netdev(esw)); + err_neigh_cleanup: mlx5e_rep_neigh_cleanup(rpriv); @@ -1047,7 +1057,8 @@ mlx5e_vport_rep_unload(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep) void *ppriv = priv->ppriv; unregister_netdev(rep->netdev); - + tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb, + mlx5_eswitch_get_uplink_netdev(esw)); mlx5e_rep_neigh_cleanup(rpriv); mlx5e_detach_netdev(priv); mlx5e_destroy_netdev(priv); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 5a7bea688ec8..7a136ae2547a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -145,10 +145,10 @@ static struct init_tree_node { } }; -enum fs_i_mutex_lock_class { - FS_MUTEX_GRANDPARENT, - FS_MUTEX_PARENT, - FS_MUTEX_CHILD +enum fs_i_lock_class { + FS_LOCK_GRANDPARENT, + FS_LOCK_PARENT, + FS_LOCK_CHILD }; static const struct rhashtable_params rhash_fte = { @@ -168,10 +168,16 @@ static const struct rhashtable_params rhash_fg = { }; -static void del_rule(struct fs_node *node); -static void del_flow_table(struct fs_node *node); -static void del_flow_group(struct fs_node *node); -static void del_fte(struct fs_node *node); +static void del_hw_flow_table(struct fs_node *node); +static void del_hw_flow_group(struct fs_node *node); +static void del_hw_fte(struct fs_node *node); +static void del_sw_flow_table(struct fs_node *node); +static void del_sw_flow_group(struct fs_node *node); +static void del_sw_fte(struct fs_node *node); +/* Delete rule (destination) is special case that + * requires to lock the FTE for all the deletion process. + */ +static void del_sw_hw_rule(struct fs_node *node); static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, struct mlx5_flow_destination *d2); static struct mlx5_flow_rule * @@ -179,14 +185,16 @@ find_flow_rule(struct fs_fte *fte, struct mlx5_flow_destination *dest); static void tree_init_node(struct fs_node *node, - unsigned int refcount, - void (*remove_func)(struct fs_node *)) + void (*del_hw_func)(struct fs_node *), + void (*del_sw_func)(struct fs_node *)) { - atomic_set(&node->refcount, refcount); + atomic_set(&node->refcount, 1); INIT_LIST_HEAD(&node->list); INIT_LIST_HEAD(&node->children); - mutex_init(&node->lock); - node->remove_func = remove_func; + init_rwsem(&node->lock); + node->del_hw_func = del_hw_func; + node->del_sw_func = del_sw_func; + node->active = false; } static void tree_add_node(struct fs_node *node, struct fs_node *parent) @@ -202,50 +210,70 @@ static void tree_add_node(struct fs_node *node, struct fs_node *parent) node->root = parent->root; } -static void tree_get_node(struct fs_node *node) +static int tree_get_node(struct fs_node *node) { - atomic_inc(&node->refcount); + return atomic_add_unless(&node->refcount, 1, 0); } -static void nested_lock_ref_node(struct fs_node *node, - enum fs_i_mutex_lock_class class) +static void nested_down_read_ref_node(struct fs_node *node, + enum fs_i_lock_class class) { if (node) { - mutex_lock_nested(&node->lock, class); + down_read_nested(&node->lock, class); atomic_inc(&node->refcount); } } -static void lock_ref_node(struct fs_node *node) +static void nested_down_write_ref_node(struct fs_node *node, + enum fs_i_lock_class class) { if (node) { - mutex_lock(&node->lock); + down_write_nested(&node->lock, class); atomic_inc(&node->refcount); } } -static void unlock_ref_node(struct fs_node *node) +static void down_write_ref_node(struct fs_node *node) { if (node) { - atomic_dec(&node->refcount); - mutex_unlock(&node->lock); + down_write(&node->lock); + atomic_inc(&node->refcount); } } +static void up_read_ref_node(struct fs_node *node) +{ + atomic_dec(&node->refcount); + up_read(&node->lock); +} + +static void up_write_ref_node(struct fs_node *node) +{ + atomic_dec(&node->refcount); + up_write(&node->lock); +} + static void tree_put_node(struct fs_node *node) { struct fs_node *parent_node = node->parent; - lock_ref_node(parent_node); if (atomic_dec_and_test(&node->refcount)) { - if (parent_node) + if (node->del_hw_func) + node->del_hw_func(node); + if (parent_node) { + /* Only root namespace doesn't have parent and we just + * need to free its node. + */ + down_write_ref_node(parent_node); list_del_init(&node->list); - if (node->remove_func) - node->remove_func(node); - kfree(node); + if (node->del_sw_func) + node->del_sw_func(node); + up_write_ref_node(parent_node); + } else { + kfree(node); + } node = NULL; } - unlock_ref_node(parent_node); if (!node && parent_node) tree_put_node(parent_node); } @@ -362,6 +390,15 @@ static struct mlx5_flow_root_namespace *find_root(struct fs_node *node) return container_of(ns, struct mlx5_flow_root_namespace, ns); } +static inline struct mlx5_flow_steering *get_steering(struct fs_node *node) +{ + struct mlx5_flow_root_namespace *root = find_root(node); + + if (root) + return root->dev->priv.steering; + return NULL; +} + static inline struct mlx5_core_dev *get_dev(struct fs_node *node) { struct mlx5_flow_root_namespace *root = find_root(node); @@ -371,26 +408,36 @@ static inline struct mlx5_core_dev *get_dev(struct fs_node *node) return NULL; } -static void del_flow_table(struct fs_node *node) +static void del_hw_flow_table(struct fs_node *node) { struct mlx5_flow_table *ft; struct mlx5_core_dev *dev; - struct fs_prio *prio; int err; fs_get_obj(ft, node); dev = get_dev(&ft->node); - err = mlx5_cmd_destroy_flow_table(dev, ft); - if (err) - mlx5_core_warn(dev, "flow steering can't destroy ft\n"); - ida_destroy(&ft->fte_allocator); + if (node->active) { + err = mlx5_cmd_destroy_flow_table(dev, ft); + if (err) + mlx5_core_warn(dev, "flow steering can't destroy ft\n"); + } +} + +static void del_sw_flow_table(struct fs_node *node) +{ + struct mlx5_flow_table *ft; + struct fs_prio *prio; + + fs_get_obj(ft, node); + rhltable_destroy(&ft->fgs_hash); fs_get_obj(prio, ft->node.parent); prio->num_ft--; + kfree(ft); } -static void del_rule(struct fs_node *node) +static void del_sw_hw_rule(struct fs_node *node) { struct mlx5_flow_rule *rule; struct mlx5_flow_table *ft; @@ -406,7 +453,6 @@ static void del_rule(struct fs_node *node) fs_get_obj(fg, fte->node.parent); fs_get_obj(ft, fg->node.parent); trace_mlx5_fs_del_rule(rule); - list_del(&rule->node.list); if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) { mutex_lock(&rule->dest_attr.ft->lock); list_del(&rule->next_ft); @@ -434,117 +480,203 @@ out: "%s can't del rule fg id=%d fte_index=%d\n", __func__, fg->id, fte->index); } + kfree(rule); } -static void destroy_fte(struct fs_fte *fte, struct mlx5_flow_group *fg) +static void del_hw_fte(struct fs_node *node) { struct mlx5_flow_table *ft; - int ret; + struct mlx5_flow_group *fg; + struct mlx5_core_dev *dev; + struct fs_fte *fte; + int err; - ret = rhashtable_remove_fast(&fg->ftes_hash, &fte->hash, rhash_fte); - WARN_ON(ret); - fte->status = 0; + fs_get_obj(fte, node); + fs_get_obj(fg, fte->node.parent); fs_get_obj(ft, fg->node.parent); - ida_simple_remove(&ft->fte_allocator, fte->index); + + trace_mlx5_fs_del_fte(fte); + dev = get_dev(&ft->node); + if (node->active) { + err = mlx5_cmd_delete_fte(dev, ft, + fte->index); + if (err) + mlx5_core_warn(dev, + "flow steering can't delete fte in index %d of flow group id %d\n", + fte->index, fg->id); + } } -static void del_fte(struct fs_node *node) +static void del_sw_fte(struct fs_node *node) { - struct mlx5_flow_table *ft; + struct mlx5_flow_steering *steering = get_steering(node); struct mlx5_flow_group *fg; - struct mlx5_core_dev *dev; struct fs_fte *fte; int err; fs_get_obj(fte, node); fs_get_obj(fg, fte->node.parent); - fs_get_obj(ft, fg->node.parent); - trace_mlx5_fs_del_fte(fte); - - dev = get_dev(&ft->node); - err = mlx5_cmd_delete_fte(dev, ft, - fte->index); - if (err) - mlx5_core_warn(dev, - "flow steering can't delete fte in index %d of flow group id %d\n", - fte->index, fg->id); - destroy_fte(fte, fg); + err = rhashtable_remove_fast(&fg->ftes_hash, + &fte->hash, + rhash_fte); + WARN_ON(err); + ida_simple_remove(&fg->fte_allocator, fte->index - fg->start_index); + kmem_cache_free(steering->ftes_cache, fte); } -static void del_flow_group(struct fs_node *node) +static void del_hw_flow_group(struct fs_node *node) { struct mlx5_flow_group *fg; struct mlx5_flow_table *ft; struct mlx5_core_dev *dev; - int err; fs_get_obj(fg, node); fs_get_obj(ft, fg->node.parent); dev = get_dev(&ft->node); trace_mlx5_fs_del_fg(fg); - if (ft->autogroup.active) - ft->autogroup.num_groups--; + if (fg->node.active && mlx5_cmd_destroy_flow_group(dev, ft, fg->id)) + mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n", + fg->id, ft->id); +} + +static void del_sw_flow_group(struct fs_node *node) +{ + struct mlx5_flow_steering *steering = get_steering(node); + struct mlx5_flow_group *fg; + struct mlx5_flow_table *ft; + int err; + + fs_get_obj(fg, node); + fs_get_obj(ft, fg->node.parent); rhashtable_destroy(&fg->ftes_hash); + ida_destroy(&fg->fte_allocator); + if (ft->autogroup.active) + ft->autogroup.num_groups--; err = rhltable_remove(&ft->fgs_hash, &fg->hash, rhash_fg); WARN_ON(err); - if (mlx5_cmd_destroy_flow_group(dev, ft, fg->id)) - mlx5_core_warn(dev, "flow steering can't destroy fg %d of ft %d\n", - fg->id, ft->id); + kmem_cache_free(steering->fgs_cache, fg); +} + +static int insert_fte(struct mlx5_flow_group *fg, struct fs_fte *fte) +{ + int index; + int ret; + + index = ida_simple_get(&fg->fte_allocator, 0, fg->max_ftes, GFP_KERNEL); + if (index < 0) + return index; + + fte->index = index + fg->start_index; + ret = rhashtable_insert_fast(&fg->ftes_hash, + &fte->hash, + rhash_fte); + if (ret) + goto err_ida_remove; + + tree_add_node(&fte->node, &fg->node); + list_add_tail(&fte->node.list, &fg->node.children); + return 0; + +err_ida_remove: + ida_simple_remove(&fg->fte_allocator, index); + return ret; } -static struct fs_fte *alloc_fte(struct mlx5_flow_act *flow_act, +static struct fs_fte *alloc_fte(struct mlx5_flow_table *ft, u32 *match_value, - unsigned int index) + struct mlx5_flow_act *flow_act) { + struct mlx5_flow_steering *steering = get_steering(&ft->node); struct fs_fte *fte; - fte = kzalloc(sizeof(*fte), GFP_KERNEL); + fte = kmem_cache_zalloc(steering->ftes_cache, GFP_KERNEL); if (!fte) return ERR_PTR(-ENOMEM); memcpy(fte->val, match_value, sizeof(fte->val)); fte->node.type = FS_TYPE_FLOW_ENTRY; fte->flow_tag = flow_act->flow_tag; - fte->index = index; fte->action = flow_act->action; fte->encap_id = flow_act->encap_id; fte->modify_id = flow_act->modify_id; + tree_init_node(&fte->node, del_hw_fte, del_sw_fte); + return fte; } -static struct mlx5_flow_group *alloc_flow_group(u32 *create_fg_in) +static void dealloc_flow_group(struct mlx5_flow_steering *steering, + struct mlx5_flow_group *fg) +{ + rhashtable_destroy(&fg->ftes_hash); + kmem_cache_free(steering->fgs_cache, fg); +} + +static struct mlx5_flow_group *alloc_flow_group(struct mlx5_flow_steering *steering, + u8 match_criteria_enable, + void *match_criteria, + int start_index, + int end_index) { struct mlx5_flow_group *fg; - void *match_criteria = MLX5_ADDR_OF(create_flow_group_in, - create_fg_in, match_criteria); - u8 match_criteria_enable = MLX5_GET(create_flow_group_in, - create_fg_in, - match_criteria_enable); int ret; - fg = kzalloc(sizeof(*fg), GFP_KERNEL); + fg = kmem_cache_zalloc(steering->fgs_cache, GFP_KERNEL); if (!fg) return ERR_PTR(-ENOMEM); ret = rhashtable_init(&fg->ftes_hash, &rhash_fte); if (ret) { - kfree(fg); + kmem_cache_free(steering->fgs_cache, fg); return ERR_PTR(ret); - } +} + ida_init(&fg->fte_allocator); fg->mask.match_criteria_enable = match_criteria_enable; memcpy(&fg->mask.match_criteria, match_criteria, sizeof(fg->mask.match_criteria)); fg->node.type = FS_TYPE_FLOW_GROUP; - fg->start_index = MLX5_GET(create_flow_group_in, create_fg_in, - start_flow_index); - fg->max_ftes = MLX5_GET(create_flow_group_in, create_fg_in, - end_flow_index) - fg->start_index + 1; + fg->start_index = start_index; + fg->max_ftes = end_index - start_index + 1; + + return fg; +} + +static struct mlx5_flow_group *alloc_insert_flow_group(struct mlx5_flow_table *ft, + u8 match_criteria_enable, + void *match_criteria, + int start_index, + int end_index, + struct list_head *prev) +{ + struct mlx5_flow_steering *steering = get_steering(&ft->node); + struct mlx5_flow_group *fg; + int ret; + + fg = alloc_flow_group(steering, match_criteria_enable, match_criteria, + start_index, end_index); + if (IS_ERR(fg)) + return fg; + + /* initialize refcnt, add to parent list */ + ret = rhltable_insert(&ft->fgs_hash, + &fg->hash, + rhash_fg); + if (ret) { + dealloc_flow_group(steering, fg); + return ERR_PTR(ret); + } + + tree_init_node(&fg->node, del_hw_flow_group, del_sw_flow_group); + tree_add_node(&fg->node, &ft->node); + /* Add node to group list */ + list_add(&fg->node.list, prev); + atomic_inc(&ft->node.version); + return fg; } @@ -575,7 +707,6 @@ static struct mlx5_flow_table *alloc_flow_table(int level, u16 vport, int max_ft ft->flags = flags; INIT_LIST_HEAD(&ft->fwd_rules); mutex_init(&ft->lock); - ida_init(&ft->fte_allocator); return ft; } @@ -724,7 +855,7 @@ static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule, fs_get_obj(fte, rule->node.parent); if (!(fte->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST)) return -EINVAL; - lock_ref_node(&fte->node); + down_write_ref_node(&fte->node); fs_get_obj(fg, fte->node.parent); fs_get_obj(ft, fg->node.parent); @@ -733,7 +864,7 @@ static int _mlx5_modify_rule_destination(struct mlx5_flow_rule *rule, ft, fg->id, modify_mask, fte); - unlock_ref_node(&fte->node); + up_write_ref_node(&fte->node); return err; } @@ -870,7 +1001,7 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa goto unlock_root; } - tree_init_node(&ft->node, 1, del_flow_table); + tree_init_node(&ft->node, del_hw_flow_table, del_sw_flow_table); log_table_sz = ft->max_fte ? ilog2(ft->max_fte) : 0; next_ft = find_next_chained_ft(fs_prio); err = mlx5_cmd_create_flow_table(root->dev, ft->vport, ft->op_mod, ft->type, @@ -882,17 +1013,17 @@ static struct mlx5_flow_table *__mlx5_create_flow_table(struct mlx5_flow_namespa err = connect_flow_table(root->dev, ft, fs_prio); if (err) goto destroy_ft; - lock_ref_node(&fs_prio->node); + ft->node.active = true; + down_write_ref_node(&fs_prio->node); tree_add_node(&ft->node, &fs_prio->node); list_add_flow_table(ft, fs_prio); fs_prio->num_ft++; - unlock_ref_node(&fs_prio->node); + up_write_ref_node(&fs_prio->node); mutex_unlock(&root->chain_lock); return ft; destroy_ft: mlx5_cmd_destroy_flow_table(root->dev, ft); free_ft: - ida_destroy(&ft->fte_allocator); kfree(ft); unlock_root: mutex_unlock(&root->chain_lock); @@ -960,54 +1091,6 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, } EXPORT_SYMBOL(mlx5_create_auto_grouped_flow_table); -/* Flow table should be locked */ -static struct mlx5_flow_group *create_flow_group_common(struct mlx5_flow_table *ft, - u32 *fg_in, - struct list_head - *prev_fg, - bool is_auto_fg) -{ - struct mlx5_flow_group *fg; - struct mlx5_core_dev *dev = get_dev(&ft->node); - int err; - - if (!dev) - return ERR_PTR(-ENODEV); - - fg = alloc_flow_group(fg_in); - if (IS_ERR(fg)) - return fg; - - err = rhltable_insert(&ft->fgs_hash, &fg->hash, rhash_fg); - if (err) - goto err_free_fg; - - err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id); - if (err) - goto err_remove_fg; - - if (ft->autogroup.active) - ft->autogroup.num_groups++; - /* Add node to tree */ - tree_init_node(&fg->node, !is_auto_fg, del_flow_group); - tree_add_node(&fg->node, &ft->node); - /* Add node to group list */ - list_add(&fg->node.list, prev_fg); - - trace_mlx5_fs_add_fg(fg); - return fg; - -err_remove_fg: - WARN_ON(rhltable_remove(&ft->fgs_hash, - &fg->hash, - rhash_fg)); -err_free_fg: - rhashtable_destroy(&fg->ftes_hash); - kfree(fg); - - return ERR_PTR(err); -} - struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft, u32 *fg_in) { @@ -1016,7 +1099,13 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft, u8 match_criteria_enable = MLX5_GET(create_flow_group_in, fg_in, match_criteria_enable); + int start_index = MLX5_GET(create_flow_group_in, fg_in, + start_flow_index); + int end_index = MLX5_GET(create_flow_group_in, fg_in, + end_flow_index); + struct mlx5_core_dev *dev = get_dev(&ft->node); struct mlx5_flow_group *fg; + int err; if (!check_valid_mask(match_criteria_enable, match_criteria)) return ERR_PTR(-EINVAL); @@ -1024,9 +1113,21 @@ struct mlx5_flow_group *mlx5_create_flow_group(struct mlx5_flow_table *ft, if (ft->autogroup.active) return ERR_PTR(-EPERM); - lock_ref_node(&ft->node); - fg = create_flow_group_common(ft, fg_in, ft->node.children.prev, false); - unlock_ref_node(&ft->node); + down_write_ref_node(&ft->node); + fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria, + start_index, end_index, + ft->node.children.prev); + up_write_ref_node(&ft->node); + if (IS_ERR(fg)) + return fg; + + err = mlx5_cmd_create_flow_group(dev, ft, fg_in, &fg->id); + if (err) { + tree_put_node(&fg->node); + return ERR_PTR(err); + } + trace_mlx5_fs_add_fg(fg); + fg->node.active = true; return fg; } @@ -1111,7 +1212,7 @@ create_flow_handle(struct fs_fte *fte, /* Add dest to dests list- we need flow tables to be in the * end of the list for forward to next prio rules. */ - tree_init_node(&rule->node, 1, del_rule); + tree_init_node(&rule->node, NULL, del_sw_hw_rule); if (dest && dest[i].type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) list_add(&rule->node.list, &fte->node.children); @@ -1167,7 +1268,9 @@ add_rule_fte(struct fs_fte *fte, if (err) goto free_handle; + fte->node.active = true; fte->status |= FS_FTE_STATUS_EXISTING; + atomic_inc(&fte->node.version); out: return handle; @@ -1177,59 +1280,17 @@ free_handle: return ERR_PTR(err); } -static struct fs_fte *create_fte(struct mlx5_flow_group *fg, - u32 *match_value, - struct mlx5_flow_act *flow_act) -{ - struct mlx5_flow_table *ft; - struct fs_fte *fte; - int index; - int ret; - - fs_get_obj(ft, fg->node.parent); - index = ida_simple_get(&ft->fte_allocator, fg->start_index, - fg->start_index + fg->max_ftes, - GFP_KERNEL); - if (index < 0) - return ERR_PTR(index); - - fte = alloc_fte(flow_act, match_value, index); - if (IS_ERR(fte)) { - ret = PTR_ERR(fte); - goto err_alloc; - } - ret = rhashtable_insert_fast(&fg->ftes_hash, &fte->hash, rhash_fte); - if (ret) - goto err_hash; - - return fte; - -err_hash: - kfree(fte); -err_alloc: - ida_simple_remove(&ft->fte_allocator, index); - return ERR_PTR(ret); -} - -static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft, - u8 match_criteria_enable, - u32 *match_criteria) +static struct mlx5_flow_group *alloc_auto_flow_group(struct mlx5_flow_table *ft, + struct mlx5_flow_spec *spec) { - int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); struct list_head *prev = &ft->node.children; - unsigned int candidate_index = 0; struct mlx5_flow_group *fg; - void *match_criteria_addr; + unsigned int candidate_index = 0; unsigned int group_size = 0; - u32 *in; if (!ft->autogroup.active) return ERR_PTR(-ENOENT); - in = kvzalloc(inlen, GFP_KERNEL); - if (!in) - return ERR_PTR(-ENOMEM); - if (ft->autogroup.num_groups < ft->autogroup.required_groups) /* We save place for flow groups in addition to max types */ group_size = ft->max_fte / (ft->autogroup.required_groups + 1); @@ -1247,25 +1308,55 @@ static struct mlx5_flow_group *create_autogroup(struct mlx5_flow_table *ft, prev = &fg->node.list; } - if (candidate_index + group_size > ft->max_fte) { - fg = ERR_PTR(-ENOSPC); + if (candidate_index + group_size > ft->max_fte) + return ERR_PTR(-ENOSPC); + + fg = alloc_insert_flow_group(ft, + spec->match_criteria_enable, + spec->match_criteria, + candidate_index, + candidate_index + group_size - 1, + prev); + if (IS_ERR(fg)) goto out; - } + + ft->autogroup.num_groups++; + +out: + return fg; +} + +static int create_auto_flow_group(struct mlx5_flow_table *ft, + struct mlx5_flow_group *fg) +{ + struct mlx5_core_dev *dev = get_dev(&ft->node); + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + void *match_criteria_addr; + int err; + u32 *in; + + in = kvzalloc(inlen, GFP_KERNEL); + if (!in) + return -ENOMEM; MLX5_SET(create_flow_group_in, in, match_criteria_enable, - match_criteria_enable); - MLX5_SET(create_flow_group_in, in, start_flow_index, candidate_index); - MLX5_SET(create_flow_group_in, in, end_flow_index, candidate_index + - group_size - 1); + fg->mask.match_criteria_enable); + MLX5_SET(create_flow_group_in, in, start_flow_index, fg->start_index); + MLX5_SET(create_flow_group_in, in, end_flow_index, fg->start_index + + fg->max_ftes - 1); match_criteria_addr = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); - memcpy(match_criteria_addr, match_criteria, - MLX5_ST_SZ_BYTES(fte_match_param)); + memcpy(match_criteria_addr, fg->mask.match_criteria, + sizeof(fg->mask.match_criteria)); + + err = mlx5_cmd_create_flow_group(dev, ft, in, &fg->id); + if (!err) { + fg->node.active = true; + trace_mlx5_fs_add_fg(fg); + } - fg = create_flow_group_common(ft, in, prev, true); -out: kvfree(in); - return fg; + return err; } static bool mlx5_flow_dests_cmp(struct mlx5_flow_destination *d1, @@ -1340,60 +1431,30 @@ static struct mlx5_flow_handle *add_rule_fg(struct mlx5_flow_group *fg, struct fs_fte *fte) { struct mlx5_flow_handle *handle; - struct mlx5_flow_table *ft; + int old_action; int i; + int ret; - if (fte) { - int old_action; - int ret; - - nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD); - ret = check_conflicting_ftes(fte, flow_act); - if (ret) { - handle = ERR_PTR(ret); - goto unlock_fte; - } - - old_action = fte->action; - fte->action |= flow_act->action; - handle = add_rule_fte(fte, fg, dest, dest_num, - old_action != flow_act->action); - if (IS_ERR(handle)) { - fte->action = old_action; - goto unlock_fte; - } else { - trace_mlx5_fs_set_fte(fte, false); - goto add_rules; - } - } - fs_get_obj(ft, fg->node.parent); + ret = check_conflicting_ftes(fte, flow_act); + if (ret) + return ERR_PTR(ret); - fte = create_fte(fg, match_value, flow_act); - if (IS_ERR(fte)) - return (void *)fte; - tree_init_node(&fte->node, 0, del_fte); - nested_lock_ref_node(&fte->node, FS_MUTEX_CHILD); - handle = add_rule_fte(fte, fg, dest, dest_num, false); + old_action = fte->action; + fte->action |= flow_act->action; + handle = add_rule_fte(fte, fg, dest, dest_num, + old_action != flow_act->action); if (IS_ERR(handle)) { - unlock_ref_node(&fte->node); - destroy_fte(fte, fg); - kfree(fte); + fte->action = old_action; return handle; } + trace_mlx5_fs_set_fte(fte, false); - tree_add_node(&fte->node, &fg->node); - /* fte list isn't sorted */ - list_add_tail(&fte->node.list, &fg->node.children); - trace_mlx5_fs_set_fte(fte, true); -add_rules: for (i = 0; i < handle->num_rules; i++) { if (atomic_read(&handle->rule[i]->node.refcount) == 1) { tree_add_node(&handle->rule[i]->node, &fte->node); trace_mlx5_fs_add_rule(handle->rule[i]); } } -unlock_fte: - unlock_ref_node(&fte->node); return handle; } @@ -1441,93 +1502,197 @@ static bool dest_is_valid(struct mlx5_flow_destination *dest, return true; } -static struct mlx5_flow_handle * -try_add_to_existing_fg(struct mlx5_flow_table *ft, - struct mlx5_flow_spec *spec, - struct mlx5_flow_act *flow_act, - struct mlx5_flow_destination *dest, - int dest_num) -{ +struct match_list { + struct list_head list; struct mlx5_flow_group *g; - struct mlx5_flow_handle *rule = ERR_PTR(-ENOENT); +}; + +struct match_list_head { + struct list_head list; + struct match_list first; +}; + +static void free_match_list(struct match_list_head *head) +{ + if (!list_empty(&head->list)) { + struct match_list *iter, *match_tmp; + + list_del(&head->first.list); + tree_put_node(&head->first.g->node); + list_for_each_entry_safe(iter, match_tmp, &head->list, + list) { + tree_put_node(&iter->g->node); + list_del(&iter->list); + kfree(iter); + } + } +} + +static int build_match_list(struct match_list_head *match_head, + struct mlx5_flow_table *ft, + struct mlx5_flow_spec *spec) +{ struct rhlist_head *tmp, *list; - struct match_list { - struct list_head list; - struct mlx5_flow_group *g; - } match_list, *iter; - LIST_HEAD(match_head); + struct mlx5_flow_group *g; + int err = 0; rcu_read_lock(); + INIT_LIST_HEAD(&match_head->list); /* Collect all fgs which has a matching match_criteria */ list = rhltable_lookup(&ft->fgs_hash, spec, rhash_fg); + /* RCU is atomic, we can't execute FW commands here */ rhl_for_each_entry_rcu(g, tmp, list, hash) { struct match_list *curr_match; - if (likely(list_empty(&match_head))) { - match_list.g = g; - list_add_tail(&match_list.list, &match_head); + if (likely(list_empty(&match_head->list))) { + if (!tree_get_node(&g->node)) + continue; + match_head->first.g = g; + list_add_tail(&match_head->first.list, + &match_head->list); continue; } - curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC); + curr_match = kmalloc(sizeof(*curr_match), GFP_ATOMIC); if (!curr_match) { - rcu_read_unlock(); - rule = ERR_PTR(-ENOMEM); - goto free_list; + free_match_list(match_head); + err = -ENOMEM; + goto out; + } + if (!tree_get_node(&g->node)) { + kfree(curr_match); + continue; } curr_match->g = g; - list_add_tail(&curr_match->list, &match_head); + list_add_tail(&curr_match->list, &match_head->list); } +out: rcu_read_unlock(); + return err; +} + +static u64 matched_fgs_get_version(struct list_head *match_head) +{ + struct match_list *iter; + u64 version = 0; + + list_for_each_entry(iter, match_head, list) + version += (u64)atomic_read(&iter->g->node.version); + return version; +} + +static struct mlx5_flow_handle * +try_add_to_existing_fg(struct mlx5_flow_table *ft, + struct list_head *match_head, + struct mlx5_flow_spec *spec, + struct mlx5_flow_act *flow_act, + struct mlx5_flow_destination *dest, + int dest_num, + int ft_version) +{ + struct mlx5_flow_steering *steering = get_steering(&ft->node); + struct mlx5_flow_group *g; + struct mlx5_flow_handle *rule; + struct match_list *iter; + bool take_write = false; + struct fs_fte *fte; + u64 version; + int err; + + fte = alloc_fte(ft, spec->match_value, flow_act); + if (IS_ERR(fte)) + return ERR_PTR(-ENOMEM); + list_for_each_entry(iter, match_head, list) { + nested_down_read_ref_node(&iter->g->node, FS_LOCK_PARENT); + ida_pre_get(&iter->g->fte_allocator, GFP_KERNEL); + } + +search_again_locked: + version = matched_fgs_get_version(match_head); /* Try to find a fg that already contains a matching fte */ - list_for_each_entry(iter, &match_head, list) { - struct fs_fte *fte; + list_for_each_entry(iter, match_head, list) { + struct fs_fte *fte_tmp; g = iter->g; - nested_lock_ref_node(&g->node, FS_MUTEX_PARENT); - fte = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value, - rhash_fte); - if (fte) { - rule = add_rule_fg(g, spec->match_value, - flow_act, dest, dest_num, fte); - unlock_ref_node(&g->node); - goto free_list; + fte_tmp = rhashtable_lookup_fast(&g->ftes_hash, spec->match_value, + rhash_fte); + if (!fte_tmp || !tree_get_node(&fte_tmp->node)) + continue; + + nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD); + if (!take_write) { + list_for_each_entry(iter, match_head, list) + up_read_ref_node(&iter->g->node); + } else { + list_for_each_entry(iter, match_head, list) + up_write_ref_node(&iter->g->node); } - unlock_ref_node(&g->node); + + rule = add_rule_fg(g, spec->match_value, + flow_act, dest, dest_num, fte_tmp); + up_write_ref_node(&fte_tmp->node); + tree_put_node(&fte_tmp->node); + kmem_cache_free(steering->ftes_cache, fte); + return rule; } /* No group with matching fte found. Try to add a new fte to any * matching fg. */ - list_for_each_entry(iter, &match_head, list) { - g = iter->g; - nested_lock_ref_node(&g->node, FS_MUTEX_PARENT); - rule = add_rule_fg(g, spec->match_value, - flow_act, dest, dest_num, NULL); - if (!IS_ERR(rule) || PTR_ERR(rule) != -ENOSPC) { - unlock_ref_node(&g->node); - goto free_list; - } - unlock_ref_node(&g->node); + if (!take_write) { + list_for_each_entry(iter, match_head, list) + up_read_ref_node(&iter->g->node); + list_for_each_entry(iter, match_head, list) + nested_down_write_ref_node(&iter->g->node, + FS_LOCK_PARENT); + take_write = true; } -free_list: - if (!list_empty(&match_head)) { - struct match_list *match_tmp; + /* Check the ft version, for case that new flow group + * was added while the fgs weren't locked + */ + if (atomic_read(&ft->node.version) != ft_version) { + rule = ERR_PTR(-EAGAIN); + goto out; + } - /* The most common case is having one FG. Since we want to - * optimize this case, we save the first on the stack. - * Therefore, no need to free it. - */ - list_del(&list_first_entry(&match_head, typeof(*iter), list)->list); - list_for_each_entry_safe(iter, match_tmp, &match_head, list) { - list_del(&iter->list); - kfree(iter); + /* Check the fgs version, for case the new FTE with the + * same values was added while the fgs weren't locked + */ + if (version != matched_fgs_get_version(match_head)) + goto search_again_locked; + + list_for_each_entry(iter, match_head, list) { + g = iter->g; + + if (!g->node.active) + continue; + err = insert_fte(g, fte); + if (err) { + if (err == -ENOSPC) + continue; + list_for_each_entry(iter, match_head, list) + up_write_ref_node(&iter->g->node); + kmem_cache_free(steering->ftes_cache, fte); + return ERR_PTR(err); } - } + nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD); + list_for_each_entry(iter, match_head, list) + up_write_ref_node(&iter->g->node); + rule = add_rule_fg(g, spec->match_value, + flow_act, dest, dest_num, fte); + up_write_ref_node(&fte->node); + tree_put_node(&fte->node); + return rule; + } + rule = ERR_PTR(-ENOENT); +out: + list_for_each_entry(iter, match_head, list) + up_write_ref_node(&iter->g->node); + kmem_cache_free(steering->ftes_cache, fte); return rule; } @@ -1539,8 +1704,14 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft, int dest_num) { + struct mlx5_flow_steering *steering = get_steering(&ft->node); struct mlx5_flow_group *g; struct mlx5_flow_handle *rule; + struct match_list_head match_head; + bool take_write = false; + struct fs_fte *fte; + int version; + int err; int i; if (!check_valid_spec(spec)) @@ -1550,33 +1721,73 @@ _mlx5_add_flow_rules(struct mlx5_flow_table *ft, if (!dest_is_valid(&dest[i], flow_act->action, ft)) return ERR_PTR(-EINVAL); } + nested_down_read_ref_node(&ft->node, FS_LOCK_GRANDPARENT); +search_again_locked: + version = atomic_read(&ft->node.version); + + /* Collect all fgs which has a matching match_criteria */ + err = build_match_list(&match_head, ft, spec); + if (err) + return ERR_PTR(err); + + if (!take_write) + up_read_ref_node(&ft->node); + + rule = try_add_to_existing_fg(ft, &match_head.list, spec, flow_act, dest, + dest_num, version); + free_match_list(&match_head); + if (!IS_ERR(rule) || + (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) + return rule; + + if (!take_write) { + nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT); + take_write = true; + } - nested_lock_ref_node(&ft->node, FS_MUTEX_GRANDPARENT); - rule = try_add_to_existing_fg(ft, spec, flow_act, dest, dest_num); - if (!IS_ERR(rule)) - goto unlock; + if (PTR_ERR(rule) == -EAGAIN || + version != atomic_read(&ft->node.version)) + goto search_again_locked; - g = create_autogroup(ft, spec->match_criteria_enable, - spec->match_criteria); + g = alloc_auto_flow_group(ft, spec); if (IS_ERR(g)) { rule = (void *)g; - goto unlock; + up_write_ref_node(&ft->node); + return rule; } - rule = add_rule_fg(g, spec->match_value, flow_act, dest, - dest_num, NULL); - if (IS_ERR(rule)) { - /* Remove assumes refcount > 0 and autogroup creates a group - * with a refcount = 0. - */ - unlock_ref_node(&ft->node); - tree_get_node(&g->node); - tree_remove_node(&g->node); - return rule; + nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); + up_write_ref_node(&ft->node); + + err = create_auto_flow_group(ft, g); + if (err) + goto err_release_fg; + + fte = alloc_fte(ft, spec->match_value, flow_act); + if (IS_ERR(fte)) { + err = PTR_ERR(fte); + goto err_release_fg; } -unlock: - unlock_ref_node(&ft->node); + + err = insert_fte(g, fte); + if (err) { + kmem_cache_free(steering->ftes_cache, fte); + goto err_release_fg; + } + + nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD); + up_write_ref_node(&g->node); + rule = add_rule_fg(g, spec->match_value, flow_act, dest, + dest_num, fte); + up_write_ref_node(&fte->node); + tree_put_node(&fte->node); + tree_put_node(&g->node); return rule; + +err_release_fg: + up_write_ref_node(&g->node); + tree_put_node(&g->node); + return ERR_PTR(err); } static bool fwd_next_prio_supported(struct mlx5_flow_table *ft) @@ -1817,7 +2028,7 @@ static struct fs_prio *fs_create_prio(struct mlx5_flow_namespace *ns, return ERR_PTR(-ENOMEM); fs_prio->node.type = FS_TYPE_PRIO; - tree_init_node(&fs_prio->node, 1, NULL); + tree_init_node(&fs_prio->node, NULL, NULL); tree_add_node(&fs_prio->node, &ns->node); fs_prio->num_levels = num_levels; fs_prio->prio = prio; @@ -1843,7 +2054,7 @@ static struct mlx5_flow_namespace *fs_create_namespace(struct fs_prio *prio) return ERR_PTR(-ENOMEM); fs_init_namespace(ns); - tree_init_node(&ns->node, 1, NULL); + tree_init_node(&ns->node, NULL, NULL); tree_add_node(&ns->node, &prio->node); list_add_tail(&ns->node.list, &prio->node.children); @@ -1968,7 +2179,7 @@ static struct mlx5_flow_root_namespace *create_root_ns(struct mlx5_flow_steering ns = &root_ns->ns; fs_init_namespace(ns); mutex_init(&root_ns->chain_lock); - tree_init_node(&ns->node, 1, NULL); + tree_init_node(&ns->node, NULL, NULL); tree_add_node(&ns->node, NULL); return root_ns; @@ -2066,8 +2277,10 @@ static void clean_tree(struct fs_node *node) struct fs_node *iter; struct fs_node *temp; + tree_get_node(node); list_for_each_entry_safe(iter, temp, &node->children, list) clean_tree(iter); + tree_put_node(node); tree_remove_node(node); } } @@ -2091,6 +2304,8 @@ void mlx5_cleanup_fs(struct mlx5_core_dev *dev) cleanup_root_ns(steering->sniffer_rx_root_ns); cleanup_root_ns(steering->sniffer_tx_root_ns); mlx5_cleanup_fc_stats(dev); + kmem_cache_destroy(steering->ftes_cache); + kmem_cache_destroy(steering->fgs_cache); kfree(steering); } @@ -2196,6 +2411,16 @@ int mlx5_init_fs(struct mlx5_core_dev *dev) steering->dev = dev; dev->priv.steering = steering; + steering->fgs_cache = kmem_cache_create("mlx5_fs_fgs", + sizeof(struct mlx5_flow_group), 0, + 0, NULL); + steering->ftes_cache = kmem_cache_create("mlx5_fs_ftes", sizeof(struct fs_fte), 0, + 0, NULL); + if (!steering->ftes_cache || !steering->fgs_cache) { + err = -ENOMEM; + goto err; + } + if ((((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && (MLX5_CAP_GEN(dev, nic_flow_table))) || ((MLX5_CAP_GEN(dev, port_type) == MLX5_CAP_PORT_TYPE_IB) && diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h index 48dd78975062..7a01277519e5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -66,6 +66,8 @@ enum fs_fte_status { struct mlx5_flow_steering { struct mlx5_core_dev *dev; + struct kmem_cache *fgs_cache; + struct kmem_cache *ftes_cache; struct mlx5_flow_root_namespace *root_ns; struct mlx5_flow_root_namespace *fdb_root_ns; struct mlx5_flow_root_namespace *esw_egress_root_ns; @@ -81,9 +83,12 @@ struct fs_node { struct fs_node *parent; struct fs_node *root; /* lock the node for writing and traversing */ - struct mutex lock; + struct rw_semaphore lock; atomic_t refcount; - void (*remove_func)(struct fs_node *); + bool active; + void (*del_hw_func)(struct fs_node *); + void (*del_sw_func)(struct fs_node *); + atomic_t version; }; struct mlx5_flow_rule { @@ -120,7 +125,6 @@ struct mlx5_flow_table { /* FWD rules that point on this flow table */ struct list_head fwd_rules; u32 flags; - struct ida fte_allocator; struct rhltable fgs_hash; }; @@ -200,6 +204,7 @@ struct mlx5_flow_group { struct mlx5_flow_group_mask mask; u32 start_index; u32 max_ftes; + struct ida fte_allocator; u32 id; struct rhashtable ftes_hash; struct rhlist_head hash; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 5cd4df08ce97..321988ac57cc 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -53,6 +53,7 @@ #include <linux/notifier.h> #include <linux/dcbnl.h> #include <linux/inetdevice.h> +#include <linux/netlink.h> #include <net/switchdev.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_mirred.h> @@ -4298,7 +4299,8 @@ static int mlxsw_sp_netdevice_port_upper_event(struct net_device *lower_dev, if (info->linking) err = mlxsw_sp_port_bridge_join(mlxsw_sp_port, lower_dev, - upper_dev); + upper_dev, + extack); else mlxsw_sp_port_bridge_leave(mlxsw_sp_port, lower_dev, @@ -4389,18 +4391,25 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, { struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); struct netdev_notifier_changeupper_info *info = ptr; + struct netlink_ext_ack *extack; struct net_device *upper_dev; int err = 0; + extack = netdev_notifier_info_to_extack(&info->info); + switch (event) { case NETDEV_PRECHANGEUPPER: upper_dev = info->upper_dev; - if (!netif_is_bridge_master(upper_dev)) + if (!netif_is_bridge_master(upper_dev)) { + NL_SET_ERR_MSG(extack, "spectrum: VLAN devices only support bridge and VRF uppers"); return -EINVAL; + } if (!info->linking) break; - if (netdev_has_any_upper_dev(upper_dev)) + if (netdev_has_any_upper_dev(upper_dev)) { + NL_SET_ERR_MSG(extack, "spectrum: Enslaving a port to a device that already has an upper device is not supported"); return -EINVAL; + } break; case NETDEV_CHANGEUPPER: upper_dev = info->upper_dev; @@ -4408,7 +4417,8 @@ static int mlxsw_sp_netdevice_port_vlan_event(struct net_device *vlan_dev, if (info->linking) err = mlxsw_sp_port_bridge_join(mlxsw_sp_port, vlan_dev, - upper_dev); + upper_dev, + extack); else mlxsw_sp_port_bridge_leave(mlxsw_sp_port, vlan_dev, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index ae67e6046098..8e45183dc9bb 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -326,7 +326,8 @@ void mlxsw_sp_port_vlan_bridge_leave(struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan); int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port, struct net_device *brport_dev, - struct net_device *br_dev); + struct net_device *br_dev, + struct netlink_ext_ack *extack); void mlxsw_sp_port_bridge_leave(struct mlxsw_sp_port *mlxsw_sp_port, struct net_device *brport_dev, struct net_device *br_dev); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index e0f8ea4ed7af..6a356f4b99a3 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -3640,20 +3640,6 @@ static int mlxsw_sp_fib_lpm_tree_link(struct mlxsw_sp *mlxsw_sp, static void mlxsw_sp_fib_lpm_tree_unlink(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_fib *fib) { - struct mlxsw_sp_prefix_usage req_prefix_usage = {{ 0 } }; - struct mlxsw_sp_lpm_tree *lpm_tree; - - /* Aggregate prefix lengths across all virtual routers to make - * sure we only have used prefix lengths in the LPM tree. - */ - mlxsw_sp_vrs_prefixes(mlxsw_sp, fib->proto, &req_prefix_usage); - lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage, - fib->proto); - if (IS_ERR(lpm_tree)) - goto err_tree_get; - mlxsw_sp_vrs_lpm_tree_replace(mlxsw_sp, fib, lpm_tree); - -err_tree_get: if (!mlxsw_sp_prefix_usage_none(&fib->prefix_usage)) return; mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, fib); @@ -5957,7 +5943,7 @@ static int mlxsw_sp_rif_vlan_fid_op(struct mlxsw_sp_rif *rif, return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ritr), ritr_pl); } -static u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp) +u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp) { return mlxsw_core_max_ports(mlxsw_sp->core) + 1; } diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h index 3d449180b035..3f2d840cb285 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.h @@ -70,6 +70,7 @@ u16 mlxsw_sp_rif_index(const struct mlxsw_sp_rif *rif); u16 mlxsw_sp_ipip_lb_rif_index(const struct mlxsw_sp_rif_ipip_lb *rif); u16 mlxsw_sp_ipip_lb_ul_vr_id(const struct mlxsw_sp_rif_ipip_lb *rif); int mlxsw_sp_rif_dev_ifindex(const struct mlxsw_sp_rif *rif); +u8 mlxsw_sp_router_port(const struct mlxsw_sp *mlxsw_sp); const struct net_device *mlxsw_sp_rif_dev(const struct mlxsw_sp_rif *rif); int mlxsw_sp_rif_counter_value_get(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_rif *rif, diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 0f9eac5f4ebf..7b8548e25ae7 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -46,8 +46,10 @@ #include <linux/workqueue.h> #include <linux/jiffies.h> #include <linux/rtnetlink.h> +#include <linux/netlink.h> #include <net/switchdev.h> +#include "spectrum_router.h" #include "spectrum.h" #include "core.h" #include "reg.h" @@ -78,7 +80,8 @@ struct mlxsw_sp_bridge_device { struct list_head ports_list; struct list_head mids_list; u8 vlan_enabled:1, - multicast_enabled:1; + multicast_enabled:1, + mrouter:1; const struct mlxsw_sp_bridge_ops *ops; }; @@ -107,7 +110,8 @@ struct mlxsw_sp_bridge_vlan { struct mlxsw_sp_bridge_ops { int (*port_join)(struct mlxsw_sp_bridge_device *bridge_device, struct mlxsw_sp_bridge_port *bridge_port, - struct mlxsw_sp_port *mlxsw_sp_port); + struct mlxsw_sp_port *mlxsw_sp_port, + struct netlink_ext_ack *extack); void (*port_leave)(struct mlxsw_sp_bridge_device *bridge_device, struct mlxsw_sp_bridge_port *bridge_port, struct mlxsw_sp_port *mlxsw_sp_port); @@ -168,6 +172,7 @@ mlxsw_sp_bridge_device_create(struct mlxsw_sp_bridge *bridge, bridge_device->dev = br_dev; bridge_device->vlan_enabled = vlan_enabled; bridge_device->multicast_enabled = br_multicast_enabled(br_dev); + bridge_device->mrouter = br_multicast_router(br_dev); INIT_LIST_HEAD(&bridge_device->ports_list); if (vlan_enabled) { bridge->vlan_enabled_exists = true; @@ -810,6 +815,60 @@ static int mlxsw_sp_port_mc_disabled_set(struct mlxsw_sp_port *mlxsw_sp_port, return 0; } +static int mlxsw_sp_smid_router_port_set(struct mlxsw_sp *mlxsw_sp, + u16 mid_idx, bool add) +{ + char *smid_pl; + int err; + + smid_pl = kmalloc(MLXSW_REG_SMID_LEN, GFP_KERNEL); + if (!smid_pl) + return -ENOMEM; + + mlxsw_reg_smid_pack(smid_pl, mid_idx, + mlxsw_sp_router_port(mlxsw_sp), add); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl); + kfree(smid_pl); + return err; +} + +static void +mlxsw_sp_bridge_mrouter_update_mdb(struct mlxsw_sp *mlxsw_sp, + struct mlxsw_sp_bridge_device *bridge_device, + bool add) +{ + struct mlxsw_sp_mid *mid; + + list_for_each_entry(mid, &bridge_device->mids_list, list) + mlxsw_sp_smid_router_port_set(mlxsw_sp, mid->mid, add); +} + +static int +mlxsw_sp_port_attr_br_mrouter_set(struct mlxsw_sp_port *mlxsw_sp_port, + struct switchdev_trans *trans, + struct net_device *orig_dev, + bool is_mrouter) +{ + struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; + struct mlxsw_sp_bridge_device *bridge_device; + + if (switchdev_trans_ph_prepare(trans)) + return 0; + + /* It's possible we failed to enslave the port, yet this + * operation is executed due to it being deferred. + */ + bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, orig_dev); + if (!bridge_device) + return 0; + + if (bridge_device->mrouter != is_mrouter) + mlxsw_sp_bridge_mrouter_update_mdb(mlxsw_sp, bridge_device, + is_mrouter); + bridge_device->mrouter = is_mrouter; + return 0; +} + static int mlxsw_sp_port_attr_set(struct net_device *dev, const struct switchdev_attr *attr, struct switchdev_trans *trans) @@ -847,6 +906,11 @@ static int mlxsw_sp_port_attr_set(struct net_device *dev, attr->orig_dev, attr->u.mc_disabled); break; + case SWITCHDEV_ATTR_ID_BRIDGE_MROUTER: + err = mlxsw_sp_port_attr_br_mrouter_set(mlxsw_sp_port, trans, + attr->orig_dev, + attr->u.mrouter); + break; default: err = -EOPNOTSUPP; break; @@ -1241,7 +1305,8 @@ static int mlxsw_sp_port_mdb_op(struct mlxsw_sp *mlxsw_sp, const char *addr, } static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx, - long *ports_bitmap) + long *ports_bitmap, + bool set_router_port) { char *smid_pl; int err, i; @@ -1256,9 +1321,15 @@ static int mlxsw_sp_port_smid_full_entry(struct mlxsw_sp *mlxsw_sp, u16 mid_idx, mlxsw_reg_smid_port_mask_set(smid_pl, i, 1); } + mlxsw_reg_smid_port_mask_set(smid_pl, + mlxsw_sp_router_port(mlxsw_sp), 1); + for_each_set_bit(i, ports_bitmap, mlxsw_core_max_ports(mlxsw_sp->core)) mlxsw_reg_smid_port_set(smid_pl, i, 1); + mlxsw_reg_smid_port_set(smid_pl, mlxsw_sp_router_port(mlxsw_sp), + set_router_port); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(smid), smid_pl); kfree(smid_pl); return err; @@ -1362,7 +1433,8 @@ mlxsw_sp_mc_write_mdb_entry(struct mlxsw_sp *mlxsw_sp, mlxsw_sp_mc_get_mrouters_bitmap(flood_bitmap, bridge_device, mlxsw_sp); mid->mid = mid_idx; - err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap); + err = mlxsw_sp_port_smid_full_entry(mlxsw_sp, mid_idx, flood_bitmap, + bridge_device->mrouter); kfree(flood_bitmap); if (err) return false; @@ -1735,12 +1807,15 @@ static const struct switchdev_ops mlxsw_sp_port_switchdev_ops = { static int mlxsw_sp_bridge_8021q_port_join(struct mlxsw_sp_bridge_device *bridge_device, struct mlxsw_sp_bridge_port *bridge_port, - struct mlxsw_sp_port *mlxsw_sp_port) + struct mlxsw_sp_port *mlxsw_sp_port, + struct netlink_ext_ack *extack) { struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan; - if (is_vlan_dev(bridge_port->dev)) + if (is_vlan_dev(bridge_port->dev)) { + NL_SET_ERR_MSG(extack, "spectrum: Can not enslave a VLAN device to a VLAN-aware bridge"); return -EINVAL; + } mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, 1); if (WARN_ON(!mlxsw_sp_port_vlan)) @@ -1797,13 +1872,16 @@ mlxsw_sp_port_is_br_member(const struct mlxsw_sp_port *mlxsw_sp_port, static int mlxsw_sp_bridge_8021d_port_join(struct mlxsw_sp_bridge_device *bridge_device, struct mlxsw_sp_bridge_port *bridge_port, - struct mlxsw_sp_port *mlxsw_sp_port) + struct mlxsw_sp_port *mlxsw_sp_port, + struct netlink_ext_ack *extack) { struct mlxsw_sp_port_vlan *mlxsw_sp_port_vlan; u16 vid; - if (!is_vlan_dev(bridge_port->dev)) + if (!is_vlan_dev(bridge_port->dev)) { + NL_SET_ERR_MSG(extack, "spectrum: Only VLAN devices can be enslaved to a VLAN-unaware bridge"); return -EINVAL; + } vid = vlan_dev_vlan_id(bridge_port->dev); mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_vid(mlxsw_sp_port, vid); @@ -1811,7 +1889,7 @@ mlxsw_sp_bridge_8021d_port_join(struct mlxsw_sp_bridge_device *bridge_device, return -EINVAL; if (mlxsw_sp_port_is_br_member(mlxsw_sp_port, bridge_device->dev)) { - netdev_err(mlxsw_sp_port->dev, "Can't bridge VLAN uppers of the same port\n"); + NL_SET_ERR_MSG(extack, "spectrum: Can not bridge VLAN uppers of the same port"); return -EINVAL; } @@ -1854,7 +1932,8 @@ static const struct mlxsw_sp_bridge_ops mlxsw_sp_bridge_8021d_ops = { int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port, struct net_device *brport_dev, - struct net_device *br_dev) + struct net_device *br_dev, + struct netlink_ext_ack *extack) { struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_bridge_device *bridge_device; @@ -1867,7 +1946,7 @@ int mlxsw_sp_port_bridge_join(struct mlxsw_sp_port *mlxsw_sp_port, bridge_device = bridge_port->bridge_device; err = bridge_device->ops->port_join(bridge_device, bridge_port, - mlxsw_sp_port); + mlxsw_sp_port, extack); if (err) goto err_port_join; diff --git a/drivers/net/ethernet/neterion/vxge/vxge-traffic.c b/drivers/net/ethernet/neterion/vxge/vxge-traffic.c index 5f630a24e491..0c3b5dea2858 100644 --- a/drivers/net/ethernet/neterion/vxge/vxge-traffic.c +++ b/drivers/net/ethernet/neterion/vxge/vxge-traffic.c @@ -1209,9 +1209,6 @@ void vxge_hw_ring_rxd_pre_post(struct __vxge_hw_ring *ring, void *rxdh) void vxge_hw_ring_rxd_post_post(struct __vxge_hw_ring *ring, void *rxdh) { struct vxge_hw_ring_rxd_1 *rxdp = (struct vxge_hw_ring_rxd_1 *)rxdh; - struct __vxge_hw_channel *channel; - - channel = &ring->channel; rxdp->control_0 = VXGE_HW_RING_RXD_LIST_OWN_ADAPTER; @@ -1359,11 +1356,8 @@ exit: enum vxge_hw_status vxge_hw_ring_handle_tcode( struct __vxge_hw_ring *ring, void *rxdh, u8 t_code) { - struct __vxge_hw_channel *channel; enum vxge_hw_status status = VXGE_HW_OK; - channel = &ring->channel; - /* If the t_code is not supported and if the * t_code is other than 0x5 (unparseable packet * such as unknown UPV6 header), Drop it !!! @@ -1399,10 +1393,6 @@ exit: static void __vxge_hw_non_offload_db_post(struct __vxge_hw_fifo *fifo, u64 txdl_ptr, u32 num_txds, u32 no_snoop) { - struct __vxge_hw_channel *channel; - - channel = &fifo->channel; - writeq(VXGE_HW_NODBW_TYPE(VXGE_HW_NODBW_TYPE_NODBW) | VXGE_HW_NODBW_LAST_TXD_NUMBER(num_txds) | VXGE_HW_NODBW_GET_NO_SNOOP(no_snoop), @@ -1506,9 +1496,6 @@ void vxge_hw_fifo_txdl_buffer_set(struct __vxge_hw_fifo *fifo, { struct __vxge_hw_fifo_txdl_priv *txdl_priv; struct vxge_hw_fifo_txd *txdp, *txdp_last; - struct __vxge_hw_channel *channel; - - channel = &fifo->channel; txdl_priv = __vxge_hw_fifo_txdl_priv(fifo, txdlh); txdp = (struct vxge_hw_fifo_txd *)txdlh + txdl_priv->frags; @@ -1554,9 +1541,6 @@ void vxge_hw_fifo_txdl_post(struct __vxge_hw_fifo *fifo, void *txdlh) struct __vxge_hw_fifo_txdl_priv *txdl_priv; struct vxge_hw_fifo_txd *txdp_last; struct vxge_hw_fifo_txd *txdp_first; - struct __vxge_hw_channel *channel; - - channel = &fifo->channel; txdl_priv = __vxge_hw_fifo_txdl_priv(fifo, txdlh); txdp_first = txdlh; @@ -1672,10 +1656,7 @@ enum vxge_hw_status vxge_hw_fifo_handle_tcode(struct __vxge_hw_fifo *fifo, void *txdlh, enum vxge_hw_fifo_tcode t_code) { - struct __vxge_hw_channel *channel; - enum vxge_hw_status status = VXGE_HW_OK; - channel = &fifo->channel; if (((t_code & 0x7) < 0) || ((t_code & 0x7) > 0x4)) { status = VXGE_HW_ERR_INVALID_TCODE; diff --git a/drivers/net/ethernet/netronome/nfp/Makefile b/drivers/net/ethernet/netronome/nfp/Makefile index becaacf1554d..bd3b2bd408bc 100644 --- a/drivers/net/ethernet/netronome/nfp/Makefile +++ b/drivers/net/ethernet/netronome/nfp/Makefile @@ -14,6 +14,7 @@ nfp-objs := \ nfpcore/nfp_resource.o \ nfpcore/nfp_rtsym.o \ nfpcore/nfp_target.o \ + nfp_asm.o \ nfp_app.o \ nfp_app_nic.o \ nfp_devlink.o \ diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 239dfbe8a0a1..23fb11a41cc4 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -110,150 +110,7 @@ nfp_prog_offset_to_index(struct nfp_prog *nfp_prog, unsigned int offset) return offset - nfp_prog->start_off; } -/* --- SW reg --- */ -struct nfp_insn_ur_regs { - enum alu_dst_ab dst_ab; - u16 dst; - u16 areg, breg; - bool swap; - bool wr_both; -}; - -struct nfp_insn_re_regs { - enum alu_dst_ab dst_ab; - u8 dst; - u8 areg, breg; - bool swap; - bool wr_both; - bool i8; -}; - -static u16 nfp_swreg_to_unreg(u32 swreg, bool is_dst) -{ - u16 val = FIELD_GET(NN_REG_VAL, swreg); - - switch (FIELD_GET(NN_REG_TYPE, swreg)) { - case NN_REG_GPR_A: - case NN_REG_GPR_B: - case NN_REG_GPR_BOTH: - return val; - case NN_REG_NNR: - return UR_REG_NN | val; - case NN_REG_XFER: - return UR_REG_XFR | val; - case NN_REG_IMM: - if (val & ~0xff) { - pr_err("immediate too large\n"); - return 0; - } - return UR_REG_IMM_encode(val); - case NN_REG_NONE: - return is_dst ? UR_REG_NO_DST : REG_NONE; - default: - pr_err("unrecognized reg encoding %08x\n", swreg); - return 0; - } -} - -static int -swreg_to_unrestricted(u32 dst, u32 lreg, u32 rreg, struct nfp_insn_ur_regs *reg) -{ - memset(reg, 0, sizeof(*reg)); - - /* Decode destination */ - if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM) - return -EFAULT; - - if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_B) - reg->dst_ab = ALU_DST_B; - if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_BOTH) - reg->wr_both = true; - reg->dst = nfp_swreg_to_unreg(dst, true); - - /* Decode source operands */ - if (FIELD_GET(NN_REG_TYPE, lreg) == FIELD_GET(NN_REG_TYPE, rreg)) - return -EFAULT; - - if (FIELD_GET(NN_REG_TYPE, lreg) == NN_REG_GPR_B || - FIELD_GET(NN_REG_TYPE, rreg) == NN_REG_GPR_A) { - reg->areg = nfp_swreg_to_unreg(rreg, false); - reg->breg = nfp_swreg_to_unreg(lreg, false); - reg->swap = true; - } else { - reg->areg = nfp_swreg_to_unreg(lreg, false); - reg->breg = nfp_swreg_to_unreg(rreg, false); - } - - return 0; -} - -static u16 nfp_swreg_to_rereg(u32 swreg, bool is_dst, bool has_imm8, bool *i8) -{ - u16 val = FIELD_GET(NN_REG_VAL, swreg); - - switch (FIELD_GET(NN_REG_TYPE, swreg)) { - case NN_REG_GPR_A: - case NN_REG_GPR_B: - case NN_REG_GPR_BOTH: - return val; - case NN_REG_XFER: - return RE_REG_XFR | val; - case NN_REG_IMM: - if (val & ~(0x7f | has_imm8 << 7)) { - pr_err("immediate too large\n"); - return 0; - } - *i8 = val & 0x80; - return RE_REG_IMM_encode(val & 0x7f); - case NN_REG_NONE: - return is_dst ? RE_REG_NO_DST : REG_NONE; - default: - pr_err("unrecognized reg encoding\n"); - return 0; - } -} - -static int -swreg_to_restricted(u32 dst, u32 lreg, u32 rreg, struct nfp_insn_re_regs *reg, - bool has_imm8) -{ - memset(reg, 0, sizeof(*reg)); - - /* Decode destination */ - if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM) - return -EFAULT; - - if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_B) - reg->dst_ab = ALU_DST_B; - if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_GPR_BOTH) - reg->wr_both = true; - reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL); - - /* Decode source operands */ - if (FIELD_GET(NN_REG_TYPE, lreg) == FIELD_GET(NN_REG_TYPE, rreg)) - return -EFAULT; - - if (FIELD_GET(NN_REG_TYPE, lreg) == NN_REG_GPR_B || - FIELD_GET(NN_REG_TYPE, rreg) == NN_REG_GPR_A) { - reg->areg = nfp_swreg_to_rereg(rreg, false, has_imm8, ®->i8); - reg->breg = nfp_swreg_to_rereg(lreg, false, has_imm8, ®->i8); - reg->swap = true; - } else { - reg->areg = nfp_swreg_to_rereg(lreg, false, has_imm8, ®->i8); - reg->breg = nfp_swreg_to_rereg(rreg, false, has_imm8, ®->i8); - } - - return 0; -} - /* --- Emitters --- */ -static const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = { - [CMD_TGT_WRITE8] = { 0x00, 0x42 }, - [CMD_TGT_READ8] = { 0x01, 0x43 }, - [CMD_TGT_READ_LE] = { 0x01, 0x40 }, - [CMD_TGT_READ_SWAP_LE] = { 0x03, 0x40 }, -}; - static void __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync) @@ -281,7 +138,7 @@ __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, static void emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, - u8 mode, u8 xfer, u32 lreg, u32 rreg, u8 size, bool sync) + u8 mode, u8 xfer, swreg lreg, swreg rreg, u8 size, bool sync) { struct nfp_insn_re_regs reg; int err; @@ -296,6 +153,11 @@ emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, nfp_prog->error = -EFAULT; return; } + if (reg.dst_lmextn || reg.src_lmextn) { + pr_err("cmd can't use LMextn\n"); + nfp_prog->error = -EFAULT; + return; + } __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync); } @@ -341,7 +203,7 @@ emit_br(struct nfp_prog *nfp_prog, enum br_mask mask, u16 addr, u8 defer) static void __emit_br_byte(struct nfp_prog *nfp_prog, u8 areg, u8 breg, bool imm8, - u8 byte, bool equal, u16 addr, u8 defer) + u8 byte, bool equal, u16 addr, u8 defer, bool src_lmextn) { u16 addr_lo, addr_hi; u64 insn; @@ -357,32 +219,34 @@ __emit_br_byte(struct nfp_prog *nfp_prog, u8 areg, u8 breg, bool imm8, FIELD_PREP(OP_BB_EQ, equal) | FIELD_PREP(OP_BB_DEFBR, defer) | FIELD_PREP(OP_BB_ADDR_LO, addr_lo) | - FIELD_PREP(OP_BB_ADDR_HI, addr_hi); + FIELD_PREP(OP_BB_ADDR_HI, addr_hi) | + FIELD_PREP(OP_BB_SRC_LMEXTN, src_lmextn); nfp_prog_push(nfp_prog, insn); } static void emit_br_byte_neq(struct nfp_prog *nfp_prog, - u32 dst, u8 imm, u8 byte, u16 addr, u8 defer) + swreg src, u8 imm, u8 byte, u16 addr, u8 defer) { struct nfp_insn_re_regs reg; int err; - err = swreg_to_restricted(reg_none(), dst, reg_imm(imm), ®, true); + err = swreg_to_restricted(reg_none(), src, reg_imm(imm), ®, true); if (err) { nfp_prog->error = err; return; } __emit_br_byte(nfp_prog, reg.areg, reg.breg, reg.i8, byte, false, addr, - defer); + defer, reg.src_lmextn); } static void __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi, enum immed_width width, bool invert, - enum immed_shift shift, bool wr_both) + enum immed_shift shift, bool wr_both, + bool dst_lmextn, bool src_lmextn) { u64 insn; @@ -393,19 +257,21 @@ __emit_immed(struct nfp_prog *nfp_prog, u16 areg, u16 breg, u16 imm_hi, FIELD_PREP(OP_IMMED_WIDTH, width) | FIELD_PREP(OP_IMMED_INV, invert) | FIELD_PREP(OP_IMMED_SHIFT, shift) | - FIELD_PREP(OP_IMMED_WR_AB, wr_both); + FIELD_PREP(OP_IMMED_WR_AB, wr_both) | + FIELD_PREP(OP_IMMED_SRC_LMEXTN, src_lmextn) | + FIELD_PREP(OP_IMMED_DST_LMEXTN, dst_lmextn); nfp_prog_push(nfp_prog, insn); } static void -emit_immed(struct nfp_prog *nfp_prog, u32 dst, u16 imm, +emit_immed(struct nfp_prog *nfp_prog, swreg dst, u16 imm, enum immed_width width, bool invert, enum immed_shift shift) { struct nfp_insn_ur_regs reg; int err; - if (FIELD_GET(NN_REG_TYPE, dst) == NN_REG_IMM) { + if (swreg_type(dst) == NN_REG_IMM) { nfp_prog->error = -EFAULT; return; } @@ -417,13 +283,15 @@ emit_immed(struct nfp_prog *nfp_prog, u32 dst, u16 imm, } __emit_immed(nfp_prog, reg.areg, reg.breg, imm >> 8, width, - invert, shift, reg.wr_both); + invert, shift, reg.wr_both, + reg.dst_lmextn, reg.src_lmextn); } static void __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab, enum shf_sc sc, u8 shift, - u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both) + u16 areg, enum shf_op op, u16 breg, bool i8, bool sw, bool wr_both, + bool dst_lmextn, bool src_lmextn) { u64 insn; @@ -445,14 +313,16 @@ __emit_shf(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab, FIELD_PREP(OP_SHF_SHIFT, shift) | FIELD_PREP(OP_SHF_OP, op) | FIELD_PREP(OP_SHF_DST_AB, dst_ab) | - FIELD_PREP(OP_SHF_WR_AB, wr_both); + FIELD_PREP(OP_SHF_WR_AB, wr_both) | + FIELD_PREP(OP_SHF_SRC_LMEXTN, src_lmextn) | + FIELD_PREP(OP_SHF_DST_LMEXTN, dst_lmextn); nfp_prog_push(nfp_prog, insn); } static void -emit_shf(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum shf_op op, u32 rreg, - enum shf_sc sc, u8 shift) +emit_shf(struct nfp_prog *nfp_prog, swreg dst, + swreg lreg, enum shf_op op, swreg rreg, enum shf_sc sc, u8 shift) { struct nfp_insn_re_regs reg; int err; @@ -464,12 +334,14 @@ emit_shf(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum shf_op op, u32 rreg, } __emit_shf(nfp_prog, reg.dst, reg.dst_ab, sc, shift, - reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both); + reg.areg, op, reg.breg, reg.i8, reg.swap, reg.wr_both, + reg.dst_lmextn, reg.src_lmextn); } static void __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab, - u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both) + u16 areg, enum alu_op op, u16 breg, bool swap, bool wr_both, + bool dst_lmextn, bool src_lmextn) { u64 insn; @@ -480,13 +352,16 @@ __emit_alu(struct nfp_prog *nfp_prog, u16 dst, enum alu_dst_ab dst_ab, FIELD_PREP(OP_ALU_SW, swap) | FIELD_PREP(OP_ALU_OP, op) | FIELD_PREP(OP_ALU_DST_AB, dst_ab) | - FIELD_PREP(OP_ALU_WR_AB, wr_both); + FIELD_PREP(OP_ALU_WR_AB, wr_both) | + FIELD_PREP(OP_ALU_SRC_LMEXTN, src_lmextn) | + FIELD_PREP(OP_ALU_DST_LMEXTN, dst_lmextn); nfp_prog_push(nfp_prog, insn); } static void -emit_alu(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum alu_op op, u32 rreg) +emit_alu(struct nfp_prog *nfp_prog, swreg dst, + swreg lreg, enum alu_op op, swreg rreg) { struct nfp_insn_ur_regs reg; int err; @@ -498,13 +373,15 @@ emit_alu(struct nfp_prog *nfp_prog, u32 dst, u32 lreg, enum alu_op op, u32 rreg) } __emit_alu(nfp_prog, reg.dst, reg.dst_ab, - reg.areg, op, reg.breg, reg.swap, reg.wr_both); + reg.areg, op, reg.breg, reg.swap, reg.wr_both, + reg.dst_lmextn, reg.src_lmextn); } static void __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc, u8 areg, u8 bmask, u8 breg, u8 shift, bool imm8, - bool zero, bool swap, bool wr_both) + bool zero, bool swap, bool wr_both, + bool dst_lmextn, bool src_lmextn) { u64 insn; @@ -517,33 +394,42 @@ __emit_ld_field(struct nfp_prog *nfp_prog, enum shf_sc sc, FIELD_PREP(OP_LDF_ZF, zero) | FIELD_PREP(OP_LDF_BMASK, bmask) | FIELD_PREP(OP_LDF_SHF, shift) | - FIELD_PREP(OP_LDF_WR_AB, wr_both); + FIELD_PREP(OP_LDF_WR_AB, wr_both) | + FIELD_PREP(OP_LDF_SRC_LMEXTN, src_lmextn) | + FIELD_PREP(OP_LDF_DST_LMEXTN, dst_lmextn); nfp_prog_push(nfp_prog, insn); } static void -emit_ld_field_any(struct nfp_prog *nfp_prog, enum shf_sc sc, u8 shift, - u32 dst, u8 bmask, u32 src, bool zero) +emit_ld_field_any(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src, + enum shf_sc sc, u8 shift, bool zero) { struct nfp_insn_re_regs reg; int err; - err = swreg_to_restricted(reg_none(), dst, src, ®, true); + /* Note: ld_field is special as it uses one of the src regs as dst */ + err = swreg_to_restricted(dst, dst, src, ®, true); if (err) { nfp_prog->error = err; return; } __emit_ld_field(nfp_prog, sc, reg.areg, bmask, reg.breg, shift, - reg.i8, zero, reg.swap, reg.wr_both); + reg.i8, zero, reg.swap, reg.wr_both, + reg.dst_lmextn, reg.src_lmextn); } static void -emit_ld_field(struct nfp_prog *nfp_prog, u32 dst, u8 bmask, u32 src, +emit_ld_field(struct nfp_prog *nfp_prog, swreg dst, u8 bmask, swreg src, enum shf_sc sc, u8 shift) { - emit_ld_field_any(nfp_prog, sc, shift, dst, bmask, src, false); + emit_ld_field_any(nfp_prog, dst, bmask, src, sc, shift, false); +} + +static void emit_nop(struct nfp_prog *nfp_prog) +{ + __emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0); } /* --- Wrappers --- */ @@ -565,7 +451,7 @@ static bool pack_immed(u32 imm, u16 *val, enum immed_shift *shift) return true; } -static void wrp_immed(struct nfp_prog *nfp_prog, u32 dst, u32 imm) +static void wrp_immed(struct nfp_prog *nfp_prog, swreg dst, u32 imm) { enum immed_shift shift; u16 val; @@ -586,7 +472,7 @@ static void wrp_immed(struct nfp_prog *nfp_prog, u32 dst, u32 imm) * If the @imm is small enough encode it directly in operand and return * otherwise load @imm to a spare register and return its encoding. */ -static u32 ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg) +static swreg ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg) { if (FIELD_FIT(UR_REG_IMM_MAX, imm)) return reg_imm(imm); @@ -599,7 +485,7 @@ static u32 ur_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg) * If the @imm is small enough encode it directly in operand and return * otherwise load @imm to a spare register and return its encoding. */ -static u32 re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, u32 tmp_reg) +static swreg re_load_imm_any(struct nfp_prog *nfp_prog, u32 imm, swreg tmp_reg) { if (FIELD_FIT(RE_REG_IMM_MAX, imm)) return reg_imm(imm); @@ -618,78 +504,134 @@ wrp_br_special(struct nfp_prog *nfp_prog, enum br_mask mask, FIELD_PREP(OP_BR_SPECIAL, special); } +static void wrp_mov(struct nfp_prog *nfp_prog, swreg dst, swreg src) +{ + emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, src); +} + static void wrp_reg_mov(struct nfp_prog *nfp_prog, u16 dst, u16 src) { - emit_alu(nfp_prog, reg_both(dst), reg_none(), ALU_OP_NONE, reg_b(src)); + wrp_mov(nfp_prog, reg_both(dst), reg_b(src)); } static int -construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, - u16 src, bool src_valid, u8 size) +data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size) { unsigned int i; u16 shift, sz; - u32 tmp_reg; /* We load the value from the address indicated in @offset and then * shift out the data we don't need. Note: this is big endian! */ - sz = size < 4 ? 4 : size; + sz = max(size, 4); shift = size < 4 ? 4 - size : 0; - if (src_valid) { - /* Calculate the true offset (src_reg + imm) */ - tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog)); - emit_alu(nfp_prog, imm_both(nfp_prog), - reg_a(src), ALU_OP_ADD, tmp_reg); - /* Check packet length (size guaranteed to fit b/c it's u8) */ - emit_alu(nfp_prog, imm_a(nfp_prog), - imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size)); - emit_alu(nfp_prog, reg_none(), - NFP_BPF_ABI_LEN, ALU_OP_SUB, imm_a(nfp_prog)); - wrp_br_special(nfp_prog, BR_BLO, OP_BR_GO_ABORT); - /* Load data */ - emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0, - pkt_reg(nfp_prog), imm_b(nfp_prog), sz - 1, true); - } else { - /* Check packet length */ - tmp_reg = ur_load_imm_any(nfp_prog, offset + size, - imm_a(nfp_prog)); - emit_alu(nfp_prog, reg_none(), - NFP_BPF_ABI_LEN, ALU_OP_SUB, tmp_reg); - wrp_br_special(nfp_prog, BR_BLO, OP_BR_GO_ABORT); - /* Load data */ - tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog)); - emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0, - pkt_reg(nfp_prog), tmp_reg, sz - 1, true); - } + emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0, + pptr_reg(nfp_prog), offset, sz - 1, true); i = 0; if (shift) - emit_shf(nfp_prog, reg_both(0), reg_none(), SHF_OP_NONE, + emit_shf(nfp_prog, reg_both(dst_gpr), reg_none(), SHF_OP_NONE, reg_xfer(0), SHF_SC_R_SHF, shift * 8); else for (; i * 4 < size; i++) - emit_alu(nfp_prog, reg_both(i), - reg_none(), ALU_OP_NONE, reg_xfer(i)); + wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i)); + + if (i < 2) + wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0); + + return 0; +} + +static int +data_ld_host_order(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, + u8 dst_gpr, int size) +{ + unsigned int i; + u8 mask, sz; + + /* We load the value from the address indicated in @offset and then + * mask out the data we don't need. Note: this is little endian! + */ + sz = max(size, 4); + mask = size < 4 ? GENMASK(size - 1, 0) : 0; + + emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, + reg_a(src_gpr), offset, sz / 4 - 1, true); + + i = 0; + if (mask) + emit_ld_field_any(nfp_prog, reg_both(dst_gpr), mask, + reg_xfer(0), SHF_SC_NONE, 0, true); + else + for (; i * 4 < size; i++) + wrp_mov(nfp_prog, reg_both(dst_gpr + i), reg_xfer(i)); if (i < 2) - wrp_immed(nfp_prog, reg_both(1), 0); + wrp_immed(nfp_prog, reg_both(dst_gpr + 1), 0); return 0; } +static int +construct_data_ind_ld(struct nfp_prog *nfp_prog, u16 offset, u16 src, u8 size) +{ + swreg tmp_reg; + + /* Calculate the true offset (src_reg + imm) */ + tmp_reg = ur_load_imm_any(nfp_prog, offset, imm_b(nfp_prog)); + emit_alu(nfp_prog, imm_both(nfp_prog), reg_a(src), ALU_OP_ADD, tmp_reg); + + /* Check packet length (size guaranteed to fit b/c it's u8) */ + emit_alu(nfp_prog, imm_a(nfp_prog), + imm_a(nfp_prog), ALU_OP_ADD, reg_imm(size)); + emit_alu(nfp_prog, reg_none(), + plen_reg(nfp_prog), ALU_OP_SUB, imm_a(nfp_prog)); + wrp_br_special(nfp_prog, BR_BLO, OP_BR_GO_ABORT); + + /* Load data */ + return data_ld(nfp_prog, imm_b(nfp_prog), 0, size); +} + static int construct_data_ld(struct nfp_prog *nfp_prog, u16 offset, u8 size) { - return construct_data_ind_ld(nfp_prog, offset, 0, false, size); + swreg tmp_reg; + + /* Check packet length */ + tmp_reg = ur_load_imm_any(nfp_prog, offset + size, imm_a(nfp_prog)); + emit_alu(nfp_prog, reg_none(), plen_reg(nfp_prog), ALU_OP_SUB, tmp_reg); + wrp_br_special(nfp_prog, BR_BLO, OP_BR_GO_ABORT); + + /* Load data */ + tmp_reg = re_load_imm_any(nfp_prog, offset, imm_b(nfp_prog)); + return data_ld(nfp_prog, tmp_reg, 0, size); +} + +static int +data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset, + u8 src_gpr, u8 size) +{ + unsigned int i; + + for (i = 0; i * 4 < size; i++) + wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i)); + + emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, + reg_a(dst_gpr), offset, size - 1, true); + + return 0; } -static int wrp_set_mark(struct nfp_prog *nfp_prog, u8 src) +static int +data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset, + u64 imm, u8 size) { - emit_alu(nfp_prog, NFP_BPF_ABI_MARK, - reg_none(), ALU_OP_NONE, reg_b(src)); - emit_alu(nfp_prog, NFP_BPF_ABI_FLAGS, - NFP_BPF_ABI_FLAGS, ALU_OP_OR, reg_imm(NFP_BPF_ABI_FLAG_MARK)); + wrp_immed(nfp_prog, reg_xfer(0), imm); + if (size == 8) + wrp_immed(nfp_prog, reg_xfer(1), imm >> 32); + + emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, + reg_a(dst_gpr), offset, size - 1, true); return 0; } @@ -697,7 +639,7 @@ static int wrp_set_mark(struct nfp_prog *nfp_prog, u8 src) static void wrp_alu_imm(struct nfp_prog *nfp_prog, u8 dst, enum alu_op alu_op, u32 imm) { - u32 tmp_reg; + swreg tmp_reg; if (alu_op == ALU_OP_AND) { if (!imm) @@ -815,7 +757,7 @@ wrp_cmp_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, const struct bpf_insn *insn = &meta->insn; u64 imm = insn->imm; /* sign extend */ u8 reg = insn->dst_reg * 2; - u32 tmp_reg; + swreg tmp_reg; if (insn->off < 0) /* TODO */ return -EOPNOTSUPP; @@ -844,7 +786,10 @@ wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, enum br_mask br_mask, bool swap) { const struct bpf_insn *insn = &meta->insn; - u8 areg = insn->src_reg * 2, breg = insn->dst_reg * 2; + u8 areg, breg; + + areg = insn->dst_reg * 2; + breg = insn->src_reg * 2; if (insn->off < 0) /* TODO */ return -EOPNOTSUPP; @@ -863,6 +808,14 @@ wrp_cmp_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, return 0; } +static void wrp_end32(struct nfp_prog *nfp_prog, swreg reg_in, u8 gpr_out) +{ + emit_ld_field(nfp_prog, reg_both(gpr_out), 0xf, reg_in, + SHF_SC_R_ROT, 8); + emit_ld_field(nfp_prog, reg_both(gpr_out), 0x5, reg_a(gpr_out), + SHF_SC_R_ROT, 16); +} + /* --- Callbacks --- */ static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { @@ -967,12 +920,24 @@ static int sub_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { const struct bpf_insn *insn = &meta->insn; - - if (insn->imm != 32) - return 1; /* TODO */ - - wrp_reg_mov(nfp_prog, insn->dst_reg * 2 + 1, insn->dst_reg * 2); - wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2), 0); + u8 dst = insn->dst_reg * 2; + + if (insn->imm < 32) { + emit_shf(nfp_prog, reg_both(dst + 1), + reg_a(dst + 1), SHF_OP_NONE, reg_b(dst), + SHF_SC_R_DSHF, 32 - insn->imm); + emit_shf(nfp_prog, reg_both(dst), + reg_none(), SHF_OP_NONE, reg_b(dst), + SHF_SC_L_SHF, insn->imm); + } else if (insn->imm == 32) { + wrp_reg_mov(nfp_prog, dst + 1, dst); + wrp_immed(nfp_prog, reg_both(dst), 0); + } else if (insn->imm > 32) { + emit_shf(nfp_prog, reg_both(dst + 1), + reg_none(), SHF_OP_NONE, reg_b(dst), + SHF_SC_L_SHF, insn->imm - 32); + wrp_immed(nfp_prog, reg_both(dst), 0); + } return 0; } @@ -980,12 +945,24 @@ static int shl_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int shr_imm64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { const struct bpf_insn *insn = &meta->insn; - - if (insn->imm != 32) - return 1; /* TODO */ - - wrp_reg_mov(nfp_prog, insn->dst_reg * 2, insn->dst_reg * 2 + 1); - wrp_immed(nfp_prog, reg_both(insn->dst_reg * 2 + 1), 0); + u8 dst = insn->dst_reg * 2; + + if (insn->imm < 32) { + emit_shf(nfp_prog, reg_both(dst), + reg_a(dst + 1), SHF_OP_NONE, reg_b(dst), + SHF_SC_R_DSHF, insn->imm); + emit_shf(nfp_prog, reg_both(dst + 1), + reg_none(), SHF_OP_NONE, reg_b(dst + 1), + SHF_SC_R_SHF, insn->imm); + } else if (insn->imm == 32) { + wrp_reg_mov(nfp_prog, dst, dst + 1); + wrp_immed(nfp_prog, reg_both(dst + 1), 0); + } else if (insn->imm > 32) { + emit_shf(nfp_prog, reg_both(dst), + reg_none(), SHF_OP_NONE, reg_b(dst + 1), + SHF_SC_R_SHF, insn->imm - 32); + wrp_immed(nfp_prog, reg_both(dst + 1), 0); + } return 0; } @@ -1075,6 +1052,35 @@ static int shl_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return 0; } +static int end_reg32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + const struct bpf_insn *insn = &meta->insn; + u8 gpr = insn->dst_reg * 2; + + switch (insn->imm) { + case 16: + emit_ld_field(nfp_prog, reg_both(gpr), 0x9, reg_b(gpr), + SHF_SC_R_ROT, 8); + emit_ld_field(nfp_prog, reg_both(gpr), 0xe, reg_a(gpr), + SHF_SC_R_SHF, 16); + + wrp_immed(nfp_prog, reg_both(gpr + 1), 0); + break; + case 32: + wrp_end32(nfp_prog, reg_a(gpr), gpr); + wrp_immed(nfp_prog, reg_both(gpr + 1), 0); + break; + case 64: + wrp_mov(nfp_prog, imm_a(nfp_prog), reg_b(gpr + 1)); + + wrp_end32(nfp_prog, reg_a(gpr), gpr + 1); + wrp_end32(nfp_prog, imm_a(nfp_prog), gpr); + break; + } + + return 0; +} + static int imm_ld8_part2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { wrp_immed(nfp_prog, reg_both(nfp_meta_prev(meta)->insn.dst_reg * 2 + 1), @@ -1111,82 +1117,209 @@ static int data_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int data_ind_ld1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { return construct_data_ind_ld(nfp_prog, meta->insn.imm, - meta->insn.src_reg * 2, true, 1); + meta->insn.src_reg * 2, 1); } static int data_ind_ld2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { return construct_data_ind_ld(nfp_prog, meta->insn.imm, - meta->insn.src_reg * 2, true, 2); + meta->insn.src_reg * 2, 2); } static int data_ind_ld4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { return construct_data_ind_ld(nfp_prog, meta->insn.imm, - meta->insn.src_reg * 2, true, 4); + meta->insn.src_reg * 2, 4); } -static int mem_ldx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_ldx_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + u8 size) { - if (meta->insn.off == offsetof(struct sk_buff, len)) - emit_alu(nfp_prog, reg_both(meta->insn.dst_reg * 2), - reg_none(), ALU_OP_NONE, NFP_BPF_ABI_LEN); - else + swreg dst = reg_both(meta->insn.dst_reg * 2); + + switch (meta->insn.off) { + case offsetof(struct sk_buff, len): + if (size != FIELD_SIZEOF(struct sk_buff, len)) + return -EOPNOTSUPP; + wrp_mov(nfp_prog, dst, plen_reg(nfp_prog)); + break; + case offsetof(struct sk_buff, data): + if (size != sizeof(void *)) + return -EOPNOTSUPP; + wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog)); + break; + case offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_data_end, data_end): + if (size != sizeof(void *)) + return -EOPNOTSUPP; + emit_alu(nfp_prog, dst, + plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog)); + break; + default: return -EOPNOTSUPP; + } + + wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0); return 0; } -static int mem_ldx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_ldx_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + u8 size) { - u32 dst = reg_both(meta->insn.dst_reg * 2); + swreg dst = reg_both(meta->insn.dst_reg * 2); - if (meta->insn.off != offsetof(struct xdp_md, data) && - meta->insn.off != offsetof(struct xdp_md, data_end)) + if (size != sizeof(void *)) + return -EINVAL; + + switch (meta->insn.off) { + case offsetof(struct xdp_buff, data): + wrp_mov(nfp_prog, dst, pptr_reg(nfp_prog)); + break; + case offsetof(struct xdp_buff, data_end): + emit_alu(nfp_prog, dst, + plen_reg(nfp_prog), ALU_OP_ADD, pptr_reg(nfp_prog)); + break; + default: return -EOPNOTSUPP; + } - emit_alu(nfp_prog, dst, reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT); + wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0); - if (meta->insn.off == offsetof(struct xdp_md, data)) - return 0; + return 0; +} - emit_alu(nfp_prog, dst, dst, ALU_OP_ADD, NFP_BPF_ABI_LEN); +static int +mem_ldx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int size) +{ + swreg tmp_reg; - return 0; + tmp_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); + + return data_ld_host_order(nfp_prog, meta->insn.src_reg * 2, tmp_reg, + meta->insn.dst_reg * 2, size); +} + +static int +mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int size) +{ + if (meta->ptr.type == PTR_TO_CTX) { + if (nfp_prog->act == NN_ACT_XDP) + return mem_ldx_xdp(nfp_prog, meta, size); + else + return mem_ldx_skb(nfp_prog, meta, size); + } + + if (meta->ptr.type == PTR_TO_PACKET) + return mem_ldx_data(nfp_prog, meta, size); + + return -EOPNOTSUPP; +} + +static int mem_ldx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_ldx(nfp_prog, meta, 1); +} + +static int mem_ldx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_ldx(nfp_prog, meta, 2); } static int mem_ldx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - int ret; + return mem_ldx(nfp_prog, meta, 4); +} - if (nfp_prog->act == NN_ACT_XDP) - ret = mem_ldx4_xdp(nfp_prog, meta); - else - ret = mem_ldx4_skb(nfp_prog, meta); +static int mem_ldx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_ldx(nfp_prog, meta, 8); +} - wrp_immed(nfp_prog, reg_both(meta->insn.dst_reg * 2 + 1), 0); +static int +mem_st_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int size) +{ + u64 imm = meta->insn.imm; /* sign extend */ + swreg off_reg; - return ret; + off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); + + return data_st_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg, + imm, size); } -static int mem_stx4_skb(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_st(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int size) { - if (meta->insn.off == offsetof(struct sk_buff, mark)) - return wrp_set_mark(nfp_prog, meta->insn.src_reg * 2); + if (meta->ptr.type == PTR_TO_PACKET) + return mem_st_data(nfp_prog, meta, size); return -EOPNOTSUPP; } -static int mem_stx4_xdp(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_st1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_st(nfp_prog, meta, 1); +} + +static int mem_st2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { + return mem_st(nfp_prog, meta, 2); +} + +static int mem_st4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_st(nfp_prog, meta, 4); +} + +static int mem_st8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_st(nfp_prog, meta, 8); +} + +static int +mem_stx_data(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int size) +{ + swreg off_reg; + + off_reg = re_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); + + return data_stx_host_order(nfp_prog, meta->insn.dst_reg * 2, off_reg, + meta->insn.src_reg * 2, size); +} + +static int +mem_stx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + unsigned int size) +{ + if (meta->ptr.type == PTR_TO_PACKET) + return mem_stx_data(nfp_prog, meta, size); + return -EOPNOTSUPP; } +static int mem_stx1(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_stx(nfp_prog, meta, 1); +} + +static int mem_stx2(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_stx(nfp_prog, meta, 2); +} + static int mem_stx4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - if (nfp_prog->act == NN_ACT_XDP) - return mem_stx4_xdp(nfp_prog, meta); - return mem_stx4_skb(nfp_prog, meta); + return mem_stx(nfp_prog, meta, 4); +} + +static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_stx(nfp_prog, meta, 8); } static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) @@ -1202,8 +1335,10 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { const struct bpf_insn *insn = &meta->insn; u64 imm = insn->imm; /* sign extend */ - u32 or1 = reg_a(insn->dst_reg * 2), or2 = reg_b(insn->dst_reg * 2 + 1); - u32 tmp_reg; + swreg or1, or2, tmp_reg; + + or1 = reg_a(insn->dst_reg * 2); + or2 = reg_b(insn->dst_reg * 2 + 1); if (insn->off < 0) /* TODO */ return -EOPNOTSUPP; @@ -1230,29 +1365,29 @@ static int jeq_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int jgt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_cmp_imm(nfp_prog, meta, BR_BLO, false); + return wrp_cmp_imm(nfp_prog, meta, BR_BLO, true); } static int jge_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_cmp_imm(nfp_prog, meta, BR_BHS, true); + return wrp_cmp_imm(nfp_prog, meta, BR_BHS, false); } static int jlt_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_cmp_imm(nfp_prog, meta, BR_BHS, false); + return wrp_cmp_imm(nfp_prog, meta, BR_BLO, false); } static int jle_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_cmp_imm(nfp_prog, meta, BR_BLO, true); + return wrp_cmp_imm(nfp_prog, meta, BR_BHS, true); } static int jset_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { const struct bpf_insn *insn = &meta->insn; u64 imm = insn->imm; /* sign extend */ - u32 tmp_reg; + swreg tmp_reg; if (insn->off < 0) /* TODO */ return -EOPNOTSUPP; @@ -1283,7 +1418,7 @@ static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { const struct bpf_insn *insn = &meta->insn; u64 imm = insn->imm; /* sign extend */ - u32 tmp_reg; + swreg tmp_reg; if (insn->off < 0) /* TODO */ return -EOPNOTSUPP; @@ -1292,6 +1427,7 @@ static int jne_imm(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) emit_alu(nfp_prog, reg_none(), reg_a(insn->dst_reg * 2), ALU_OP_OR, reg_b(insn->dst_reg * 2 + 1)); emit_br(nfp_prog, BR_BNE, insn->off, 0); + return 0; } tmp_reg = ur_load_imm_any(nfp_prog, imm & ~0U, imm_b(nfp_prog)); @@ -1327,22 +1463,22 @@ static int jeq_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) static int jgt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_cmp_reg(nfp_prog, meta, BR_BLO, false); + return wrp_cmp_reg(nfp_prog, meta, BR_BLO, true); } static int jge_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_cmp_reg(nfp_prog, meta, BR_BHS, true); + return wrp_cmp_reg(nfp_prog, meta, BR_BHS, false); } static int jlt_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_cmp_reg(nfp_prog, meta, BR_BHS, false); + return wrp_cmp_reg(nfp_prog, meta, BR_BLO, false); } static int jle_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { - return wrp_cmp_reg(nfp_prog, meta, BR_BLO, true); + return wrp_cmp_reg(nfp_prog, meta, BR_BHS, true); } static int jset_reg(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) @@ -1390,6 +1526,7 @@ static const instr_cb_t instr_cb[256] = { [BPF_ALU | BPF_SUB | BPF_X] = sub_reg, [BPF_ALU | BPF_SUB | BPF_K] = sub_imm, [BPF_ALU | BPF_LSH | BPF_K] = shl_imm, + [BPF_ALU | BPF_END | BPF_X] = end_reg32, [BPF_LD | BPF_IMM | BPF_DW] = imm_ld8, [BPF_LD | BPF_ABS | BPF_B] = data_ld1, [BPF_LD | BPF_ABS | BPF_H] = data_ld2, @@ -1397,8 +1534,18 @@ static const instr_cb_t instr_cb[256] = { [BPF_LD | BPF_IND | BPF_B] = data_ind_ld1, [BPF_LD | BPF_IND | BPF_H] = data_ind_ld2, [BPF_LD | BPF_IND | BPF_W] = data_ind_ld4, + [BPF_LDX | BPF_MEM | BPF_B] = mem_ldx1, + [BPF_LDX | BPF_MEM | BPF_H] = mem_ldx2, [BPF_LDX | BPF_MEM | BPF_W] = mem_ldx4, + [BPF_LDX | BPF_MEM | BPF_DW] = mem_ldx8, + [BPF_STX | BPF_MEM | BPF_B] = mem_stx1, + [BPF_STX | BPF_MEM | BPF_H] = mem_stx2, [BPF_STX | BPF_MEM | BPF_W] = mem_stx4, + [BPF_STX | BPF_MEM | BPF_DW] = mem_stx8, + [BPF_ST | BPF_MEM | BPF_B] = mem_st1, + [BPF_ST | BPF_MEM | BPF_H] = mem_st2, + [BPF_ST | BPF_MEM | BPF_W] = mem_st4, + [BPF_ST | BPF_MEM | BPF_DW] = mem_st8, [BPF_JMP | BPF_JA | BPF_K] = jump, [BPF_JMP | BPF_JEQ | BPF_K] = jeq_imm, [BPF_JMP | BPF_JGT | BPF_K] = jgt_imm, @@ -1510,8 +1657,9 @@ static int nfp_fixup_branches(struct nfp_prog *nfp_prog) static void nfp_intro(struct nfp_prog *nfp_prog) { - emit_alu(nfp_prog, pkt_reg(nfp_prog), - reg_none(), ALU_OP_NONE, NFP_BPF_ABI_PKT); + wrp_immed(nfp_prog, plen_reg(nfp_prog), GENMASK(13, 0)); + emit_alu(nfp_prog, plen_reg(nfp_prog), + plen_reg(nfp_prog), ALU_OP_AND, pv_len(nfp_prog)); } static void nfp_outro_tc_legacy(struct nfp_prog *nfp_prog) @@ -1534,8 +1682,7 @@ static void nfp_outro_tc_legacy(struct nfp_prog *nfp_prog) * ife + tx 0x24 -> redir, count as stat1 */ emit_br_byte_neq(nfp_prog, reg_b(0), 0xff, 0, nfp_prog->tgt_done, 2); - emit_alu(nfp_prog, reg_a(0), - reg_none(), ALU_OP_NONE, NFP_BPF_ABI_FLAGS); + wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS); emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16); emit_br(nfp_prog, BR_UNC, nfp_prog->tgt_done, 1); @@ -1562,8 +1709,7 @@ static void nfp_outro_tc_da(struct nfp_prog *nfp_prog) emit_br_def(nfp_prog, nfp_prog->tgt_done, 2); - emit_alu(nfp_prog, reg_a(0), - reg_none(), ALU_OP_NONE, NFP_BPF_ABI_FLAGS); + wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS); emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x11), SHF_SC_L_SHF, 16); /* Target for normal exits */ @@ -1572,8 +1718,7 @@ static void nfp_outro_tc_da(struct nfp_prog *nfp_prog) /* if R0 > 7 jump to abort */ emit_alu(nfp_prog, reg_none(), reg_imm(7), ALU_OP_SUB, reg_b(0)); emit_br(nfp_prog, BR_BLO, nfp_prog->tgt_abort, 0); - emit_alu(nfp_prog, reg_a(0), - reg_none(), ALU_OP_NONE, NFP_BPF_ABI_FLAGS); + wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS); wrp_immed(nfp_prog, reg_b(2), 0x41221211); wrp_immed(nfp_prog, reg_b(3), 0x41001211); @@ -1610,8 +1755,7 @@ static void nfp_outro_xdp(struct nfp_prog *nfp_prog) emit_br_def(nfp_prog, nfp_prog->tgt_done, 2); - emit_alu(nfp_prog, reg_a(0), - reg_none(), ALU_OP_NONE, NFP_BPF_ABI_FLAGS); + wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS); emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_imm(0x82), SHF_SC_L_SHF, 16); /* Target for normal exits */ @@ -1632,8 +1776,7 @@ static void nfp_outro_xdp(struct nfp_prog *nfp_prog) emit_br_def(nfp_prog, nfp_prog->tgt_done, 2); - emit_alu(nfp_prog, reg_a(0), - reg_none(), ALU_OP_NONE, NFP_BPF_ABI_FLAGS); + wrp_mov(nfp_prog, reg_a(0), NFP_BPF_ABI_FLAGS); emit_ld_field(nfp_prog, reg_a(0), 0xc, reg_b(2), SHF_SC_L_SHF, 16); } @@ -1656,7 +1799,7 @@ static void nfp_outro(struct nfp_prog *nfp_prog) static int nfp_translate(struct nfp_prog *nfp_prog) { struct nfp_insn_meta *meta; - int err; + int i, err; nfp_intro(nfp_prog); if (nfp_prog->error) @@ -1688,6 +1831,11 @@ static int nfp_translate(struct nfp_prog *nfp_prog) if (nfp_prog->error) return nfp_prog->error; + for (i = 0; i < NFP_USTORE_PREFETCH_WINDOW; i++) + emit_nop(nfp_prog); + if (nfp_prog->error) + return nfp_prog->error; + return nfp_fixup_branches(nfp_prog); } @@ -1737,38 +1885,6 @@ static void nfp_bpf_opt_reg_init(struct nfp_prog *nfp_prog) } } -/* Try to rename registers so that program uses only low ones */ -static int nfp_bpf_opt_reg_rename(struct nfp_prog *nfp_prog) -{ - bool reg_used[MAX_BPF_REG] = {}; - u8 tgt_reg[MAX_BPF_REG] = {}; - struct nfp_insn_meta *meta; - unsigned int i, j; - - list_for_each_entry(meta, &nfp_prog->insns, l) { - if (meta->skip) - continue; - - reg_used[meta->insn.src_reg] = true; - reg_used[meta->insn.dst_reg] = true; - } - - for (i = 0, j = 0; i < ARRAY_SIZE(tgt_reg); i++) { - if (!reg_used[i]) - continue; - - tgt_reg[i] = j++; - } - nfp_prog->num_regs = j; - - list_for_each_entry(meta, &nfp_prog->insns, l) { - meta->insn.src_reg = tgt_reg[meta->insn.src_reg]; - meta->insn.dst_reg = tgt_reg[meta->insn.dst_reg]; - } - - return 0; -} - /* Remove masking after load since our load guarantees this is not needed */ static void nfp_bpf_opt_ld_mask(struct nfp_prog *nfp_prog) { @@ -1845,20 +1961,33 @@ static void nfp_bpf_opt_ld_shift(struct nfp_prog *nfp_prog) static int nfp_bpf_optimize(struct nfp_prog *nfp_prog) { - int ret; - nfp_bpf_opt_reg_init(nfp_prog); - ret = nfp_bpf_opt_reg_rename(nfp_prog); - if (ret) - return ret; - nfp_bpf_opt_ld_mask(nfp_prog); nfp_bpf_opt_ld_shift(nfp_prog); return 0; } +static int nfp_bpf_ustore_calc(struct nfp_prog *nfp_prog, __le64 *ustore) +{ + int i; + + for (i = 0; i < nfp_prog->prog_len; i++) { + int err; + + err = nfp_ustore_check_valid_no_ecc(nfp_prog->prog[i]); + if (err) + return err; + + nfp_prog->prog[i] = nfp_ustore_calc_ecc_insn(nfp_prog->prog[i]); + + ustore[i] = cpu_to_le64(nfp_prog->prog[i]); + } + + return 0; +} + /** * nfp_bpf_jit() - translate BPF code into NFP assembly * @filter: kernel BPF filter struct @@ -1899,10 +2028,8 @@ nfp_bpf_jit(struct bpf_prog *filter, void *prog_mem, if (ret) goto out; - if (nfp_prog->num_regs <= 7) - nfp_prog->regs_per_thread = 16; - else - nfp_prog->regs_per_thread = 32; + nfp_prog->num_regs = MAX_BPF_REG; + nfp_prog->regs_per_thread = 32; nfp_prog->prog = prog_mem; nfp_prog->__prog_alloc_len = prog_sz; @@ -1912,10 +2039,13 @@ nfp_bpf_jit(struct bpf_prog *filter, void *prog_mem, pr_err("Translation failed with error %d (translated: %u)\n", ret, nfp_prog->n_translated); ret = -EINVAL; + goto out; } + ret = nfp_bpf_ustore_calc(nfp_prog, (__force __le64 *)prog_mem); + res->n_instr = nfp_prog->prog_len; - res->dense_mode = nfp_prog->num_regs <= 7; + res->dense_mode = false; out: nfp_prog_free(nfp_prog); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index be2cf10a2cd7..6e74f8db1cc1 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -42,9 +42,11 @@ static bool nfp_net_ebpf_capable(struct nfp_net *nn) { +#ifdef __LITTLE_ENDIAN if (nn->cap & NFP_NET_CFG_CTRL_BPF && nn_readb(nn, NFP_NET_CFG_BPF_ABI) == NFP_NET_BPF_ABI) return true; +#endif return false; } @@ -89,14 +91,6 @@ nfp_bpf_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id) struct nfp_net_bpf_priv *priv; int ret; - /* Limit to single port, otherwise it's just a NIC */ - if (id > 0) { - nfp_warn(app->cpp, - "BPF NIC doesn't support more than one port right now\n"); - nn->port = nfp_port_alloc(app, NFP_PORT_INVALID, nn->dp.netdev); - return PTR_ERR_OR_ZERO(nn->port); - } - priv = kmalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 4051e943f363..d77e88a45409 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -36,9 +36,11 @@ #include <linux/bitfield.h> #include <linux/bpf.h> +#include <linux/bpf_verifier.h> #include <linux/list.h> #include <linux/types.h> +#include "../nfp_asm.h" #include "../nfp_net.h" /* For branch fixup logic use up-most byte of branch instruction as scratch @@ -53,9 +55,13 @@ enum br_special { }; enum static_regs { - STATIC_REG_PKT = 1, -#define REG_PKT_BANK ALU_DST_A - STATIC_REG_IMM = 2, /* Bank AB */ + STATIC_REG_IMM = 21, /* Bank AB */ + STATIC_REG_PKT_LEN = 22, /* Bank B */ +}; + +enum pkt_vec { + PKT_VEC_PKT_LEN = 0, + PKT_VEC_PKT_PTR = 2, }; enum nfp_bpf_action_type { @@ -65,39 +71,17 @@ enum nfp_bpf_action_type { NN_ACT_XDP, }; -/* Software register representation, hardware encoding in asm.h */ -#define NN_REG_TYPE GENMASK(31, 24) -#define NN_REG_VAL GENMASK(7, 0) - -enum nfp_bpf_reg_type { - NN_REG_GPR_A = BIT(0), - NN_REG_GPR_B = BIT(1), - NN_REG_NNR = BIT(2), - NN_REG_XFER = BIT(3), - NN_REG_IMM = BIT(4), - NN_REG_NONE = BIT(5), -}; - -#define NN_REG_GPR_BOTH (NN_REG_GPR_A | NN_REG_GPR_B) - -#define reg_both(x) ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_BOTH)) -#define reg_a(x) ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_A)) -#define reg_b(x) ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_GPR_B)) -#define reg_nnr(x) ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_NNR)) -#define reg_xfer(x) ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_XFER)) -#define reg_imm(x) ((x) | FIELD_PREP(NN_REG_TYPE, NN_REG_IMM)) -#define reg_none() (FIELD_PREP(NN_REG_TYPE, NN_REG_NONE)) +#define pv_len(np) reg_lm(1, PKT_VEC_PKT_LEN) +#define pv_ctm_ptr(np) reg_lm(1, PKT_VEC_PKT_PTR) -#define pkt_reg(np) reg_a((np)->regs_per_thread - STATIC_REG_PKT) -#define imm_a(np) reg_a((np)->regs_per_thread - STATIC_REG_IMM) -#define imm_b(np) reg_b((np)->regs_per_thread - STATIC_REG_IMM) -#define imm_both(np) reg_both((np)->regs_per_thread - STATIC_REG_IMM) +#define plen_reg(np) reg_b(STATIC_REG_PKT_LEN) +#define pptr_reg(np) pv_ctm_ptr(np) +#define imm_a(np) reg_a(STATIC_REG_IMM) +#define imm_b(np) reg_b(STATIC_REG_IMM) +#define imm_both(np) reg_both(STATIC_REG_IMM) -#define NFP_BPF_ABI_FLAGS reg_nnr(0) +#define NFP_BPF_ABI_FLAGS reg_imm(0) #define NFP_BPF_ABI_FLAG_MARK 1 -#define NFP_BPF_ABI_MARK reg_nnr(1) -#define NFP_BPF_ABI_PKT reg_nnr(2) -#define NFP_BPF_ABI_LEN reg_nnr(3) struct nfp_prog; struct nfp_insn_meta; @@ -113,6 +97,7 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); /** * struct nfp_insn_meta - BPF instruction wrapper * @insn: BPF instruction + * @ptr: pointer type for memory operations * @off: index of first generated machine instruction (in nfp_prog.prog) * @n: eBPF instruction number * @skip: skip this instruction (optimized out) @@ -121,6 +106,7 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); */ struct nfp_insn_meta { struct bpf_insn insn; + struct bpf_reg_state ptr; unsigned int off; unsigned short n; bool skip; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 5b783a91b115..e361c0e3b788 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -112,12 +112,19 @@ nfp_bpf_check_exit(struct nfp_prog *nfp_prog, } static int -nfp_bpf_check_ctx_ptr(struct nfp_prog *nfp_prog, - const struct bpf_verifier_env *env, u8 reg) +nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + const struct bpf_verifier_env *env, u8 reg) { - if (env->cur_state.regs[reg].type != PTR_TO_CTX) + if (env->cur_state.regs[reg].type != PTR_TO_CTX && + env->cur_state.regs[reg].type != PTR_TO_PACKET) return -EINVAL; + if (meta->ptr.type != NOT_INIT && + meta->ptr.type != env->cur_state.regs[reg].type) + return -EINVAL; + + meta->ptr = env->cur_state.regs[reg]; + return 0; } @@ -145,11 +152,11 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) return nfp_bpf_check_exit(priv->prog, env); if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_LDX | BPF_MEM)) - return nfp_bpf_check_ctx_ptr(priv->prog, env, - meta->insn.src_reg); + return nfp_bpf_check_ptr(priv->prog, meta, env, + meta->insn.src_reg); if ((meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM)) - return nfp_bpf_check_ctx_ptr(priv->prog, env, - meta->insn.dst_reg); + return nfp_bpf_check_ptr(priv->prog, meta, env, + meta->insn.dst_reg); return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c index 38f3835ae176..1194c47ef827 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/action.c +++ b/drivers/net/ethernet/netronome/nfp/flower/action.c @@ -36,6 +36,7 @@ #include <net/switchdev.h> #include <net/tc_act/tc_gact.h> #include <net/tc_act/tc_mirred.h> +#include <net/tc_act/tc_pedit.h> #include <net/tc_act/tc_vlan.h> #include <net/tc_act/tc_tunnel_key.h> @@ -223,6 +224,247 @@ nfp_fl_set_vxlan(struct nfp_fl_set_vxlan *set_vxlan, return 0; } +static void nfp_fl_set_helper32(u32 value, u32 mask, u8 *p_exact, u8 *p_mask) +{ + u32 oldvalue = get_unaligned((u32 *)p_exact); + u32 oldmask = get_unaligned((u32 *)p_mask); + + value &= mask; + value |= oldvalue & ~mask; + + put_unaligned(oldmask | mask, (u32 *)p_mask); + put_unaligned(value, (u32 *)p_exact); +} + +static int +nfp_fl_set_eth(const struct tc_action *action, int idx, u32 off, + struct nfp_fl_set_eth *set_eth) +{ + u16 tmp_set_eth_op; + u32 exact, mask; + + if (off + 4 > ETH_ALEN * 2) + return -EOPNOTSUPP; + + mask = ~tcf_pedit_mask(action, idx); + exact = tcf_pedit_val(action, idx); + + if (exact & ~mask) + return -EOPNOTSUPP; + + nfp_fl_set_helper32(exact, mask, &set_eth->eth_addr_val[off], + &set_eth->eth_addr_mask[off]); + + set_eth->reserved = cpu_to_be16(0); + tmp_set_eth_op = FIELD_PREP(NFP_FL_ACT_LEN_LW, + sizeof(*set_eth) >> NFP_FL_LW_SIZ) | + FIELD_PREP(NFP_FL_ACT_JMP_ID, + NFP_FL_ACTION_OPCODE_SET_ETHERNET); + set_eth->a_op = cpu_to_be16(tmp_set_eth_op); + + return 0; +} + +static int +nfp_fl_set_ip4(const struct tc_action *action, int idx, u32 off, + struct nfp_fl_set_ip4_addrs *set_ip_addr) +{ + u16 tmp_set_ipv4_op; + __be32 exact, mask; + + /* We are expecting tcf_pedit to return a big endian value */ + mask = (__force __be32)~tcf_pedit_mask(action, idx); + exact = (__force __be32)tcf_pedit_val(action, idx); + + if (exact & ~mask) + return -EOPNOTSUPP; + + switch (off) { + case offsetof(struct iphdr, daddr): + set_ip_addr->ipv4_dst_mask = mask; + set_ip_addr->ipv4_dst = exact; + break; + case offsetof(struct iphdr, saddr): + set_ip_addr->ipv4_src_mask = mask; + set_ip_addr->ipv4_src = exact; + break; + default: + return -EOPNOTSUPP; + } + + set_ip_addr->reserved = cpu_to_be16(0); + tmp_set_ipv4_op = FIELD_PREP(NFP_FL_ACT_LEN_LW, + sizeof(*set_ip_addr) >> NFP_FL_LW_SIZ) | + FIELD_PREP(NFP_FL_ACT_JMP_ID, + NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS); + set_ip_addr->a_op = cpu_to_be16(tmp_set_ipv4_op); + + return 0; +} + +static void +nfp_fl_set_ip6_helper(int opcode_tag, int idx, __be32 exact, __be32 mask, + struct nfp_fl_set_ipv6_addr *ip6) +{ + u16 tmp_set_op; + + ip6->ipv6[idx % 4].mask = mask; + ip6->ipv6[idx % 4].exact = exact; + + ip6->reserved = cpu_to_be16(0); + tmp_set_op = FIELD_PREP(NFP_FL_ACT_LEN_LW, sizeof(*ip6) >> + NFP_FL_LW_SIZ) | + FIELD_PREP(NFP_FL_ACT_JMP_ID, opcode_tag); + ip6->a_op = cpu_to_be16(tmp_set_op); +} + +static int +nfp_fl_set_ip6(const struct tc_action *action, int idx, u32 off, + struct nfp_fl_set_ipv6_addr *ip_dst, + struct nfp_fl_set_ipv6_addr *ip_src) +{ + __be32 exact, mask; + + /* We are expecting tcf_pedit to return a big endian value */ + mask = (__force __be32)~tcf_pedit_mask(action, idx); + exact = (__force __be32)tcf_pedit_val(action, idx); + + if (exact & ~mask) + return -EOPNOTSUPP; + + if (off < offsetof(struct ipv6hdr, saddr)) + return -EOPNOTSUPP; + else if (off < offsetof(struct ipv6hdr, daddr)) + nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_SRC, idx, + exact, mask, ip_src); + else if (off < offsetof(struct ipv6hdr, daddr) + + sizeof(struct in6_addr)) + nfp_fl_set_ip6_helper(NFP_FL_ACTION_OPCODE_SET_IPV6_DST, idx, + exact, mask, ip_dst); + else + return -EOPNOTSUPP; + + return 0; +} + +static int +nfp_fl_set_tport(const struct tc_action *action, int idx, u32 off, + struct nfp_fl_set_tport *set_tport, int opcode) +{ + u32 exact, mask; + u16 tmp_set_op; + + if (off) + return -EOPNOTSUPP; + + mask = ~tcf_pedit_mask(action, idx); + exact = tcf_pedit_val(action, idx); + + if (exact & ~mask) + return -EOPNOTSUPP; + + nfp_fl_set_helper32(exact, mask, set_tport->tp_port_val, + set_tport->tp_port_mask); + + set_tport->reserved = cpu_to_be16(0); + tmp_set_op = FIELD_PREP(NFP_FL_ACT_LEN_LW, + sizeof(*set_tport) >> NFP_FL_LW_SIZ); + tmp_set_op |= FIELD_PREP(NFP_FL_ACT_JMP_ID, opcode); + set_tport->a_op = cpu_to_be16(tmp_set_op); + + return 0; +} + +static int +nfp_fl_pedit(const struct tc_action *action, char *nfp_action, int *a_len) +{ + struct nfp_fl_set_ipv6_addr set_ip6_dst, set_ip6_src; + struct nfp_fl_set_ip4_addrs set_ip_addr; + struct nfp_fl_set_tport set_tport; + struct nfp_fl_set_eth set_eth; + enum pedit_header_type htype; + int idx, nkeys, err; + size_t act_size; + u32 offset, cmd; + + memset(&set_ip6_dst, 0, sizeof(set_ip6_dst)); + memset(&set_ip6_src, 0, sizeof(set_ip6_src)); + memset(&set_ip_addr, 0, sizeof(set_ip_addr)); + memset(&set_tport, 0, sizeof(set_tport)); + memset(&set_eth, 0, sizeof(set_eth)); + nkeys = tcf_pedit_nkeys(action); + + for (idx = 0; idx < nkeys; idx++) { + cmd = tcf_pedit_cmd(action, idx); + htype = tcf_pedit_htype(action, idx); + offset = tcf_pedit_offset(action, idx); + + if (cmd != TCA_PEDIT_KEY_EX_CMD_SET) + return -EOPNOTSUPP; + + switch (htype) { + case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH: + err = nfp_fl_set_eth(action, idx, offset, &set_eth); + break; + case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4: + err = nfp_fl_set_ip4(action, idx, offset, &set_ip_addr); + break; + case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6: + err = nfp_fl_set_ip6(action, idx, offset, &set_ip6_dst, + &set_ip6_src); + break; + case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP: + err = nfp_fl_set_tport(action, idx, offset, &set_tport, + NFP_FL_ACTION_OPCODE_SET_TCP); + break; + case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP: + err = nfp_fl_set_tport(action, idx, offset, &set_tport, + NFP_FL_ACTION_OPCODE_SET_UDP); + break; + default: + return -EOPNOTSUPP; + } + if (err) + return err; + } + + if (set_eth.a_op) { + act_size = sizeof(set_eth); + memcpy(nfp_action, &set_eth, act_size); + *a_len += act_size; + } else if (set_ip_addr.a_op) { + act_size = sizeof(set_ip_addr); + memcpy(nfp_action, &set_ip_addr, act_size); + *a_len += act_size; + } else if (set_ip6_dst.a_op && set_ip6_src.a_op) { + /* TC compiles set src and dst IPv6 address as a single action, + * the hardware requires this to be 2 separate actions. + */ + act_size = sizeof(set_ip6_src); + memcpy(nfp_action, &set_ip6_src, act_size); + *a_len += act_size; + + act_size = sizeof(set_ip6_dst); + memcpy(&nfp_action[sizeof(set_ip6_src)], &set_ip6_dst, + act_size); + *a_len += act_size; + } else if (set_ip6_dst.a_op) { + act_size = sizeof(set_ip6_dst); + memcpy(nfp_action, &set_ip6_dst, act_size); + *a_len += act_size; + } else if (set_ip6_src.a_op) { + act_size = sizeof(set_ip6_src); + memcpy(nfp_action, &set_ip6_src, act_size); + *a_len += act_size; + } else if (set_tport.a_op) { + act_size = sizeof(set_tport); + memcpy(nfp_action, &set_tport, act_size); + *a_len += act_size; + } + + return 0; +} + static int nfp_flower_loop_action(const struct tc_action *a, struct nfp_fl_payload *nfp_fl, int *a_len, @@ -301,6 +543,9 @@ nfp_flower_loop_action(const struct tc_action *a, } else if (is_tcf_tunnel_release(a)) { /* Tunnel decap is handled by default so accept action. */ return 0; + } else if (is_tcf_pedit(a)) { + if (nfp_fl_pedit(a, &nfp_fl->action_data[*a_len], a_len)) + return -EOPNOTSUPP; } else { /* Currently we do not handle any other actions. */ return -EOPNOTSUPP; diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h index 504ddaa21701..f7b7242a22bc 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h +++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h @@ -57,6 +57,11 @@ #define NFP_FLOWER_MASK_VLAN_CFI BIT(12) #define NFP_FLOWER_MASK_VLAN_VID GENMASK(11, 0) +#define NFP_FLOWER_MASK_MPLS_LB GENMASK(31, 12) +#define NFP_FLOWER_MASK_MPLS_TC GENMASK(11, 9) +#define NFP_FLOWER_MASK_MPLS_BOS BIT(8) +#define NFP_FLOWER_MASK_MPLS_Q BIT(0) + #define NFP_FL_SC_ACT_DROP 0x80000000 #define NFP_FL_SC_ACT_USER 0x7D000000 #define NFP_FL_SC_ACT_POPV 0x6A000000 @@ -72,6 +77,12 @@ #define NFP_FL_ACTION_OPCODE_PUSH_VLAN 1 #define NFP_FL_ACTION_OPCODE_POP_VLAN 2 #define NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL 6 +#define NFP_FL_ACTION_OPCODE_SET_ETHERNET 7 +#define NFP_FL_ACTION_OPCODE_SET_IPV4_ADDRS 9 +#define NFP_FL_ACTION_OPCODE_SET_IPV6_SRC 11 +#define NFP_FL_ACTION_OPCODE_SET_IPV6_DST 12 +#define NFP_FL_ACTION_OPCODE_SET_UDP 14 +#define NFP_FL_ACTION_OPCODE_SET_TCP 15 #define NFP_FL_ACTION_OPCODE_PRE_TUNNEL 17 #define NFP_FL_ACTION_OPCODE_NUM 32 @@ -102,6 +113,38 @@ enum nfp_flower_tun_type { NFP_FL_TUNNEL_VXLAN = 2, }; +struct nfp_fl_set_eth { + __be16 a_op; + __be16 reserved; + u8 eth_addr_mask[ETH_ALEN * 2]; + u8 eth_addr_val[ETH_ALEN * 2]; +}; + +struct nfp_fl_set_ip4_addrs { + __be16 a_op; + __be16 reserved; + __be32 ipv4_src_mask; + __be32 ipv4_src; + __be32 ipv4_dst_mask; + __be32 ipv4_dst; +}; + +struct nfp_fl_set_ipv6_addr { + __be16 a_op; + __be16 reserved; + struct { + __be32 mask; + __be32 exact; + } ipv6[4]; +}; + +struct nfp_fl_set_tport { + __be16 a_op; + __be16 reserved; + u8 tp_port_mask[4]; + u8 tp_port_val[4]; +}; + struct nfp_fl_output { __be16 a_op; __be16 flags; diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c index 865a815ab92a..60614d4f0e22 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/match.c +++ b/drivers/net/ethernet/netronome/nfp/flower/match.c @@ -111,8 +111,21 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *frame, ether_addr_copy(frame->mac_src, &addr->src[0]); } - if (mask_version) - frame->mpls_lse = cpu_to_be32(~0); + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_MPLS)) { + struct flow_dissector_key_mpls *mpls; + u32 t_mpls; + + mpls = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_MPLS, + target); + + t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, mpls->mpls_label) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, mpls->mpls_tc) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, mpls->mpls_bos) | + NFP_FLOWER_MASK_MPLS_Q; + + frame->mpls_lse = cpu_to_be32(t_mpls); + } } static void @@ -143,7 +156,6 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame, struct flow_dissector_key_ipv4_addrs *addr; struct flow_dissector_key_basic *basic; - /* Wildcard TOS/TTL for now. */ memset(frame, 0, sizeof(struct nfp_flower_ipv4)); if (dissector_uses_key(flow->dissector, @@ -161,6 +173,16 @@ nfp_flower_compile_ipv4(struct nfp_flower_ipv4 *frame, target); frame->proto = basic->ip_proto; } + + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) { + struct flow_dissector_key_ip *flow_ip; + + flow_ip = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_IP, + target); + frame->tos = flow_ip->tos; + frame->ttl = flow_ip->ttl; + } } static void @@ -172,7 +194,6 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame, struct flow_dissector_key_ipv6_addrs *addr; struct flow_dissector_key_basic *basic; - /* Wildcard LABEL/TOS/TTL for now. */ memset(frame, 0, sizeof(struct nfp_flower_ipv6)); if (dissector_uses_key(flow->dissector, @@ -190,6 +211,16 @@ nfp_flower_compile_ipv6(struct nfp_flower_ipv6 *frame, target); frame->proto = basic->ip_proto; } + + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) { + struct flow_dissector_key_ip *flow_ip; + + flow_ip = skb_flow_dissector_target(flow->dissector, + FLOW_DISSECTOR_KEY_IP, + target); + frame->tos = flow_ip->tos; + frame->ttl = flow_ip->ttl; + } } static void diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index 3d9537ebdea4..6f239c27964e 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -57,6 +57,7 @@ BIT(FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS) | \ BIT(FLOW_DISSECTOR_KEY_ENC_CONTROL) | \ BIT(FLOW_DISSECTOR_KEY_ENC_PORTS) | \ + BIT(FLOW_DISSECTOR_KEY_MPLS) | \ BIT(FLOW_DISSECTOR_KEY_IP)) #define NFP_FLOWER_WHITELIST_TUN_DISSECTOR \ @@ -134,7 +135,6 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, { struct flow_dissector_key_basic *mask_basic = NULL; struct flow_dissector_key_basic *key_basic = NULL; - struct flow_dissector_key_ip *mask_ip = NULL; u32 key_layer_two; u8 key_layer; int key_size; @@ -206,28 +206,15 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, flow->key); } - if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_IP)) - mask_ip = skb_flow_dissector_target(flow->dissector, - FLOW_DISSECTOR_KEY_IP, - flow->mask); - if (mask_basic && mask_basic->n_proto) { /* Ethernet type is present in the key. */ switch (key_basic->n_proto) { case cpu_to_be16(ETH_P_IP): - if (mask_ip && mask_ip->tos) - return -EOPNOTSUPP; - if (mask_ip && mask_ip->ttl) - return -EOPNOTSUPP; key_layer |= NFP_FLOWER_LAYER_IPV4; key_size += sizeof(struct nfp_flower_ipv4); break; case cpu_to_be16(ETH_P_IPV6): - if (mask_ip && mask_ip->tos) - return -EOPNOTSUPP; - if (mask_ip && mask_ip->ttl) - return -EOPNOTSUPP; key_layer |= NFP_FLOWER_LAYER_IPV6; key_size += sizeof(struct nfp_flower_ipv6); break; @@ -238,11 +225,6 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, case cpu_to_be16(ETH_P_ARP): return -EOPNOTSUPP; - /* Currently we do not offload MPLS. */ - case cpu_to_be16(ETH_P_MPLS_UC): - case cpu_to_be16(ETH_P_MPLS_MC): - return -EOPNOTSUPP; - /* Will be included in layer 2. */ case cpu_to_be16(ETH_P_8021Q): break; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.c b/drivers/net/ethernet/netronome/nfp/nfp_app.c index 82c290763529..5d9e2eba5b49 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include <linux/bug.h> #include <linux/skbuff.h> #include <linux/slab.h> diff --git a/drivers/net/ethernet/netronome/nfp/nfp_app.h b/drivers/net/ethernet/netronome/nfp/nfp_app.h index af640b5c2108..857bb33020ba 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_app.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_app.h @@ -36,6 +36,8 @@ #include <net/devlink.h> +#include <trace/events/devlink.h> + #include "nfp_net_repr.h" struct bpf_prog; @@ -271,11 +273,17 @@ static inline int nfp_app_xdp_offload(struct nfp_app *app, struct nfp_net *nn, static inline bool nfp_app_ctrl_tx(struct nfp_app *app, struct sk_buff *skb) { + trace_devlink_hwmsg(priv_to_devlink(app->pf), false, 0, + skb->data, skb->len); + return nfp_ctrl_tx(app->ctrl, skb); } static inline void nfp_app_ctrl_rx(struct nfp_app *app, struct sk_buff *skb) { + trace_devlink_hwmsg(priv_to_devlink(app->pf), true, 0, + skb->data, skb->len); + app->type->ctrl_msg_rx(app, skb); } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c new file mode 100644 index 000000000000..830f6de25f47 --- /dev/null +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c @@ -0,0 +1,257 @@ +/* + * Copyright (C) 2016-2017 Netronome Systems, Inc. + * + * This software is dual licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree or the BSD 2-Clause License provided below. You have the + * option to license this software under the complete terms of either license. + * + * The BSD 2-Clause License: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * 1. Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * 2. Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/bitops.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "nfp_asm.h" + +const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = { + [CMD_TGT_WRITE8_SWAP] = { 0x02, 0x42 }, + [CMD_TGT_READ8] = { 0x01, 0x43 }, + [CMD_TGT_READ32] = { 0x00, 0x5c }, + [CMD_TGT_READ32_LE] = { 0x01, 0x5c }, + [CMD_TGT_READ32_SWAP] = { 0x02, 0x5c }, + [CMD_TGT_READ_LE] = { 0x01, 0x40 }, + [CMD_TGT_READ_SWAP_LE] = { 0x03, 0x40 }, +}; + +static u16 nfp_swreg_to_unreg(swreg reg, bool is_dst) +{ + bool lm_id, lm_dec = false; + u16 val = swreg_value(reg); + + switch (swreg_type(reg)) { + case NN_REG_GPR_A: + case NN_REG_GPR_B: + case NN_REG_GPR_BOTH: + return val; + case NN_REG_NNR: + return UR_REG_NN | val; + case NN_REG_XFER: + return UR_REG_XFR | val; + case NN_REG_LMEM: + lm_id = swreg_lm_idx(reg); + + switch (swreg_lm_mode(reg)) { + case NN_LM_MOD_NONE: + if (val & ~UR_REG_LM_IDX_MAX) { + pr_err("LM offset too large\n"); + return 0; + } + return UR_REG_LM | FIELD_PREP(UR_REG_LM_IDX, lm_id) | + val; + case NN_LM_MOD_DEC: + lm_dec = true; + /* fall through */ + case NN_LM_MOD_INC: + if (val) { + pr_err("LM offset in inc/dev mode\n"); + return 0; + } + return UR_REG_LM | UR_REG_LM_POST_MOD | + FIELD_PREP(UR_REG_LM_IDX, lm_id) | + FIELD_PREP(UR_REG_LM_POST_MOD_DEC, lm_dec); + default: + pr_err("bad LM mode for unrestricted operands %d\n", + swreg_lm_mode(reg)); + return 0; + } + case NN_REG_IMM: + if (val & ~0xff) { + pr_err("immediate too large\n"); + return 0; + } + return UR_REG_IMM_encode(val); + case NN_REG_NONE: + return is_dst ? UR_REG_NO_DST : REG_NONE; + } + + pr_err("unrecognized reg encoding %08x\n", reg); + return 0; +} + +int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg, + struct nfp_insn_ur_regs *reg) +{ + memset(reg, 0, sizeof(*reg)); + + /* Decode destination */ + if (swreg_type(dst) == NN_REG_IMM) + return -EFAULT; + + if (swreg_type(dst) == NN_REG_GPR_B) + reg->dst_ab = ALU_DST_B; + if (swreg_type(dst) == NN_REG_GPR_BOTH) + reg->wr_both = true; + reg->dst = nfp_swreg_to_unreg(dst, true); + + /* Decode source operands */ + if (swreg_type(lreg) == swreg_type(rreg)) + return -EFAULT; + + if (swreg_type(lreg) == NN_REG_GPR_B || + swreg_type(rreg) == NN_REG_GPR_A) { + reg->areg = nfp_swreg_to_unreg(rreg, false); + reg->breg = nfp_swreg_to_unreg(lreg, false); + reg->swap = true; + } else { + reg->areg = nfp_swreg_to_unreg(lreg, false); + reg->breg = nfp_swreg_to_unreg(rreg, false); + } + + reg->dst_lmextn = swreg_lmextn(dst); + reg->src_lmextn = swreg_lmextn(lreg) | swreg_lmextn(rreg); + + return 0; +} + +static u16 nfp_swreg_to_rereg(swreg reg, bool is_dst, bool has_imm8, bool *i8) +{ + u16 val = swreg_value(reg); + bool lm_id; + + switch (swreg_type(reg)) { + case NN_REG_GPR_A: + case NN_REG_GPR_B: + case NN_REG_GPR_BOTH: + return val; + case NN_REG_XFER: + return RE_REG_XFR | val; + case NN_REG_LMEM: + lm_id = swreg_lm_idx(reg); + + if (swreg_lm_mode(reg) != NN_LM_MOD_NONE) { + pr_err("bad LM mode for restricted operands %d\n", + swreg_lm_mode(reg)); + return 0; + } + + if (val & ~RE_REG_LM_IDX_MAX) { + pr_err("LM offset too large\n"); + return 0; + } + + return RE_REG_LM | FIELD_PREP(RE_REG_LM_IDX, lm_id) | val; + case NN_REG_IMM: + if (val & ~(0x7f | has_imm8 << 7)) { + pr_err("immediate too large\n"); + return 0; + } + *i8 = val & 0x80; + return RE_REG_IMM_encode(val & 0x7f); + case NN_REG_NONE: + return is_dst ? RE_REG_NO_DST : REG_NONE; + case NN_REG_NNR: + pr_err("NNRs used with restricted encoding\n"); + return 0; + } + + pr_err("unrecognized reg encoding\n"); + return 0; +} + +int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg, + struct nfp_insn_re_regs *reg, bool has_imm8) +{ + memset(reg, 0, sizeof(*reg)); + + /* Decode destination */ + if (swreg_type(dst) == NN_REG_IMM) + return -EFAULT; + + if (swreg_type(dst) == NN_REG_GPR_B) + reg->dst_ab = ALU_DST_B; + if (swreg_type(dst) == NN_REG_GPR_BOTH) + reg->wr_both = true; + reg->dst = nfp_swreg_to_rereg(dst, true, false, NULL); + + /* Decode source operands */ + if (swreg_type(lreg) == swreg_type(rreg)) + return -EFAULT; + + if (swreg_type(lreg) == NN_REG_GPR_B || + swreg_type(rreg) == NN_REG_GPR_A) { + reg->areg = nfp_swreg_to_rereg(rreg, false, has_imm8, ®->i8); + reg->breg = nfp_swreg_to_rereg(lreg, false, has_imm8, ®->i8); + reg->swap = true; + } else { + reg->areg = nfp_swreg_to_rereg(lreg, false, has_imm8, ®->i8); + reg->breg = nfp_swreg_to_rereg(rreg, false, has_imm8, ®->i8); + } + + reg->dst_lmextn = swreg_lmextn(dst); + reg->src_lmextn = swreg_lmextn(lreg) | swreg_lmextn(rreg); + + return 0; +} + +#define NFP_USTORE_ECC_POLY_WORDS 7 +#define NFP_USTORE_OP_BITS 45 + +static const u64 nfp_ustore_ecc_polynomials[NFP_USTORE_ECC_POLY_WORDS] = { + 0x0ff800007fffULL, + 0x11f801ff801fULL, + 0x1e387e0781e1ULL, + 0x17cb8e388e22ULL, + 0x1af5b2c93244ULL, + 0x1f56d5525488ULL, + 0x0daf69a46910ULL, +}; + +static bool parity(u64 value) +{ + return hweight64(value) & 1; +} + +int nfp_ustore_check_valid_no_ecc(u64 insn) +{ + if (insn & ~GENMASK_ULL(NFP_USTORE_OP_BITS, 0)) + return -EINVAL; + + return 0; +} + +u64 nfp_ustore_calc_ecc_insn(u64 insn) +{ + u8 ecc = 0; + int i; + + for (i = 0; i < NFP_USTORE_ECC_POLY_WORDS; i++) + ecc |= parity(nfp_ustore_ecc_polynomials[i] & insn) << i; + + return insn | (u64)ecc << NFP_USTORE_OP_BITS; +} diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h index d2b535739d2b..86e7daee6099 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h @@ -34,6 +34,8 @@ #ifndef __NFP_ASM_H__ #define __NFP_ASM_H__ 1 +#include <linux/bitfield.h> +#include <linux/bug.h> #include <linux/types.h> #define REG_NONE 0 @@ -43,23 +45,31 @@ #define RE_REG_IMM_encode(x) \ (RE_REG_IMM | ((x) & 0x1f) | (((x) & 0x60) << 1)) #define RE_REG_IMM_MAX 0x07fULL +#define RE_REG_LM 0x050 +#define RE_REG_LM_IDX 0x008 +#define RE_REG_LM_IDX_MAX 0x7 #define RE_REG_XFR 0x080 #define UR_REG_XFR 0x180 +#define UR_REG_LM 0x200 +#define UR_REG_LM_IDX 0x020 +#define UR_REG_LM_POST_MOD 0x010 +#define UR_REG_LM_POST_MOD_DEC 0x001 +#define UR_REG_LM_IDX_MAX 0xf #define UR_REG_NN 0x280 #define UR_REG_NO_DST 0x300 #define UR_REG_IMM UR_REG_NO_DST #define UR_REG_IMM_encode(x) (UR_REG_IMM | (x)) #define UR_REG_IMM_MAX 0x0ffULL -#define OP_BR_BASE 0x0d800000020ULL -#define OP_BR_BASE_MASK 0x0f8000c3ce0ULL -#define OP_BR_MASK 0x0000000001fULL -#define OP_BR_EV_PIP 0x00000000300ULL -#define OP_BR_CSS 0x0000003c000ULL -#define OP_BR_DEFBR 0x00000300000ULL -#define OP_BR_ADDR_LO 0x007ffc00000ULL -#define OP_BR_ADDR_HI 0x10000000000ULL +#define OP_BR_BASE 0x0d800000020ULL +#define OP_BR_BASE_MASK 0x0f8000c3ce0ULL +#define OP_BR_MASK 0x0000000001fULL +#define OP_BR_EV_PIP 0x00000000300ULL +#define OP_BR_CSS 0x0000003c000ULL +#define OP_BR_DEFBR 0x00000300000ULL +#define OP_BR_ADDR_LO 0x007ffc00000ULL +#define OP_BR_ADDR_HI 0x10000000000ULL #define nfp_is_br(_insn) \ (((_insn) & OP_BR_BASE_MASK) == OP_BR_BASE) @@ -82,30 +92,33 @@ enum br_ctx_signal_state { BR_CSS_NONE = 2, }; -#define OP_BBYTE_BASE 0x0c800000000ULL -#define OP_BB_A_SRC 0x000000000ffULL -#define OP_BB_BYTE 0x00000000300ULL -#define OP_BB_B_SRC 0x0000003fc00ULL -#define OP_BB_I8 0x00000040000ULL -#define OP_BB_EQ 0x00000080000ULL -#define OP_BB_DEFBR 0x00000300000ULL -#define OP_BB_ADDR_LO 0x007ffc00000ULL -#define OP_BB_ADDR_HI 0x10000000000ULL - -#define OP_BALU_BASE 0x0e800000000ULL -#define OP_BA_A_SRC 0x000000003ffULL -#define OP_BA_B_SRC 0x000000ffc00ULL -#define OP_BA_DEFBR 0x00000300000ULL -#define OP_BA_ADDR_HI 0x0007fc00000ULL - -#define OP_IMMED_A_SRC 0x000000003ffULL -#define OP_IMMED_B_SRC 0x000000ffc00ULL -#define OP_IMMED_IMM 0x0000ff00000ULL -#define OP_IMMED_WIDTH 0x00060000000ULL -#define OP_IMMED_INV 0x00080000000ULL -#define OP_IMMED_SHIFT 0x00600000000ULL -#define OP_IMMED_BASE 0x0f000000000ULL -#define OP_IMMED_WR_AB 0x20000000000ULL +#define OP_BBYTE_BASE 0x0c800000000ULL +#define OP_BB_A_SRC 0x000000000ffULL +#define OP_BB_BYTE 0x00000000300ULL +#define OP_BB_B_SRC 0x0000003fc00ULL +#define OP_BB_I8 0x00000040000ULL +#define OP_BB_EQ 0x00000080000ULL +#define OP_BB_DEFBR 0x00000300000ULL +#define OP_BB_ADDR_LO 0x007ffc00000ULL +#define OP_BB_ADDR_HI 0x10000000000ULL +#define OP_BB_SRC_LMEXTN 0x40000000000ULL + +#define OP_BALU_BASE 0x0e800000000ULL +#define OP_BA_A_SRC 0x000000003ffULL +#define OP_BA_B_SRC 0x000000ffc00ULL +#define OP_BA_DEFBR 0x00000300000ULL +#define OP_BA_ADDR_HI 0x0007fc00000ULL + +#define OP_IMMED_A_SRC 0x000000003ffULL +#define OP_IMMED_B_SRC 0x000000ffc00ULL +#define OP_IMMED_IMM 0x0000ff00000ULL +#define OP_IMMED_WIDTH 0x00060000000ULL +#define OP_IMMED_INV 0x00080000000ULL +#define OP_IMMED_SHIFT 0x00600000000ULL +#define OP_IMMED_BASE 0x0f000000000ULL +#define OP_IMMED_WR_AB 0x20000000000ULL +#define OP_IMMED_SRC_LMEXTN 0x40000000000ULL +#define OP_IMMED_DST_LMEXTN 0x80000000000ULL enum immed_width { IMMED_WIDTH_ALL = 0, @@ -119,17 +132,19 @@ enum immed_shift { IMMED_SHIFT_2B = 2, }; -#define OP_SHF_BASE 0x08000000000ULL -#define OP_SHF_A_SRC 0x000000000ffULL -#define OP_SHF_SC 0x00000000300ULL -#define OP_SHF_B_SRC 0x0000003fc00ULL -#define OP_SHF_I8 0x00000040000ULL -#define OP_SHF_SW 0x00000080000ULL -#define OP_SHF_DST 0x0000ff00000ULL -#define OP_SHF_SHIFT 0x001f0000000ULL -#define OP_SHF_OP 0x00e00000000ULL -#define OP_SHF_DST_AB 0x01000000000ULL -#define OP_SHF_WR_AB 0x20000000000ULL +#define OP_SHF_BASE 0x08000000000ULL +#define OP_SHF_A_SRC 0x000000000ffULL +#define OP_SHF_SC 0x00000000300ULL +#define OP_SHF_B_SRC 0x0000003fc00ULL +#define OP_SHF_I8 0x00000040000ULL +#define OP_SHF_SW 0x00000080000ULL +#define OP_SHF_DST 0x0000ff00000ULL +#define OP_SHF_SHIFT 0x001f0000000ULL +#define OP_SHF_OP 0x00e00000000ULL +#define OP_SHF_DST_AB 0x01000000000ULL +#define OP_SHF_WR_AB 0x20000000000ULL +#define OP_SHF_SRC_LMEXTN 0x40000000000ULL +#define OP_SHF_DST_LMEXTN 0x80000000000ULL enum shf_op { SHF_OP_NONE = 0, @@ -139,19 +154,22 @@ enum shf_op { enum shf_sc { SHF_SC_R_ROT = 0, + SHF_SC_NONE = SHF_SC_R_ROT, SHF_SC_R_SHF = 1, SHF_SC_L_SHF = 2, SHF_SC_R_DSHF = 3, }; -#define OP_ALU_A_SRC 0x000000003ffULL -#define OP_ALU_B_SRC 0x000000ffc00ULL -#define OP_ALU_DST 0x0003ff00000ULL -#define OP_ALU_SW 0x00040000000ULL -#define OP_ALU_OP 0x00f80000000ULL -#define OP_ALU_DST_AB 0x01000000000ULL -#define OP_ALU_BASE 0x0a000000000ULL -#define OP_ALU_WR_AB 0x20000000000ULL +#define OP_ALU_A_SRC 0x000000003ffULL +#define OP_ALU_B_SRC 0x000000ffc00ULL +#define OP_ALU_DST 0x0003ff00000ULL +#define OP_ALU_SW 0x00040000000ULL +#define OP_ALU_OP 0x00f80000000ULL +#define OP_ALU_DST_AB 0x01000000000ULL +#define OP_ALU_BASE 0x0a000000000ULL +#define OP_ALU_WR_AB 0x20000000000ULL +#define OP_ALU_SRC_LMEXTN 0x40000000000ULL +#define OP_ALU_DST_LMEXTN 0x80000000000ULL enum alu_op { ALU_OP_NONE = 0x00, @@ -170,26 +188,28 @@ enum alu_dst_ab { ALU_DST_B = 1, }; -#define OP_LDF_BASE 0x0c000000000ULL -#define OP_LDF_A_SRC 0x000000000ffULL -#define OP_LDF_SC 0x00000000300ULL -#define OP_LDF_B_SRC 0x0000003fc00ULL -#define OP_LDF_I8 0x00000040000ULL -#define OP_LDF_SW 0x00000080000ULL -#define OP_LDF_ZF 0x00000100000ULL -#define OP_LDF_BMASK 0x0000f000000ULL -#define OP_LDF_SHF 0x001f0000000ULL -#define OP_LDF_WR_AB 0x20000000000ULL - -#define OP_CMD_A_SRC 0x000000000ffULL -#define OP_CMD_CTX 0x00000000300ULL -#define OP_CMD_B_SRC 0x0000003fc00ULL -#define OP_CMD_TOKEN 0x000000c0000ULL -#define OP_CMD_XFER 0x00001f00000ULL -#define OP_CMD_CNT 0x0000e000000ULL -#define OP_CMD_SIG 0x000f0000000ULL -#define OP_CMD_TGT_CMD 0x07f00000000ULL -#define OP_CMD_MODE 0x1c0000000000ULL +#define OP_LDF_BASE 0x0c000000000ULL +#define OP_LDF_A_SRC 0x000000000ffULL +#define OP_LDF_SC 0x00000000300ULL +#define OP_LDF_B_SRC 0x0000003fc00ULL +#define OP_LDF_I8 0x00000040000ULL +#define OP_LDF_SW 0x00000080000ULL +#define OP_LDF_ZF 0x00000100000ULL +#define OP_LDF_BMASK 0x0000f000000ULL +#define OP_LDF_SHF 0x001f0000000ULL +#define OP_LDF_WR_AB 0x20000000000ULL +#define OP_LDF_SRC_LMEXTN 0x40000000000ULL +#define OP_LDF_DST_LMEXTN 0x80000000000ULL + +#define OP_CMD_A_SRC 0x000000000ffULL +#define OP_CMD_CTX 0x00000000300ULL +#define OP_CMD_B_SRC 0x0000003fc00ULL +#define OP_CMD_TOKEN 0x000000c0000ULL +#define OP_CMD_XFER 0x00001f00000ULL +#define OP_CMD_CNT 0x0000e000000ULL +#define OP_CMD_SIG 0x000f0000000ULL +#define OP_CMD_TGT_CMD 0x07f00000000ULL +#define OP_CMD_MODE 0x1c0000000000ULL struct cmd_tgt_act { u8 token; @@ -198,12 +218,17 @@ struct cmd_tgt_act { enum cmd_tgt_map { CMD_TGT_READ8, - CMD_TGT_WRITE8, + CMD_TGT_WRITE8_SWAP, + CMD_TGT_READ32, + CMD_TGT_READ32_LE, + CMD_TGT_READ32_SWAP, CMD_TGT_READ_LE, CMD_TGT_READ_SWAP_LE, __CMD_TGT_MAP_SIZE, }; +extern const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE]; + enum cmd_mode { CMD_MODE_40b_AB = 0, CMD_MODE_40b_BA = 1, @@ -215,11 +240,13 @@ enum cmd_ctx_swap { CMD_CTX_NO_SWAP = 3, }; -#define OP_LCSR_BASE 0x0fc00000000ULL -#define OP_LCSR_A_SRC 0x000000003ffULL -#define OP_LCSR_B_SRC 0x000000ffc00ULL -#define OP_LCSR_WRITE 0x00000200000ULL -#define OP_LCSR_ADDR 0x001ffc00000ULL +#define OP_LCSR_BASE 0x0fc00000000ULL +#define OP_LCSR_A_SRC 0x000000003ffULL +#define OP_LCSR_B_SRC 0x000000ffc00ULL +#define OP_LCSR_WRITE 0x00000200000ULL +#define OP_LCSR_ADDR 0x001ffc00000ULL +#define OP_LCSR_SRC_LMEXTN 0x40000000000ULL +#define OP_LCSR_DST_LMEXTN 0x80000000000ULL enum lcsr_wr_src { LCSR_WR_AREG, @@ -227,7 +254,122 @@ enum lcsr_wr_src { LCSR_WR_IMM, }; -#define OP_CARB_BASE 0x0e000000000ULL -#define OP_CARB_OR 0x00000010000ULL +#define OP_CARB_BASE 0x0e000000000ULL +#define OP_CARB_OR 0x00000010000ULL + +/* Software register representation, independent of operand type */ +#define NN_REG_TYPE GENMASK(31, 24) +#define NN_REG_LM_IDX GENMASK(23, 22) +#define NN_REG_LM_IDX_HI BIT(23) +#define NN_REG_LM_IDX_LO BIT(22) +#define NN_REG_LM_MOD GENMASK(21, 20) +#define NN_REG_VAL GENMASK(7, 0) + +enum nfp_bpf_reg_type { + NN_REG_GPR_A = BIT(0), + NN_REG_GPR_B = BIT(1), + NN_REG_GPR_BOTH = NN_REG_GPR_A | NN_REG_GPR_B, + NN_REG_NNR = BIT(2), + NN_REG_XFER = BIT(3), + NN_REG_IMM = BIT(4), + NN_REG_NONE = BIT(5), + NN_REG_LMEM = BIT(6), +}; + +enum nfp_bpf_lm_mode { + NN_LM_MOD_NONE = 0, + NN_LM_MOD_INC, + NN_LM_MOD_DEC, +}; + +#define reg_both(x) __enc_swreg((x), NN_REG_GPR_BOTH) +#define reg_a(x) __enc_swreg((x), NN_REG_GPR_A) +#define reg_b(x) __enc_swreg((x), NN_REG_GPR_B) +#define reg_nnr(x) __enc_swreg((x), NN_REG_NNR) +#define reg_xfer(x) __enc_swreg((x), NN_REG_XFER) +#define reg_imm(x) __enc_swreg((x), NN_REG_IMM) +#define reg_none() __enc_swreg(0, NN_REG_NONE) +#define reg_lm(x, off) __enc_swreg_lm((x), NN_LM_MOD_NONE, (off)) +#define reg_lm_inc(x) __enc_swreg_lm((x), NN_LM_MOD_INC, 0) +#define reg_lm_dec(x) __enc_swreg_lm((x), NN_LM_MOD_DEC, 0) +#define __reg_lm(x, mod, off) __enc_swreg_lm((x), (mod), (off)) + +typedef __u32 __bitwise swreg; + +static inline swreg __enc_swreg(u16 id, u8 type) +{ + return (__force swreg)(id | FIELD_PREP(NN_REG_TYPE, type)); +} + +static inline swreg __enc_swreg_lm(u8 id, enum nfp_bpf_lm_mode mode, u8 off) +{ + WARN_ON(id > 3 || (off && mode != NN_LM_MOD_NONE)); + + return (__force swreg)(FIELD_PREP(NN_REG_TYPE, NN_REG_LMEM) | + FIELD_PREP(NN_REG_LM_IDX, id) | + FIELD_PREP(NN_REG_LM_MOD, mode) | + off); +} + +static inline u32 swreg_raw(swreg reg) +{ + return (__force u32)reg; +} + +static inline enum nfp_bpf_reg_type swreg_type(swreg reg) +{ + return FIELD_GET(NN_REG_TYPE, swreg_raw(reg)); +} + +static inline u16 swreg_value(swreg reg) +{ + return FIELD_GET(NN_REG_VAL, swreg_raw(reg)); +} + +static inline bool swreg_lm_idx(swreg reg) +{ + return FIELD_GET(NN_REG_LM_IDX_LO, swreg_raw(reg)); +} + +static inline bool swreg_lmextn(swreg reg) +{ + return FIELD_GET(NN_REG_LM_IDX_HI, swreg_raw(reg)); +} + +static inline enum nfp_bpf_lm_mode swreg_lm_mode(swreg reg) +{ + return FIELD_GET(NN_REG_LM_MOD, swreg_raw(reg)); +} + +struct nfp_insn_ur_regs { + enum alu_dst_ab dst_ab; + u16 dst; + u16 areg, breg; + bool swap; + bool wr_both; + bool dst_lmextn; + bool src_lmextn; +}; + +struct nfp_insn_re_regs { + enum alu_dst_ab dst_ab; + u8 dst; + u8 areg, breg; + bool swap; + bool wr_both; + bool i8; + bool dst_lmextn; + bool src_lmextn; +}; + +int swreg_to_unrestricted(swreg dst, swreg lreg, swreg rreg, + struct nfp_insn_ur_regs *reg); +int swreg_to_restricted(swreg dst, swreg lreg, swreg rreg, + struct nfp_insn_re_regs *reg, bool has_imm8); + +#define NFP_USTORE_PREFETCH_WINDOW 8 + +int nfp_ustore_check_valid_no_ecc(u64 insn); +u64 nfp_ustore_calc_ecc_insn(u64 insn); #endif diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h index b0a452ba9039..782d452e0fc2 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_ctrl.h @@ -255,7 +255,7 @@ * @NFP_NET_CFG_BPF_ADDR: DMA address of the buffer with JITed BPF code */ #define NFP_NET_CFG_BPF_ABI 0x0080 -#define NFP_NET_BPF_ABI 1 +#define NFP_NET_BPF_ABI 2 #define NFP_NET_CFG_BPF_CAP 0x0081 #define NFP_NET_BPF_CAP_RELO (1 << 0) /* seamless reload */ #define NFP_NET_CFG_BPF_MAX_LEN 0x0082 diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index 8f6ccc0c39e5..6e15d3c10ebf 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -2308,7 +2308,7 @@ static int qed_dcbnl_ieee_setapp(struct qed_dev *cdev, struct dcb_app *app) DP_VERBOSE(hwfn, QED_MSG_DCB, "selector = %d protocol = %d pri = %d\n", app->selector, app->protocol, app->priority); - if (app->priority < 0 || app->priority >= QED_MAX_PFC_PRIORITIES) { + if (app->priority >= QED_MAX_PFC_PRIORITIES) { DP_INFO(hwfn, "Invalid priority %d\n", app->priority); return -EINVAL; } diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c index 8fc9c811f6e3..b2b1f87864ef 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c @@ -1415,7 +1415,12 @@ int qed_iwarp_alloc(struct qed_hwfn *p_hwfn) void qed_iwarp_resc_free(struct qed_hwfn *p_hwfn) { + struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp; + qed_rdma_bmap_free(p_hwfn, &p_hwfn->p_rdma_info->tcp_cid_map, 1); + kfree(iwarp_info->mpa_bufs); + kfree(iwarp_info->partial_fpdus); + kfree(iwarp_info->mpa_intermediate_buf); } int qed_iwarp_accept(void *rdma_cxt, struct qed_iwarp_accept_in *iparams) @@ -1713,6 +1718,569 @@ qed_iwarp_parse_rx_pkt(struct qed_hwfn *p_hwfn, return 0; } +static struct qed_iwarp_fpdu *qed_iwarp_get_curr_fpdu(struct qed_hwfn *p_hwfn, + u16 cid) +{ + struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp; + struct qed_iwarp_fpdu *partial_fpdu; + u32 idx; + + idx = cid - qed_cxt_get_proto_cid_start(p_hwfn, PROTOCOLID_IWARP); + if (idx >= iwarp_info->max_num_partial_fpdus) { + DP_ERR(p_hwfn, "Invalid cid %x max_num_partial_fpdus=%x\n", cid, + iwarp_info->max_num_partial_fpdus); + return NULL; + } + + partial_fpdu = &iwarp_info->partial_fpdus[idx]; + + return partial_fpdu; +} + +enum qed_iwarp_mpa_pkt_type { + QED_IWARP_MPA_PKT_PACKED, + QED_IWARP_MPA_PKT_PARTIAL, + QED_IWARP_MPA_PKT_UNALIGNED +}; + +#define QED_IWARP_INVALID_FPDU_LENGTH 0xffff +#define QED_IWARP_MPA_FPDU_LENGTH_SIZE (2) +#define QED_IWARP_MPA_CRC32_DIGEST_SIZE (4) + +/* Pad to multiple of 4 */ +#define QED_IWARP_PDU_DATA_LEN_WITH_PAD(data_len) ALIGN(data_len, 4) +#define QED_IWARP_FPDU_LEN_WITH_PAD(_mpa_len) \ + (QED_IWARP_PDU_DATA_LEN_WITH_PAD((_mpa_len) + \ + QED_IWARP_MPA_FPDU_LENGTH_SIZE) + \ + QED_IWARP_MPA_CRC32_DIGEST_SIZE) + +/* fpdu can be fragmented over maximum 3 bds: header, partial mpa, unaligned */ +#define QED_IWARP_MAX_BDS_PER_FPDU 3 + +char *pkt_type_str[] = { + "QED_IWARP_MPA_PKT_PACKED", + "QED_IWARP_MPA_PKT_PARTIAL", + "QED_IWARP_MPA_PKT_UNALIGNED" +}; + +static int +qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn, + struct qed_iwarp_fpdu *fpdu, + struct qed_iwarp_ll2_buff *buf); + +static enum qed_iwarp_mpa_pkt_type +qed_iwarp_mpa_classify(struct qed_hwfn *p_hwfn, + struct qed_iwarp_fpdu *fpdu, + u16 tcp_payload_len, u8 *mpa_data) +{ + enum qed_iwarp_mpa_pkt_type pkt_type; + u16 mpa_len; + + if (fpdu->incomplete_bytes) { + pkt_type = QED_IWARP_MPA_PKT_UNALIGNED; + goto out; + } + + /* special case of one byte remaining... + * lower byte will be read next packet + */ + if (tcp_payload_len == 1) { + fpdu->fpdu_length = *mpa_data << BITS_PER_BYTE; + pkt_type = QED_IWARP_MPA_PKT_PARTIAL; + goto out; + } + + mpa_len = ntohs(*((u16 *)(mpa_data))); + fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len); + + if (fpdu->fpdu_length <= tcp_payload_len) + pkt_type = QED_IWARP_MPA_PKT_PACKED; + else + pkt_type = QED_IWARP_MPA_PKT_PARTIAL; + +out: + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "MPA_ALIGN: %s: fpdu_length=0x%x tcp_payload_len:0x%x\n", + pkt_type_str[pkt_type], fpdu->fpdu_length, tcp_payload_len); + + return pkt_type; +} + +static void +qed_iwarp_init_fpdu(struct qed_iwarp_ll2_buff *buf, + struct qed_iwarp_fpdu *fpdu, + struct unaligned_opaque_data *pkt_data, + u16 tcp_payload_size, u8 placement_offset) +{ + fpdu->mpa_buf = buf; + fpdu->pkt_hdr = buf->data_phys_addr + placement_offset; + fpdu->pkt_hdr_size = pkt_data->tcp_payload_offset; + fpdu->mpa_frag = buf->data_phys_addr + pkt_data->first_mpa_offset; + fpdu->mpa_frag_virt = (u8 *)(buf->data) + pkt_data->first_mpa_offset; + + if (tcp_payload_size == 1) + fpdu->incomplete_bytes = QED_IWARP_INVALID_FPDU_LENGTH; + else if (tcp_payload_size < fpdu->fpdu_length) + fpdu->incomplete_bytes = fpdu->fpdu_length - tcp_payload_size; + else + fpdu->incomplete_bytes = 0; /* complete fpdu */ + + fpdu->mpa_frag_len = fpdu->fpdu_length - fpdu->incomplete_bytes; +} + +static int +qed_iwarp_cp_pkt(struct qed_hwfn *p_hwfn, + struct qed_iwarp_fpdu *fpdu, + struct unaligned_opaque_data *pkt_data, + struct qed_iwarp_ll2_buff *buf, u16 tcp_payload_size) +{ + u8 *tmp_buf = p_hwfn->p_rdma_info->iwarp.mpa_intermediate_buf; + int rc; + + /* need to copy the data from the partial packet stored in fpdu + * to the new buf, for this we also need to move the data currently + * placed on the buf. The assumption is that the buffer is big enough + * since fpdu_length <= mss, we use an intermediate buffer since + * we may need to copy the new data to an overlapping location + */ + if ((fpdu->mpa_frag_len + tcp_payload_size) > (u16)buf->buff_size) { + DP_ERR(p_hwfn, + "MPA ALIGN: Unexpected: buffer is not large enough for split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n", + buf->buff_size, fpdu->mpa_frag_len, + tcp_payload_size, fpdu->incomplete_bytes); + return -EINVAL; + } + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "MPA ALIGN Copying fpdu: [%p, %d] [%p, %d]\n", + fpdu->mpa_frag_virt, fpdu->mpa_frag_len, + (u8 *)(buf->data) + pkt_data->first_mpa_offset, + tcp_payload_size); + + memcpy(tmp_buf, fpdu->mpa_frag_virt, fpdu->mpa_frag_len); + memcpy(tmp_buf + fpdu->mpa_frag_len, + (u8 *)(buf->data) + pkt_data->first_mpa_offset, + tcp_payload_size); + + rc = qed_iwarp_recycle_pkt(p_hwfn, fpdu, fpdu->mpa_buf); + if (rc) + return rc; + + /* If we managed to post the buffer copy the data to the new buffer + * o/w this will occur in the next round... + */ + memcpy((u8 *)(buf->data), tmp_buf, + fpdu->mpa_frag_len + tcp_payload_size); + + fpdu->mpa_buf = buf; + /* fpdu->pkt_hdr remains as is */ + /* fpdu->mpa_frag is overridden with new buf */ + fpdu->mpa_frag = buf->data_phys_addr; + fpdu->mpa_frag_virt = buf->data; + fpdu->mpa_frag_len += tcp_payload_size; + + fpdu->incomplete_bytes -= tcp_payload_size; + + DP_VERBOSE(p_hwfn, + QED_MSG_RDMA, + "MPA ALIGN: split fpdu buff_size = %d mpa_frag_len = %d, tcp_payload_size = %d, incomplete_bytes = %d\n", + buf->buff_size, fpdu->mpa_frag_len, tcp_payload_size, + fpdu->incomplete_bytes); + + return 0; +} + +static void +qed_iwarp_update_fpdu_length(struct qed_hwfn *p_hwfn, + struct qed_iwarp_fpdu *fpdu, u8 *mpa_data) +{ + u16 mpa_len; + + /* Update incomplete packets if needed */ + if (fpdu->incomplete_bytes == QED_IWARP_INVALID_FPDU_LENGTH) { + /* Missing lower byte is now available */ + mpa_len = fpdu->fpdu_length | *mpa_data; + fpdu->fpdu_length = QED_IWARP_FPDU_LEN_WITH_PAD(mpa_len); + fpdu->mpa_frag_len = fpdu->fpdu_length; + /* one byte of hdr */ + fpdu->incomplete_bytes = fpdu->fpdu_length - 1; + DP_VERBOSE(p_hwfn, + QED_MSG_RDMA, + "MPA_ALIGN: Partial header mpa_len=%x fpdu_length=%x incomplete_bytes=%x\n", + mpa_len, fpdu->fpdu_length, fpdu->incomplete_bytes); + } +} + +#define QED_IWARP_IS_RIGHT_EDGE(_curr_pkt) \ + (GET_FIELD((_curr_pkt)->flags, \ + UNALIGNED_OPAQUE_DATA_PKT_REACHED_WIN_RIGHT_EDGE)) + +/* This function is used to recycle a buffer using the ll2 drop option. It + * uses the mechanism to ensure that all buffers posted to tx before this one + * were completed. The buffer sent here will be sent as a cookie in the tx + * completion function and can then be reposted to rx chain when done. The flow + * that requires this is the flow where a FPDU splits over more than 3 tcp + * segments. In this case the driver needs to re-post a rx buffer instead of + * the one received, but driver can't simply repost a buffer it copied from + * as there is a case where the buffer was originally a packed FPDU, and is + * partially posted to FW. Driver needs to ensure FW is done with it. + */ +static int +qed_iwarp_recycle_pkt(struct qed_hwfn *p_hwfn, + struct qed_iwarp_fpdu *fpdu, + struct qed_iwarp_ll2_buff *buf) +{ + struct qed_ll2_tx_pkt_info tx_pkt; + u8 ll2_handle; + int rc; + + memset(&tx_pkt, 0, sizeof(tx_pkt)); + tx_pkt.num_of_bds = 1; + tx_pkt.tx_dest = QED_LL2_TX_DEST_DROP; + tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2; + tx_pkt.first_frag = fpdu->pkt_hdr; + tx_pkt.first_frag_len = fpdu->pkt_hdr_size; + buf->piggy_buf = NULL; + tx_pkt.cookie = buf; + + ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle; + + rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true); + if (rc) + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Can't drop packet rc=%d\n", rc); + + DP_VERBOSE(p_hwfn, + QED_MSG_RDMA, + "MPA_ALIGN: send drop tx packet [%lx, 0x%x], buf=%p, rc=%d\n", + (unsigned long int)tx_pkt.first_frag, + tx_pkt.first_frag_len, buf, rc); + + return rc; +} + +static int +qed_iwarp_win_right_edge(struct qed_hwfn *p_hwfn, struct qed_iwarp_fpdu *fpdu) +{ + struct qed_ll2_tx_pkt_info tx_pkt; + u8 ll2_handle; + int rc; + + memset(&tx_pkt, 0, sizeof(tx_pkt)); + tx_pkt.num_of_bds = 1; + tx_pkt.tx_dest = QED_LL2_TX_DEST_LB; + tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2; + + tx_pkt.first_frag = fpdu->pkt_hdr; + tx_pkt.first_frag_len = fpdu->pkt_hdr_size; + tx_pkt.enable_ip_cksum = true; + tx_pkt.enable_l4_cksum = true; + tx_pkt.calc_ip_len = true; + /* vlan overload with enum iwarp_ll2_tx_queues */ + tx_pkt.vlan = IWARP_LL2_ALIGNED_RIGHT_TRIMMED_TX_QUEUE; + + ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle; + + rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true); + if (rc) + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Can't send right edge rc=%d\n", rc); + DP_VERBOSE(p_hwfn, + QED_MSG_RDMA, + "MPA_ALIGN: Sent right edge FPDU num_bds=%d [%lx, 0x%x], rc=%d\n", + tx_pkt.num_of_bds, + (unsigned long int)tx_pkt.first_frag, + tx_pkt.first_frag_len, rc); + + return rc; +} + +static int +qed_iwarp_send_fpdu(struct qed_hwfn *p_hwfn, + struct qed_iwarp_fpdu *fpdu, + struct unaligned_opaque_data *curr_pkt, + struct qed_iwarp_ll2_buff *buf, + u16 tcp_payload_size, enum qed_iwarp_mpa_pkt_type pkt_type) +{ + struct qed_ll2_tx_pkt_info tx_pkt; + u8 ll2_handle; + int rc; + + memset(&tx_pkt, 0, sizeof(tx_pkt)); + + /* An unaligned packet means it's split over two tcp segments. So the + * complete packet requires 3 bds, one for the header, one for the + * part of the fpdu of the first tcp segment, and the last fragment + * will point to the remainder of the fpdu. A packed pdu, requires only + * two bds, one for the header and one for the data. + */ + tx_pkt.num_of_bds = (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED) ? 3 : 2; + tx_pkt.tx_dest = QED_LL2_TX_DEST_LB; + tx_pkt.l4_hdr_offset_w = fpdu->pkt_hdr_size >> 2; /* offset in words */ + + /* Send the mpa_buf only with the last fpdu (in case of packed) */ + if (pkt_type == QED_IWARP_MPA_PKT_UNALIGNED || + tcp_payload_size <= fpdu->fpdu_length) + tx_pkt.cookie = fpdu->mpa_buf; + + tx_pkt.first_frag = fpdu->pkt_hdr; + tx_pkt.first_frag_len = fpdu->pkt_hdr_size; + tx_pkt.enable_ip_cksum = true; + tx_pkt.enable_l4_cksum = true; + tx_pkt.calc_ip_len = true; + /* vlan overload with enum iwarp_ll2_tx_queues */ + tx_pkt.vlan = IWARP_LL2_ALIGNED_TX_QUEUE; + + /* special case of unaligned packet and not packed, need to send + * both buffers as cookie to release. + */ + if (tcp_payload_size == fpdu->incomplete_bytes) + fpdu->mpa_buf->piggy_buf = buf; + + ll2_handle = p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle; + + /* Set first fragment to header */ + rc = qed_ll2_prepare_tx_packet(p_hwfn, ll2_handle, &tx_pkt, true); + if (rc) + goto out; + + /* Set second fragment to first part of packet */ + rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn, ll2_handle, + fpdu->mpa_frag, + fpdu->mpa_frag_len); + if (rc) + goto out; + + if (!fpdu->incomplete_bytes) + goto out; + + /* Set third fragment to second part of the packet */ + rc = qed_ll2_set_fragment_of_tx_packet(p_hwfn, + ll2_handle, + buf->data_phys_addr + + curr_pkt->first_mpa_offset, + fpdu->incomplete_bytes); +out: + DP_VERBOSE(p_hwfn, + QED_MSG_RDMA, + "MPA_ALIGN: Sent FPDU num_bds=%d first_frag_len=%x, mpa_frag_len=0x%x, incomplete_bytes:0x%x rc=%d\n", + tx_pkt.num_of_bds, + tx_pkt.first_frag_len, + fpdu->mpa_frag_len, + fpdu->incomplete_bytes, rc); + + return rc; +} + +static void +qed_iwarp_mpa_get_data(struct qed_hwfn *p_hwfn, + struct unaligned_opaque_data *curr_pkt, + u32 opaque_data0, u32 opaque_data1) +{ + u64 opaque_data; + + opaque_data = HILO_64(opaque_data1, opaque_data0); + *curr_pkt = *((struct unaligned_opaque_data *)&opaque_data); + + curr_pkt->first_mpa_offset = curr_pkt->tcp_payload_offset + + le16_to_cpu(curr_pkt->first_mpa_offset); + curr_pkt->cid = le32_to_cpu(curr_pkt->cid); +} + +/* This function is called when an unaligned or incomplete MPA packet arrives + * driver needs to align the packet, perhaps using previous data and send + * it down to FW once it is aligned. + */ +static int +qed_iwarp_process_mpa_pkt(struct qed_hwfn *p_hwfn, + struct qed_iwarp_ll2_mpa_buf *mpa_buf) +{ + struct unaligned_opaque_data *curr_pkt = &mpa_buf->data; + struct qed_iwarp_ll2_buff *buf = mpa_buf->ll2_buf; + enum qed_iwarp_mpa_pkt_type pkt_type; + struct qed_iwarp_fpdu *fpdu; + int rc = -EINVAL; + u8 *mpa_data; + + fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, curr_pkt->cid & 0xffff); + if (!fpdu) { /* something corrupt with cid, post rx back */ + DP_ERR(p_hwfn, "Invalid cid, drop and post back to rx cid=%x\n", + curr_pkt->cid); + goto err; + } + + do { + mpa_data = ((u8 *)(buf->data) + curr_pkt->first_mpa_offset); + + pkt_type = qed_iwarp_mpa_classify(p_hwfn, fpdu, + mpa_buf->tcp_payload_len, + mpa_data); + + switch (pkt_type) { + case QED_IWARP_MPA_PKT_PARTIAL: + qed_iwarp_init_fpdu(buf, fpdu, + curr_pkt, + mpa_buf->tcp_payload_len, + mpa_buf->placement_offset); + + if (!QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) { + mpa_buf->tcp_payload_len = 0; + break; + } + + rc = qed_iwarp_win_right_edge(p_hwfn, fpdu); + + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Can't send FPDU:reset rc=%d\n", rc); + memset(fpdu, 0, sizeof(*fpdu)); + break; + } + + mpa_buf->tcp_payload_len = 0; + break; + case QED_IWARP_MPA_PKT_PACKED: + qed_iwarp_init_fpdu(buf, fpdu, + curr_pkt, + mpa_buf->tcp_payload_len, + mpa_buf->placement_offset); + + rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf, + mpa_buf->tcp_payload_len, + pkt_type); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Can't send FPDU:reset rc=%d\n", rc); + memset(fpdu, 0, sizeof(*fpdu)); + break; + } + + mpa_buf->tcp_payload_len -= fpdu->fpdu_length; + curr_pkt->first_mpa_offset += fpdu->fpdu_length; + break; + case QED_IWARP_MPA_PKT_UNALIGNED: + qed_iwarp_update_fpdu_length(p_hwfn, fpdu, mpa_data); + if (mpa_buf->tcp_payload_len < fpdu->incomplete_bytes) { + /* special handling of fpdu split over more + * than 2 segments + */ + if (QED_IWARP_IS_RIGHT_EDGE(curr_pkt)) { + rc = qed_iwarp_win_right_edge(p_hwfn, + fpdu); + /* packet will be re-processed later */ + if (rc) + return rc; + } + + rc = qed_iwarp_cp_pkt(p_hwfn, fpdu, curr_pkt, + buf, + mpa_buf->tcp_payload_len); + if (rc) /* packet will be re-processed later */ + return rc; + + mpa_buf->tcp_payload_len = 0; + break; + } + + rc = qed_iwarp_send_fpdu(p_hwfn, fpdu, curr_pkt, buf, + mpa_buf->tcp_payload_len, + pkt_type); + if (rc) { + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, + "Can't send FPDU:delay rc=%d\n", rc); + /* don't reset fpdu -> we need it for next + * classify + */ + break; + } + + mpa_buf->tcp_payload_len -= fpdu->incomplete_bytes; + curr_pkt->first_mpa_offset += fpdu->incomplete_bytes; + /* The framed PDU was sent - no more incomplete bytes */ + fpdu->incomplete_bytes = 0; + break; + } + } while (mpa_buf->tcp_payload_len && !rc); + + return rc; + +err: + qed_iwarp_ll2_post_rx(p_hwfn, + buf, + p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle); + return rc; +} + +static void qed_iwarp_process_pending_pkts(struct qed_hwfn *p_hwfn) +{ + struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp; + struct qed_iwarp_ll2_mpa_buf *mpa_buf = NULL; + int rc; + + while (!list_empty(&iwarp_info->mpa_buf_pending_list)) { + mpa_buf = list_first_entry(&iwarp_info->mpa_buf_pending_list, + struct qed_iwarp_ll2_mpa_buf, + list_entry); + + rc = qed_iwarp_process_mpa_pkt(p_hwfn, mpa_buf); + + /* busy means break and continue processing later, don't + * remove the buf from the pending list. + */ + if (rc == -EBUSY) + break; + + list_del(&mpa_buf->list_entry); + list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_list); + + if (rc) { /* different error, don't continue */ + DP_NOTICE(p_hwfn, "process pkts failed rc=%d\n", rc); + break; + } + } +} + +static void +qed_iwarp_ll2_comp_mpa_pkt(void *cxt, struct qed_ll2_comp_rx_data *data) +{ + struct qed_iwarp_ll2_mpa_buf *mpa_buf; + struct qed_iwarp_info *iwarp_info; + struct qed_hwfn *p_hwfn = cxt; + + iwarp_info = &p_hwfn->p_rdma_info->iwarp; + mpa_buf = list_first_entry(&iwarp_info->mpa_buf_list, + struct qed_iwarp_ll2_mpa_buf, list_entry); + if (!mpa_buf) { + DP_ERR(p_hwfn, "No free mpa buf\n"); + goto err; + } + + list_del(&mpa_buf->list_entry); + qed_iwarp_mpa_get_data(p_hwfn, &mpa_buf->data, + data->opaque_data_0, data->opaque_data_1); + + DP_VERBOSE(p_hwfn, + QED_MSG_RDMA, + "LL2 MPA CompRx payload_len:0x%x\tfirst_mpa_offset:0x%x\ttcp_payload_offset:0x%x\tflags:0x%x\tcid:0x%x\n", + data->length.packet_length, mpa_buf->data.first_mpa_offset, + mpa_buf->data.tcp_payload_offset, mpa_buf->data.flags, + mpa_buf->data.cid); + + mpa_buf->ll2_buf = data->cookie; + mpa_buf->tcp_payload_len = data->length.packet_length - + mpa_buf->data.first_mpa_offset; + mpa_buf->data.first_mpa_offset += data->u.placement_offset; + mpa_buf->placement_offset = data->u.placement_offset; + + list_add_tail(&mpa_buf->list_entry, &iwarp_info->mpa_buf_pending_list); + + qed_iwarp_process_pending_pkts(p_hwfn); + return; +err: + qed_iwarp_ll2_post_rx(p_hwfn, data->cookie, + iwarp_info->ll2_mpa_handle); +} + static void qed_iwarp_ll2_comp_syn_pkt(void *cxt, struct qed_ll2_comp_rx_data *data) { @@ -1855,10 +2423,25 @@ static void qed_iwarp_ll2_comp_tx_pkt(void *cxt, u8 connection_handle, bool b_last_fragment, bool b_last_packet) { struct qed_iwarp_ll2_buff *buffer = cookie; + struct qed_iwarp_ll2_buff *piggy; struct qed_hwfn *p_hwfn = cxt; + if (!buffer) /* can happen in packed mpa unaligned... */ + return; + /* this was originally an rx packet, post it back */ + piggy = buffer->piggy_buf; + if (piggy) { + buffer->piggy_buf = NULL; + qed_iwarp_ll2_post_rx(p_hwfn, piggy, connection_handle); + } + qed_iwarp_ll2_post_rx(p_hwfn, buffer, connection_handle); + + if (connection_handle == p_hwfn->p_rdma_info->iwarp.ll2_mpa_handle) + qed_iwarp_process_pending_pkts(p_hwfn); + + return; } static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle, @@ -1871,12 +2454,44 @@ static void qed_iwarp_ll2_rel_tx_pkt(void *cxt, u8 connection_handle, if (!buffer) return; + if (buffer->piggy_buf) { + dma_free_coherent(&p_hwfn->cdev->pdev->dev, + buffer->piggy_buf->buff_size, + buffer->piggy_buf->data, + buffer->piggy_buf->data_phys_addr); + + kfree(buffer->piggy_buf); + } + dma_free_coherent(&p_hwfn->cdev->pdev->dev, buffer->buff_size, buffer->data, buffer->data_phys_addr); kfree(buffer); } +/* The only slowpath for iwarp ll2 is unalign flush. When this completion + * is received, need to reset the FPDU. + */ +void +qed_iwarp_ll2_slowpath(void *cxt, + u8 connection_handle, + u32 opaque_data_0, u32 opaque_data_1) +{ + struct unaligned_opaque_data unalign_data; + struct qed_hwfn *p_hwfn = cxt; + struct qed_iwarp_fpdu *fpdu; + + qed_iwarp_mpa_get_data(p_hwfn, &unalign_data, + opaque_data_0, opaque_data_1); + + DP_VERBOSE(p_hwfn, QED_MSG_RDMA, "(0x%x) Flush fpdu\n", + unalign_data.cid); + + fpdu = qed_iwarp_get_curr_fpdu(p_hwfn, (u16)unalign_data.cid); + if (fpdu) + memset(fpdu, 0, sizeof(*fpdu)); +} + static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) { struct qed_iwarp_info *iwarp_info = &p_hwfn->p_rdma_info->iwarp; @@ -1902,6 +2517,16 @@ static int qed_iwarp_ll2_stop(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL; } + if (iwarp_info->ll2_mpa_handle != QED_IWARP_HANDLE_INVAL) { + rc = qed_ll2_terminate_connection(p_hwfn, + iwarp_info->ll2_mpa_handle); + if (rc) + DP_INFO(p_hwfn, "Failed to terminate mpa connection\n"); + + qed_ll2_release_connection(p_hwfn, iwarp_info->ll2_mpa_handle); + iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL; + } + qed_llh_remove_mac_filter(p_hwfn, p_ptt, p_hwfn->p_rdma_info->iwarp.mac_addr); return rc; @@ -1953,12 +2578,15 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, struct qed_iwarp_info *iwarp_info; struct qed_ll2_acquire_data data; struct qed_ll2_cbs cbs; + u32 mpa_buff_size; u16 n_ooo_bufs; int rc = 0; + int i; iwarp_info = &p_hwfn->p_rdma_info->iwarp; iwarp_info->ll2_syn_handle = QED_IWARP_HANDLE_INVAL; iwarp_info->ll2_ooo_handle = QED_IWARP_HANDLE_INVAL; + iwarp_info->ll2_mpa_handle = QED_IWARP_HANDLE_INVAL; iwarp_info->max_mtu = params->max_mtu; @@ -2029,6 +2657,68 @@ qed_iwarp_ll2_start(struct qed_hwfn *p_hwfn, if (rc) goto err; + /* Start Unaligned MPA connection */ + cbs.rx_comp_cb = qed_iwarp_ll2_comp_mpa_pkt; + cbs.slowpath_cb = qed_iwarp_ll2_slowpath; + + memset(&data, 0, sizeof(data)); + data.input.conn_type = QED_LL2_TYPE_IWARP; + data.input.mtu = params->max_mtu; + /* FW requires that once a packet arrives OOO, it must have at + * least 2 rx buffers available on the unaligned connection + * for handling the case that it is a partial fpdu. + */ + data.input.rx_num_desc = n_ooo_bufs * 2; + data.input.tx_num_desc = data.input.rx_num_desc; + data.input.tx_max_bds_per_packet = QED_IWARP_MAX_BDS_PER_FPDU; + data.p_connection_handle = &iwarp_info->ll2_mpa_handle; + data.input.secondary_queue = true; + data.cbs = &cbs; + + rc = qed_ll2_acquire_connection(p_hwfn, &data); + if (rc) + goto err; + + rc = qed_ll2_establish_connection(p_hwfn, iwarp_info->ll2_mpa_handle); + if (rc) + goto err; + + mpa_buff_size = QED_IWARP_MAX_BUF_SIZE(params->max_mtu); + rc = qed_iwarp_ll2_alloc_buffers(p_hwfn, + data.input.rx_num_desc, + mpa_buff_size, + iwarp_info->ll2_mpa_handle); + if (rc) + goto err; + + iwarp_info->partial_fpdus = kcalloc((u16)p_hwfn->p_rdma_info->num_qps, + sizeof(*iwarp_info->partial_fpdus), + GFP_KERNEL); + if (!iwarp_info->partial_fpdus) + goto err; + + iwarp_info->max_num_partial_fpdus = (u16)p_hwfn->p_rdma_info->num_qps; + + iwarp_info->mpa_intermediate_buf = kzalloc(mpa_buff_size, GFP_KERNEL); + if (!iwarp_info->mpa_intermediate_buf) + goto err; + + /* The mpa_bufs array serves for pending RX packets received on the + * mpa ll2 that don't have place on the tx ring and require later + * processing. We can't fail on allocation of such a struct therefore + * we allocate enough to take care of all rx packets + */ + iwarp_info->mpa_bufs = kcalloc(data.input.rx_num_desc, + sizeof(*iwarp_info->mpa_bufs), + GFP_KERNEL); + if (!iwarp_info->mpa_bufs) + goto err; + + INIT_LIST_HEAD(&iwarp_info->mpa_buf_pending_list); + INIT_LIST_HEAD(&iwarp_info->mpa_buf_list); + for (i = 0; i < data.input.rx_num_desc; i++) + list_add_tail(&iwarp_info->mpa_bufs[i].list_entry, + &iwarp_info->mpa_buf_list); return rc; err: qed_iwarp_ll2_stop(p_hwfn, p_ptt); diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h index 9e2bfde894df..c1ecd743305f 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.h @@ -55,15 +55,43 @@ enum qed_iwarp_qp_state qed_roce2iwarp_state(enum qed_roce_qp_state state); #define QED_IWARP_HANDLE_INVAL (0xff) struct qed_iwarp_ll2_buff { + struct qed_iwarp_ll2_buff *piggy_buf; void *data; dma_addr_t data_phys_addr; u32 buff_size; }; +struct qed_iwarp_ll2_mpa_buf { + struct list_head list_entry; + struct qed_iwarp_ll2_buff *ll2_buf; + struct unaligned_opaque_data data; + u16 tcp_payload_len; + u8 placement_offset; +}; + +/* In some cases a fpdu will arrive with only one byte of the header, in this + * case the fpdu_length will be partial (contain only higher byte and + * incomplete bytes will contain the invalid value + */ +#define QED_IWARP_INVALID_INCOMPLETE_BYTES 0xffff + +struct qed_iwarp_fpdu { + struct qed_iwarp_ll2_buff *mpa_buf; + void *mpa_frag_virt; + dma_addr_t mpa_frag; + dma_addr_t pkt_hdr; + u16 mpa_frag_len; + u16 fpdu_length; + u16 incomplete_bytes; + u8 pkt_hdr_size; +}; + struct qed_iwarp_info { struct list_head listen_list; /* qed_iwarp_listener */ struct list_head ep_list; /* qed_iwarp_ep */ struct list_head ep_free_list; /* pre-allocated ep's */ + struct list_head mpa_buf_list; /* list of mpa_bufs */ + struct list_head mpa_buf_pending_list; spinlock_t iw_lock; /* for iwarp resources */ spinlock_t qp_lock; /* for teardown races */ u32 rcv_wnd_scale; @@ -73,9 +101,14 @@ struct qed_iwarp_info { u8 tcp_flags; u8 ll2_syn_handle; u8 ll2_ooo_handle; + u8 ll2_mpa_handle; u8 peer2peer; enum mpa_negotiation_mode mpa_rev; enum mpa_rtr_type rtr_type; + struct qed_iwarp_fpdu *partial_fpdus; + struct qed_iwarp_ll2_mpa_buf *mpa_bufs; + u8 *mpa_intermediate_buf; + u16 max_num_partial_fpdus; }; enum qed_iwarp_ep_state { diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 250afa5486cf..047f556ca62e 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -423,6 +423,41 @@ static void qed_ll2_rxq_parse_reg(struct qed_hwfn *p_hwfn, } static int +qed_ll2_handle_slowpath(struct qed_hwfn *p_hwfn, + struct qed_ll2_info *p_ll2_conn, + union core_rx_cqe_union *p_cqe, + unsigned long *p_lock_flags) +{ + struct qed_ll2_rx_queue *p_rx = &p_ll2_conn->rx_queue; + struct core_rx_slow_path_cqe *sp_cqe; + + sp_cqe = &p_cqe->rx_cqe_sp; + if (sp_cqe->ramrod_cmd_id != CORE_RAMROD_RX_QUEUE_FLUSH) { + DP_NOTICE(p_hwfn, + "LL2 - unexpected Rx CQE slowpath ramrod_cmd_id:%d\n", + sp_cqe->ramrod_cmd_id); + return -EINVAL; + } + + if (!p_ll2_conn->cbs.slowpath_cb) { + DP_NOTICE(p_hwfn, + "LL2 - received RX_QUEUE_FLUSH but no callback was provided\n"); + return -EINVAL; + } + + spin_unlock_irqrestore(&p_rx->lock, *p_lock_flags); + + p_ll2_conn->cbs.slowpath_cb(p_ll2_conn->cbs.cookie, + p_ll2_conn->my_id, + le32_to_cpu(sp_cqe->opaque_data.data[0]), + le32_to_cpu(sp_cqe->opaque_data.data[1])); + + spin_lock_irqsave(&p_rx->lock, *p_lock_flags); + + return 0; +} + +static int qed_ll2_rxq_handle_completion(struct qed_hwfn *p_hwfn, struct qed_ll2_info *p_ll2_conn, union core_rx_cqe_union *p_cqe, @@ -495,8 +530,8 @@ static int qed_ll2_rxq_completion(struct qed_hwfn *p_hwfn, void *cookie) switch (cqe->rx_cqe_sp.type) { case CORE_RX_CQE_TYPE_SLOW_PATH: - DP_NOTICE(p_hwfn, "LL2 - unexpected Rx CQE slowpath\n"); - rc = -EINVAL; + rc = qed_ll2_handle_slowpath(p_hwfn, p_ll2_conn, + cqe, &flags); break; case CORE_RX_CQE_TYPE_GSI_OFFLOAD: case CORE_RX_CQE_TYPE_REGULAR: @@ -894,7 +929,7 @@ static int qed_sp_ll2_rx_queue_start(struct qed_hwfn *p_hwfn, p_ramrod->drop_ttl0_flg = p_ll2_conn->input.rx_drop_ttl0_flg; p_ramrod->inner_vlan_removal_en = p_ll2_conn->input.rx_vlan_removal_en; p_ramrod->queue_id = p_ll2_conn->queue_id; - p_ramrod->main_func_queue = (conn_type == QED_LL2_TYPE_OOO) ? 0 : 1; + p_ramrod->main_func_queue = p_ll2_conn->main_func_queue ? 1 : 0; if ((IS_MF_DEFAULT(p_hwfn) || IS_MF_SI(p_hwfn)) && p_ramrod->main_func_queue && (conn_type != QED_LL2_TYPE_ROCE) && @@ -1105,6 +1140,7 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn, struct qed_ll2_info *p_ll2_info) { struct qed_ll2_tx_packet *p_descq; + u32 desc_size; u32 capacity; int rc = 0; @@ -1122,13 +1158,17 @@ static int qed_ll2_acquire_connection_tx(struct qed_hwfn *p_hwfn, goto out; capacity = qed_chain_get_capacity(&p_ll2_info->tx_queue.txq_chain); - p_descq = kcalloc(capacity, sizeof(struct qed_ll2_tx_packet), - GFP_KERNEL); + /* First element is part of the packet, rest are flexibly added */ + desc_size = (sizeof(*p_descq) + + (p_ll2_info->input.tx_max_bds_per_packet - 1) * + sizeof(p_descq->bds_set)); + + p_descq = kcalloc(capacity, desc_size, GFP_KERNEL); if (!p_descq) { rc = -ENOMEM; goto out; } - p_ll2_info->tx_queue.descq_array = p_descq; + p_ll2_info->tx_queue.descq_mem = p_descq; DP_VERBOSE(p_hwfn, QED_MSG_LL2, "Allocated LL2 Txq [Type %08x] with 0x%08x buffers\n", @@ -1209,6 +1249,7 @@ qed_ll2_set_cbs(struct qed_ll2_info *p_ll2_info, const struct qed_ll2_cbs *cbs) p_ll2_info->cbs.rx_release_cb = cbs->rx_release_cb; p_ll2_info->cbs.tx_comp_cb = cbs->tx_comp_cb; p_ll2_info->cbs.tx_release_cb = cbs->tx_release_cb; + p_ll2_info->cbs.slowpath_cb = cbs->slowpath_cb; p_ll2_info->cbs.cookie = cbs->cookie; return 0; @@ -1260,6 +1301,11 @@ int qed_ll2_acquire_connection(void *cxt, struct qed_ll2_acquire_data *data) p_ll2_info->tx_dest = (data->input.tx_dest == QED_LL2_TX_DEST_NW) ? CORE_TX_DEST_NW : CORE_TX_DEST_LB; + if (data->input.conn_type == QED_LL2_TYPE_OOO || + data->input.secondary_queue) + p_ll2_info->main_func_queue = false; + else + p_ll2_info->main_func_queue = true; /* Correct maximum number of Tx BDs */ p_tx_max = &p_ll2_info->input.tx_max_bds_per_packet; @@ -1359,11 +1405,13 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle) { struct qed_hwfn *p_hwfn = cxt; struct qed_ll2_info *p_ll2_conn; + struct qed_ll2_tx_packet *p_pkt; struct qed_ll2_rx_queue *p_rx; struct qed_ll2_tx_queue *p_tx; struct qed_ptt *p_ptt; int rc = -EINVAL; u32 i, capacity; + u32 desc_size; u8 qid; p_ptt = qed_ptt_acquire(p_hwfn); @@ -1397,9 +1445,15 @@ int qed_ll2_establish_connection(void *cxt, u8 connection_handle) INIT_LIST_HEAD(&p_tx->sending_descq); spin_lock_init(&p_tx->lock); capacity = qed_chain_get_capacity(&p_tx->txq_chain); - for (i = 0; i < capacity; i++) - list_add_tail(&p_tx->descq_array[i].list_entry, - &p_tx->free_descq); + /* First element is part of the packet, rest are flexibly added */ + desc_size = (sizeof(*p_pkt) + + (p_ll2_conn->input.tx_max_bds_per_packet - 1) * + sizeof(p_pkt->bds_set)); + + for (i = 0; i < capacity; i++) { + p_pkt = p_tx->descq_mem + desc_size * i; + list_add_tail(&p_pkt->list_entry, &p_tx->free_descq); + } p_tx->cur_completing_bd_idx = 0; p_tx->bds_idx = 0; p_tx->b_completing_packet = false; @@ -1579,11 +1633,28 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn, roce_flavor = (pkt->qed_roce_flavor == QED_LL2_ROCE) ? CORE_ROCE : CORE_RROCE; - tx_dest = (pkt->tx_dest == QED_LL2_TX_DEST_NW) ? CORE_TX_DEST_NW - : CORE_TX_DEST_LB; + switch (pkt->tx_dest) { + case QED_LL2_TX_DEST_NW: + tx_dest = CORE_TX_DEST_NW; + break; + case QED_LL2_TX_DEST_LB: + tx_dest = CORE_TX_DEST_LB; + break; + case QED_LL2_TX_DEST_DROP: + tx_dest = CORE_TX_DEST_DROP; + break; + default: + tx_dest = CORE_TX_DEST_LB; + break; + } start_bd = (struct core_tx_bd *)qed_chain_produce(p_tx_chain); - start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan); + if (QED_IS_IWARP_PERSONALITY(p_hwfn) && + p_ll2->input.conn_type == QED_LL2_TYPE_OOO) + start_bd->nw_vlan_or_lb_echo = + cpu_to_le16(IWARP_LL2_IN_ORDER_TX_QUEUE); + else + start_bd->nw_vlan_or_lb_echo = cpu_to_le16(pkt->vlan); SET_FIELD(start_bd->bitfield1, CORE_TX_BD_L4_HDR_OFFSET_W, cpu_to_le16(pkt->l4_hdr_offset_w)); SET_FIELD(start_bd->bitfield1, CORE_TX_BD_TX_DST, tx_dest); @@ -1591,6 +1662,9 @@ qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn, SET_FIELD(bd_data, CORE_TX_BD_DATA_START_BD, 0x1); SET_FIELD(bd_data, CORE_TX_BD_DATA_NBDS, pkt->num_of_bds); SET_FIELD(bd_data, CORE_TX_BD_DATA_ROCE_FLAV, roce_flavor); + SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_CSUM, !!(pkt->enable_ip_cksum)); + SET_FIELD(bd_data, CORE_TX_BD_DATA_L4_CSUM, !!(pkt->enable_l4_cksum)); + SET_FIELD(bd_data, CORE_TX_BD_DATA_IP_LEN, !!(pkt->calc_ip_len)); start_bd->bd_data.as_bitfield = cpu_to_le16(bd_data); DMA_REGPAIR_LE(start_bd->addr, pkt->first_frag); start_bd->nbytes = cpu_to_le16(pkt->first_frag_len); @@ -1698,7 +1772,7 @@ int qed_ll2_prepare_tx_packet(void *cxt, p_tx = &p_ll2_conn->tx_queue; p_tx_chain = &p_tx->txq_chain; - if (pkt->num_of_bds > CORE_LL2_TX_MAX_BDS_PER_PACKET) + if (pkt->num_of_bds > p_ll2_conn->input.tx_max_bds_per_packet) return -EIO; spin_lock_irqsave(&p_tx->lock, flags); @@ -1858,7 +1932,7 @@ void qed_ll2_release_connection(void *cxt, u8 connection_handle) qed_int_unregister_cb(p_hwfn, p_ll2_conn->tx_queue.tx_sb_index); } - kfree(p_ll2_conn->tx_queue.descq_array); + kfree(p_ll2_conn->tx_queue.descq_mem); qed_chain_free(p_hwfn->cdev, &p_ll2_conn->tx_queue.txq_chain); kfree(p_ll2_conn->rx_queue.descq_array); diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.h b/drivers/net/ethernet/qlogic/qed/qed_ll2.h index a822528e9c63..f65817012e97 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.h +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.h @@ -63,17 +63,14 @@ struct qed_ll2_rx_packet { struct qed_ll2_tx_packet { struct list_head list_entry; u16 bd_used; - u16 vlan; - u16 l4_hdr_offset_w; - u8 bd_flags; bool notify_fw; void *cookie; - + /* Flexible Array of bds_set determined by max_bds_per_packet */ struct { struct core_tx_bd *txq_bd; dma_addr_t tx_frag; u16 frag_len; - } bds_set[ETH_TX_MAX_BDS_PER_NON_LSO_PACKET]; + } bds_set[1]; }; struct qed_ll2_rx_queue { @@ -101,7 +98,7 @@ struct qed_ll2_tx_queue { struct list_head active_descq; struct list_head free_descq; struct list_head sending_descq; - struct qed_ll2_tx_packet *descq_array; + void *descq_mem; /* memory for variable sized qed_ll2_tx_packet*/ struct qed_ll2_tx_packet *cur_send_packet; struct qed_ll2_tx_packet cur_completing_packet; u16 cur_completing_bd_idx; @@ -124,6 +121,7 @@ struct qed_ll2_info { bool b_active; enum core_tx_dest tx_dest; u8 tx_stats_en; + bool main_func_queue; struct qed_ll2_rx_queue rx_queue; struct qed_ll2_tx_queue tx_queue; struct qed_ll2_cbs cbs; diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index 3ed9033e56db..9cbb27263742 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -309,22 +309,12 @@ void emac_mac_mode_config(struct emac_adapter *adpt) /* Config descriptor rings */ static void emac_mac_dma_rings_config(struct emac_adapter *adpt) { - static const unsigned short tpd_q_offset[] = { - EMAC_DESC_CTRL_8, EMAC_H1TPD_BASE_ADDR_LO, - EMAC_H2TPD_BASE_ADDR_LO, EMAC_H3TPD_BASE_ADDR_LO}; - static const unsigned short rfd_q_offset[] = { - EMAC_DESC_CTRL_2, EMAC_DESC_CTRL_10, - EMAC_DESC_CTRL_12, EMAC_DESC_CTRL_13}; - static const unsigned short rrd_q_offset[] = { - EMAC_DESC_CTRL_5, EMAC_DESC_CTRL_14, - EMAC_DESC_CTRL_15, EMAC_DESC_CTRL_16}; - /* TPD (Transmit Packet Descriptor) */ writel(upper_32_bits(adpt->tx_q.tpd.dma_addr), adpt->base + EMAC_DESC_CTRL_1); writel(lower_32_bits(adpt->tx_q.tpd.dma_addr), - adpt->base + tpd_q_offset[0]); + adpt->base + EMAC_DESC_CTRL_8); writel(adpt->tx_q.tpd.count & TPD_RING_SIZE_BMSK, adpt->base + EMAC_DESC_CTRL_9); @@ -334,9 +324,9 @@ static void emac_mac_dma_rings_config(struct emac_adapter *adpt) adpt->base + EMAC_DESC_CTRL_0); writel(lower_32_bits(adpt->rx_q.rfd.dma_addr), - adpt->base + rfd_q_offset[0]); + adpt->base + EMAC_DESC_CTRL_2); writel(lower_32_bits(adpt->rx_q.rrd.dma_addr), - adpt->base + rrd_q_offset[0]); + adpt->base + EMAC_DESC_CTRL_5); writel(adpt->rx_q.rfd.count & RFD_RING_SIZE_BMSK, adpt->base + EMAC_DESC_CTRL_3); diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c index 29ba37a08372..e8ab512ee7e3 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c @@ -68,10 +68,10 @@ static void emac_sgmii_link_init(struct emac_adapter *adpt) writel(val, phy->base + EMAC_SGMII_PHY_AUTONEG_CFG2); } -static int emac_sgmii_irq_clear(struct emac_adapter *adpt, u32 irq_bits) +static int emac_sgmii_irq_clear(struct emac_adapter *adpt, u8 irq_bits) { struct emac_sgmii *phy = &adpt->phy; - u32 status; + u8 status; writel_relaxed(irq_bits, phy->base + EMAC_SGMII_PHY_INTERRUPT_CLEAR); writel_relaxed(IRQ_GLOBAL_CLEAR, phy->base + EMAC_SGMII_PHY_IRQ_CMD); @@ -86,9 +86,8 @@ static int emac_sgmii_irq_clear(struct emac_adapter *adpt, u32 irq_bits) EMAC_SGMII_PHY_INTERRUPT_STATUS, status, !(status & irq_bits), 1, SGMII_PHY_IRQ_CLR_WAIT_TIME)) { - netdev_err(adpt->netdev, - "error: failed clear SGMII irq: status:0x%x bits:0x%x\n", - status, irq_bits); + net_err_ratelimited("%s: failed to clear SGMII irq: status:0x%x bits:0x%x\n", + adpt->netdev->name, status, irq_bits); return -EIO; } @@ -109,7 +108,7 @@ static irqreturn_t emac_sgmii_interrupt(int irq, void *data) { struct emac_adapter *adpt = data; struct emac_sgmii *phy = &adpt->phy; - u32 status; + u8 status; status = readl(phy->base + EMAC_SGMII_PHY_INTERRUPT_STATUS); status &= SGMII_ISR_MASK; @@ -139,10 +138,8 @@ static irqreturn_t emac_sgmii_interrupt(int irq, void *data) atomic_set(&phy->decode_error_count, 0); } - if (emac_sgmii_irq_clear(adpt, status)) { - netdev_warn(adpt->netdev, "failed to clear SGMII interrupt\n"); + if (emac_sgmii_irq_clear(adpt, status)) schedule_work(&adpt->work_thread); - } return IRQ_HANDLED; } diff --git a/drivers/net/ethernet/qualcomm/emac/emac.c b/drivers/net/ethernet/qualcomm/emac/emac.c index f477ba29c569..70c92b649b29 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac.c @@ -148,9 +148,8 @@ static irqreturn_t emac_isr(int _irq, void *data) goto exit; if (status & ISR_ERROR) { - netif_warn(adpt, intr, adpt->netdev, - "warning: error irq status 0x%lx\n", - status & ISR_ERROR); + net_err_ratelimited("%s: error interrupt 0x%lx\n", + adpt->netdev->name, status & ISR_ERROR); /* reset MAC */ schedule_work(&adpt->work_thread); } @@ -169,7 +168,8 @@ static irqreturn_t emac_isr(int _irq, void *data) emac_mac_tx_process(adpt, &adpt->tx_q); if (status & ISR_OVER) - net_warn_ratelimited("warning: TX/RX overflow\n"); + net_warn_ratelimited("%s: TX/RX overflow interrupt\n", + adpt->netdev->name); exit: /* enable the interrupt */ @@ -615,20 +615,11 @@ static int emac_probe(struct platform_device *pdev) u32 reg; int ret; - /* The EMAC itself is capable of 64-bit DMA, so try that first. */ - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + /* The TPD buffer address is limited to 45 bits. */ + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(45)); if (ret) { - /* Some platforms may restrict the EMAC's address bus to less - * then the size of DDR. In this case, we need to try a - * smaller mask. We could try every possible smaller mask, - * but that's overkill. Instead, just fall to 32-bit, which - * should always work. - */ - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); - if (ret) { - dev_err(&pdev->dev, "could not set DMA mask\n"); - return ret; - } + dev_err(&pdev->dev, "could not set DMA mask\n"); + return ret; } netdev = alloc_etherdev(sizeof(struct emac_adapter)); diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c index 8403eea08d0e..71bee1af71ef 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c @@ -61,23 +61,6 @@ rmnet_get_port_rtnl(const struct net_device *real_dev) return rtnl_dereference(real_dev->rx_handler_data); } -static struct rmnet_endpoint* -rmnet_get_endpoint(struct net_device *dev, int config_id) -{ - struct rmnet_endpoint *ep; - struct rmnet_port *port; - - if (!rmnet_is_real_dev_registered(dev)) { - ep = rmnet_vnd_get_endpoint(dev); - } else { - port = rmnet_get_port_rtnl(dev); - - ep = &port->muxed_ep[config_id]; - } - - return ep; -} - static int rmnet_unregister_real_device(struct net_device *real_dev, struct rmnet_port *port) { @@ -98,7 +81,7 @@ static int rmnet_unregister_real_device(struct net_device *real_dev, static int rmnet_register_real_device(struct net_device *real_dev) { struct rmnet_port *port; - int rc; + int rc, entry; ASSERT_RTNL(); @@ -119,27 +102,41 @@ static int rmnet_register_real_device(struct net_device *real_dev) /* hold on to real dev for MAP data */ dev_hold(real_dev); + for (entry = 0; entry < RMNET_MAX_LOGICAL_EP; entry++) + INIT_HLIST_HEAD(&port->muxed_ep[entry]); + netdev_dbg(real_dev, "registered with rmnet\n"); return 0; } -static void rmnet_set_endpoint_config(struct net_device *dev, - u8 mux_id, u8 rmnet_mode, - struct net_device *egress_dev) +static void rmnet_unregister_bridge(struct net_device *dev, + struct rmnet_port *port) { - struct rmnet_endpoint *ep; + struct net_device *rmnet_dev, *bridge_dev; + struct rmnet_port *bridge_port; + + if (port->rmnet_mode != RMNET_EPMODE_BRIDGE) + return; - netdev_dbg(dev, "id %d mode %d dev %s\n", - mux_id, rmnet_mode, egress_dev->name); + /* bridge slave handling */ + if (!port->nr_rmnet_devs) { + rmnet_dev = netdev_master_upper_dev_get_rcu(dev); + netdev_upper_dev_unlink(dev, rmnet_dev); - ep = rmnet_get_endpoint(dev, mux_id); - /* This config is cleared on every set, so its ok to not - * clear it on a device delete. - */ - memset(ep, 0, sizeof(struct rmnet_endpoint)); - ep->rmnet_mode = rmnet_mode; - ep->egress_dev = egress_dev; - ep->mux_id = mux_id; + bridge_dev = port->bridge_ep; + + bridge_port = rmnet_get_port_rtnl(bridge_dev); + bridge_port->bridge_ep = NULL; + bridge_port->rmnet_mode = RMNET_EPMODE_VND; + } else { + bridge_dev = port->bridge_ep; + + bridge_port = rmnet_get_port_rtnl(bridge_dev); + rmnet_dev = netdev_master_upper_dev_get_rcu(bridge_dev); + netdev_upper_dev_unlink(bridge_dev, rmnet_dev); + + rmnet_unregister_real_device(bridge_dev, bridge_port); + } } static int rmnet_newlink(struct net *src_net, struct net_device *dev, @@ -153,6 +150,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, RMNET_EGRESS_FORMAT_MAP; struct net_device *real_dev; int mode = RMNET_EPMODE_VND; + struct rmnet_endpoint *ep; struct rmnet_port *port; int err = 0; u16 mux_id; @@ -164,6 +162,10 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, if (!data[IFLA_VLAN_ID]) return -EINVAL; + ep = kzalloc(sizeof(*ep), GFP_ATOMIC); + if (!ep) + return -ENOMEM; + mux_id = nla_get_u16(data[IFLA_VLAN_ID]); err = rmnet_register_real_device(real_dev); @@ -171,7 +173,7 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, goto err0; port = rmnet_get_port_rtnl(real_dev); - err = rmnet_vnd_newlink(mux_id, dev, port, real_dev); + err = rmnet_vnd_newlink(mux_id, dev, port, real_dev, ep); if (err) goto err1; @@ -183,13 +185,13 @@ static int rmnet_newlink(struct net *src_net, struct net_device *dev, ingress_format, egress_format); port->egress_data_format = egress_format; port->ingress_data_format = ingress_format; + port->rmnet_mode = mode; - rmnet_set_endpoint_config(real_dev, mux_id, mode, dev); - rmnet_set_endpoint_config(dev, mux_id, mode, real_dev); + hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]); return 0; err2: - rmnet_vnd_dellink(mux_id, port); + rmnet_vnd_dellink(mux_id, port, ep); err1: rmnet_unregister_real_device(real_dev, port); err0: @@ -199,6 +201,7 @@ err0: static void rmnet_dellink(struct net_device *dev, struct list_head *head) { struct net_device *real_dev; + struct rmnet_endpoint *ep; struct rmnet_port *port; u8 mux_id; @@ -212,8 +215,15 @@ static void rmnet_dellink(struct net_device *dev, struct list_head *head) port = rmnet_get_port_rtnl(real_dev); mux_id = rmnet_vnd_get_mux(dev); - rmnet_vnd_dellink(mux_id, port); netdev_upper_dev_unlink(dev, real_dev); + + ep = rmnet_get_endpoint(port, mux_id); + if (ep) { + hlist_del_init_rcu(&ep->hlnode); + rmnet_unregister_bridge(dev, port); + rmnet_vnd_dellink(mux_id, port, ep); + kfree(ep); + } rmnet_unregister_real_device(real_dev, port); unregister_netdevice_queue(dev, head); @@ -222,11 +232,16 @@ static void rmnet_dellink(struct net_device *dev, struct list_head *head) static int rmnet_dev_walk_unreg(struct net_device *rmnet_dev, void *data) { struct rmnet_walk_data *d = data; + struct rmnet_endpoint *ep; u8 mux_id; mux_id = rmnet_vnd_get_mux(rmnet_dev); - - rmnet_vnd_dellink(mux_id, d->port); + ep = rmnet_get_endpoint(d->port, mux_id); + if (ep) { + hlist_del_init_rcu(&ep->hlnode); + rmnet_vnd_dellink(mux_id, d->port, ep); + kfree(ep); + } netdev_upper_dev_unlink(rmnet_dev, d->real_dev); unregister_netdevice_queue(rmnet_dev, d->head); @@ -252,6 +267,8 @@ static void rmnet_force_unassociate_device(struct net_device *dev) d.port = port; rcu_read_lock(); + rmnet_unregister_bridge(dev, port); + netdev_walk_all_lower_dev_rcu(real_dev, rmnet_dev_walk_unreg, &d); rcu_read_unlock(); unregister_netdevice_many(&list); @@ -324,6 +341,77 @@ struct rmnet_port *rmnet_get_port(struct net_device *real_dev) return NULL; } +struct rmnet_endpoint *rmnet_get_endpoint(struct rmnet_port *port, u8 mux_id) +{ + struct rmnet_endpoint *ep; + + hlist_for_each_entry_rcu(ep, &port->muxed_ep[mux_id], hlnode) { + if (ep->mux_id == mux_id) + return ep; + } + + return NULL; +} + +int rmnet_add_bridge(struct net_device *rmnet_dev, + struct net_device *slave_dev, + struct netlink_ext_ack *extack) +{ + struct rmnet_priv *priv = netdev_priv(rmnet_dev); + struct net_device *real_dev = priv->real_dev; + struct rmnet_port *port, *slave_port; + int err; + + port = rmnet_get_port(real_dev); + + /* If there is more than one rmnet dev attached, its probably being + * used for muxing. Skip the briding in that case + */ + if (port->nr_rmnet_devs > 1) + return -EINVAL; + + if (rmnet_is_real_dev_registered(slave_dev)) + return -EBUSY; + + err = rmnet_register_real_device(slave_dev); + if (err) + return -EBUSY; + + err = netdev_master_upper_dev_link(slave_dev, rmnet_dev, NULL, NULL, + extack); + if (err) + return -EINVAL; + + slave_port = rmnet_get_port(slave_dev); + slave_port->rmnet_mode = RMNET_EPMODE_BRIDGE; + slave_port->bridge_ep = real_dev; + + port->rmnet_mode = RMNET_EPMODE_BRIDGE; + port->bridge_ep = slave_dev; + + netdev_dbg(slave_dev, "registered with rmnet as slave\n"); + return 0; +} + +int rmnet_del_bridge(struct net_device *rmnet_dev, + struct net_device *slave_dev) +{ + struct rmnet_priv *priv = netdev_priv(rmnet_dev); + struct net_device *real_dev = priv->real_dev; + struct rmnet_port *port, *slave_port; + + port = rmnet_get_port(real_dev); + port->rmnet_mode = RMNET_EPMODE_VND; + port->bridge_ep = NULL; + + netdev_upper_dev_unlink(slave_dev, rmnet_dev); + slave_port = rmnet_get_port(slave_dev); + rmnet_unregister_real_device(slave_dev, slave_port); + + netdev_dbg(slave_dev, "removed from rmnet as slave\n"); + return 0; +} + /* Startup/Shutdown */ static int __init rmnet_init(void) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h index dde4e9f14f4a..60115e69e415 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h @@ -20,13 +20,10 @@ #define RMNET_MAX_LOGICAL_EP 255 -/* Information about the next device to deliver the packet to. - * Exact usage of this parameter depends on the rmnet_mode. - */ struct rmnet_endpoint { - u8 rmnet_mode; u8 mux_id; struct net_device *egress_dev; + struct hlist_node hlnode; }; /* One instance of this structure is instantiated for each real_dev associated @@ -34,22 +31,26 @@ struct rmnet_endpoint { */ struct rmnet_port { struct net_device *dev; - struct rmnet_endpoint local_ep; - struct rmnet_endpoint muxed_ep[RMNET_MAX_LOGICAL_EP]; u32 ingress_data_format; u32 egress_data_format; - struct net_device *rmnet_devices[RMNET_MAX_LOGICAL_EP]; u8 nr_rmnet_devs; + u8 rmnet_mode; + struct hlist_head muxed_ep[RMNET_MAX_LOGICAL_EP]; + struct net_device *bridge_ep; }; extern struct rtnl_link_ops rmnet_link_ops; struct rmnet_priv { - struct rmnet_endpoint local_ep; u8 mux_id; struct net_device *real_dev; }; struct rmnet_port *rmnet_get_port(struct net_device *real_dev); - +struct rmnet_endpoint *rmnet_get_endpoint(struct rmnet_port *port, u8 mux_id); +int rmnet_add_bridge(struct net_device *rmnet_dev, + struct net_device *slave_dev, + struct netlink_ext_ack *extack); +int rmnet_del_bridge(struct net_device *rmnet_dev, + struct net_device *slave_dev); #endif /* _RMNET_CONFIG_H_ */ diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c index 540c7622dcb1..df3d2d16ce55 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c @@ -44,56 +44,18 @@ static void rmnet_set_skb_proto(struct sk_buff *skb) /* Generic handler */ static rx_handler_result_t -rmnet_bridge_handler(struct sk_buff *skb, struct rmnet_endpoint *ep) +rmnet_deliver_skb(struct sk_buff *skb) { - if (!ep->egress_dev) - kfree_skb(skb); - else - rmnet_egress_handler(skb, ep); + skb_reset_transport_header(skb); + skb_reset_network_header(skb); + rmnet_vnd_rx_fixup(skb, skb->dev); + skb->pkt_type = PACKET_HOST; + skb_set_mac_header(skb, 0); + netif_receive_skb(skb); return RX_HANDLER_CONSUMED; } -static rx_handler_result_t -rmnet_deliver_skb(struct sk_buff *skb, struct rmnet_endpoint *ep) -{ - switch (ep->rmnet_mode) { - case RMNET_EPMODE_NONE: - return RX_HANDLER_PASS; - - case RMNET_EPMODE_BRIDGE: - return rmnet_bridge_handler(skb, ep); - - case RMNET_EPMODE_VND: - skb_reset_transport_header(skb); - skb_reset_network_header(skb); - rmnet_vnd_rx_fixup(skb, skb->dev); - - skb->pkt_type = PACKET_HOST; - skb_set_mac_header(skb, 0); - netif_receive_skb(skb); - return RX_HANDLER_CONSUMED; - - default: - kfree_skb(skb); - return RX_HANDLER_CONSUMED; - } -} - -static rx_handler_result_t -rmnet_ingress_deliver_packet(struct sk_buff *skb, - struct rmnet_port *port) -{ - if (!port) { - kfree_skb(skb); - return RX_HANDLER_CONSUMED; - } - - skb->dev = port->local_ep.egress_dev; - - return rmnet_deliver_skb(skb, &port->local_ep); -} - /* MAP handler */ static rx_handler_result_t @@ -109,19 +71,18 @@ __rmnet_map_ingress_handler(struct sk_buff *skb, & RMNET_INGRESS_FORMAT_MAP_COMMANDS) return rmnet_map_command(skb, port); - kfree_skb(skb); - return RX_HANDLER_CONSUMED; + goto free_skb; } mux_id = RMNET_MAP_GET_MUX_ID(skb); len = RMNET_MAP_GET_LENGTH(skb) - RMNET_MAP_GET_PAD(skb); - if (mux_id >= RMNET_MAX_LOGICAL_EP) { - kfree_skb(skb); - return RX_HANDLER_CONSUMED; - } + if (mux_id >= RMNET_MAX_LOGICAL_EP) + goto free_skb; - ep = &port->muxed_ep[mux_id]; + ep = rmnet_get_endpoint(port, mux_id); + if (!ep) + goto free_skb; if (port->ingress_data_format & RMNET_INGRESS_FORMAT_DEMUXING) skb->dev = ep->egress_dev; @@ -130,7 +91,11 @@ __rmnet_map_ingress_handler(struct sk_buff *skb, skb_pull(skb, sizeof(struct rmnet_map_header)); skb_trim(skb, len); rmnet_set_skb_proto(skb); - return rmnet_deliver_skb(skb, ep); + return rmnet_deliver_skb(skb); + +free_skb: + kfree_skb(skb); + return RX_HANDLER_CONSUMED; } static rx_handler_result_t @@ -154,8 +119,7 @@ rmnet_map_ingress_handler(struct sk_buff *skb, } static int rmnet_map_egress_handler(struct sk_buff *skb, - struct rmnet_port *port, - struct rmnet_endpoint *ep, + struct rmnet_port *port, u8 mux_id, struct net_device *orig_dev) { int required_headroom, additional_header_len; @@ -174,10 +138,10 @@ static int rmnet_map_egress_handler(struct sk_buff *skb, return RMNET_MAP_CONSUMED; if (port->egress_data_format & RMNET_EGRESS_FORMAT_MUXING) { - if (ep->mux_id == 0xff) + if (mux_id == 0xff) map_header->mux_id = 0; else - map_header->mux_id = ep->mux_id; + map_header->mux_id = mux_id; } skb->protocol = htons(ETH_P_MAP); @@ -185,6 +149,17 @@ static int rmnet_map_egress_handler(struct sk_buff *skb, return RMNET_MAP_SUCCESS; } +static rx_handler_result_t +rmnet_bridge_handler(struct sk_buff *skb, struct net_device *bridge_dev) +{ + if (bridge_dev) { + skb->dev = bridge_dev; + dev_queue_xmit(skb); + } + + return RX_HANDLER_CONSUMED; +} + /* Ingress / Egress Entry Points */ /* Processes packet as per ingress data format for receiving device. Logical @@ -193,10 +168,10 @@ static int rmnet_map_egress_handler(struct sk_buff *skb, */ rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb) { - struct rmnet_port *port; + int rc = RX_HANDLER_CONSUMED; struct sk_buff *skb = *pskb; + struct rmnet_port *port; struct net_device *dev; - int rc; if (!skb) return RX_HANDLER_CONSUMED; @@ -204,28 +179,14 @@ rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb) dev = skb->dev; port = rmnet_get_port(dev); - if (port->ingress_data_format & RMNET_INGRESS_FORMAT_MAP) { - rc = rmnet_map_ingress_handler(skb, port); - } else { - switch (ntohs(skb->protocol)) { - case ETH_P_MAP: - if (port->local_ep.rmnet_mode == - RMNET_EPMODE_BRIDGE) { - rc = rmnet_ingress_deliver_packet(skb, port); - } else { - kfree_skb(skb); - rc = RX_HANDLER_CONSUMED; - } - break; - - case ETH_P_IP: - case ETH_P_IPV6: - rc = rmnet_ingress_deliver_packet(skb, port); - break; - - default: - rc = RX_HANDLER_PASS; - } + switch (port->rmnet_mode) { + case RMNET_EPMODE_VND: + if (port->ingress_data_format & RMNET_INGRESS_FORMAT_MAP) + rc = rmnet_map_ingress_handler(skb, port); + break; + case RMNET_EPMODE_BRIDGE: + rc = rmnet_bridge_handler(skb, port->bridge_ep); + break; } return rc; @@ -235,14 +196,17 @@ rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb) * for egress device configured in logical endpoint. Packet is then transmitted * on the egress device. */ -void rmnet_egress_handler(struct sk_buff *skb, - struct rmnet_endpoint *ep) +void rmnet_egress_handler(struct sk_buff *skb) { struct net_device *orig_dev; struct rmnet_port *port; + struct rmnet_priv *priv; + u8 mux_id; orig_dev = skb->dev; - skb->dev = ep->egress_dev; + priv = netdev_priv(orig_dev); + skb->dev = priv->real_dev; + mux_id = priv->mux_id; port = rmnet_get_port(skb->dev); if (!port) { @@ -251,7 +215,7 @@ void rmnet_egress_handler(struct sk_buff *skb, } if (port->egress_data_format & RMNET_EGRESS_FORMAT_MAP) { - switch (rmnet_map_egress_handler(skb, port, ep, orig_dev)) { + switch (rmnet_map_egress_handler(skb, port, mux_id, orig_dev)) { case RMNET_MAP_CONSUMED: return; @@ -264,8 +228,7 @@ void rmnet_egress_handler(struct sk_buff *skb, } } - if (ep->rmnet_mode == RMNET_EPMODE_VND) - rmnet_vnd_tx_fixup(skb, orig_dev); + rmnet_vnd_tx_fixup(skb, orig_dev); dev_queue_xmit(skb); } diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h index f2638cf5693c..3537e4ceedb3 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h @@ -18,8 +18,7 @@ #include "rmnet_config.h" -void rmnet_egress_handler(struct sk_buff *skb, - struct rmnet_endpoint *ep); +void rmnet_egress_handler(struct sk_buff *skb); rx_handler_result_t rmnet_rx_handler(struct sk_buff **pskb); diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c index d1ea5e21b982..74d362f71cce 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c @@ -17,7 +17,7 @@ #include "rmnet_vnd.h" static u8 rmnet_map_do_flow_control(struct sk_buff *skb, - struct rmnet_port *rdinfo, + struct rmnet_port *port, int enable) { struct rmnet_map_control_command *cmd; @@ -37,7 +37,7 @@ static u8 rmnet_map_do_flow_control(struct sk_buff *skb, return RX_HANDLER_CONSUMED; } - ep = &rdinfo->muxed_ep[mux_id]; + ep = rmnet_get_endpoint(port, mux_id); vnd = ep->egress_dev; ip_family = cmd->flow_control.ip_family; diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h index 7967198fdd90..49102f922b31 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h @@ -19,23 +19,15 @@ #define RMNET_TX_QUEUE_LEN 1000 /* Constants */ -#define RMNET_EGRESS_FORMAT__RESERVED__ BIT(0) #define RMNET_EGRESS_FORMAT_MAP BIT(1) #define RMNET_EGRESS_FORMAT_AGGREGATION BIT(2) #define RMNET_EGRESS_FORMAT_MUXING BIT(3) -#define RMNET_EGRESS_FORMAT_MAP_CKSUMV3 BIT(4) -#define RMNET_EGRESS_FORMAT_MAP_CKSUMV4 BIT(5) -#define RMNET_INGRESS_FIX_ETHERNET BIT(0) #define RMNET_INGRESS_FORMAT_MAP BIT(1) #define RMNET_INGRESS_FORMAT_DEAGGREGATION BIT(2) #define RMNET_INGRESS_FORMAT_DEMUXING BIT(3) #define RMNET_INGRESS_FORMAT_MAP_COMMANDS BIT(4) -#define RMNET_INGRESS_FORMAT_MAP_CKSUMV3 BIT(5) -#define RMNET_INGRESS_FORMAT_MAP_CKSUMV4 BIT(6) -/* Pass the frame up the stack with no modifications to skb->dev */ -#define RMNET_EPMODE_NONE (0) /* Replace skb->dev to a virtual rmnet device and pass up the stack */ #define RMNET_EPMODE_VND (1) /* Pass the frame directly to another device with dev_queue_xmit() */ diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c index 7f90d5587653..12bd0bbd5235 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c @@ -45,8 +45,8 @@ static netdev_tx_t rmnet_vnd_start_xmit(struct sk_buff *skb, struct rmnet_priv *priv; priv = netdev_priv(dev); - if (priv->local_ep.egress_dev) { - rmnet_egress_handler(skb, &priv->local_ep); + if (priv->real_dev) { + rmnet_egress_handler(skb); } else { dev->stats.tx_dropped++; kfree_skb(skb); @@ -74,6 +74,8 @@ static const struct net_device_ops rmnet_vnd_ops = { .ndo_start_xmit = rmnet_vnd_start_xmit, .ndo_change_mtu = rmnet_vnd_change_mtu, .ndo_get_iflink = rmnet_vnd_get_iflink, + .ndo_add_slave = rmnet_add_bridge, + .ndo_del_slave = rmnet_del_bridge, }; /* Called by kernel whenever a new rmnet<n> device is created. Sets MTU, @@ -100,17 +102,19 @@ void rmnet_vnd_setup(struct net_device *rmnet_dev) int rmnet_vnd_newlink(u8 id, struct net_device *rmnet_dev, struct rmnet_port *port, - struct net_device *real_dev) + struct net_device *real_dev, + struct rmnet_endpoint *ep) { struct rmnet_priv *priv; int rc; - if (port->rmnet_devices[id]) + if (ep->egress_dev) return -EINVAL; rc = register_netdevice(rmnet_dev); if (!rc) { - port->rmnet_devices[id] = rmnet_dev; + ep->egress_dev = rmnet_dev; + ep->mux_id = id; port->nr_rmnet_devs++; rmnet_dev->rtnl_link_ops = &rmnet_link_ops; @@ -125,12 +129,13 @@ int rmnet_vnd_newlink(u8 id, struct net_device *rmnet_dev, return rc; } -int rmnet_vnd_dellink(u8 id, struct rmnet_port *port) +int rmnet_vnd_dellink(u8 id, struct rmnet_port *port, + struct rmnet_endpoint *ep) { - if (id >= RMNET_MAX_LOGICAL_EP || !port->rmnet_devices[id]) + if (id >= RMNET_MAX_LOGICAL_EP || !ep->egress_dev) return -EINVAL; - port->rmnet_devices[id] = NULL; + ep->egress_dev = NULL; port->nr_rmnet_devs--; return 0; } @@ -143,21 +148,6 @@ u8 rmnet_vnd_get_mux(struct net_device *rmnet_dev) return priv->mux_id; } -/* Gets the logical endpoint configuration for a RmNet virtual network device - * node. Caller should confirm that devices is a RmNet VND before calling. - */ -struct rmnet_endpoint *rmnet_vnd_get_endpoint(struct net_device *rmnet_dev) -{ - struct rmnet_priv *priv; - - if (!rmnet_dev) - return NULL; - - priv = netdev_priv(rmnet_dev); - - return &priv->local_ep; -} - int rmnet_vnd_do_flow_control(struct net_device *rmnet_dev, int enable) { netdev_dbg(rmnet_dev, "Setting VND TX queue state to %d\n", enable); diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h index 8a4042f0f6bf..71e4c3286951 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h @@ -17,11 +17,12 @@ #define _RMNET_VND_H_ int rmnet_vnd_do_flow_control(struct net_device *dev, int enable); -struct rmnet_endpoint *rmnet_vnd_get_endpoint(struct net_device *dev); int rmnet_vnd_newlink(u8 id, struct net_device *rmnet_dev, struct rmnet_port *port, - struct net_device *real_dev); -int rmnet_vnd_dellink(u8 id, struct rmnet_port *port); + struct net_device *real_dev, + struct rmnet_endpoint *ep); +int rmnet_vnd_dellink(u8 id, struct rmnet_port *port, + struct rmnet_endpoint *ep); void rmnet_vnd_rx_fixup(struct sk_buff *skb, struct net_device *dev); void rmnet_vnd_tx_fixup(struct sk_buff *skb, struct net_device *dev); u8 rmnet_vnd_get_mux(struct net_device *rmnet_dev); diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c index a8822a756e08..2b962d349f5f 100644 --- a/drivers/net/ethernet/renesas/ravb_main.c +++ b/drivers/net/ethernet/renesas/ravb_main.c @@ -1354,20 +1354,15 @@ static void ravb_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol) { struct ravb_private *priv = netdev_priv(ndev); - wol->supported = 0; - wol->wolopts = 0; - - if (priv->clk) { - wol->supported = WAKE_MAGIC; - wol->wolopts = priv->wol_enabled ? WAKE_MAGIC : 0; - } + wol->supported = WAKE_MAGIC; + wol->wolopts = priv->wol_enabled ? WAKE_MAGIC : 0; } static int ravb_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol) { struct ravb_private *priv = netdev_priv(ndev); - if (!priv->clk || wol->wolopts & ~WAKE_MAGIC) + if (wol->wolopts & ~WAKE_MAGIC) return -EOPNOTSUPP; priv->wol_enabled = !!(wol->wolopts & WAKE_MAGIC); @@ -1962,22 +1957,12 @@ MODULE_DEVICE_TABLE(of, ravb_match_table); static int ravb_set_gti(struct net_device *ndev) { - + struct ravb_private *priv = netdev_priv(ndev); struct device *dev = ndev->dev.parent; - struct device_node *np = dev->of_node; unsigned long rate; - struct clk *clk; uint64_t inc; - clk = of_clk_get(np, 0); - if (IS_ERR(clk)) { - dev_err(dev, "could not get clock\n"); - return PTR_ERR(clk); - } - - rate = clk_get_rate(clk); - clk_put(clk); - + rate = clk_get_rate(priv->clk); if (!rate) return -EINVAL; @@ -2126,10 +2111,11 @@ static int ravb_probe(struct platform_device *pdev) priv->chip_id = chip_id; - /* Get clock, if not found that's OK but Wake-On-Lan is unavailable */ priv->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(priv->clk)) - priv->clk = NULL; + if (IS_ERR(priv->clk)) { + error = PTR_ERR(priv->clk); + goto out_release; + } /* Set function */ ndev->netdev_ops = &ravb_netdev_ops; @@ -2197,8 +2183,7 @@ static int ravb_probe(struct platform_device *pdev) if (error) goto out_napi_del; - if (priv->clk) - device_set_wakeup_capable(&pdev->dev, 1); + device_set_wakeup_capable(&pdev->dev, 1); /* Print device information */ netdev_info(ndev, "Base address at %#x, %pM, IRQ %d.\n", diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h b/drivers/net/ethernet/stmicro/stmmac/common.h index e82b4b70b7be..e1e5ac053760 100644 --- a/drivers/net/ethernet/stmicro/stmmac/common.h +++ b/drivers/net/ethernet/stmicro/stmmac/common.h @@ -442,8 +442,9 @@ struct stmmac_dma_ops { void (*dma_mode)(void __iomem *ioaddr, int txmode, int rxmode, int rxfifosz); void (*dma_rx_mode)(void __iomem *ioaddr, int mode, u32 channel, - int fifosz); - void (*dma_tx_mode)(void __iomem *ioaddr, int mode, u32 channel); + int fifosz, u8 qmode); + void (*dma_tx_mode)(void __iomem *ioaddr, int mode, u32 channel, + int fifosz, u8 qmode); /* To track extra statistic (if supported) */ void (*dma_diagnostic_fr) (void *data, struct stmmac_extra_stats *x, void __iomem *ioaddr); diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c index 866444b6c82f..2c6d7c69c8f7 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-ipq806x.c @@ -51,15 +51,11 @@ #define NSS_COMMON_CLK_SRC_CTRL_RGMII(x) 1 #define NSS_COMMON_CLK_SRC_CTRL_SGMII(x) ((x >= 2) ? 1 : 0) -#define NSS_COMMON_MACSEC_CTL 0x28 -#define NSS_COMMON_MACSEC_CTL_EXT_BYPASS_EN(x) (1 << x) - #define NSS_COMMON_GMAC_CTL(x) (0x30 + (x * 4)) #define NSS_COMMON_GMAC_CTL_CSYS_REQ BIT(19) #define NSS_COMMON_GMAC_CTL_PHY_IFACE_SEL BIT(16) #define NSS_COMMON_GMAC_CTL_IFG_LIMIT_OFFSET 8 #define NSS_COMMON_GMAC_CTL_IFG_OFFSET 0 -#define NSS_COMMON_GMAC_CTL_IFG_MASK 0x3f #define NSS_COMMON_CLK_DIV_RGMII_1000 1 #define NSS_COMMON_CLK_DIV_RGMII_100 9 @@ -68,9 +64,6 @@ #define NSS_COMMON_CLK_DIV_SGMII_100 4 #define NSS_COMMON_CLK_DIV_SGMII_10 49 -#define QSGMII_PCS_MODE_CTL 0x68 -#define QSGMII_PCS_MODE_CTL_AUTONEG_EN(x) BIT((x * 8) + 7) - #define QSGMII_PCS_CAL_LCKDT_CTL 0x120 #define QSGMII_PCS_CAL_LCKDT_CTL_RST BIT(19) @@ -83,15 +76,10 @@ #define QSGMII_PHY_TX_DRIVER_EN BIT(3) #define QSGMII_PHY_QSGMII_EN BIT(7) #define QSGMII_PHY_PHASE_LOOP_GAIN_OFFSET 12 -#define QSGMII_PHY_PHASE_LOOP_GAIN_MASK 0x7 #define QSGMII_PHY_RX_DC_BIAS_OFFSET 18 -#define QSGMII_PHY_RX_DC_BIAS_MASK 0x3 #define QSGMII_PHY_RX_INPUT_EQU_OFFSET 20 -#define QSGMII_PHY_RX_INPUT_EQU_MASK 0x3 #define QSGMII_PHY_CDR_PI_SLEW_OFFSET 22 -#define QSGMII_PHY_CDR_PI_SLEW_MASK 0x3 #define QSGMII_PHY_TX_DRV_AMP_OFFSET 28 -#define QSGMII_PHY_TX_DRV_AMP_MASK 0xf struct ipq806x_gmac { struct platform_device *pdev; @@ -217,7 +205,7 @@ static int ipq806x_gmac_of_parse(struct ipq806x_gmac *gmac) * code and keep it consistent with the Linux convention, we'll number * them from 0 to 3 here. */ - if (gmac->id < 0 || gmac->id > 3) { + if (gmac->id > 3) { dev_err(dev, "invalid gmac id\n"); return -EINVAL; } diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index d74cedf2a397..aeda3ab2d761 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -225,6 +225,8 @@ enum power_event { #define MTL_CHAN_RX_DEBUG(x) (MTL_CHANX_BASE_ADDR(x) + 0x38) #define MTL_OP_MODE_RSF BIT(5) +#define MTL_OP_MODE_TXQEN_MASK GENMASK(3, 2) +#define MTL_OP_MODE_TXQEN_AV BIT(2) #define MTL_OP_MODE_TXQEN BIT(3) #define MTL_OP_MODE_TSF BIT(1) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index e84831e1b63b..c110f6850ffa 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -191,7 +191,7 @@ static void dwmac4_rx_watchdog(void __iomem *ioaddr, u32 riwt, u32 number_chan) } static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode, - u32 channel, int fifosz) + u32 channel, int fifosz, u8 qmode) { unsigned int rqs = fifosz / 256 - 1; u32 mtl_rx_op, mtl_rx_int; @@ -218,8 +218,10 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode, mtl_rx_op &= ~MTL_OP_MODE_RQS_MASK; mtl_rx_op |= rqs << MTL_OP_MODE_RQS_SHIFT; - /* enable flow control only if each channel gets 4 KiB or more FIFO */ - if (fifosz >= 4096) { + /* Enable flow control only if each channel gets 4 KiB or more FIFO and + * only if channel is not an AVB channel. + */ + if ((fifosz >= 4096) && (qmode != MTL_QUEUE_AVB)) { unsigned int rfd, rfa; mtl_rx_op |= MTL_OP_MODE_EHFC; @@ -271,9 +273,10 @@ static void dwmac4_dma_rx_chan_op_mode(void __iomem *ioaddr, int mode, } static void dwmac4_dma_tx_chan_op_mode(void __iomem *ioaddr, int mode, - u32 channel) + u32 channel, int fifosz, u8 qmode) { u32 mtl_tx_op = readl(ioaddr + MTL_CHAN_TX_OP_MODE(channel)); + unsigned int tqs = fifosz / 256 - 1; if (mode == SF_DMA_MODE) { pr_debug("GMAC: enable TX store and forward mode\n"); @@ -306,12 +309,18 @@ static void dwmac4_dma_tx_chan_op_mode(void __iomem *ioaddr, int mode, * For an IP with DWC_EQOS_NUM_TXQ > 1, the fields TXQEN and TQS are R/W * with reset values: TXQEN off, TQS 256 bytes. * - * Write the bits in both cases, since it will have no effect when RO. - * For DWC_EQOS_NUM_TXQ > 1, the top bits in MTL_OP_MODE_TQS_MASK might - * be RO, however, writing the whole TQS field will result in a value - * equal to DWC_EQOS_TXFIFO_SIZE, just like for DWC_EQOS_NUM_TXQ == 1. + * TXQEN must be written for multi-channel operation and TQS must + * reflect the available fifo size per queue (total fifo size / number + * of enabled queues). */ - mtl_tx_op |= MTL_OP_MODE_TXQEN | MTL_OP_MODE_TQS_MASK; + mtl_tx_op &= ~MTL_OP_MODE_TXQEN_MASK; + if (qmode != MTL_QUEUE_AVB) + mtl_tx_op |= MTL_OP_MODE_TXQEN; + else + mtl_tx_op |= MTL_OP_MODE_TXQEN_AV; + mtl_tx_op &= ~MTL_OP_MODE_TQS_MASK; + mtl_tx_op |= tqs << MTL_OP_MODE_TQS_SHIFT; + writel(mtl_tx_op, ioaddr + MTL_CHAN_TX_OP_MODE(channel)); } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index f41661a04f23..0e1b0a3d7b76 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -1750,12 +1750,20 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) u32 rx_channels_count = priv->plat->rx_queues_to_use; u32 tx_channels_count = priv->plat->tx_queues_to_use; int rxfifosz = priv->plat->rx_fifo_size; + int txfifosz = priv->plat->tx_fifo_size; u32 txmode = 0; u32 rxmode = 0; u32 chan = 0; + u8 qmode = 0; if (rxfifosz == 0) rxfifosz = priv->dma_cap.rx_fifo_size; + if (txfifosz == 0) + txfifosz = priv->dma_cap.tx_fifo_size; + + /* Adjust for real per queue fifo size */ + rxfifosz /= rx_channels_count; + txfifosz /= tx_channels_count; if (priv->plat->force_thresh_dma_mode) { txmode = tc; @@ -1778,12 +1786,19 @@ static void stmmac_dma_operation_mode(struct stmmac_priv *priv) /* configure all channels */ if (priv->synopsys_id >= DWMAC_CORE_4_00) { - for (chan = 0; chan < rx_channels_count; chan++) + for (chan = 0; chan < rx_channels_count; chan++) { + qmode = priv->plat->rx_queues_cfg[chan].mode_to_use; + priv->hw->dma->dma_rx_mode(priv->ioaddr, rxmode, chan, - rxfifosz); + rxfifosz, qmode); + } - for (chan = 0; chan < tx_channels_count; chan++) - priv->hw->dma->dma_tx_mode(priv->ioaddr, txmode, chan); + for (chan = 0; chan < tx_channels_count; chan++) { + qmode = priv->plat->tx_queues_cfg[chan].mode_to_use; + + priv->hw->dma->dma_tx_mode(priv->ioaddr, txmode, chan, + txfifosz, qmode); + } } else { priv->hw->dma->dma_mode(priv->ioaddr, txmode, rxmode, rxfifosz); @@ -1946,15 +1961,27 @@ static void stmmac_tx_err(struct stmmac_priv *priv, u32 chan) static void stmmac_set_dma_operation_mode(struct stmmac_priv *priv, u32 txmode, u32 rxmode, u32 chan) { + u8 rxqmode = priv->plat->rx_queues_cfg[chan].mode_to_use; + u8 txqmode = priv->plat->tx_queues_cfg[chan].mode_to_use; + u32 rx_channels_count = priv->plat->rx_queues_to_use; + u32 tx_channels_count = priv->plat->tx_queues_to_use; int rxfifosz = priv->plat->rx_fifo_size; + int txfifosz = priv->plat->tx_fifo_size; if (rxfifosz == 0) rxfifosz = priv->dma_cap.rx_fifo_size; + if (txfifosz == 0) + txfifosz = priv->dma_cap.tx_fifo_size; + + /* Adjust for real per queue fifo size */ + rxfifosz /= rx_channels_count; + txfifosz /= tx_channels_count; if (priv->synopsys_id >= DWMAC_CORE_4_00) { priv->hw->dma->dma_rx_mode(priv->ioaddr, rxmode, chan, - rxfifosz); - priv->hw->dma->dma_tx_mode(priv->ioaddr, txmode, chan); + rxfifosz, rxqmode); + priv->hw->dma->dma_tx_mode(priv->ioaddr, txmode, chan, + txfifosz, txqmode); } else { priv->hw->dma->dma_mode(priv->ioaddr, txmode, rxmode, rxfifosz); diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 6f550e15a41c..a81335e8ebe8 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -704,6 +704,14 @@ struct netvsc_reconfig { u32 event; }; +/* L4 hash bits for different protocols */ +#define HV_TCP4_L4HASH 1 +#define HV_TCP6_L4HASH 2 +#define HV_UDP4_L4HASH 4 +#define HV_UDP6_L4HASH 8 +#define HV_DEFAULT_L4HASH (HV_TCP4_L4HASH | HV_TCP6_L4HASH | HV_UDP4_L4HASH | \ + HV_UDP6_L4HASH) + /* The context of the netvsc device */ struct net_device_context { /* point back to our device context */ @@ -726,10 +734,9 @@ struct net_device_context { u32 tx_send_table[VRSS_SEND_TAB_SIZE]; /* Ethtool settings */ - bool udp4_l4_hash; - bool udp6_l4_hash; u8 duplex; u32 speed; + u32 l4_hash; /* L4 hash settings */ struct netvsc_ethtool_stats eth_stats; /* State to manage the associated VF interface. */ diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index dfb986421ec6..44746de3dd4c 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -203,7 +203,7 @@ static inline u32 netvsc_get_hash( const struct net_device_context *ndc) { struct flow_keys flow; - u32 hash; + u32 hash, pkt_proto = 0; static u32 hashrnd __read_mostly; net_get_random_once(&hashrnd, sizeof(hashrnd)); @@ -211,11 +211,25 @@ static inline u32 netvsc_get_hash( if (!skb_flow_dissect_flow_keys(skb, &flow, 0)) return 0; - if (flow.basic.ip_proto == IPPROTO_TCP || - (flow.basic.ip_proto == IPPROTO_UDP && - ((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) || - (flow.basic.n_proto == htons(ETH_P_IPV6) && - ndc->udp6_l4_hash)))) { + switch (flow.basic.ip_proto) { + case IPPROTO_TCP: + if (flow.basic.n_proto == htons(ETH_P_IP)) + pkt_proto = HV_TCP4_L4HASH; + else if (flow.basic.n_proto == htons(ETH_P_IPV6)) + pkt_proto = HV_TCP6_L4HASH; + + break; + + case IPPROTO_UDP: + if (flow.basic.n_proto == htons(ETH_P_IP)) + pkt_proto = HV_UDP4_L4HASH; + else if (flow.basic.n_proto == htons(ETH_P_IPV6)) + pkt_proto = HV_UDP6_L4HASH; + + break; + } + + if (pkt_proto & ndc->l4_hash) { return skb_get_hash(skb); } else { if (flow.basic.n_proto == htons(ETH_P_IP)) @@ -898,8 +912,7 @@ static void netvsc_init_settings(struct net_device *dev) { struct net_device_context *ndc = netdev_priv(dev); - ndc->udp4_l4_hash = true; - ndc->udp6_l4_hash = true; + ndc->l4_hash = HV_DEFAULT_L4HASH; ndc->speed = SPEED_UNKNOWN; ndc->duplex = DUPLEX_FULL; @@ -1245,23 +1258,32 @@ static int netvsc_get_rss_hash_opts(struct net_device_context *ndc, struct ethtool_rxnfc *info) { + const u32 l4_flag = RXH_L4_B_0_1 | RXH_L4_B_2_3; + info->data = RXH_IP_SRC | RXH_IP_DST; switch (info->flow_type) { case TCP_V4_FLOW: + if (ndc->l4_hash & HV_TCP4_L4HASH) + info->data |= l4_flag; + + break; + case TCP_V6_FLOW: - info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + if (ndc->l4_hash & HV_TCP6_L4HASH) + info->data |= l4_flag; + break; case UDP_V4_FLOW: - if (ndc->udp4_l4_hash) - info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + if (ndc->l4_hash & HV_UDP4_L4HASH) + info->data |= l4_flag; break; case UDP_V6_FLOW: - if (ndc->udp6_l4_hash) - info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; + if (ndc->l4_hash & HV_UDP6_L4HASH) + info->data |= l4_flag; break; @@ -1302,23 +1324,51 @@ static int netvsc_set_rss_hash_opts(struct net_device_context *ndc, { if (info->data == (RXH_IP_SRC | RXH_IP_DST | RXH_L4_B_0_1 | RXH_L4_B_2_3)) { - if (info->flow_type == UDP_V4_FLOW) - ndc->udp4_l4_hash = true; - else if (info->flow_type == UDP_V6_FLOW) - ndc->udp6_l4_hash = true; - else + switch (info->flow_type) { + case TCP_V4_FLOW: + ndc->l4_hash |= HV_TCP4_L4HASH; + break; + + case TCP_V6_FLOW: + ndc->l4_hash |= HV_TCP6_L4HASH; + break; + + case UDP_V4_FLOW: + ndc->l4_hash |= HV_UDP4_L4HASH; + break; + + case UDP_V6_FLOW: + ndc->l4_hash |= HV_UDP6_L4HASH; + break; + + default: return -EOPNOTSUPP; + } return 0; } if (info->data == (RXH_IP_SRC | RXH_IP_DST)) { - if (info->flow_type == UDP_V4_FLOW) - ndc->udp4_l4_hash = false; - else if (info->flow_type == UDP_V6_FLOW) - ndc->udp6_l4_hash = false; - else + switch (info->flow_type) { + case TCP_V4_FLOW: + ndc->l4_hash &= ~HV_TCP4_L4HASH; + break; + + case TCP_V6_FLOW: + ndc->l4_hash &= ~HV_TCP6_L4HASH; + break; + + case UDP_V4_FLOW: + ndc->l4_hash &= ~HV_UDP4_L4HASH; + break; + + case UDP_V6_FLOW: + ndc->l4_hash &= ~HV_UDP6_L4HASH; + break; + + default: return -EOPNOTSUPP; + } return 0; } diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 57c3856bab05..3cf67db513e2 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -407,7 +407,7 @@ static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, * while the packets use the mac-addr on the physical device. */ return dev_hard_header(skb, phy_dev, type, daddr, - saddr ? : dev->dev_addr, len); + saddr ? : phy_dev->dev_addr, len); } static const struct header_ops ipvlan_header_ops = { @@ -730,6 +730,11 @@ static int ipvlan_device_event(struct notifier_block *unused, ipvlan_adjust_mtu(ipvlan, dev); break; + case NETDEV_CHANGEADDR: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) + ether_addr_copy(ipvlan->dev->dev_addr, dev->dev_addr); + break; + case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig index cd931cf9dcc2..8125412c8814 100644 --- a/drivers/net/phy/Kconfig +++ b/drivers/net/phy/Kconfig @@ -277,6 +277,11 @@ config DAVICOM_PHY ---help--- Currently supports dm9161e and dm9131 +config DP83822_PHY + tristate "Texas Instruments DP83822 PHY" + ---help--- + Supports the DP83822 PHY. + config DP83848_PHY tristate "Texas Instruments DP83848 PHY" ---help--- @@ -366,6 +371,11 @@ config REALTEK_PHY ---help--- Supports the Realtek 821x PHY. +config RENESAS_PHY + tristate "Driver for Renesas PHYs" + ---help--- + Supports the Renesas PHYs uPD60620 and uPD60620A. + config ROCKCHIP_PHY tristate "Driver for Rockchip Ethernet PHYs" ---help--- diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile index 416df92fbf4f..4b983be83566 100644 --- a/drivers/net/phy/Makefile +++ b/drivers/net/phy/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_CICADA_PHY) += cicada.o obj-$(CONFIG_CORTINA_PHY) += cortina.o obj-$(CONFIG_DAVICOM_PHY) += davicom.o obj-$(CONFIG_DP83640_PHY) += dp83640.o +obj-$(CONFIG_DP83822_PHY) += dp83822.o obj-$(CONFIG_DP83848_PHY) += dp83848.o obj-$(CONFIG_DP83867_PHY) += dp83867.o obj-$(CONFIG_FIXED_PHY) += fixed_phy.o @@ -72,6 +73,7 @@ obj-$(CONFIG_MICROSEMI_PHY) += mscc.o obj-$(CONFIG_NATIONAL_PHY) += national.o obj-$(CONFIG_QSEMI_PHY) += qsemi.o obj-$(CONFIG_REALTEK_PHY) += realtek.o +obj-$(CONFIG_RENESAS_PHY) += uPD60620.o obj-$(CONFIG_ROCKCHIP_PHY) += rockchip.o obj-$(CONFIG_SMSC_PHY) += smsc.o obj-$(CONFIG_STE10XP) += ste10Xp.o diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c index c1e52b9dc58d..5f93e6add563 100644 --- a/drivers/net/phy/at803x.c +++ b/drivers/net/phy/at803x.c @@ -167,7 +167,7 @@ static int at803x_set_wol(struct phy_device *phydev, mac = (const u8 *) ndev->dev_addr; if (!is_valid_ether_addr(mac)) - return -EFAULT; + return -EINVAL; for (i = 0; i < 3; i++) { phy_write(phydev, AT803X_MMD_ACCESS_CONTROL, diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index 1e9ad30a35c8..d7ed69deabfb 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -43,6 +43,12 @@ static int bcm54210e_config_init(struct phy_device *phydev) val &= ~BCM54810_SHD_CLK_CTL_GTXCLK_EN; bcm_phy_write_shadow(phydev, BCM54810_SHD_CLK_CTL, val); + if (phydev->dev_flags & PHY_BRCM_EN_MASTER_MODE) { + val = phy_read(phydev, MII_CTRL1000); + val |= CTL1000_AS_MASTER | CTL1000_ENABLE_MASTER; + phy_write(phydev, MII_CTRL1000, val); + } + return 0; } diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c new file mode 100644 index 000000000000..14335d14e9e4 --- /dev/null +++ b/drivers/net/phy/dp83822.c @@ -0,0 +1,344 @@ +/* + * Driver for the Texas Instruments DP83822 PHY + * + * Copyright (C) 2017 Texas Instruments Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/ethtool.h> +#include <linux/etherdevice.h> +#include <linux/kernel.h> +#include <linux/mii.h> +#include <linux/module.h> +#include <linux/of.h> +#include <linux/phy.h> +#include <linux/netdevice.h> + +#define DP83822_PHY_ID 0x2000a240 +#define DP83822_DEVADDR 0x1f + +#define MII_DP83822_PHYSCR 0x11 +#define MII_DP83822_MISR1 0x12 +#define MII_DP83822_MISR2 0x13 +#define MII_DP83822_RESET_CTRL 0x1f + +#define DP83822_HW_RESET BIT(15) +#define DP83822_SW_RESET BIT(14) + +/* PHYSCR Register Fields */ +#define DP83822_PHYSCR_INT_OE BIT(0) /* Interrupt Output Enable */ +#define DP83822_PHYSCR_INTEN BIT(1) /* Interrupt Enable */ + +/* MISR1 bits */ +#define DP83822_RX_ERR_HF_INT_EN BIT(0) +#define DP83822_FALSE_CARRIER_HF_INT_EN BIT(1) +#define DP83822_ANEG_COMPLETE_INT_EN BIT(2) +#define DP83822_DUP_MODE_CHANGE_INT_EN BIT(3) +#define DP83822_SPEED_CHANGED_INT_EN BIT(4) +#define DP83822_LINK_STAT_INT_EN BIT(5) +#define DP83822_ENERGY_DET_INT_EN BIT(6) +#define DP83822_LINK_QUAL_INT_EN BIT(7) + +/* MISR2 bits */ +#define DP83822_JABBER_DET_INT_EN BIT(0) +#define DP83822_WOL_PKT_INT_EN BIT(1) +#define DP83822_SLEEP_MODE_INT_EN BIT(2) +#define DP83822_MDI_XOVER_INT_EN BIT(3) +#define DP83822_LB_FIFO_INT_EN BIT(4) +#define DP83822_PAGE_RX_INT_EN BIT(5) +#define DP83822_ANEG_ERR_INT_EN BIT(6) +#define DP83822_EEE_ERROR_CHANGE_INT_EN BIT(7) + +/* INT_STAT1 bits */ +#define DP83822_WOL_INT_EN BIT(4) +#define DP83822_WOL_INT_STAT BIT(12) + +#define MII_DP83822_RXSOP1 0x04a5 +#define MII_DP83822_RXSOP2 0x04a6 +#define MII_DP83822_RXSOP3 0x04a7 + +/* WoL Registers */ +#define MII_DP83822_WOL_CFG 0x04a0 +#define MII_DP83822_WOL_STAT 0x04a1 +#define MII_DP83822_WOL_DA1 0x04a2 +#define MII_DP83822_WOL_DA2 0x04a3 +#define MII_DP83822_WOL_DA3 0x04a4 + +/* WoL bits */ +#define DP83822_WOL_MAGIC_EN BIT(0) +#define DP83822_WOL_SECURE_ON BIT(5) +#define DP83822_WOL_EN BIT(7) +#define DP83822_WOL_INDICATION_SEL BIT(8) +#define DP83822_WOL_CLR_INDICATION BIT(11) + +static int dp83822_ack_interrupt(struct phy_device *phydev) +{ + int err; + + err = phy_read(phydev, MII_DP83822_MISR1); + if (err < 0) + return err; + + err = phy_read(phydev, MII_DP83822_MISR2); + if (err < 0) + return err; + + return 0; +} + +static int dp83822_set_wol(struct phy_device *phydev, + struct ethtool_wolinfo *wol) +{ + struct net_device *ndev = phydev->attached_dev; + u16 value; + const u8 *mac; + + if (wol->wolopts & (WAKE_MAGIC | WAKE_MAGICSECURE)) { + mac = (const u8 *)ndev->dev_addr; + + if (!is_valid_ether_addr(mac)) + return -EINVAL; + + /* MAC addresses start with byte 5, but stored in mac[0]. + * 822 PHYs store bytes 4|5, 2|3, 0|1 + */ + phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA1, + (mac[1] << 8) | mac[0]); + phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA2, + (mac[3] << 8) | mac[2]); + phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_DA3, + (mac[5] << 8) | mac[4]); + + value = phy_read_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_WOL_CFG); + if (wol->wolopts & WAKE_MAGIC) + value |= DP83822_WOL_MAGIC_EN; + else + value &= ~DP83822_WOL_MAGIC_EN; + + if (wol->wolopts & WAKE_MAGICSECURE) { + phy_write_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_RXSOP1, + (wol->sopass[1] << 8) | wol->sopass[0]); + phy_write_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_RXSOP2, + (wol->sopass[3] << 8) | wol->sopass[2]); + phy_write_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_RXSOP3, + (wol->sopass[5] << 8) | wol->sopass[4]); + value |= DP83822_WOL_SECURE_ON; + } else { + value &= ~DP83822_WOL_SECURE_ON; + } + + value |= (DP83822_WOL_EN | DP83822_WOL_INDICATION_SEL | + DP83822_WOL_CLR_INDICATION); + phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, + value); + } else { + value = phy_read_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_WOL_CFG); + value &= ~DP83822_WOL_EN; + phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, + value); + } + + return 0; +} + +static void dp83822_get_wol(struct phy_device *phydev, + struct ethtool_wolinfo *wol) +{ + int value; + u16 sopass_val; + + wol->supported = (WAKE_MAGIC | WAKE_MAGICSECURE); + wol->wolopts = 0; + + value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + + if (value & DP83822_WOL_MAGIC_EN) + wol->wolopts |= WAKE_MAGIC; + + if (value & DP83822_WOL_SECURE_ON) { + sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_RXSOP1); + wol->sopass[0] = (sopass_val & 0xff); + wol->sopass[1] = (sopass_val >> 8); + + sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_RXSOP2); + wol->sopass[2] = (sopass_val & 0xff); + wol->sopass[3] = (sopass_val >> 8); + + sopass_val = phy_read_mmd(phydev, DP83822_DEVADDR, + MII_DP83822_RXSOP3); + wol->sopass[4] = (sopass_val & 0xff); + wol->sopass[5] = (sopass_val >> 8); + + wol->wolopts |= WAKE_MAGICSECURE; + } + + /* WoL is not enabled so set wolopts to 0 */ + if (!(value & DP83822_WOL_EN)) + wol->wolopts = 0; +} + +static int dp83822_config_intr(struct phy_device *phydev) +{ + int misr_status; + int physcr_status; + int err; + + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { + misr_status = phy_read(phydev, MII_DP83822_MISR1); + if (misr_status < 0) + return misr_status; + + misr_status |= (DP83822_RX_ERR_HF_INT_EN | + DP83822_FALSE_CARRIER_HF_INT_EN | + DP83822_ANEG_COMPLETE_INT_EN | + DP83822_DUP_MODE_CHANGE_INT_EN | + DP83822_SPEED_CHANGED_INT_EN | + DP83822_LINK_STAT_INT_EN | + DP83822_ENERGY_DET_INT_EN | + DP83822_LINK_QUAL_INT_EN); + + err = phy_write(phydev, MII_DP83822_MISR1, misr_status); + if (err < 0) + return err; + + misr_status = phy_read(phydev, MII_DP83822_MISR2); + if (misr_status < 0) + return misr_status; + + misr_status |= (DP83822_JABBER_DET_INT_EN | + DP83822_WOL_PKT_INT_EN | + DP83822_SLEEP_MODE_INT_EN | + DP83822_MDI_XOVER_INT_EN | + DP83822_LB_FIFO_INT_EN | + DP83822_PAGE_RX_INT_EN | + DP83822_ANEG_ERR_INT_EN | + DP83822_EEE_ERROR_CHANGE_INT_EN); + + err = phy_write(phydev, MII_DP83822_MISR2, misr_status); + if (err < 0) + return err; + + physcr_status = phy_read(phydev, MII_DP83822_PHYSCR); + if (physcr_status < 0) + return physcr_status; + + physcr_status |= DP83822_PHYSCR_INT_OE | DP83822_PHYSCR_INTEN; + + } else { + err = phy_write(phydev, MII_DP83822_MISR1, 0); + if (err < 0) + return err; + + err = phy_write(phydev, MII_DP83822_MISR1, 0); + if (err < 0) + return err; + + physcr_status = phy_read(phydev, MII_DP83822_PHYSCR); + if (physcr_status < 0) + return physcr_status; + + physcr_status &= ~DP83822_PHYSCR_INTEN; + } + + return phy_write(phydev, MII_DP83822_PHYSCR, physcr_status); +} + +static int dp83822_config_init(struct phy_device *phydev) +{ + int err; + int value; + + err = genphy_config_init(phydev); + if (err < 0) + return err; + + value = DP83822_WOL_MAGIC_EN | DP83822_WOL_SECURE_ON | DP83822_WOL_EN; + + return phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, + value); +} + +static int dp83822_phy_reset(struct phy_device *phydev) +{ + int err; + + err = phy_write(phydev, MII_DP83822_RESET_CTRL, DP83822_HW_RESET); + if (err < 0) + return err; + + dp83822_config_init(phydev); + + return 0; +} + +static int dp83822_suspend(struct phy_device *phydev) +{ + int value; + + value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + + if (!(value & DP83822_WOL_EN)) + genphy_suspend(phydev); + + return 0; +} + +static int dp83822_resume(struct phy_device *phydev) +{ + int value; + + genphy_resume(phydev); + + value = phy_read_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG); + + phy_write_mmd(phydev, DP83822_DEVADDR, MII_DP83822_WOL_CFG, value | + DP83822_WOL_CLR_INDICATION); + + return 0; +} + +static struct phy_driver dp83822_driver[] = { + { + .phy_id = DP83822_PHY_ID, + .phy_id_mask = 0xfffffff0, + .name = "TI DP83822", + .features = PHY_BASIC_FEATURES, + .flags = PHY_HAS_INTERRUPT, + .config_init = dp83822_config_init, + .soft_reset = dp83822_phy_reset, + .get_wol = dp83822_get_wol, + .set_wol = dp83822_set_wol, + .ack_interrupt = dp83822_ack_interrupt, + .config_intr = dp83822_config_intr, + .config_aneg = genphy_config_aneg, + .read_status = genphy_read_status, + .suspend = dp83822_suspend, + .resume = dp83822_resume, + }, +}; +module_phy_driver(dp83822_driver); + +static struct mdio_device_id __maybe_unused dp83822_tbl[] = { + { DP83822_PHY_ID, 0xfffffff0 }, + { }, +}; +MODULE_DEVICE_TABLE(mdio, dp83822_tbl); + +MODULE_DESCRIPTION("Texas Instruments DP83822 PHY driver"); +MODULE_AUTHOR("Dan Murphy <dmurphy@ti.com"); +MODULE_LICENSE("GPL"); diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c index 3de4fe4dda77..3966d43c5146 100644 --- a/drivers/net/phy/dp83848.c +++ b/drivers/net/phy/dp83848.c @@ -20,7 +20,6 @@ #define TI_DP83620_PHY_ID 0x20005ce0 #define NS_DP83848C_PHY_ID 0x20005c90 #define TLK10X_PHY_ID 0x2000a210 -#define TI_DP83822_PHY_ID 0x2000a240 /* Registers */ #define DP83848_MICR 0x11 /* MII Interrupt Control Register */ @@ -80,7 +79,6 @@ static struct mdio_device_id __maybe_unused dp83848_tbl[] = { { NS_DP83848C_PHY_ID, 0xfffffff0 }, { TI_DP83620_PHY_ID, 0xfffffff0 }, { TLK10X_PHY_ID, 0xfffffff0 }, - { TI_DP83822_PHY_ID, 0xfffffff0 }, { } }; MODULE_DEVICE_TABLE(mdio, dp83848_tbl); @@ -110,7 +108,6 @@ static struct phy_driver dp83848_driver[] = { DP83848_PHY_DRIVER(NS_DP83848C_PHY_ID, "NS DP83848C 10/100 Mbps PHY"), DP83848_PHY_DRIVER(TI_DP83620_PHY_ID, "TI DP83620 10/100 Mbps PHY"), DP83848_PHY_DRIVER(TLK10X_PHY_ID, "TI TLK10X 10/100 Mbps PHY"), - DP83848_PHY_DRIVER(TI_DP83822_PHY_ID, "TI DP83822 10/100 Mbps PHY"), }; module_phy_driver(dp83848_driver); diff --git a/drivers/net/phy/uPD60620.c b/drivers/net/phy/uPD60620.c new file mode 100644 index 000000000000..96b33475ea5e --- /dev/null +++ b/drivers/net/phy/uPD60620.c @@ -0,0 +1,109 @@ +/* + * Driver for the Renesas PHY uPD60620. + * + * Copyright (C) 2015 Softing Industrial Automation GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/phy.h> + +#define UPD60620_PHY_ID 0xb8242824 + +/* Extended Registers and values */ +/* PHY Special Control/Status */ +#define PHY_PHYSCR 0x1F /* PHY.31 */ +#define PHY_PHYSCR_10MB 0x0004 /* PHY speed = 10mb */ +#define PHY_PHYSCR_100MB 0x0008 /* PHY speed = 100mb */ +#define PHY_PHYSCR_DUPLEX 0x0010 /* PHY Duplex */ + +/* PHY Special Modes */ +#define PHY_SPM 0x12 /* PHY.18 */ + +/* Init PHY */ + +static int upd60620_config_init(struct phy_device *phydev) +{ + /* Enable support for passive HUBs (could be a strap option) */ + /* PHYMODE: All speeds, HD in parallel detect */ + return phy_write(phydev, PHY_SPM, 0x0180 | phydev->mdio.addr); +} + +/* Get PHY status from common registers */ + +static int upd60620_read_status(struct phy_device *phydev) +{ + int phy_state; + + /* Read negotiated state */ + phy_state = phy_read(phydev, MII_BMSR); + if (phy_state < 0) + return phy_state; + + phydev->link = 0; + phydev->lp_advertising = 0; + phydev->pause = 0; + phydev->asym_pause = 0; + + if (phy_state & (BMSR_ANEGCOMPLETE | BMSR_LSTATUS)) { + phy_state = phy_read(phydev, PHY_PHYSCR); + if (phy_state < 0) + return phy_state; + + if (phy_state & (PHY_PHYSCR_10MB | PHY_PHYSCR_100MB)) { + phydev->link = 1; + phydev->speed = SPEED_10; + phydev->duplex = DUPLEX_HALF; + + if (phy_state & PHY_PHYSCR_100MB) + phydev->speed = SPEED_100; + if (phy_state & PHY_PHYSCR_DUPLEX) + phydev->duplex = DUPLEX_FULL; + + phy_state = phy_read(phydev, MII_LPA); + if (phy_state < 0) + return phy_state; + + phydev->lp_advertising + = mii_lpa_to_ethtool_lpa_t(phy_state); + + if (phydev->duplex == DUPLEX_FULL) { + if (phy_state & LPA_PAUSE_CAP) + phydev->pause = 1; + if (phy_state & LPA_PAUSE_ASYM) + phydev->asym_pause = 1; + } + } + } + return 0; +} + +MODULE_DESCRIPTION("Renesas uPD60620 PHY driver"); +MODULE_AUTHOR("Bernd Edlinger <bernd.edlinger@hotmail.de>"); +MODULE_LICENSE("GPL"); + +static struct phy_driver upd60620_driver[1] = { { + .phy_id = UPD60620_PHY_ID, + .phy_id_mask = 0xfffffffe, + .name = "Renesas uPD60620", + .features = PHY_BASIC_FEATURES, + .flags = 0, + .config_init = upd60620_config_init, + .config_aneg = genphy_config_aneg, + .read_status = upd60620_read_status, +} }; + +module_phy_driver(upd60620_driver); + +static struct mdio_device_id __maybe_unused upd60620_tbl[] = { + { UPD60620_PHY_ID, 0xfffffffe }, + { } +}; + +MODULE_DEVICE_TABLE(mdio, upd60620_tbl); diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index c3f77e3b7819..e365866600ba 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1339,7 +1339,17 @@ ppp_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats64) static int ppp_dev_init(struct net_device *dev) { + struct ppp *ppp; + netdev_lockdep_set_classes(dev); + + ppp = netdev_priv(dev); + /* Let the netdevice take a reference on the ppp file. This ensures + * that ppp_destroy_interface() won't run before the device gets + * unregistered. + */ + atomic_inc(&ppp->file.refcnt); + return 0; } @@ -1362,6 +1372,15 @@ static void ppp_dev_uninit(struct net_device *dev) wake_up_interruptible(&ppp->file.rwait); } +static void ppp_dev_priv_destructor(struct net_device *dev) +{ + struct ppp *ppp; + + ppp = netdev_priv(dev); + if (atomic_dec_and_test(&ppp->file.refcnt)) + ppp_destroy_interface(ppp); +} + static const struct net_device_ops ppp_netdev_ops = { .ndo_init = ppp_dev_init, .ndo_uninit = ppp_dev_uninit, @@ -1387,6 +1406,7 @@ static void ppp_setup(struct net_device *dev) dev->tx_queue_len = 3; dev->type = ARPHRD_PPP; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; + dev->priv_destructor = ppp_dev_priv_destructor; netif_keep_dst(dev); } diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index 29c7e2ec0dcb..52ea80bcd639 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -560,6 +560,7 @@ static const struct driver_info wwan_info = { #define NVIDIA_VENDOR_ID 0x0955 #define HP_VENDOR_ID 0x03f0 #define MICROSOFT_VENDOR_ID 0x045e +#define UBLOX_VENDOR_ID 0x1546 static const struct usb_device_id products[] = { /* BLACKLIST !! @@ -869,6 +870,18 @@ static const struct usb_device_id products[] = { USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&zte_cdc_info, }, { + /* U-blox TOBY-L2 */ + USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1143, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, +}, { + /* U-blox SARA-U2 */ + USB_DEVICE_AND_INTERFACE_INFO(UBLOX_VENDOR_ID, 0x1104, USB_CLASS_COMM, + USB_CDC_SUBCLASS_ETHERNET, + USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&wwan_info, +}, { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), .driver_info = (unsigned long) &cdc_info, diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bb2aad078637..5a14cc7f28ee 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2136,7 +2136,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct nvme_ns *ns = nvme_get_ns_from_dev(dev); if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ns->uuid) || + if (uuid_is_null(&ns->uuid) && !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) return 0; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cb73bc8cad3b..3f5a04c586ce 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -94,7 +94,7 @@ struct nvme_dev { struct mutex shutdown_lock; bool subsystem; void __iomem *cmb; - dma_addr_t cmb_dma_addr; + pci_bus_addr_t cmb_bus_addr; u64 cmb_size; u32 cmbsz; u32 cmbloc; @@ -1226,7 +1226,7 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), dev->ctrl.page_size); - nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; + nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; nvmeq->sq_cmds_io = dev->cmb + offset; } else { nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), @@ -1527,7 +1527,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) resource_size_t bar_size; struct pci_dev *pdev = to_pci_dev(dev->dev); void __iomem *cmb; - dma_addr_t dma_addr; + int bar; dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); if (!(NVME_CMB_SZ(dev->cmbsz))) @@ -1540,7 +1540,8 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); size = szu * NVME_CMB_SZ(dev->cmbsz); offset = szu * NVME_CMB_OFST(dev->cmbloc); - bar_size = pci_resource_len(pdev, NVME_CMB_BIR(dev->cmbloc)); + bar = NVME_CMB_BIR(dev->cmbloc); + bar_size = pci_resource_len(pdev, bar); if (offset > bar_size) return NULL; @@ -1553,12 +1554,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (size > bar_size - offset) size = bar_size - offset; - dma_addr = pci_resource_start(pdev, NVME_CMB_BIR(dev->cmbloc)) + offset; - cmb = ioremap_wc(dma_addr, size); + cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); if (!cmb) return NULL; - dev->cmb_dma_addr = dma_addr; + dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; dev->cmb_size = size; return cmb; } diff --git a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c index 785fb42f6650..2799a6b08f73 100644 --- a/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c +++ b/drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c @@ -3767,7 +3767,7 @@ static int ibmvscsis_write_pending(struct se_cmd *se_cmd) */ if ((vscsi->flags & (CLIENT_FAILED | RESPONSE_Q_DOWN))) { pr_err("write_pending failed since: %d\n", vscsi->flags); - return 0; + return -EIO; } rc = srp_transfer_data(cmd, &vio_iu(iue)->srp.cmd, ibmvscsis_rdma, diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index bd4605a34f54..c62e8d111fd9 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2851,9 +2851,6 @@ EXPORT_SYMBOL_GPL(iscsi_session_setup); /** * iscsi_session_teardown - destroy session, host, and cls_session * @cls_session: iscsi session - * - * The driver must have called iscsi_remove_session before - * calling this. */ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) { @@ -2863,6 +2860,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) iscsi_pool_free(&session->cmdpool); + iscsi_remove_session(cls_session); + kfree(session->password); kfree(session->password_in); kfree(session->username); @@ -2877,7 +2876,8 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) kfree(session->portal_type); kfree(session->discovery_parent_type); - iscsi_destroy_session(cls_session); + iscsi_free_session(cls_session); + iscsi_host_dec_session_cnt(shost); module_put(owner); } diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index e7818afeda2b..15590a063ad9 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -956,6 +956,9 @@ static int scsi_add_lun(struct scsi_device *sdev, unsigned char *inq_result, if (*bflags & BLIST_NO_DIF) sdev->no_dif = 1; + if (*bflags & BLIST_UNMAP_LIMIT_WS) + sdev->unmap_limit_for_ws = 1; + sdev->eh_timeout = SCSI_DEFAULT_EH_TIMEOUT; if (*bflags & BLIST_TRY_VPD_PAGES) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 0190aeff5f7f..7404d26895f5 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -2211,22 +2211,6 @@ void iscsi_free_session(struct iscsi_cls_session *session) EXPORT_SYMBOL_GPL(iscsi_free_session); /** - * iscsi_destroy_session - destroy iscsi session - * @session: iscsi_session - * - * Can be called by a LLD or iscsi_transport. There must not be - * any running connections. - */ -int iscsi_destroy_session(struct iscsi_cls_session *session) -{ - iscsi_remove_session(session); - ISCSI_DBG_TRANS_SESSION(session, "Completing session destruction\n"); - iscsi_free_session(session); - return 0; -} -EXPORT_SYMBOL_GPL(iscsi_destroy_session); - -/** * iscsi_create_conn - create iscsi class connection * @session: iscsi cls session * @dd_size: private driver data size diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index fb9f8b5f4673..d175c5c5ccf8 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -715,13 +715,21 @@ static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) break; case SD_LBP_WS16: - max_blocks = min_not_zero(sdkp->max_ws_blocks, - (u32)SD_MAX_WS16_BLOCKS); + if (sdkp->device->unmap_limit_for_ws) + max_blocks = sdkp->max_unmap_blocks; + else + max_blocks = sdkp->max_ws_blocks; + + max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS10: - max_blocks = min_not_zero(sdkp->max_ws_blocks, - (u32)SD_MAX_WS10_BLOCKS); + if (sdkp->device->unmap_limit_for_ws) + max_blocks = sdkp->max_unmap_blocks; + else + max_blocks = sdkp->max_ws_blocks; + + max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS); break; case SD_LBP_ZERO: @@ -3099,8 +3107,6 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_security(sdkp, buffer); } - sdkp->first_scan = 0; - /* * We now have all cache related info, determine how we deal * with flush requests. @@ -3115,7 +3121,7 @@ static int sd_revalidate_disk(struct gendisk *disk) q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); /* - * Use the device's preferred I/O size for reads and writes + * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, or * garbage. */ @@ -3129,8 +3135,19 @@ static int sd_revalidate_disk(struct gendisk *disk) rw_max = min_not_zero(logical_to_sectors(sdp, dev_max), (sector_t)BLK_DEF_MAX_SECTORS); - /* Combine with controller limits */ - q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q)); + /* Do not exceed controller limit */ + rw_max = min(rw_max, queue_max_hw_sectors(q)); + + /* + * Only update max_sectors if previously unset or if the current value + * exceeds the capabilities of the hardware. + */ + if (sdkp->first_scan || + q->limits.max_sectors > q->limits.max_dev_sectors || + q->limits.max_sectors > q->limits.max_hw_sectors) + q->limits.max_sectors = rw_max; + + sdkp->first_scan = 0; set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp); diff --git a/drivers/thunderbolt/nhi.c b/drivers/thunderbolt/nhi.c index 0e79eebfcbb7..419a7a90bce0 100644 --- a/drivers/thunderbolt/nhi.c +++ b/drivers/thunderbolt/nhi.c @@ -1144,5 +1144,5 @@ static void __exit nhi_unload(void) tb_domain_exit(); } -module_init(nhi_init); +fs_initcall(nhi_init); module_exit(nhi_unload); diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c index f2d06f6f7be9..138027537d29 100644 --- a/drivers/thunderbolt/xdomain.c +++ b/drivers/thunderbolt/xdomain.c @@ -1487,6 +1487,9 @@ int tb_register_property_dir(const char *key, struct tb_property_dir *dir) { int ret; + if (WARN_ON(!xdomain_property_dir)) + return -EAGAIN; + if (!key || strlen(key) > 8) return -EINVAL; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 58585ec8699e..68677d930e20 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -436,8 +436,8 @@ static bool vhost_exceeds_maxpend(struct vhost_net *net) struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX]; struct vhost_virtqueue *vq = &nvq->vq; - return (nvq->upend_idx + vq->num - VHOST_MAX_PEND) % UIO_MAXIOV - == nvq->done_idx; + return (nvq->upend_idx + UIO_MAXIOV - nvq->done_idx) % UIO_MAXIOV > + min_t(unsigned int, VHOST_MAX_PEND, vq->num >> 2); } /* Expects to be always run from workqueue - which acts as @@ -480,11 +480,6 @@ static void handle_tx(struct vhost_net *net) if (zcopy) vhost_zerocopy_signal_used(net, vq); - /* If more outstanding DMAs, queue the work. - * Handle upend_idx wrap around - */ - if (unlikely(vhost_exceeds_maxpend(net))) - break; head = vhost_net_tx_get_vq_desc(net, vq, vq->iov, ARRAY_SIZE(vq->iov), @@ -519,8 +514,7 @@ static void handle_tx(struct vhost_net *net) len = msg_data_left(&msg); zcopy_used = zcopy && len >= VHOST_GOODCOPY_LEN - && (nvq->upend_idx + 1) % UIO_MAXIOV != - nvq->done_idx + && !vhost_exceeds_maxpend(net) && vhost_net_tx_select_zcopy(net); /* use msg_control to pass vhost zerocopy ubuf info to skb */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 899ddaeeacec..8fc690384c58 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -722,7 +722,7 @@ struct btrfs_delayed_root; * Indicate that a whole-filesystem exclusive operation is running * (device replace, resize, device add/delete, balance) */ -#define BTRFS_FS_EXCL_OP 14 +#define BTRFS_FS_EXCL_OP 16 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 12ab19a4b93e..970190cd347e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2801,7 +2801,7 @@ static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, } } - bio = btrfs_bio_alloc(bdev, sector << 9); + bio = btrfs_bio_alloc(bdev, (u64)sector << 9); bio_add_page(bio, page, page_size, offset); bio->bi_end_io = end_io_func; bio->bi_private = tree; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 84edfc60d87a..f23c820daaed 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -734,12 +734,13 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = req->r_inode; ihold(inode); } else { - /* req->r_dentry is non-null for LSSNAP request. - * fall-thru */ - WARN_ON_ONCE(!req->r_dentry); + /* req->r_dentry is non-null for LSSNAP request */ + rcu_read_lock(); + inode = get_nonsnap_parent(req->r_dentry); + rcu_read_unlock(); + dout("__choose_mds using snapdir's parent %p\n", inode); } - } - if (!inode && req->r_dentry) { + } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent; struct inode *dir; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 1ffc8b426c1c..7fc0b850c352 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -374,12 +374,10 @@ static int build_snap_context(struct ceph_snap_realm *realm, realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - if (realm->cached_context) { - ceph_put_snap_context(realm->cached_context); - /* queue realm for cap_snap creation */ - list_add_tail(&realm->dirty_item, dirty_realms); - } + ceph_put_snap_context(realm->cached_context); realm->cached_context = snapc; + /* queue realm for cap_snap creation */ + list_add_tail(&realm->dirty_item, dirty_realms); return 0; fail: diff --git a/fs/namespace.c b/fs/namespace.c index 54059b142d6b..3b601f115b6c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -468,7 +468,9 @@ static inline int may_write_real(struct file *file) /* File refers to upper, writable layer? */ upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); - if (upperdentry && file_inode(file) == d_inode(upperdentry)) + if (upperdentry && + (file_inode(file) == d_inode(upperdentry) || + file_inode(file) == d_inode(dentry))) return 0; /* Lower layer: can't write to real file, sorry... */ diff --git a/fs/nfs/client.c b/fs/nfs/client.c index efebe6cf4378..22880ef6d8dd 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -218,7 +218,6 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); - rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); } #else @@ -888,6 +887,7 @@ struct nfs_server *nfs_alloc_server(void) ida_init(&server->openowner_id); ida_init(&server->lockowner_id); pnfs_init_server(server); + rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); return server; } diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 44c638b7876c..508126eb49f9 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -745,7 +745,8 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg) struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg); dprintk("--> %s\n", __func__); - nfs4_fl_put_deviceid(fl->dsaddr); + if (fl->dsaddr != NULL) + nfs4_fl_put_deviceid(fl->dsaddr); /* This assumes a single RW lseg */ if (lseg->pls_range.iomode == IOMODE_RW) { struct nfs4_filelayout *flo; diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index dd5d27da8c0c..30426c1a1bbd 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -274,7 +274,7 @@ static struct key *nfs_idmap_request_key(const char *name, size_t namelen, ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); - if (ret <= 0) + if (ret < 0) return ERR_PTR(ret); rkey = request_key(&key_type_id_resolver, desc, ""); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6c61e2b99635..f90090e8c959 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -8399,8 +8399,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, lo = NFS_I(inode)->layout; /* If the open stateid was bad, then recover it. */ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || - nfs4_stateid_match_other(&lgp->args.stateid, - &lgp->args.ctx->state->stateid)) { + !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { spin_unlock(&inode->i_lock); exception->state = lgp->args.ctx->state; exception->stateid = &lgp->args.stateid; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 37c8af003275..14ed9791ec9c 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1842,8 +1842,8 @@ static void encode_create_session(struct xdr_stream *xdr, * Assumes OPEN is the biggest non-idempotent compound. * 2 is the verifier. */ - max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + - RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT; + max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE + 2) + * XDR_UNIT + RPC_MAX_AUTH_SIZE; encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr); p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index aad97b30d5e6..c441f9387a1b 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -561,10 +561,8 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c) c->tmpfile = true; err = ovl_copy_up_locked(c); } else { - err = -EIO; - if (lock_rename(c->workdir, c->destdir) != NULL) { - pr_err("overlayfs: failed to lock workdir+upperdir\n"); - } else { + err = ovl_lock_rename_workdir(c->workdir, c->destdir); + if (!err) { err = ovl_copy_up_locked(c); unlock_rename(c->workdir, c->destdir); } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 3309b1912241..cc961a3bd3bd 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -216,26 +216,6 @@ out_unlock: return err; } -static int ovl_lock_rename_workdir(struct dentry *workdir, - struct dentry *upperdir) -{ - /* Workdir should not be the same as upperdir */ - if (workdir == upperdir) - goto err; - - /* Workdir should not be subdir of upperdir and vice versa */ - if (lock_rename(workdir, upperdir) != NULL) - goto err_unlock; - - return 0; - -err_unlock: - unlock_rename(workdir, upperdir); -err: - pr_err("overlayfs: failed to lock workdir+upperdir\n"); - return -EIO; -} - static struct dentry *ovl_clear_empty(struct dentry *dentry, struct list_head *list) { diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index c3addd1114f1..654bea1a5ac9 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -506,6 +506,7 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len); if (IS_ERR(index)) { + err = PTR_ERR(index); pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d4e8c1a08fb0..c706a6f99928 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -235,6 +235,7 @@ bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); int ovl_nlink_start(struct dentry *dentry, bool *locked); void ovl_nlink_end(struct dentry *dentry, bool locked); +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir); static inline bool ovl_is_impuredir(struct dentry *dentry) { diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 878a750986dd..25d9b5adcd42 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -37,6 +37,9 @@ struct ovl_fs { bool noxattr; /* sb common to all layers */ struct super_block *same_sb; + /* Did we take the inuse lock? */ + bool upperdir_locked; + bool workdir_locked; }; /* private information held for every overlayfs dentry */ diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 62e9b22a2077..0f85ee9c3268 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -988,6 +988,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, struct path *lowerstack, unsigned int numlower) { int err; + struct dentry *index = NULL; struct inode *dir = dentry->d_inode; struct path path = { .mnt = mnt, .dentry = dentry }; LIST_HEAD(list); @@ -1007,8 +1008,6 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, inode_lock_nested(dir, I_MUTEX_PARENT); list_for_each_entry(p, &list, l_node) { - struct dentry *index; - if (p->name[0] == '.') { if (p->len == 1) continue; @@ -1018,6 +1017,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, index = lookup_one_len(p->name, dentry, p->len); if (IS_ERR(index)) { err = PTR_ERR(index); + index = NULL; break; } err = ovl_verify_index(index, lowerstack, numlower); @@ -1029,7 +1029,9 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, break; } dput(index); + index = NULL; } + dput(index); inode_unlock(dir); out: ovl_cache_free(&list); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index fd5ea4facc62..092d150643c1 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -211,9 +211,10 @@ static void ovl_put_super(struct super_block *sb) dput(ufs->indexdir); dput(ufs->workdir); - ovl_inuse_unlock(ufs->workbasedir); + if (ufs->workdir_locked) + ovl_inuse_unlock(ufs->workbasedir); dput(ufs->workbasedir); - if (ufs->upper_mnt) + if (ufs->upper_mnt && ufs->upperdir_locked) ovl_inuse_unlock(ufs->upper_mnt->mnt_root); mntput(ufs->upper_mnt); for (i = 0; i < ufs->numlower; i++) @@ -881,9 +882,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_upperpath; err = -EBUSY; - if (!ovl_inuse_trylock(upperpath.dentry)) { - pr_err("overlayfs: upperdir is in-use by another mount\n"); + if (ovl_inuse_trylock(upperpath.dentry)) { + ufs->upperdir_locked = true; + } else if (ufs->config.index) { + pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); goto out_put_upperpath; + } else { + pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); } err = ovl_mount_dir(ufs->config.workdir, &workpath); @@ -901,9 +906,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } err = -EBUSY; - if (!ovl_inuse_trylock(workpath.dentry)) { - pr_err("overlayfs: workdir is in-use by another mount\n"); + if (ovl_inuse_trylock(workpath.dentry)) { + ufs->workdir_locked = true; + } else if (ufs->config.index) { + pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); goto out_put_workpath; + } else { + pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); } ufs->workbasedir = workpath.dentry; @@ -1156,11 +1165,13 @@ out_put_lowerpath: out_free_lowertmp: kfree(lowertmp); out_unlock_workdentry: - ovl_inuse_unlock(workpath.dentry); + if (ufs->workdir_locked) + ovl_inuse_unlock(workpath.dentry); out_put_workpath: path_put(&workpath); out_unlock_upperdentry: - ovl_inuse_unlock(upperpath.dentry); + if (ufs->upperdir_locked) + ovl_inuse_unlock(upperpath.dentry); out_put_upperpath: path_put(&upperpath); out_free_config: diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 117794582f9f..b9b239fa5cfd 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -430,7 +430,7 @@ void ovl_inuse_unlock(struct dentry *dentry) } } -/* Called must hold OVL_I(inode)->oi_lock */ +/* Caller must hold OVL_I(inode)->lock */ static void ovl_cleanup_index(struct dentry *dentry) { struct inode *dir = ovl_indexdir(dentry->d_sb)->d_inode; @@ -469,6 +469,9 @@ static void ovl_cleanup_index(struct dentry *dentry) err = PTR_ERR(index); if (!IS_ERR(index)) err = ovl_cleanup(dir, index); + else + index = NULL; + inode_unlock(dir); if (err) goto fail; @@ -557,3 +560,22 @@ void ovl_nlink_end(struct dentry *dentry, bool locked) mutex_unlock(&OVL_I(d_inode(dentry))->lock); } } + +int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index bc6c6e10a969..e9db7fc95b70 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -2122,11 +2122,31 @@ xfs_swap_extents( ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK; tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK; + } + + /* Swap the cow forks. */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + xfs_extnum_t extnum; + + ASSERT(ip->i_cformat == XFS_DINODE_FMT_EXTENTS); + ASSERT(tip->i_cformat == XFS_DINODE_FMT_EXTENTS); + + extnum = ip->i_cnextents; + ip->i_cnextents = tip->i_cnextents; + tip->i_cnextents = extnum; + cowfp = ip->i_cowfp; ip->i_cowfp = tip->i_cowfp; tip->i_cowfp = cowfp; - xfs_inode_set_cowblocks_tag(ip); - xfs_inode_set_cowblocks_tag(tip); + + if (ip->i_cowfp && ip->i_cnextents) + xfs_inode_set_cowblocks_tag(ip); + else + xfs_inode_clear_cowblocks_tag(ip); + if (tip->i_cowfp && tip->i_cnextents) + xfs_inode_set_cowblocks_tag(tip); + else + xfs_inode_clear_cowblocks_tag(tip); } xfs_trans_log_inode(tp, ip, src_log_flags); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 3246815c24d6..37e603bf1591 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -736,7 +736,13 @@ xfs_reflink_end_cow( /* If there is a hole at end_fsb - 1 go to the previous extent */ if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || got.br_startoff > end_fsb) { - ASSERT(idx > 0); + /* + * In case of racing, overlapping AIO writes no COW extents + * might be left by the time I/O completes for the loser of + * the race. In that case we are done. + */ + if (idx <= 0) + goto out_cancel; xfs_iext_get_extent(ifp, --idx, &got); } @@ -809,6 +815,7 @@ next_extent: out_defer: xfs_defer_cancel(&dfops); +out_cancel: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a67daea731ab..4373125de1f3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -56,7 +56,7 @@ struct bpf_map { struct work_struct work; atomic_t usercnt; struct bpf_map *inner_map_meta; - u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; }; /* function argument constraints */ @@ -189,7 +189,7 @@ struct bpf_prog_aux { struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ - u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; union { struct work_struct work; struct rcu_head rcu; @@ -407,6 +407,11 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) { } +static inline int bpf_obj_get_user(const char __user *pathname) +{ + return -EOPNOTSUPP; +} + static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b8d200f60a40..f00ef751c1c5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -115,6 +115,21 @@ struct bpf_insn_aux_data { #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +#define BPF_VERIFIER_TMP_LOG_SIZE 1024 + +struct bpf_verifer_log { + u32 level; + char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; + char __user *ubuf; + u32 len_used; + u32 len_total; +}; + +static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) +{ + return log->len_used >= log->len_total - 1; +} + struct bpf_verifier_env; struct bpf_ext_analyzer_ops { int (*insn_hook)(struct bpf_verifier_env *env, @@ -139,6 +154,8 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ + + struct bpf_verifer_log log; }; int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index abcda9b458ab..9ac9e3e3d1e5 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -63,6 +63,7 @@ #define PHY_BRCM_EXT_IBND_TX_ENABLE 0x00002000 #define PHY_BRCM_CLEAR_RGMII_MODE 0x00004000 #define PHY_BRCM_DIS_TXCRXC_NOENRGY 0x00008000 +#define PHY_BRCM_EN_MASTER_MODE 0x00010000 /* Broadcom BCM7xxx specific workarounds */ #define PHY_BRCM_7XXX_REV(x) (((x) >> 8) & 0xff) diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index 3cd18ac0697f..02639ebea2f0 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -49,6 +49,7 @@ struct br_ip_list { #define BR_MULTICAST_TO_UNICAST BIT(12) #define BR_VLAN_TUNNEL BIT(13) #define BR_BCAST_FLOOD BIT(14) +#define BR_NEIGH_SUPPRESS BIT(15) #define BR_DEFAULT_AGEING_TIME (300 * HZ) @@ -63,6 +64,7 @@ int br_multicast_list_adjacent(struct net_device *dev, bool br_multicast_has_querier_anywhere(struct net_device *dev, int proto); bool br_multicast_has_querier_adjacent(struct net_device *dev, int proto); bool br_multicast_enabled(const struct net_device *dev); +bool br_multicast_router(const struct net_device *dev); #else static inline int br_multicast_list_adjacent(struct net_device *dev, struct list_head *br_ip_list) @@ -83,6 +85,10 @@ static inline bool br_multicast_enabled(const struct net_device *dev) { return false; } +static inline bool br_multicast_router(const struct net_device *dev) +{ + return false; +} #endif #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_BRIDGE_VLAN_FILTERING) diff --git a/include/linux/if_phonet.h b/include/linux/if_phonet.h index bbcdb0a767d8..a118ee4a8428 100644 --- a/include/linux/if_phonet.h +++ b/include/linux/if_phonet.h @@ -10,5 +10,5 @@ #include <uapi/linux/if_phonet.h> -extern struct header_ops phonet_header_ops; +extern const struct header_ops phonet_header_ops; #endif diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index f3f2d07feb2a..9a43763a68ad 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -316,7 +316,7 @@ struct mmc_host { #define MMC_CAP_UHS_SDR50 (1 << 18) /* Host supports UHS SDR50 mode */ #define MMC_CAP_UHS_SDR104 (1 << 19) /* Host supports UHS SDR104 mode */ #define MMC_CAP_UHS_DDR50 (1 << 20) /* Host supports UHS DDR50 mode */ -#define MMC_CAP_NO_BOUNCE_BUFF (1 << 21) /* Disable bounce buffers on host */ +/* (1 << 21) is free for reuse */ #define MMC_CAP_DRIVER_TYPE_A (1 << 23) /* Host supports Driver Type A */ #define MMC_CAP_DRIVER_TYPE_C (1 << 24) /* Host supports Driver Type C */ #define MMC_CAP_DRIVER_TYPE_D (1 << 25) /* Host supports Driver Type D */ diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 2c2a5514b0df..528b24c78308 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -108,9 +108,10 @@ struct ebt_table { #define EBT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) & \ ~(__alignof__(struct _xt_align)-1)) -extern struct ebt_table *ebt_register_table(struct net *net, - const struct ebt_table *table, - const struct nf_hook_ops *); +extern int ebt_register_table(struct net *net, + const struct ebt_table *table, + const struct nf_hook_ops *ops, + struct ebt_table **res); extern void ebt_unregister_table(struct net *net, struct ebt_table *table, const struct nf_hook_ops *); extern unsigned int ebt_do_table(struct sk_buff *skb, diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a36abe2da13e..27e249ed7c5c 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -12,11 +12,31 @@ #ifdef CONFIG_LOCKUP_DETECTOR void lockup_detector_init(void); +void lockup_detector_soft_poweroff(void); +void lockup_detector_cleanup(void); +bool is_hardlockup(void); + +extern int watchdog_user_enabled; +extern int nmi_watchdog_user_enabled; +extern int soft_watchdog_user_enabled; +extern int watchdog_thresh; +extern unsigned long watchdog_enabled; + +extern struct cpumask watchdog_cpumask; +extern unsigned long *watchdog_cpumask_bits; +#ifdef CONFIG_SMP +extern int sysctl_softlockup_all_cpu_backtrace; +extern int sysctl_hardlockup_all_cpu_backtrace; #else -static inline void lockup_detector_init(void) -{ -} -#endif +#define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 +#endif /* !CONFIG_SMP */ + +#else /* CONFIG_LOCKUP_DETECTOR */ +static inline void lockup_detector_init(void) { } +static inline void lockup_detector_soft_poweroff(void) { } +static inline void lockup_detector_cleanup(void) { } +#endif /* !CONFIG_LOCKUP_DETECTOR */ #ifdef CONFIG_SOFTLOCKUP_DETECTOR extern void touch_softlockup_watchdog_sched(void); @@ -24,29 +44,17 @@ extern void touch_softlockup_watchdog(void); extern void touch_softlockup_watchdog_sync(void); extern void touch_all_softlockup_watchdogs(void); extern unsigned int softlockup_panic; -extern int soft_watchdog_enabled; -extern atomic_t watchdog_park_in_progress; #else -static inline void touch_softlockup_watchdog_sched(void) -{ -} -static inline void touch_softlockup_watchdog(void) -{ -} -static inline void touch_softlockup_watchdog_sync(void) -{ -} -static inline void touch_all_softlockup_watchdogs(void) -{ -} +static inline void touch_softlockup_watchdog_sched(void) { } +static inline void touch_softlockup_watchdog(void) { } +static inline void touch_softlockup_watchdog_sync(void) { } +static inline void touch_all_softlockup_watchdogs(void) { } #endif #ifdef CONFIG_DETECT_HUNG_TASK void reset_hung_task_detector(void); #else -static inline void reset_hung_task_detector(void) -{ -} +static inline void reset_hung_task_detector(void) { } #endif /* @@ -54,12 +62,12 @@ static inline void reset_hung_task_detector(void) * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. + * 'watchdog_user_enabled', 'nmi_watchdog_user_enabled' and + * 'soft_watchdog_user_enabled' are variables that are only used as an + * 'interface' between the parameters in /proc/sys/kernel and the internal + * state bits in 'watchdog_enabled'. The 'watchdog_thresh' variable is + * handled differently because its value is not boolean, and the lockup + * detectors are 'suspended' while 'watchdog_thresh' is equal zero. */ #define NMI_WATCHDOG_ENABLED_BIT 0 #define SOFT_WATCHDOG_ENABLED_BIT 1 @@ -73,17 +81,41 @@ extern unsigned int hardlockup_panic; static inline void hardlockup_detector_disable(void) {} #endif +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +# define NMI_WATCHDOG_SYSCTL_PERM 0644 +#else +# define NMI_WATCHDOG_SYSCTL_PERM 0444 +#endif + #if defined(CONFIG_HARDLOCKUP_DETECTOR_PERF) extern void arch_touch_nmi_watchdog(void); +extern void hardlockup_detector_perf_stop(void); +extern void hardlockup_detector_perf_restart(void); +extern void hardlockup_detector_perf_disable(void); +extern void hardlockup_detector_perf_enable(void); +extern void hardlockup_detector_perf_cleanup(void); +extern int hardlockup_detector_perf_init(void); #else -#if !defined(CONFIG_HAVE_NMI_WATCHDOG) +static inline void hardlockup_detector_perf_stop(void) { } +static inline void hardlockup_detector_perf_restart(void) { } +static inline void hardlockup_detector_perf_disable(void) { } +static inline void hardlockup_detector_perf_enable(void) { } +static inline void hardlockup_detector_perf_cleanup(void) { } +# if !defined(CONFIG_HAVE_NMI_WATCHDOG) +static inline int hardlockup_detector_perf_init(void) { return -ENODEV; } static inline void arch_touch_nmi_watchdog(void) {} +# else +static inline int hardlockup_detector_perf_init(void) { return 0; } +# endif #endif -#endif + +void watchdog_nmi_stop(void); +void watchdog_nmi_start(void); +int watchdog_nmi_probe(void); /** * touch_nmi_watchdog - restart NMI watchdog timeout. - * + * * If the architecture supports the NMI watchdog, touch_nmi_watchdog() * may be used to reset the timeout - for code which intentionally * disables interrupts for a long time. This call is stateless. @@ -153,22 +185,6 @@ static inline bool trigger_single_cpu_backtrace(int cpu) u64 hw_nmi_get_sample_period(int watchdog_thresh); #endif -#ifdef CONFIG_LOCKUP_DETECTOR -extern int nmi_watchdog_enabled; -extern int watchdog_user_enabled; -extern int watchdog_thresh; -extern unsigned long watchdog_enabled; -extern struct cpumask watchdog_cpumask; -extern unsigned long *watchdog_cpumask_bits; -extern int __read_mostly watchdog_suspended; -#ifdef CONFIG_SMP -extern int sysctl_softlockup_all_cpu_backtrace; -extern int sysctl_hardlockup_all_cpu_backtrace; -#else -#define sysctl_softlockup_all_cpu_backtrace 0 -#define sysctl_hardlockup_all_cpu_backtrace 0 -#endif - #if defined(CONFIG_HARDLOCKUP_CHECK_TIMESTAMP) && \ defined(CONFIG_HARDLOCKUP_DETECTOR) void watchdog_update_hrtimer_threshold(u64 period); @@ -176,7 +192,6 @@ void watchdog_update_hrtimer_threshold(u64 period); static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif -extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); @@ -188,18 +203,6 @@ extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_cpumask(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int lockup_detector_suspend(void); -extern void lockup_detector_resume(void); -#else -static inline int lockup_detector_suspend(void) -{ - return 0; -} - -static inline void lockup_detector_resume(void) -{ -} -#endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include <asm/nmi.h> diff --git a/include/linux/once.h b/include/linux/once.h index 9c98aaa87cbc..724724918e8b 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -5,7 +5,7 @@ #include <linux/jump_label.h> bool __do_once_start(bool *done, unsigned long *flags); -void __do_once_done(bool *done, struct static_key *once_key, +void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags); /* Call a function exactly once. The idea of DO_ONCE() is to perform @@ -38,8 +38,8 @@ void __do_once_done(bool *done, struct static_key *once_key, ({ \ bool ___ret = false; \ static bool ___done = false; \ - static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \ - if (static_key_true(&___once_key)) { \ + static DEFINE_STATIC_KEY_TRUE(___once_key); \ + if (static_branch_unlikely(&___once_key)) { \ unsigned long ___flags; \ ___ret = __do_once_start(&___done, &___flags); \ if (unlikely(___ret)) { \ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 8e22f24ded6a..79b18a20cf5d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -806,6 +806,7 @@ struct perf_output_handle { struct bpf_perf_event_data_kern { struct pt_regs *regs; struct perf_sample_data *data; + struct perf_event *event; }; #ifdef CONFIG_CGROUP_PERF @@ -884,7 +885,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, void *context); extern void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu); -int perf_event_read_local(struct perf_event *event, u64 *value); +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); @@ -1286,7 +1288,8 @@ static inline const struct perf_event_attr *perf_event_attrs(struct perf_event * { return ERR_PTR(-EINVAL); } -static inline int perf_event_read_local(struct perf_event *event, u64 *value) +static inline int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { return -EINVAL; } diff --git a/include/linux/qed/qed_ll2_if.h b/include/linux/qed/qed_ll2_if.h index 89fa0bbd54f3..e755954d85fd 100644 --- a/include/linux/qed/qed_ll2_if.h +++ b/include/linux/qed/qed_ll2_if.h @@ -64,6 +64,7 @@ enum qed_ll2_roce_flavor_type { enum qed_ll2_tx_dest { QED_LL2_TX_DEST_NW, /* Light L2 TX Destination to the Network */ QED_LL2_TX_DEST_LB, /* Light L2 TX Destination to the Loopback */ + QED_LL2_TX_DEST_DROP, /* Light L2 Drop the TX packet */ QED_LL2_TX_DEST_MAX }; @@ -150,11 +151,16 @@ void (*qed_ll2_release_tx_packet_cb)(void *cxt, dma_addr_t first_frag_addr, bool b_last_fragment, bool b_last_packet); +typedef +void (*qed_ll2_slowpath_cb)(void *cxt, u8 connection_handle, + u32 opaque_data_0, u32 opaque_data_1); + struct qed_ll2_cbs { qed_ll2_complete_rx_packet_cb rx_comp_cb; qed_ll2_release_rx_packet_cb rx_release_cb; qed_ll2_complete_tx_packet_cb tx_comp_cb; qed_ll2_release_tx_packet_cb tx_release_cb; + qed_ll2_slowpath_cb slowpath_cb; void *cookie; }; @@ -171,6 +177,7 @@ struct qed_ll2_acquire_data_inputs { enum qed_ll2_tx_dest tx_dest; enum qed_ll2_error_handle ai_err_packet_too_big; enum qed_ll2_error_handle ai_err_no_buf; + bool secondary_queue; u8 gsi_enable; }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 01a985937867..03634ec2f918 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3158,6 +3158,12 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) return __skb_grow(skb, len); } +#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode) +#define skb_rb_first(root) rb_to_skb(rb_first(root)) +#define skb_rb_last(root) rb_to_skb(rb_last(root)) +#define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) +#define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ skb != (struct sk_buff *)(queue); \ @@ -3172,6 +3178,18 @@ static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) for (; skb != (struct sk_buff *)(queue); \ skb = skb->next) +#define skb_rbtree_walk(skb, root) \ + for (skb = skb_rb_first(root); skb != NULL; \ + skb = skb_rb_next(skb)) + +#define skb_rbtree_walk_from(skb) \ + for (; skb != NULL; \ + skb = skb_rb_next(skb)) + +#define skb_rbtree_walk_from_safe(skb, tmp) \ + for (; tmp = skb ? skb_rb_next(skb) : NULL, (skb != NULL); \ + skb = tmp) + #define skb_queue_walk_from_safe(queue, skb, tmp) \ for (tmp = skb->next; \ skb != (struct sk_buff *)(queue); \ diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index 12910cf19869..c149aa7bedf3 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -55,7 +55,7 @@ smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) } void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread); -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *); +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *); #endif diff --git a/include/net/act_api.h b/include/net/act_api.h index b944e0eb93be..f5e8c9048fb0 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -93,8 +93,7 @@ struct tc_action_ops { int (*walk)(struct net *, struct sk_buff *, struct netlink_callback *, int, const struct tc_action_ops *); void (*stats_update)(struct tc_action *, u64, u32, u64); - int (*get_dev)(const struct tc_action *a, struct net *net, - struct net_device **mirred_dev); + struct net_device *(*get_dev)(const struct tc_action *a); }; struct tc_action_net { @@ -175,4 +174,38 @@ static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes, #endif } +typedef int tc_setup_cb_t(enum tc_setup_type type, + void *type_data, void *cb_priv); + +#ifdef CONFIG_NET_CLS_ACT +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv); +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv); +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop); +#else +static inline +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + return 0; +} + +static inline +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ +} + +static inline +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop) +{ + return 0; +} +#endif + #endif diff --git a/include/net/dsa.h b/include/net/dsa.h index 10dceccd9ce8..2746741f74cf 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -291,7 +291,6 @@ struct dsa_switch_ops { enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds); int (*setup)(struct dsa_switch *ds); - int (*set_addr)(struct dsa_switch *ds, u8 *addr); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* @@ -471,4 +470,54 @@ static inline int dsa_switch_resume(struct dsa_switch *ds) } #endif /* CONFIG_PM_SLEEP */ +enum dsa_notifier_type { + DSA_PORT_REGISTER, + DSA_PORT_UNREGISTER, +}; + +struct dsa_notifier_info { + struct net_device *dev; +}; + +struct dsa_notifier_register_info { + struct dsa_notifier_info info; /* must be first */ + struct net_device *master; + unsigned int port_number; + unsigned int switch_number; +}; + +static inline struct net_device * +dsa_notifier_info_to_dev(const struct dsa_notifier_info *info) +{ + return info->dev; +} + +#if IS_ENABLED(CONFIG_NET_DSA) +int register_dsa_notifier(struct notifier_block *nb); +int unregister_dsa_notifier(struct notifier_block *nb); +int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info); +#else +static inline int register_dsa_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int unregister_dsa_notifier(struct notifier_block *nb) +{ + return 0; +} + +static inline int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info) +{ + return NOTIFY_DONE; +} +#endif + +/* Broadcom tag specific helpers to insert and extract queue/port number */ +#define BRCM_TAG_SET_PORT_QUEUE(p, q) ((p) << 8 | q) +#define BRCM_TAG_GET_PORT(v) ((v) >> 8) +#define BRCM_TAG_GET_QUEUE(v) ((v) & 0xff) + #endif diff --git a/include/net/dst.h b/include/net/dst.h index 06a6765da074..204c19e25456 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -101,7 +101,7 @@ struct dst_entry { union { struct dst_entry *next; struct rtable __rcu *rt_next; - struct rt6_info *rt6_next; + struct rt6_info __rcu *rt6_next; struct dn_route __rcu *dn_next; }; }; diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 9fba2ebf6dda..87a0bb8d449f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -87,6 +87,7 @@ static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a, void metadata_dst_free(struct metadata_dst *); struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags); +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags); diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index d060d711a624..10c913816032 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -29,6 +29,14 @@ #define FIB6_TABLE_HASHSZ 1 #endif +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) pr_debug(x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + struct rt6_info; struct fib6_config { @@ -60,25 +68,30 @@ struct fib6_config { }; struct fib6_node { - struct fib6_node *parent; - struct fib6_node *left; - struct fib6_node *right; + struct fib6_node __rcu *parent; + struct fib6_node __rcu *left; + struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES - struct fib6_node *subtree; + struct fib6_node __rcu *subtree; #endif - struct rt6_info *leaf; + struct rt6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; - struct rt6_info *rr_ptr; + struct rt6_info __rcu *rr_ptr; struct rcu_head rcu; }; +struct fib6_gc_args { + int timeout; + int more; +}; + #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL #else -#define FIB6_SUBTREE(fn) ((fn)->subtree) +#define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif struct mx6_config { @@ -98,6 +111,22 @@ struct rt6key { struct fib6_table; +struct rt6_exception_bucket { + struct hlist_head chain; + int depth; +}; + +struct rt6_exception { + struct hlist_node hlist; + struct rt6_info *rt6i; + unsigned long stamp; + struct rcu_head rcu; +}; + +#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 +#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) +#define FIB6_MAX_DEPTH 5 + struct rt6_info { struct dst_entry dst; @@ -134,14 +163,25 @@ struct rt6_info { struct inet6_dev *rt6i_idev; struct rt6_info * __percpu *rt6i_pcpu; + struct rt6_exception_bucket __rcu *rt6i_exception_bucket; u32 rt6i_metric; u32 rt6i_pmtu; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + u8 exception_bucket_flushed:1, + unused:7; }; +#define for_each_fib6_node_rt_rcu(fn) \ + for (rt = rcu_dereference((fn)->leaf); rt; \ + rt = rcu_dereference(rt->dst.rt6_next)) + +#define for_each_fib6_walker_rt(w) \ + for (rt = (w)->leaf; rt; \ + rt = rcu_dereference_protected(rt->dst.rt6_next, 1)) + static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { return ((struct rt6_info *)dst)->rt6i_idev; @@ -188,6 +228,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, if (fn) { *cookie = fn->fn_sernum; + /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ + smp_rmb(); status = true; } @@ -248,7 +290,6 @@ struct fib6_walker { struct fib6_node *root, *node; struct rt6_info *leaf; enum fib6_walk_state state; - bool prune; unsigned int skip; unsigned int count; int (*func)(struct fib6_walker *); @@ -256,12 +297,15 @@ struct fib6_walker { }; struct rt6_statistics { - __u32 fib_nodes; - __u32 fib_route_nodes; - __u32 fib_rt_alloc; /* permanent routes */ - __u32 fib_rt_entries; /* rt entries in table */ - __u32 fib_rt_cache; /* cache routes */ - __u32 fib_discarded_routes; + __u32 fib_nodes; /* all fib6 nodes */ + __u32 fib_route_nodes; /* intermediate nodes */ + __u32 fib_rt_entries; /* rt entries in fib table */ + __u32 fib_rt_cache; /* cached rt entries in exception table */ + __u32 fib_discarded_routes; /* total number of routes delete */ + + /* The following stats are not protected by any lock */ + atomic_t fib_rt_alloc; /* total number of routes alloced */ + atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 @@ -277,7 +321,7 @@ struct rt6_statistics { struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; - rwlock_t tb6_lock; + spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; @@ -325,7 +369,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len); + const struct in6_addr *saddr, int src_len, + bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg), void *arg); @@ -358,6 +403,8 @@ void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb); +void fib6_update_sernum(struct rt6_info *rt); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES int fib6_rules_init(void); void fib6_rules_cleanup(void); diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index ee96f402cb75..a0087fb9864b 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -95,6 +95,11 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack); int ip6_ins_rt(struct rt6_info *); int ip6_del_rt(struct rt6_info *); +void rt6_flush_exceptions(struct rt6_info *rt); +int rt6_remove_exception_rt(struct rt6_info *rt); +void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args, + unsigned long now); + static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt, const struct in6_addr *daddr, unsigned int prefs, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 6eac5cf8f1e6..3cda3b521c36 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -300,8 +300,8 @@ static inline void fl6_sock_release(struct ip6_flowlabel *fl) void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); -int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, - struct icmp6hdr *thdr, int len); +void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h index 039cc29cb4a8..51e1a2a45d02 100644 --- a/include/net/phonet/phonet.h +++ b/include/net/phonet/phonet.h @@ -108,8 +108,10 @@ struct phonet_protocol { int sock_type; }; -int phonet_proto_register(unsigned int protocol, struct phonet_protocol *pp); -void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp); +int phonet_proto_register(unsigned int protocol, + const struct phonet_protocol *pp); +void phonet_proto_unregister(unsigned int protocol, + const struct phonet_protocol *pp); int phonet_sysctl_init(void); void phonet_sysctl_exit(void); diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index e80edd8879ef..f5263743076b 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -204,8 +204,6 @@ void tcf_exts_destroy(struct tcf_exts *exts); void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src); int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts); -int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, - struct net_device **hw_dev); /** * struct tcf_pkt_info - packet information @@ -405,6 +403,9 @@ tcf_match_indev(struct sk_buff *skb, int ifindex) } #endif /* CONFIG_NET_CLS_IND */ +int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop); + struct tc_cls_common_offload { u32 chain_index; __be16 protocol; @@ -514,7 +515,6 @@ struct tc_cls_flower_offload { struct fl_flow_key *mask; struct fl_flow_key *key; struct tcf_exts *exts; - bool egress_dev; }; enum tc_matchall_command { diff --git a/include/net/sock.h b/include/net/sock.h index a6b9a8d1a6df..4827094f1db4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -60,7 +60,7 @@ #include <linux/sched.h> #include <linux/wait.h> #include <linux/cgroup-defs.h> - +#include <linux/rbtree.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> @@ -397,7 +397,10 @@ struct sock { int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; - struct sk_buff *sk_send_head; + union { + struct sk_buff *sk_send_head; + struct rb_root tcp_rtx_queue; + }; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d767b7991887..d756fbe46625 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -51,6 +51,7 @@ enum switchdev_attr_id { SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME, SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING, SWITCHDEV_ATTR_ID_BRIDGE_MC_DISABLED, + SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, }; struct switchdev_attr { diff --git a/include/net/tc_act/tc_ife.h b/include/net/tc_act/tc_ife.h index 30ba459ddd34..c7fb99c3f76c 100644 --- a/include/net/tc_act/tc_ife.h +++ b/include/net/tc_act/tc_ife.h @@ -6,12 +6,18 @@ #include <linux/rtnetlink.h> #include <linux/module.h> -struct tcf_ife_info { - struct tc_action common; +struct tcf_ife_params { u8 eth_dst[ETH_ALEN]; u8 eth_src[ETH_ALEN]; u16 eth_type; u16 flags; + + struct rcu_head rcu; +}; + +struct tcf_ife_info { + struct tc_action common; + struct tcf_ife_params __rcu *params; /* list of metaids allowed */ struct list_head metalist; }; @@ -40,7 +46,7 @@ struct tcf_meta_ops { struct module *owner; }; -#define MODULE_ALIAS_IFE_META(metan) MODULE_ALIAS("ifemeta" __stringify_1(metan)) +#define MODULE_ALIAS_IFE_META(metan) MODULE_ALIAS("ife-meta-" metan) int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi); int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi); diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 604bc31e23ab..21a656569840 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -10,6 +10,7 @@ struct tcf_mirred { int tcfm_ifindex; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; + struct net *net; struct list_head tcfm_list; }; #define to_mirred(a) ((struct tcf_mirred *)a) diff --git a/include/net/tcp.h b/include/net/tcp.h index 3b16f353b539..3b3b9b968e2d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); -int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t); +enum tcp_queue { + TCP_FRAG_IN_WRITE_QUEUE, + TCP_FRAG_IN_RTX_QUEUE, +}; +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, + unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); void tcp_send_partial(struct sock *); @@ -1606,19 +1612,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) skb->_skb_refdst = _save; \ } -/* write queue abstraction */ -static inline void tcp_write_queue_purge(struct sock *sk) -{ - struct sk_buff *skb; +void tcp_write_queue_purge(struct sock *sk); - tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { - tcp_skb_tsorted_anchor_cleanup(skb); - sk_wmem_free_skb(sk, skb); - } - INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); - sk_mem_reclaim(sk); - tcp_clear_all_retrans_hints(tcp_sk(sk)); +static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) +{ + return skb_rb_first(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) @@ -1631,30 +1629,12 @@ static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) return skb_peek_tail(&sk->sk_write_queue); } -static inline struct sk_buff *tcp_write_queue_next(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb_queue_next(&sk->sk_write_queue, skb); -} - -static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk, - const struct sk_buff *skb) -{ - return skb_queue_prev(&sk->sk_write_queue, skb); -} - -#define tcp_for_write_queue(skb, sk) \ - skb_queue_walk(&(sk)->sk_write_queue, skb) - -#define tcp_for_write_queue_from(skb, sk) \ - skb_queue_walk_from(&(sk)->sk_write_queue, skb) - #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) static inline struct sk_buff *tcp_send_head(const struct sock *sk) { - return sk->sk_send_head; + return skb_peek(&sk->sk_write_queue); } static inline bool tcp_skb_is_last(const struct sock *sk, @@ -1663,29 +1643,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk, return skb_queue_is_last(&sk->sk_write_queue, skb); } -static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb) +static inline bool tcp_write_queue_empty(const struct sock *sk) { - if (tcp_skb_is_last(sk, skb)) - sk->sk_send_head = NULL; - else - sk->sk_send_head = tcp_write_queue_next(sk, skb); + return skb_queue_empty(&sk->sk_write_queue); +} + +static inline bool tcp_rtx_queue_empty(const struct sock *sk) +{ + return RB_EMPTY_ROOT(&sk->tcp_rtx_queue); +} + +static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk) +{ + return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk); } static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked) { - if (sk->sk_send_head == skb_unlinked) { - sk->sk_send_head = NULL; + if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); - } + if (tcp_sk(sk)->highest_sack == skb_unlinked) tcp_sk(sk)->highest_sack = NULL; } -static inline void tcp_init_send_head(struct sock *sk) -{ - sk->sk_send_head = NULL; -} - static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { __skb_queue_tail(&sk->sk_write_queue, skb); @@ -1696,8 +1677,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb __tcp_add_write_queue_tail(sk, skb); /* Queue it, remembering where we must start sending. */ - if (sk->sk_send_head == NULL) { - sk->sk_send_head = skb; + if (sk->sk_write_queue.next == skb) { tcp_chrono_start(sk, TCP_CHRONO_BUSY); if (tcp_sk(sk)->highest_sack == NULL) @@ -1705,40 +1685,33 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb } } -static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *skb) -{ - __skb_queue_head(&sk->sk_write_queue, skb); -} - -/* Insert buff after skb on the write queue of sk. */ -static inline void tcp_insert_write_queue_after(struct sk_buff *skb, - struct sk_buff *buff, - struct sock *sk) -{ - __skb_queue_after(&sk->sk_write_queue, skb, buff); -} - /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_queue_before(&sk->sk_write_queue, skb, new); - - if (sk->sk_send_head == skb) - sk->sk_send_head = new; } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { - list_del(&skb->tcp_tsorted_anchor); tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } -static inline bool tcp_write_queue_empty(struct sock *sk) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb); + +static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk) { - return skb_queue_empty(&sk->sk_write_queue); + tcp_skb_tsorted_anchor_cleanup(skb); + rb_erase(&skb->rbnode, &sk->tcp_rtx_queue); +} + +static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk) +{ + list_del(&skb->tcp_tsorted_anchor); + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); } static inline void tcp_push_pending_frames(struct sock *sk) @@ -1767,8 +1740,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { - tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL : - tcp_write_queue_next(sk, skb); + struct sk_buff *next = skb_rb_next(skb); + + tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) @@ -1778,7 +1752,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk) static inline void tcp_highest_sack_reset(struct sock *sk) { - tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk); + struct sk_buff *skb = tcp_rtx_queue_head(sk); + + tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk); } /* Called when old skb is about to be deleted (to be combined with new skb) */ @@ -1948,7 +1924,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk); /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { - const struct sk_buff *skb = tcp_write_queue_head(sk); + const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 82e93ee94708..67c5a9f223f7 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -192,6 +192,7 @@ struct scsi_device { unsigned no_dif:1; /* T10 PI (DIF) should be disabled */ unsigned broken_fua:1; /* Don't set FUA bit */ unsigned lun_in_cdb:1; /* Store LUN bits in CDB[1] */ + unsigned unmap_limit_for_ws:1; /* Use the UNMAP limit for WRITE SAME */ atomic_t disk_events_disable_depth; /* disable depth for disk events */ diff --git a/include/scsi/scsi_devinfo.h b/include/scsi/scsi_devinfo.h index 9592570e092a..36b03013d629 100644 --- a/include/scsi/scsi_devinfo.h +++ b/include/scsi/scsi_devinfo.h @@ -29,5 +29,6 @@ #define BLIST_TRY_VPD_PAGES 0x10000000 /* Attempt to read VPD pages */ #define BLIST_NO_RSOC 0x20000000 /* don't try to issue RSOC */ #define BLIST_MAX_1024 0x40000000 /* maximum 1024 sector cdb length */ +#define BLIST_UNMAP_LIMIT_WS 0x80000000 /* Use UNMAP limit for WRITE SAME */ #endif diff --git a/include/scsi/scsi_transport_iscsi.h b/include/scsi/scsi_transport_iscsi.h index 6183d20a01fb..b266d2a3bcb1 100644 --- a/include/scsi/scsi_transport_iscsi.h +++ b/include/scsi/scsi_transport_iscsi.h @@ -434,7 +434,6 @@ extern struct iscsi_cls_session *iscsi_create_session(struct Scsi_Host *shost, unsigned int target_id); extern void iscsi_remove_session(struct iscsi_cls_session *session); extern void iscsi_free_session(struct iscsi_cls_session *session); -extern int iscsi_destroy_session(struct iscsi_cls_session *session); extern struct iscsi_cls_conn *iscsi_create_conn(struct iscsi_cls_session *sess, int dd_size, uint32_t cid); extern int iscsi_destroy_conn(struct iscsi_cls_conn *conn); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6082faf5fd2a..6db9e1d679cd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -230,7 +230,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ - __u8 map_name[BPF_OBJ_NAME_LEN]; + char map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -253,7 +253,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; - __u8 prog_name[BPF_OBJ_NAME_LEN]; + char prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -641,6 +641,21 @@ union bpf_attr { * @xdp_md: pointer to xdp_md * @delta: An positive/negative integer to be added to xdp_md.data_meta * Return: 0 on success or negative on error + * + * int bpf_perf_event_read_value(map, flags, buf, buf_size) + * read perf event counter value and perf event enabled/running time + * @map: pointer to perf_event_array map + * @flags: index of event in the map or bitmask flags + * @buf: buf to fill + * @buf_size: size of the buf + * Return: 0 on success or negative error code + * + * int bpf_perf_prog_read_value(ctx, buf, buf_size) + * read perf prog attached perf event counter and enabled/running time + * @ctx: pointer to ctx + * @buf: buf to fill + * @buf_size: size of the buf + * Return : 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -697,7 +712,9 @@ union bpf_attr { FN(redirect_map), \ FN(sk_redirect_map), \ FN(sock_map_update), \ - FN(xdp_adjust_meta), + FN(xdp_adjust_meta), \ + FN(perf_event_read_value), \ + FN(perf_prog_read_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -741,7 +758,9 @@ enum bpf_func_id { #define BPF_F_ZERO_CSUM_TX (1ULL << 1) #define BPF_F_DONT_FRAGMENT (1ULL << 2) -/* BPF_FUNC_perf_event_output and BPF_FUNC_perf_event_read flags. */ +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ #define BPF_F_INDEX_MASK 0xffffffffULL #define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK /* BPF_FUNC_perf_event_output for sk_buff input context. */ @@ -869,7 +888,7 @@ struct bpf_prog_info { __u32 created_by_uid; __u32 nr_map_ids; __aligned_u64 map_ids; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { @@ -879,7 +898,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops @@ -934,4 +953,10 @@ enum { #define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ #define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index cd580fc0e58f..b037e0ab1975 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -327,6 +327,7 @@ enum { IFLA_BRPORT_VLAN_TUNNEL, IFLA_BRPORT_BCAST_FLOOD, IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 2e520883c054..a2f48c01365e 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -84,6 +84,7 @@ enum tunnel_encap_types { TUNNEL_ENCAP_NONE, TUNNEL_ENCAP_FOU, TUNNEL_ENCAP_GUE, + TUNNEL_ENCAP_MPLS, }; #define TUNNEL_ENCAP_FLAG_CSUM (1<<0) diff --git a/include/uapi/linux/netfilter/xt_bpf.h b/include/uapi/linux/netfilter/xt_bpf.h index b97725af2ac0..da161b56c79e 100644 --- a/include/uapi/linux/netfilter/xt_bpf.h +++ b/include/uapi/linux/netfilter/xt_bpf.h @@ -23,6 +23,7 @@ enum xt_bpf_modes { XT_BPF_MODE_FD_PINNED, XT_BPF_MODE_FD_ELF, }; +#define XT_BPF_MODE_PATH_PINNED XT_BPF_MODE_FD_PINNED struct xt_bpf_info_v1 { __u16 mode; diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 156ee4cab82e..0cd6f8833147 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -359,6 +359,7 @@ enum ovs_tunnel_key_attr { OVS_TUNNEL_KEY_ATTR_IPV6_SRC, /* struct in6_addr src IPv6 address. */ OVS_TUNNEL_KEY_ATTR_IPV6_DST, /* struct in6_addr dst IPv6 address. */ OVS_TUNNEL_KEY_ATTR_PAD, + OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, /* be32 ERSPAN index. */ __OVS_TUNNEL_KEY_ATTR_MAX }; @@ -806,6 +807,7 @@ struct ovs_action_push_eth { * packet. * @OVS_ACTION_ATTR_POP_ETH: Pop the outermost Ethernet header off the * packet. + * @OVS_ACTION_ATTR_CT_CLEAR: Clear conntrack state from the packet. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -835,6 +837,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_TRUNC, /* u32 struct ovs_action_trunc. */ OVS_ACTION_ATTR_PUSH_ETH, /* struct ovs_action_push_eth. */ OVS_ACTION_ATTR_POP_ETH, /* No argument. */ + OVS_ACTION_ATTR_CT_CLEAR, /* No argument. */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h index 9d76c566f66e..179af64846e0 100644 --- a/include/uapi/linux/qrtr.h +++ b/include/uapi/linux/qrtr.h @@ -4,10 +4,45 @@ #include <linux/socket.h> #include <linux/types.h> +#define QRTR_NODE_BCAST 0xffffffffu +#define QRTR_PORT_CTRL 0xfffffffeu + struct sockaddr_qrtr { __kernel_sa_family_t sq_family; __u32 sq_node; __u32 sq_port; }; +enum qrtr_pkt_type { + QRTR_TYPE_DATA = 1, + QRTR_TYPE_HELLO = 2, + QRTR_TYPE_BYE = 3, + QRTR_TYPE_NEW_SERVER = 4, + QRTR_TYPE_DEL_SERVER = 5, + QRTR_TYPE_DEL_CLIENT = 6, + QRTR_TYPE_RESUME_TX = 7, + QRTR_TYPE_EXIT = 8, + QRTR_TYPE_PING = 9, + QRTR_TYPE_NEW_LOOKUP = 10, + QRTR_TYPE_DEL_LOOKUP = 11, +}; + +struct qrtr_ctrl_pkt { + __le32 cmd; + + union { + struct { + __le32 service; + __le32 instance; + __le32 node; + __le32 port; + } server; + + struct { + __le32 node; + __le32 port; + } client; + }; +} __packed; + #endif /* _LINUX_QRTR_H */ diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h index 3d7a2b352a62..69038c29e8a9 100644 --- a/include/uapi/linux/tc_act/tc_mirred.h +++ b/include/uapi/linux/tc_act/tc_mirred.h @@ -9,13 +9,13 @@ #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */ #define TCA_INGRESS_REDIR 3 /* packet redirect to INGRESS*/ #define TCA_INGRESS_MIRROR 4 /* mirror packet to INGRESS */ - + struct tc_mirred { tc_gen; int eaction; /* one of IN/EGRESS_MIRROR/REDIR */ __u32 ifindex; /* ifindex of egress port */ }; - + enum { TCA_MIRRED_UNSPEC, TCA_MIRRED_TM, @@ -24,5 +24,5 @@ enum { __TCA_MIRRED_MAX }; #define TCA_MIRRED_MAX (__TCA_MIRRED_MAX - 1) - + #endif diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h index 5351b08c897a..ef41c11a7f38 100644 --- a/include/uapi/linux/tipc.h +++ b/include/uapi/linux/tipc.h @@ -231,6 +231,21 @@ struct sockaddr_tipc { #define TIPC_SOCK_RECVQ_DEPTH 132 /* Default: none (read only) */ #define TIPC_MCAST_BROADCAST 133 /* Default: TIPC selects. No arg */ #define TIPC_MCAST_REPLICAST 134 /* Default: TIPC selects. No arg */ +#define TIPC_GROUP_JOIN 135 /* Takes struct tipc_group_req* */ +#define TIPC_GROUP_LEAVE 136 /* No argument */ + +/* + * Flag values + */ +#define TIPC_GROUP_LOOPBACK 0x1 /* Receive copy of sent msg when match */ +#define TIPC_GROUP_MEMBER_EVTS 0x2 /* Receive membership events in socket */ + +struct tipc_group_req { + __u32 type; /* group id */ + __u32 instance; /* member id */ + __u32 scope; /* zone/cluster/node */ + __u32 flags; +}; /* * Maximum sizes of TIPC bearer-related names (including terminating NULL) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 897daa005b23..53fb09f92e3f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,6 +2,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +obj-$(CONFIG_BPF_SYSCALL) += disasm.o ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o ifeq ($(CONFIG_STREAM_PARSER),y) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 98c0f00c3f5e..68d866628be0 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, ee = ERR_PTR(-EOPNOTSUPP); event = perf_file->private_data; - if (perf_event_read_local(event, &value) == -EOPNOTSUPP) + if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP) goto err_out; ee = bpf_event_entry_gen(perf_file, map_file); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e88abc0865d5..3db5a17fcfe8 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -192,7 +192,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, struct cgroup_subsys_state *css; struct bpf_prog_list *pl; bool pl_was_allocated; - u32 old_flags; int err; if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) @@ -239,7 +238,6 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = prog; } - old_flags = cgrp->bpf.flags[type]; cgrp->bpf.flags[type] = flags; /* allocate and recompute effective prog arrays */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c6be15ae83ee..248961af2421 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -309,12 +309,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { + const char *end = sym + KSYM_NAME_LEN; + BUILD_BUG_ON(sizeof("bpf_prog_") + - sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN); + sizeof(prog->tag) * 2 + + /* name has been null terminated. + * We should need +1 for the '_' preceding + * the name. However, the null character + * is double counted between the name and the + * sizeof("bpf_prog_") above, so we omit + * the +1 here. + */ + sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); - *sym = 0; + if (prog->aux->name[0]) + snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); + else + *sym = 0; } static __always_inline unsigned long diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c new file mode 100644 index 000000000000..e682850c9715 --- /dev/null +++ b/kernel/bpf/disasm.c @@ -0,0 +1,214 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/bpf.h> + +#include "disasm.h" + +#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +static const char * const func_id_str[] = { + __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +}; +#undef __BPF_FUNC_STR_FN + +const char *func_id_name(int id) +{ + BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + + if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) + return func_id_str[id]; + else + return "unknown"; +} + +const char *const bpf_class_string[8] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +const char *const bpf_alu_string[16] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[16] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JLT >> 4] = "<", + [BPF_JGE >> 4] = ">=", + [BPF_JLE >> 4] = "<=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSLT >> 4] = "s<", + [BPF_JSGE >> 4] = "s>=", + [BPF_JSLE >> 4] = "s<=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_end_insn(bpf_insn_print_cb verbose, + struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", + insn->imm, insn->dst_reg); +} + +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_OP(insn->code) == BPF_END) { + if (class == BPF_ALU64) + verbose(env, "BUG_alu64_%02x\n", insn->code); + else + print_bpf_end_insn(verbose, env, insn); + } else if (BPF_OP(insn->code) == BPF_NEG) { + verbose(env, "(%02x) r%d = %s-r%d\n", + insn->code, insn->dst_reg, + class == BPF_ALU ? "(u32) " : "", + insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + } else { + verbose(env, "(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose(env, "BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_st_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose(env, "BUG_ldx_%02x\n", insn->code); + return; + } + verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !allow_ptr_leaks) + imm = 0; + + verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); + } else { + verbose(env, "BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose(env, "(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose(env, "(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose(env, "(%02x) %s\n", + insn->code, bpf_class_string[class]); + } +} diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h new file mode 100644 index 000000000000..8de977e420b6 --- /dev/null +++ b/kernel/bpf/disasm.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef __BPF_DISASM_H__ +#define __BPF_DISASM_H__ + +#include <linux/bpf.h> +#include <linux/kernel.h> +#include <linux/stringify.h> + +extern const char *const bpf_alu_string[16]; +extern const char *const bpf_class_string[8]; + +const char *func_id_name(int id); + +struct bpf_verifier_env; +typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, + const char *, ...); +void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, + const struct bpf_insn *insn, bool allow_ptr_leaks); + +#endif diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index e833ed914358..be1dde967208 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -363,6 +363,7 @@ out: putname(pname); return ret; } +EXPORT_SYMBOL_GPL(bpf_obj_get_user); static void bpf_evict_inode(struct inode *inode) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0048cb24ba7b..d124e702e040 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -322,6 +322,8 @@ static int bpf_obj_name_cpy(char *dst, const char *src) { const char *end = src + BPF_OBJ_NAME_LEN; + memset(dst, 0, BPF_OBJ_NAME_LEN); + /* Copy all isalnum() and '_' char */ while (src < end && *src) { if (!isalnum(*src) && *src != '_') @@ -333,9 +335,6 @@ static int bpf_obj_name_cpy(char *dst, const char *src) if (src == end) return -EINVAL; - /* '\0' terminates dst */ - *dst = 0; - return 0; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52b022310f6a..9755279d94cb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -21,6 +21,8 @@ #include <linux/vmalloc.h> #include <linux/stringify.h> +#include "disasm.h" + /* bpf_check() is a static code analyzer that walks eBPF program * instruction by instruction and updates register/stack state. * All paths of conditional branches are analyzed until 'bpf_exit' insn. @@ -153,28 +155,36 @@ struct bpf_call_arg_meta { int access_size; }; -/* verbose verifier prints what it's seeing - * bpf_check() is called under lock, so no race to access these global vars - */ -static u32 log_level, log_size, log_len; -static char *log_buf; - static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. * verbose() is used to dump the verification trace to the log, so the user * can figure out what's wrong with the program */ -static __printf(1, 2) void verbose(const char *fmt, ...) +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) { + struct bpf_verifer_log *log = &env->log; + unsigned int n; va_list args; - if (log_level == 0 || log_len >= log_size - 1) + if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) return; va_start(args, fmt); - log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); va_end(args); + + WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, + "verifier log line truncated - local buffer too short\n"); + + n = min(log->len_total - log->len_used - 1, n); + log->kbuf[n] = '\0'; + + if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) + log->len_used += n; + else + log->ubuf = NULL; } static bool type_is_pkt_pointer(enum bpf_reg_type type) @@ -197,23 +207,8 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; -#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) -static const char * const func_id_str[] = { - __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) -}; -#undef __BPF_FUNC_STR_FN - -static const char *func_id_name(int id) -{ - BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); - - if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) - return func_id_str[id]; - else - return "unknown"; -} - -static void print_verifier_state(struct bpf_verifier_state *state) +static void print_verifier_state(struct bpf_verifier_env *env, + struct bpf_verifier_state *state) { struct bpf_reg_state *reg; enum bpf_reg_type t; @@ -224,21 +219,21 @@ static void print_verifier_state(struct bpf_verifier_state *state) t = reg->type; if (t == NOT_INIT) continue; - verbose(" R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d=%s", i, reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ - verbose("%lld", reg->var_off.value + reg->off); + verbose(env, "%lld", reg->var_off.value + reg->off); } else { - verbose("(id=%d", reg->id); + verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) - verbose(",off=%d", reg->off); + verbose(env, ",off=%d", reg->off); if (type_is_pkt_pointer(t)) - verbose(",r=%d", reg->range); + verbose(env, ",r=%d", reg->range); else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || t == PTR_TO_MAP_VALUE_OR_NULL) - verbose(",ks=%d,vs=%d", + verbose(env, ",ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); if (tnum_is_const(reg->var_off)) { @@ -246,218 +241,38 @@ static void print_verifier_state(struct bpf_verifier_state *state) * could be a pointer whose offset is too big * for reg->off */ - verbose(",imm=%llx", reg->var_off.value); + verbose(env, ",imm=%llx", reg->var_off.value); } else { if (reg->smin_value != reg->umin_value && reg->smin_value != S64_MIN) - verbose(",smin_value=%lld", + verbose(env, ",smin_value=%lld", (long long)reg->smin_value); if (reg->smax_value != reg->umax_value && reg->smax_value != S64_MAX) - verbose(",smax_value=%lld", + verbose(env, ",smax_value=%lld", (long long)reg->smax_value); if (reg->umin_value != 0) - verbose(",umin_value=%llu", + verbose(env, ",umin_value=%llu", (unsigned long long)reg->umin_value); if (reg->umax_value != U64_MAX) - verbose(",umax_value=%llu", + verbose(env, ",umax_value=%llu", (unsigned long long)reg->umax_value); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(",var_off=%s", tn_buf); + verbose(env, ",var_off=%s", tn_buf); } } - verbose(")"); + verbose(env, ")"); } } for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] == STACK_SPILL) - verbose(" fp%d=%s", -MAX_BPF_STACK + i, + verbose(env, " fp%d=%s", -MAX_BPF_STACK + i, reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); } - verbose("\n"); -} - -static const char *const bpf_class_string[] = { - [BPF_LD] = "ld", - [BPF_LDX] = "ldx", - [BPF_ST] = "st", - [BPF_STX] = "stx", - [BPF_ALU] = "alu", - [BPF_JMP] = "jmp", - [BPF_RET] = "BUG", - [BPF_ALU64] = "alu64", -}; - -static const char *const bpf_alu_string[16] = { - [BPF_ADD >> 4] = "+=", - [BPF_SUB >> 4] = "-=", - [BPF_MUL >> 4] = "*=", - [BPF_DIV >> 4] = "/=", - [BPF_OR >> 4] = "|=", - [BPF_AND >> 4] = "&=", - [BPF_LSH >> 4] = "<<=", - [BPF_RSH >> 4] = ">>=", - [BPF_NEG >> 4] = "neg", - [BPF_MOD >> 4] = "%=", - [BPF_XOR >> 4] = "^=", - [BPF_MOV >> 4] = "=", - [BPF_ARSH >> 4] = "s>>=", - [BPF_END >> 4] = "endian", -}; - -static const char *const bpf_ldst_string[] = { - [BPF_W >> 3] = "u32", - [BPF_H >> 3] = "u16", - [BPF_B >> 3] = "u8", - [BPF_DW >> 3] = "u64", -}; - -static const char *const bpf_jmp_string[16] = { - [BPF_JA >> 4] = "jmp", - [BPF_JEQ >> 4] = "==", - [BPF_JGT >> 4] = ">", - [BPF_JLT >> 4] = "<", - [BPF_JGE >> 4] = ">=", - [BPF_JLE >> 4] = "<=", - [BPF_JSET >> 4] = "&", - [BPF_JNE >> 4] = "!=", - [BPF_JSGT >> 4] = "s>", - [BPF_JSLT >> 4] = "s<", - [BPF_JSGE >> 4] = "s>=", - [BPF_JSLE >> 4] = "s<=", - [BPF_CALL >> 4] = "call", - [BPF_EXIT >> 4] = "exit", -}; - -static void print_bpf_end_insn(const struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, - BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", - insn->imm, insn->dst_reg); -} - -static void print_bpf_insn(const struct bpf_verifier_env *env, - const struct bpf_insn *insn) -{ - u8 class = BPF_CLASS(insn->code); - - if (class == BPF_ALU || class == BPF_ALU64) { - if (BPF_OP(insn->code) == BPF_END) { - if (class == BPF_ALU64) - verbose("BUG_alu64_%02x\n", insn->code); - else - print_bpf_end_insn(env, insn); - } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose("(%02x) r%d = %s-r%d\n", - insn->code, insn->dst_reg, - class == BPF_ALU ? "(u32) " : "", - insn->dst_reg); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) %sr%d %s %sr%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->src_reg); - } else { - verbose("(%02x) %sr%d %s %s%d\n", - insn->code, class == BPF_ALU ? "(u32) " : "", - insn->dst_reg, - bpf_alu_string[BPF_OP(insn->code) >> 4], - class == BPF_ALU ? "(u32) " : "", - insn->imm); - } - } else if (class == BPF_STX) { - if (BPF_MODE(insn->code) == BPF_MEM) - verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) - verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, insn->off, - insn->src_reg); - else - verbose("BUG_%02x\n", insn->code); - } else if (class == BPF_ST) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_st_%02x\n", insn->code); - return; - } - verbose("(%02x) *(%s *)(r%d %+d) = %d\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->dst_reg, - insn->off, insn->imm); - } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM) { - verbose("BUG_ldx_%02x\n", insn->code); - return; - } - verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", - insn->code, insn->dst_reg, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->off); - } else if (class == BPF_LD) { - if (BPF_MODE(insn->code) == BPF_ABS) { - verbose("(%02x) r0 = *(%s *)skb[%d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", - insn->code, - bpf_ldst_string[BPF_SIZE(insn->code) >> 3], - insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM && - BPF_SIZE(insn->code) == BPF_DW) { - /* At this point, we already made sure that the second - * part of the ldimm64 insn is accessible. - */ - u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; - bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; - - if (map_ptr && !env->allow_ptr_leaks) - imm = 0; - - verbose("(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); - } else { - verbose("BUG_ld_%02x\n", insn->code); - return; - } - } else if (class == BPF_JMP) { - u8 opcode = BPF_OP(insn->code); - - if (opcode == BPF_CALL) { - verbose("(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); - } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose("(%02x) goto pc%+d\n", - insn->code, insn->off); - } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose("(%02x) exit\n", insn->code); - } else if (BPF_SRC(insn->code) == BPF_X) { - verbose("(%02x) if r%d %s r%d goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->src_reg, insn->off); - } else { - verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", - insn->code, insn->dst_reg, - bpf_jmp_string[BPF_OP(insn->code) >> 4], - insn->imm, insn->off); - } - } else { - verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); - } + verbose(env, "\n"); } static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) @@ -495,7 +310,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, env->head = elem; env->stack_size++; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { - verbose("BPF program is too complex\n"); + verbose(env, "BPF program is too complex\n"); goto err; } return &elem->st; @@ -533,10 +348,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } -static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_known_zero(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_known_zero(regs, %u)\n", regno); + verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -646,10 +462,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) __mark_reg_unbounded(reg); } -static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_unknown(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_unknown(regs, %u)\n", regno); + verbose(env, "mark_reg_unknown(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -664,10 +481,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg) reg->type = NOT_INIT; } -static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) +static void mark_reg_not_init(struct bpf_verifier_env *env, + struct bpf_reg_state *regs, u32 regno) { if (WARN_ON(regno >= MAX_BPF_REG)) { - verbose("mark_reg_not_init(regs, %u)\n", regno); + verbose(env, "mark_reg_not_init(regs, %u)\n", regno); /* Something bad happened, let's kill all regs */ for (regno = 0; regno < MAX_BPF_REG; regno++) __mark_reg_not_init(regs + regno); @@ -676,22 +494,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno) __mark_reg_not_init(regs + regno); } -static void init_reg_state(struct bpf_reg_state *regs) +static void init_reg_state(struct bpf_verifier_env *env, + struct bpf_reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { - mark_reg_not_init(regs, i); + mark_reg_not_init(env, regs, i); regs[i].live = REG_LIVE_NONE; } /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; - mark_reg_known_zero(regs, BPF_REG_FP); + mark_reg_known_zero(env, regs, BPF_REG_FP); /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; - mark_reg_known_zero(regs, BPF_REG_1); + mark_reg_known_zero(env, regs, BPF_REG_1); } enum reg_arg_type { @@ -704,6 +523,10 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; + if (regno == BPF_REG_FP) + /* We don't need to worry about FP liveness because it's read-only */ + return; + while (parent) { /* if read wasn't screened by an earlier write ... */ if (state->regs[regno].live & REG_LIVE_WRITTEN) @@ -721,26 +544,26 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *regs = env->cur_state.regs; if (regno >= MAX_BPF_REG) { - verbose("R%d is invalid\n", regno); + verbose(env, "R%d is invalid\n", regno); return -EINVAL; } if (t == SRC_OP) { /* check whether register used as source operand can be read */ if (regs[regno].type == NOT_INIT) { - verbose("R%d !read_ok\n", regno); + verbose(env, "R%d !read_ok\n", regno); return -EACCES; } mark_reg_read(&env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { - verbose("frame pointer is read only\n"); + verbose(env, "frame pointer is read only\n"); return -EACCES; } regs[regno].live |= REG_LIVE_WRITTEN; if (t == DST_OP) - mark_reg_unknown(regs, regno); + mark_reg_unknown(env, regs, regno); } return 0; } @@ -765,7 +588,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_state *state, int off, +static int check_stack_write(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; @@ -778,7 +602,7 @@ static int check_stack_write(struct bpf_verifier_state *state, int off, /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } @@ -813,7 +637,8 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo } } -static int check_stack_read(struct bpf_verifier_state *state, int off, int size, +static int check_stack_read(struct bpf_verifier_env *env, + struct bpf_verifier_state *state, int off, int size, int value_regno) { u8 *slot_type; @@ -823,12 +648,12 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, if (slot_type[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { - verbose("invalid size of register spill\n"); + verbose(env, "invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { if (slot_type[i] != STACK_SPILL) { - verbose("corrupted spill memory\n"); + verbose(env, "corrupted spill memory\n"); return -EACCES; } } @@ -844,14 +669,14 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } else { for (i = 0; i < size; i++) { if (slot_type[i] != STACK_MISC) { - verbose("invalid read from stack off %d+%d size %d\n", + verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } } if (value_regno >= 0) /* have read misc data from the stack */ - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -863,7 +688,7 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, struct bpf_map *map = env->cur_state.regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { - verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", map->value_size, off, size); return -EACCES; } @@ -882,8 +707,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * need to try adding each of min_value and max_value to off * to make sure our theoretical access will be safe. */ - if (log_level) - print_verifier_state(state); + if (env->log.level) + print_verifier_state(env, state); /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our @@ -891,13 +716,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * will have a set floor within our range. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->smin_value + off, size); if (err) { - verbose("R%d min value is outside of the array range\n", regno); + verbose(env, "R%d min value is outside of the array range\n", + regno); return err; } @@ -906,13 +732,14 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, * If reg->umax_value + off could overflow, treat that as unbounded too. */ if (reg->umax_value >= BPF_MAX_VAR_OFF) { - verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", + verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", regno); return -EACCES; } err = __check_map_access(env, regno, reg->umax_value + off, size); if (err) - verbose("R%d max value is outside of the array range\n", regno); + verbose(env, "R%d max value is outside of the array range\n", + regno); return err; } @@ -951,7 +778,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { - verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", + verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", off, size, regno, reg->id, reg->off, reg->range); return -EACCES; } @@ -974,18 +801,48 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, * detail to prove they're safe. */ if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", + verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; } err = __check_packet_access(env, regno, off, size); if (err) { - verbose("R%d offset is outside of the packet\n", regno); + verbose(env, "R%d offset is outside of the packet\n", regno); return err; } return err; } +static bool analyzer_is_valid_access(struct bpf_verifier_env *env, int off, + struct bpf_insn_access_aux *info) +{ + switch (env->prog->type) { + case BPF_PROG_TYPE_XDP: + switch (off) { + case offsetof(struct xdp_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct xdp_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; + case BPF_PROG_TYPE_SCHED_CLS: + switch (off) { + case offsetof(struct sk_buff, data): + info->reg_type = PTR_TO_PACKET; + return true; + case offsetof(struct sk_buff, cb) + + offsetof(struct bpf_skb_data_end, data_end): + info->reg_type = PTR_TO_PACKET_END; + return true; + } + return false; + default: + return false; + } +} + /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type) @@ -994,12 +851,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, .reg_type = *reg_type, }; - /* for analyzer ctx accesses are already validated and converted */ - if (env->analyzer_ops) - return 0; - - if (env->prog->aux->ops->is_valid_access && - env->prog->aux->ops->is_valid_access(off, size, t, &info)) { + if (env->analyzer_ops) { + if (analyzer_is_valid_access(env, off, &info)) { + *reg_type = info.reg_type; + return 0; + } + } else if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -1016,7 +874,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return 0; } - verbose("invalid bpf_context access off=%d size=%d\n", off, size); + verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size); return -EACCES; } @@ -1034,7 +892,8 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); } -static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, +static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, int off, int size, bool strict) { struct tnum reg_off; @@ -1059,7 +918,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned packet access off %d+%s+%d+%d size %d\n", + verbose(env, + "misaligned packet access off %d+%s+%d+%d size %d\n", ip_align, tn_buf, reg->off, off, size); return -EACCES; } @@ -1067,7 +927,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg, return 0; } -static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, +static int check_generic_ptr_alignment(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, const char *pointer_desc, int off, int size, bool strict) { @@ -1082,7 +943,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("misaligned %saccess off %s+%d+%d size %d\n", + verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", pointer_desc, tn_buf, reg->off, off, size); return -EACCES; } @@ -1103,7 +964,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, /* Special case, because of NET_IP_ALIGN. Given metadata sits * right in front, treat it the very same way. */ - return check_pkt_ptr_alignment(reg, off, size, strict); + return check_pkt_ptr_alignment(env, reg, off, size, strict); case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1116,7 +977,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, default: break; } - return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict); + return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, + strict); } /* check whether memory at (regno + off) is accessible for t = (read | write) @@ -1148,20 +1010,20 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (reg->type == PTR_TO_MAP_VALUE) { if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into map\n", value_regno); + verbose(env, "R%d leaks addr into map\n", value_regno); return -EACCES; } err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into ctx\n", value_regno); + verbose(env, "R%d leaks addr into ctx\n", value_regno); return -EACCES; } /* ctx accesses must be at a fixed offset, so that we can @@ -1171,7 +1033,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable ctx access var_off=%s off=%d size=%d", + verbose(env, + "variable ctx access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } @@ -1183,9 +1046,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * case, we know the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); else - mark_reg_known_zero(state->regs, value_regno); + mark_reg_known_zero(env, state->regs, + value_regno); state->regs[value_regno].id = 0; state->regs[value_regno].off = 0; state->regs[value_regno].range = 0; @@ -1201,13 +1065,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable stack access var_off=%s off=%d size=%d", + verbose(env, "variable stack access var_off=%s off=%d size=%d", tn_buf, off, size); return -EACCES; } off += reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK) { - verbose("invalid stack off=%d size=%d\n", off, size); + verbose(env, "invalid stack off=%d size=%d\n", off, + size); return -EACCES; } @@ -1218,29 +1083,32 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!env->allow_ptr_leaks && state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && size != BPF_REG_SIZE) { - verbose("attempt to corrupt spilled pointer on stack\n"); + verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; } - err = check_stack_write(state, off, size, value_regno); + err = check_stack_write(env, state, off, size, + value_regno); } else { - err = check_stack_read(state, off, size, value_regno); + err = check_stack_read(env, state, off, size, + value_regno); } } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { - verbose("cannot write into packet\n"); + verbose(env, "cannot write into packet\n"); return -EACCES; } if (t == BPF_WRITE && value_regno >= 0 && is_pointer_value(env, value_regno)) { - verbose("R%d leaks addr into packet\n", value_regno); + verbose(env, "R%d leaks addr into packet\n", + value_regno); return -EACCES; } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(env, state->regs, value_regno); } else { - verbose("R%d invalid mem access '%s'\n", - regno, reg_type_str[reg->type]); + verbose(env, "R%d invalid mem access '%s'\n", regno, + reg_type_str[reg->type]); return -EACCES; } @@ -1260,7 +1128,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || insn->imm != 0) { - verbose("BPF_XADD uses reserved fields\n"); + verbose(env, "BPF_XADD uses reserved fields\n"); return -EINVAL; } @@ -1275,7 +1143,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d leaks addr into mem\n", insn->src_reg); + verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; } @@ -1316,7 +1184,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, register_is_null(regs[regno])) return 0; - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[regs[regno].type], reg_type_str[PTR_TO_STACK]); return -EACCES; @@ -1327,13 +1195,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); - verbose("invalid variable stack read R%d var_off=%s\n", + verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); } off = regs[regno].off + regs[regno].var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size <= 0) { - verbose("invalid stack type R%d off=%d access_size=%d\n", + verbose(env, "invalid stack type R%d off=%d access_size=%d\n", regno, off, access_size); return -EACCES; } @@ -1349,7 +1217,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, for (i = 0; i < access_size; i++) { if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { - verbose("invalid indirect read from stack off %d+%d size %d\n", + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; } @@ -1392,7 +1260,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (arg_type == ARG_ANYTHING) { if (is_pointer_value(env, regno)) { - verbose("R%d leaks addr into helper function\n", regno); + verbose(env, "R%d leaks addr into helper function\n", + regno); return -EACCES; } return 0; @@ -1400,7 +1269,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, if (type_is_pkt_pointer(type) && !may_access_direct_pkt_data(env, meta, BPF_READ)) { - verbose("helper access to the packet is not allowed\n"); + verbose(env, "helper access to the packet is not allowed\n"); return -EACCES; } @@ -1438,7 +1307,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, goto err_type; meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; } else { - verbose("unsupported arg_type %d\n", arg_type); + verbose(env, "unsupported arg_type %d\n", arg_type); return -EFAULT; } @@ -1456,7 +1325,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * we have to check map_key here. Otherwise it means * that kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->key\n"); + verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } if (type_is_pkt_pointer(type)) @@ -1472,7 +1341,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (!meta->map_ptr) { /* kernel subsystem misconfigured verifier */ - verbose("invalid map_ptr to access map->value\n"); + verbose(env, "invalid map_ptr to access map->value\n"); return -EACCES; } if (type_is_pkt_pointer(type)) @@ -1492,7 +1361,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, */ if (regno == 0) { /* kernel subsystem misconfigured verifier */ - verbose("ARG_CONST_SIZE cannot be first argument\n"); + verbose(env, + "ARG_CONST_SIZE cannot be first argument\n"); return -EACCES; } @@ -1509,7 +1379,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, meta = NULL; if (reg->smin_value < 0) { - verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", + verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", regno); return -EACCES; } @@ -1523,7 +1393,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, } if (reg->umax_value >= BPF_MAX_VAR_SIZ) { - verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", regno); return -EACCES; } @@ -1534,12 +1404,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return err; err_type: - verbose("R%d type=%s expected=%s\n", regno, + verbose(env, "R%d type=%s expected=%s\n", regno, reg_type_str[type], reg_type_str[expected_type]); return -EACCES; } -static int check_map_func_compatibility(struct bpf_map *map, int func_id) +static int check_map_func_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, int func_id) { if (!map) return 0; @@ -1552,7 +1423,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_MAP_TYPE_PERF_EVENT_ARRAY: if (func_id != BPF_FUNC_perf_event_read && - func_id != BPF_FUNC_perf_event_output) + func_id != BPF_FUNC_perf_event_output && + func_id != BPF_FUNC_perf_event_read_value) goto error; break; case BPF_MAP_TYPE_STACK_TRACE: @@ -1595,6 +1467,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: + case BPF_FUNC_perf_event_read_value: if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) goto error; break; @@ -1625,7 +1498,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) return 0; error: - verbose("cannot pass map_type %d into func %s#%d\n", + verbose(env, "cannot pass map_type %d into func %s#%d\n", map->map_type, func_id_name(func_id), func_id); return -EINVAL; } @@ -1659,7 +1532,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) for (i = 0; i < MAX_BPF_REG; i++) if (reg_is_pkt_pointer_any(®s[i])) - mark_reg_unknown(regs, i); + mark_reg_unknown(env, regs, i); for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { if (state->stack_slot_type[i] != STACK_SPILL) @@ -1681,7 +1554,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* find function prototype */ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { - verbose("invalid func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "invalid func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } @@ -1689,13 +1563,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) fn = env->prog->aux->ops->get_func_proto(func_id); if (!fn) { - verbose("unknown func %s#%d\n", func_id_name(func_id), func_id); + verbose(env, "unknown func %s#%d\n", func_id_name(func_id), + func_id); return -EINVAL; } /* eBPF programs must be GPL compatible to use GPL-ed functions */ if (!env->prog->gpl_compatible && fn->gpl_only) { - verbose("cannot call GPL only function from proprietary program\n"); + verbose(env, "cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1709,7 +1584,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) */ err = check_raw_mode(fn); if (err) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); return err; } @@ -1742,14 +1617,14 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } /* update return register (already marked as written above) */ if (fn->ret_type == RET_INTEGER) { /* sets type to SCALAR_VALUE */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { @@ -1757,14 +1632,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ - mark_reg_known_zero(regs, BPF_REG_0); + mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].off = 0; /* remember map_ptr, so that check_map_access() * can check 'value_size' boundary of memory access * to map element returned from bpf_map_lookup_elem() */ if (meta.map_ptr == NULL) { - verbose("kernel subsystem misconfigured verifier\n"); + verbose(env, + "kernel subsystem misconfigured verifier\n"); return -EINVAL; } regs[BPF_REG_0].map_ptr = meta.map_ptr; @@ -1775,12 +1651,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) else if (insn_aux->map_ptr != meta.map_ptr) insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { - verbose("unknown return type %d of func %s#%d\n", + verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } - err = check_map_func_compatibility(meta.map_ptr, func_id); + err = check_map_func_compatibility(env, meta.map_ptr, func_id); if (err) return err; @@ -1839,39 +1715,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad sbounds\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, + "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: known but bad ubounds\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, + "verifier internal error: known but bad ubounds\n"); return -EINVAL; } if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ if (!env->allow_ptr_leaks) - verbose("R%d 32-bit pointer arithmetic prohibited\n", + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", dst); return -EACCES; } @@ -1936,7 +1815,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ if (!env->allow_ptr_leaks) - verbose("R%d tried to subtract pointer from scalar\n", + verbose(env, "R%d tried to subtract pointer from scalar\n", dst); return -EACCES; } @@ -1946,7 +1825,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, */ if (ptr_reg->type == PTR_TO_STACK) { if (!env->allow_ptr_leaks) - verbose("R%d subtraction from stack pointer prohibited\n", + verbose(env, "R%d subtraction from stack pointer prohibited\n", dst); return -EACCES; } @@ -2001,13 +1880,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * ptr &= ~3 which would reduce min_value by 3.) */ if (!env->allow_ptr_leaks) - verbose("R%d bitwise operator %s on pointer prohibited\n", + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ if (!env->allow_ptr_leaks) - verbose("R%d pointer arithmetic with %s operator prohibited\n", + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; } @@ -2173,7 +2052,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* We lose all sign bit information (except what we can pick @@ -2201,7 +2080,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, /* Shifts greater than 63 are undefined. This includes * shifts by a negative number. */ - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } /* BPF_RSH is an unsigned shift, so make the appropriate casts */ @@ -2229,7 +2108,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; default: - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); break; } @@ -2261,12 +2140,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * an arbitrary scalar. */ if (!env->allow_ptr_leaks) { - verbose("R%d pointer %s pointer prohibited\n", + verbose(env, "R%d pointer %s pointer prohibited\n", insn->dst_reg, bpf_alu_string[opcode >> 4]); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); return 0; } else { /* scalar += pointer @@ -2318,13 +2197,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: unexpected ptr_reg\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(&env->cur_state); - verbose("verifier internal error: no src_reg\n"); + print_verifier_state(env, &env->cur_state); + verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); @@ -2342,14 +2221,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) != 0 || insn->src_reg != BPF_REG_0 || insn->off != 0 || insn->imm != 0) { - verbose("BPF_NEG uses reserved fields\n"); + verbose(env, "BPF_NEG uses reserved fields\n"); return -EINVAL; } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || BPF_CLASS(insn->code) == BPF_ALU64) { - verbose("BPF_END uses reserved fields\n"); + verbose(env, "BPF_END uses reserved fields\n"); return -EINVAL; } } @@ -2360,7 +2239,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer arithmetic prohibited\n", + verbose(env, "R%d pointer arithmetic prohibited\n", insn->dst_reg); return -EACCES; } @@ -2374,7 +2253,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } @@ -2384,7 +2263,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_MOV uses reserved fields\n"); + verbose(env, "BPF_MOV uses reserved fields\n"); return -EINVAL; } } @@ -2400,14 +2279,16 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * copy register state to dest reg */ regs[insn->dst_reg] = regs[insn->src_reg]; + regs[insn->dst_reg].live |= REG_LIVE_WRITTEN; } else { /* R1 = (u32) R2 */ if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d partial copy of pointer\n", + verbose(env, + "R%d partial copy of pointer\n", insn->src_reg); return -EACCES; } - mark_reg_unknown(regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->dst_reg); /* high 32 bits are known zero. */ regs[insn->dst_reg].var_off = tnum_cast( regs[insn->dst_reg].var_off, 4); @@ -2422,14 +2303,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else if (opcode > BPF_END) { - verbose("invalid BPF_ALU opcode %x\n", opcode); + verbose(env, "invalid BPF_ALU opcode %x\n", opcode); return -EINVAL; } else { /* all other ALU ops: and, sub, xor, add, ... */ if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } /* check src1 operand */ @@ -2438,7 +2319,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0) { - verbose("BPF_ALU uses reserved fields\n"); + verbose(env, "BPF_ALU uses reserved fields\n"); return -EINVAL; } } @@ -2450,7 +2331,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) if ((opcode == BPF_MOD || opcode == BPF_DIV) && BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { - verbose("div by zero\n"); + verbose(env, "div by zero\n"); return -EINVAL; } @@ -2459,7 +2340,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; if (insn->imm < 0 || insn->imm >= size) { - verbose("invalid shift %d\n", insn->imm); + verbose(env, "invalid shift %d\n", insn->imm); return -EINVAL; } } @@ -2812,13 +2693,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, int err; if (opcode > BPF_JSLE) { - verbose("invalid BPF_JMP opcode %x\n", opcode); + verbose(env, "invalid BPF_JMP opcode %x\n", opcode); return -EINVAL; } if (BPF_SRC(insn->code) == BPF_X) { if (insn->imm != 0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } @@ -2828,13 +2709,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return err; if (is_pointer_value(env, insn->src_reg)) { - verbose("R%d pointer comparison prohibited\n", + verbose(env, "R%d pointer comparison prohibited\n", insn->src_reg); return -EACCES; } } else { if (insn->src_reg != BPF_REG_0) { - verbose("BPF_JMP uses reserved fields\n"); + verbose(env, "BPF_JMP uses reserved fields\n"); return -EINVAL; } } @@ -2946,11 +2827,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, find_good_pkt_pointers(this_branch, ®s[insn->src_reg], PTR_TO_PACKET_META); } else if (is_pointer_value(env, insn->dst_reg)) { - verbose("R%d pointer comparison prohibited\n", insn->dst_reg); + verbose(env, "R%d pointer comparison prohibited\n", + insn->dst_reg); return -EACCES; } - if (log_level) - print_verifier_state(this_branch); + if (env->log.level) + print_verifier_state(env, this_branch); return 0; } @@ -2969,11 +2851,11 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) int err; if (BPF_SIZE(insn->code) != BPF_DW) { - verbose("invalid BPF_LD_IMM insn\n"); + verbose(env, "invalid BPF_LD_IMM insn\n"); return -EINVAL; } if (insn->off != 0) { - verbose("BPF_LD_IMM64 uses reserved fields\n"); + verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); return -EINVAL; } @@ -3031,14 +2913,14 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) int i, err; if (!may_access_skb(env->prog->type)) { - verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); + verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); return -EINVAL; } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { - verbose("BPF_LD_[ABS|IND] uses reserved fields\n"); + verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); return -EINVAL; } @@ -3048,7 +2930,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; if (regs[BPF_REG_6].type != PTR_TO_CTX) { - verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + verbose(env, + "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); return -EINVAL; } @@ -3061,7 +2944,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) /* reset caller saved regs to unreadable */ for (i = 0; i < CALLER_SAVED_REGS; i++) { - mark_reg_not_init(regs, caller_saved[i]); + mark_reg_not_init(env, regs, caller_saved[i]); check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); } @@ -3069,7 +2952,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) * the value fetched from the packet. * Already marked as written above. */ - mark_reg_unknown(regs, BPF_REG_0); + mark_reg_unknown(env, regs, BPF_REG_0); return 0; } @@ -3089,22 +2972,22 @@ static int check_return_code(struct bpf_verifier_env *env) reg = &env->cur_state.regs[BPF_REG_0]; if (reg->type != SCALAR_VALUE) { - verbose("At program exit the register R0 is not a known value (%s)\n", + verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); return -EINVAL; } if (!tnum_in(range, reg->var_off)) { - verbose("At program exit the register R0 "); + verbose(env, "At program exit the register R0 "); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("has value %s", tn_buf); + verbose(env, "has value %s", tn_buf); } else { - verbose("has unknown scalar value"); + verbose(env, "has unknown scalar value"); } - verbose(" should have been 0 or 1\n"); + verbose(env, " should have been 0 or 1\n"); return -EINVAL; } return 0; @@ -3170,7 +3053,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) return 0; if (w < 0 || w >= env->prog->len) { - verbose("jump out of range from insn %d to %d\n", t, w); + verbose(env, "jump out of range from insn %d to %d\n", t, w); return -EINVAL; } @@ -3187,13 +3070,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) insn_stack[cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - verbose("back-edge from insn %d to %d\n", t, w); + verbose(env, "back-edge from insn %d to %d\n", t, w); return -EINVAL; } else if (insn_state[w] == EXPLORED) { /* forward- or cross-edge */ insn_state[t] = DISCOVERED | e; } else { - verbose("insn state internal bug\n"); + verbose(env, "insn state internal bug\n"); return -EFAULT; } return 0; @@ -3287,7 +3170,7 @@ peek_stack: mark_explored: insn_state[t] = EXPLORED; if (cur_stack-- <= 0) { - verbose("pop stack internal bug\n"); + verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } @@ -3296,7 +3179,7 @@ mark_explored: check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { - verbose("unreachable insn %d\n", i); + verbose(env, "unreachable insn %d\n", i); ret = -EINVAL; goto err_free; } @@ -3677,7 +3560,7 @@ static int do_check(struct bpf_verifier_env *env) int insn_processed = 0; bool do_print_state = false; - init_reg_state(regs); + init_reg_state(env, regs); state->parent = NULL; insn_idx = 0; for (;;) { @@ -3686,7 +3569,7 @@ static int do_check(struct bpf_verifier_env *env) int err; if (insn_idx >= insn_cnt) { - verbose("invalid insn idx %d insn_cnt %d\n", + verbose(env, "invalid insn idx %d insn_cnt %d\n", insn_idx, insn_cnt); return -EFAULT; } @@ -3695,7 +3578,8 @@ static int do_check(struct bpf_verifier_env *env) class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { - verbose("BPF program is too large. Processed %d insn\n", + verbose(env, + "BPF program is too large. Processed %d insn\n", insn_processed); return -E2BIG; } @@ -3705,12 +3589,12 @@ static int do_check(struct bpf_verifier_env *env) return err; if (err == 1) { /* found equivalent state, can prune the search */ - if (log_level) { + if (env->log.level) { if (do_print_state) - verbose("\nfrom %d to %d: safe\n", + verbose(env, "\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); else - verbose("%d: safe\n", insn_idx); + verbose(env, "%d: safe\n", insn_idx); } goto process_bpf_exit; } @@ -3718,19 +3602,20 @@ static int do_check(struct bpf_verifier_env *env) if (need_resched()) cond_resched(); - if (log_level > 1 || (log_level && do_print_state)) { - if (log_level > 1) - verbose("%d:", insn_idx); + if (env->log.level > 1 || (env->log.level && do_print_state)) { + if (env->log.level > 1) + verbose(env, "%d:", insn_idx); else - verbose("\nfrom %d to %d:", + verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(&env->cur_state); + print_verifier_state(env, &env->cur_state); do_print_state = false; } - if (log_level) { - verbose("%d: ", insn_idx); - print_bpf_insn(env, insn); + if (env->log.level) { + verbose(env, "%d: ", insn_idx); + print_bpf_insn(verbose, env, insn, + env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); @@ -3786,7 +3671,7 @@ static int do_check(struct bpf_verifier_env *env) * src_reg == stack|map in some other branch. * Reject it. */ - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } @@ -3826,14 +3711,14 @@ static int do_check(struct bpf_verifier_env *env) } else if (dst_reg_type != *prev_dst_type && (dst_reg_type == PTR_TO_CTX || *prev_dst_type == PTR_TO_CTX)) { - verbose("same insn cannot be used with different pointers\n"); + verbose(env, "same insn cannot be used with different pointers\n"); return -EINVAL; } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { - verbose("BPF_ST uses reserved fields\n"); + verbose(env, "BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ @@ -3856,7 +3741,7 @@ static int do_check(struct bpf_verifier_env *env) insn->off != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_CALL uses reserved fields\n"); + verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } @@ -3869,7 +3754,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_JA uses reserved fields\n"); + verbose(env, "BPF_JA uses reserved fields\n"); return -EINVAL; } @@ -3881,7 +3766,7 @@ static int do_check(struct bpf_verifier_env *env) insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { - verbose("BPF_EXIT uses reserved fields\n"); + verbose(env, "BPF_EXIT uses reserved fields\n"); return -EINVAL; } @@ -3896,7 +3781,7 @@ static int do_check(struct bpf_verifier_env *env) return err; if (is_pointer_value(env, BPF_REG_0)) { - verbose("R0 leaks addr as return value\n"); + verbose(env, "R0 leaks addr as return value\n"); return -EACCES; } @@ -3931,19 +3816,19 @@ process_bpf_exit: insn_idx++; } else { - verbose("invalid BPF_LD mode\n"); + verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; } } else { - verbose("unknown insn class %d\n", class); + verbose(env, "unknown insn class %d\n", class); return -EINVAL; } insn_idx++; } - verbose("processed %d insns, stack depth %d\n", - insn_processed, env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth %d\n", insn_processed, + env->prog->aux->stack_depth); return 0; } @@ -3955,7 +3840,8 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } -static int check_map_prog_compatibility(struct bpf_map *map, +static int check_map_prog_compatibility(struct bpf_verifier_env *env, + struct bpf_map *map, struct bpf_prog *prog) { @@ -3966,12 +3852,12 @@ static int check_map_prog_compatibility(struct bpf_map *map, */ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { if (!check_map_prealloc(map)) { - verbose("perf_event programs can only use preallocated hash map\n"); + verbose(env, "perf_event programs can only use preallocated hash map\n"); return -EINVAL; } if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta)) { - verbose("perf_event programs can only use preallocated inner hash map\n"); + verbose(env, "perf_event programs can only use preallocated inner hash map\n"); return -EINVAL; } } @@ -3994,14 +3880,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (BPF_CLASS(insn->code) == BPF_LDX && (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { - verbose("BPF_LDX uses reserved fields\n"); + verbose(env, "BPF_LDX uses reserved fields\n"); return -EINVAL; } if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { - verbose("BPF_STX uses reserved fields\n"); + verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } @@ -4012,7 +3898,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) if (i == insn_cnt - 1 || insn[1].code != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].off != 0) { - verbose("invalid bpf_ld_imm64 insn\n"); + verbose(env, "invalid bpf_ld_imm64 insn\n"); return -EINVAL; } @@ -4021,19 +3907,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) goto next_insn; if (insn->src_reg != BPF_PSEUDO_MAP_FD) { - verbose("unrecognized bpf_ld_imm64 insn\n"); + verbose(env, + "unrecognized bpf_ld_imm64 insn\n"); return -EINVAL; } f = fdget(insn->imm); map = __bpf_map_get(f); if (IS_ERR(map)) { - verbose("fd %d is not pointing to valid bpf_map\n", + verbose(env, "fd %d is not pointing to valid bpf_map\n", insn->imm); return PTR_ERR(map); } - err = check_map_prog_compatibility(map, env->prog); + err = check_map_prog_compatibility(env, map, env->prog); if (err) { fdput(f); return err; @@ -4155,7 +4042,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, env->prog); if (cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); @@ -4203,7 +4090,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) u8 size_code; if (type == BPF_WRITE) { - verbose("bpf verifier narrow ctx access misconfigured\n"); + verbose(env, "bpf verifier narrow ctx access misconfigured\n"); return -EINVAL; } @@ -4222,7 +4109,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) &target_size); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || (ctx_field_size && !target_size)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4304,7 +4191,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { - verbose("bpf verifier is misconfigured\n"); + verbose(env, "bpf verifier is misconfigured\n"); return -EINVAL; } @@ -4348,7 +4235,8 @@ patch_call_imm: * programs to call them, must be real in-kernel functions */ if (!fn->func) { - verbose("kernel subsystem misconfigured func %s#%d\n", + verbose(env, + "kernel subsystem misconfigured func %s#%d\n", func_id_name(insn->imm), insn->imm); return -EFAULT; } @@ -4382,8 +4270,8 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { - char __user *log_ubuf = NULL; struct bpf_verifier_env *env; + struct bpf_verifer_log *log; int ret = -EINVAL; /* 'struct bpf_verifier_env' can be global, but since it's not small, @@ -4392,6 +4280,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; + log = &env->log; env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) * (*prog)->len); @@ -4407,23 +4296,15 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* user requested verbose verifier output * and supplied buffer to store the verification trace */ - log_level = attr->log_level; - log_ubuf = (char __user *) (unsigned long) attr->log_buf; - log_size = attr->log_size; - log_len = 0; + log->level = attr->log_level; + log->ubuf = (char __user *) (unsigned long) attr->log_buf; + log->len_total = attr->log_size; ret = -EINVAL; - /* log_* values have to be sane */ - if (log_size < 128 || log_size > UINT_MAX >> 8 || - log_level == 0 || log_ubuf == NULL) - goto err_unlock; - - ret = -ENOMEM; - log_buf = vmalloc(log_size); - if (!log_buf) + /* log attributes have to be sane */ + if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || + !log->level || !log->ubuf) goto err_unlock; - } else { - log_level = 0; } env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); @@ -4460,17 +4341,11 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); - if (log_level && log_len >= log_size - 1) { - BUG_ON(log_len >= log_size); - /* verifier log exceeded user supplied buffer */ + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; - /* fall through to return what was recorded */ - } - - /* copy verifier log back to user space including trailing zero */ - if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + if (log->level && !log->ubuf) { ret = -EFAULT; - goto free_log_buf; + goto err_release_maps; } if (ret == 0 && env->used_map_cnt) { @@ -4481,7 +4356,7 @@ skip_full_check: if (!env->prog->aux->used_maps) { ret = -ENOMEM; - goto free_log_buf; + goto err_release_maps; } memcpy(env->prog->aux->used_maps, env->used_maps, @@ -4494,9 +4369,7 @@ skip_full_check: convert_pseudo_ld_imm64(env); } -free_log_buf: - if (log_level) - vfree(log_buf); +err_release_maps: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. @@ -4533,8 +4406,6 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); - log_level = 0; - env->strict_alignment = false; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; diff --git a/kernel/cpu.c b/kernel/cpu.c index 8de11a29e495..d851df22f5c5 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,7 @@ #include <linux/lockdep.h> #include <linux/tick.h> #include <linux/irq.h> +#include <linux/nmi.h> #include <linux/smpboot.h> #include <linux/relay.h> #include <linux/slab.h> @@ -897,6 +898,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, out: cpus_write_unlock(); + /* + * Do post unplug cleanup. This is still protected against + * concurrent CPU hotplug via cpu_add_remove_lock. + */ + lockup_detector_cleanup(); return ret; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 6bc21e202ae4..902149f05381 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3684,10 +3684,12 @@ static inline u64 perf_event_count(struct perf_event *event) * will not be local and we cannot read them atomically * - must not have a pmu::count method */ -int perf_event_read_local(struct perf_event *event, u64 *value) +int perf_event_read_local(struct perf_event *event, u64 *value, + u64 *enabled, u64 *running) { unsigned long flags; int ret = 0; + u64 now; /* * Disabling interrupts avoids all counter scheduling (context @@ -3718,13 +3720,21 @@ int perf_event_read_local(struct perf_event *event, u64 *value) goto out; } + now = event->shadow_ctx_time + perf_clock(); + if (enabled) + *enabled = now - event->tstamp_enabled; /* * If the event is currently on this CPU, its either a per-task event, * or local to this CPU. Furthermore it means its ACTIVE (otherwise * oncpu == -1). */ - if (event->oncpu == smp_processor_id()) + if (event->oncpu == smp_processor_id()) { event->pmu->read(event); + if (running) + *running = now - event->tstamp_running; + } else if (running) { + *running = event->total_time_running; + } *value = local64_read(&event->count); out: @@ -8072,6 +8082,7 @@ static void bpf_overflow_handler(struct perf_event *event, struct bpf_perf_event_data_kern ctx = { .data = data, .regs = regs, + .event = event, }; int ret = 0; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 1d71c051a951..5043e7433f4b 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -344,39 +344,30 @@ EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); * by the client, but only by calling this function. * This function can only be called on a registered smp_hotplug_thread. */ -int smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, - const struct cpumask *new) +void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread, + const struct cpumask *new) { struct cpumask *old = plug_thread->cpumask; - cpumask_var_t tmp; + static struct cpumask tmp; unsigned int cpu; - if (!alloc_cpumask_var(&tmp, GFP_KERNEL)) - return -ENOMEM; - - get_online_cpus(); + lockdep_assert_cpus_held(); mutex_lock(&smpboot_threads_lock); /* Park threads that were exclusively enabled on the old mask. */ - cpumask_andnot(tmp, old, new); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, old, new); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_park_thread(plug_thread, cpu); /* Unpark threads that are exclusively enabled on the new mask. */ - cpumask_andnot(tmp, new, old); - for_each_cpu_and(cpu, tmp, cpu_online_mask) + cpumask_andnot(&tmp, new, old); + for_each_cpu_and(cpu, &tmp, cpu_online_mask) smpboot_unpark_thread(plug_thread, cpu); cpumask_copy(old, new); mutex_unlock(&smpboot_threads_lock); - put_online_cpus(); - - free_cpumask_var(tmp); - - return 0; } -EXPORT_SYMBOL_GPL(smpboot_update_cpumask_percpu_thread); static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4da9e622471f..d9c31bc2eaea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -872,9 +872,9 @@ static struct ctl_table kern_table[] = { #if defined(CONFIG_LOCKUP_DETECTOR) { .procname = "watchdog", - .data = &watchdog_user_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_watchdog, .extra1 = &zero, .extra2 = &one, @@ -890,16 +890,12 @@ static struct ctl_table kern_table[] = { }, { .procname = "nmi_watchdog", - .data = &nmi_watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &nmi_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = NMI_WATCHDOG_SYSCTL_PERM, .proc_handler = proc_nmi_watchdog, .extra1 = &zero, -#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) .extra2 = &one, -#else - .extra2 = &zero, -#endif }, { .procname = "watchdog_cpumask", @@ -911,9 +907,9 @@ static struct ctl_table kern_table[] = { #ifdef CONFIG_SOFTLOCKUP_DETECTOR { .procname = "soft_watchdog", - .data = &soft_watchdog_enabled, - .maxlen = sizeof (int), - .mode = 0644, + .data = &soft_watchdog_user_enabled, + .maxlen = sizeof(int), + .mode = 0644, .proc_handler = proc_soft_watchdog, .extra1 = &zero, .extra2 = &one, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index dc498b605d5d..04ea5314f2bc 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -255,14 +255,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void) return &bpf_trace_printk_proto; } -BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +static __always_inline int +get_map_perf_counter(struct bpf_map *map, u64 flags, + u64 *value, u64 *enabled, u64 *running) { struct bpf_array *array = container_of(map, struct bpf_array, map); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; - u64 value = 0; - int err; if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; @@ -275,7 +275,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) if (!ee) return -ENOENT; - err = perf_event_read_local(ee->event, &value); + return perf_event_read_local(ee->event, value, enabled, running); +} + +BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags) +{ + u64 value = 0; + int err; + + err = get_map_perf_counter(map, flags, &value, NULL, NULL); /* * this api is ugly since we miss [-22..-2] range of valid * counter values, but that's uapi @@ -293,6 +301,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_event_read_value_proto = { + .func = bpf_perf_event_read_value, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_UNINIT_MEM, + .arg4_type = ARG_CONST_SIZE, +}; + static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); static __always_inline u64 @@ -499,6 +534,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_output_proto; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto; + case BPF_FUNC_perf_event_read_value: + return &bpf_perf_event_read_value_proto; default: return tracing_func_proto(func_id); } @@ -576,6 +613,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx, + struct bpf_perf_event_value *, buf, u32, size) +{ + int err = -EINVAL; + + if (unlikely(size != sizeof(struct bpf_perf_event_value))) + goto clear; + err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled, + &buf->running); + if (unlikely(err)) + goto clear; + return 0; +clear: + memset(buf, 0, size); + return err; +} + +static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = { + .func = bpf_perf_prog_read_value_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -583,6 +646,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) return &bpf_perf_event_output_proto_tp; case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; + case BPF_FUNC_perf_prog_read_value: + return &bpf_perf_prog_read_value_proto_tp; default: return tracing_func_proto(func_id); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index f5d52024f6b7..6bcb854909c0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -29,20 +29,29 @@ #include <linux/kvm_para.h> #include <linux/kthread.h> -/* Watchdog configuration */ -static DEFINE_MUTEX(watchdog_proc_mutex); - -int __read_mostly nmi_watchdog_enabled; +static DEFINE_MUTEX(watchdog_mutex); #if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG) -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED | - NMI_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED | NMI_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 1 #else -unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +# define WATCHDOG_DEFAULT (SOFT_WATCHDOG_ENABLED) +# define NMI_WATCHDOG_DEFAULT 0 #endif +unsigned long __read_mostly watchdog_enabled; +int __read_mostly watchdog_user_enabled = 1; +int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT; +int __read_mostly soft_watchdog_user_enabled = 1; +int __read_mostly watchdog_thresh = 10; +int __read_mostly nmi_watchdog_available; + +struct cpumask watchdog_allowed_mask __read_mostly; + +struct cpumask watchdog_cpumask __read_mostly; +unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); + #ifdef CONFIG_HARDLOCKUP_DETECTOR -/* boot commands */ /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -56,9 +65,9 @@ unsigned int __read_mostly hardlockup_panic = * kernel command line parameters are parsed, because otherwise it is not * possible to override this in hardlockup_panic_setup(). */ -void hardlockup_detector_disable(void) +void __init hardlockup_detector_disable(void) { - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; } static int __init hardlockup_panic_setup(char *str) @@ -68,48 +77,24 @@ static int __init hardlockup_panic_setup(char *str) else if (!strncmp(str, "nopanic", 7)) hardlockup_panic = 0; else if (!strncmp(str, "0", 1)) - watchdog_enabled &= ~NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 0; else if (!strncmp(str, "1", 1)) - watchdog_enabled |= NMI_WATCHDOG_ENABLED; + nmi_watchdog_user_enabled = 1; return 1; } __setup("nmi_watchdog=", hardlockup_panic_setup); -#endif - -#ifdef CONFIG_SOFTLOCKUP_DETECTOR -int __read_mostly soft_watchdog_enabled; -#endif - -int __read_mostly watchdog_user_enabled; -int __read_mostly watchdog_thresh = 10; - -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; +# ifdef CONFIG_SMP int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#endif -struct cpumask watchdog_cpumask __read_mostly; -unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); -/* - * The 'watchdog_running' variable is set to 1 when the watchdog threads - * are registered/started and is set to 0 when the watchdog threads are - * unregistered/stopped, so it is an indicator whether the threads exist. - */ -static int __read_mostly watchdog_running; -/* - * If a subsystem has a need to deactivate the watchdog temporarily, it - * can use the suspend/resume interface to achieve this. The content of - * the 'watchdog_suspended' variable reflects this state. Existing threads - * are parked/unparked by the lockup_detector_{suspend|resume} functions - * (see comment blocks pertaining to those functions for further details). - * - * 'watchdog_suspended' also prevents threads from being registered/started - * or unregistered/stopped via parameters in /proc/sys/kernel, so the state - * of 'watchdog_running' cannot change while the watchdog is deactivated - * temporarily (see related code in 'proc' handlers). - */ -int __read_mostly watchdog_suspended; +static int __init hardlockup_all_cpu_backtrace_setup(char *str) +{ + sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); + return 1; +} +__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); +# endif /* CONFIG_SMP */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* * These functions can be overridden if an architecture implements its @@ -121,36 +106,68 @@ int __read_mostly watchdog_suspended; */ int __weak watchdog_nmi_enable(unsigned int cpu) { + hardlockup_detector_perf_enable(); return 0; } + void __weak watchdog_nmi_disable(unsigned int cpu) { + hardlockup_detector_perf_disable(); } -/* - * watchdog_nmi_reconfigure can be implemented to be notified after any - * watchdog configuration change. The arch hardlockup watchdog should - * respond to the following variables: - * - nmi_watchdog_enabled +/* Return 0, if a NMI watchdog is available. Error code otherwise */ +int __weak __init watchdog_nmi_probe(void) +{ + return hardlockup_detector_perf_init(); +} + +/** + * watchdog_nmi_stop - Stop the watchdog for reconfiguration + * + * The reconfiguration steps are: + * watchdog_nmi_stop(); + * update_variables(); + * watchdog_nmi_start(); + */ +void __weak watchdog_nmi_stop(void) { } + +/** + * watchdog_nmi_start - Start the watchdog after reconfiguration + * + * Counterpart to watchdog_nmi_stop(). + * + * The following variables have been updated in update_variables() and + * contain the currently valid configuration: + * - watchdog_enabled * - watchdog_thresh * - watchdog_cpumask - * - sysctl_hardlockup_all_cpu_backtrace - * - hardlockup_panic - * - watchdog_suspended */ -void __weak watchdog_nmi_reconfigure(void) +void __weak watchdog_nmi_start(void) { } + +/** + * lockup_detector_update_enable - Update the sysctl enable bit + * + * Caller needs to make sure that the NMI/perf watchdogs are off, so this + * can't race with watchdog_nmi_disable(). + */ +static void lockup_detector_update_enable(void) { + watchdog_enabled = 0; + if (!watchdog_user_enabled) + return; + if (nmi_watchdog_available && nmi_watchdog_user_enabled) + watchdog_enabled |= NMI_WATCHDOG_ENABLED; + if (soft_watchdog_user_enabled) + watchdog_enabled |= SOFT_WATCHDOG_ENABLED; } - #ifdef CONFIG_SOFTLOCKUP_DETECTOR -/* Helper for online, unparked cpus. */ -#define for_each_watchdog_cpu(cpu) \ - for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) - -atomic_t watchdog_park_in_progress = ATOMIC_INIT(0); +/* Global variables, exported for sysctl */ +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; +static bool softlockup_threads_initialized __read_mostly; static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -164,50 +181,40 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -unsigned int __read_mostly softlockup_panic = - CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; - static int __init softlockup_panic_setup(char *str) { softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; } __setup("softlockup_panic=", softlockup_panic_setup); static int __init nowatchdog_setup(char *str) { - watchdog_enabled = 0; + watchdog_user_enabled = 0; return 1; } __setup("nowatchdog", nowatchdog_setup); static int __init nosoftlockup_setup(char *str) { - watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED; + soft_watchdog_user_enabled = 0; return 1; } __setup("nosoftlockup", nosoftlockup_setup); #ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; + static int __init softlockup_all_cpu_backtrace_setup(char *str) { - sysctl_softlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); + sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); return 1; } __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = - !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -#endif #endif +static void __lockup_detector_cleanup(void); + /* * Hard-lockup warnings should be triggered after just a few seconds. Soft- * lockups can have false positives under extreme conditions. So we generally @@ -278,11 +285,15 @@ void touch_all_softlockup_watchdogs(void) int cpu; /* - * this is done lockless - * do we care if a 0 races with a timestamp? - * all it means is the softlock check starts one cycle later + * watchdog_mutex cannpt be taken here, as this might be called + * from (soft)interrupt context, so the access to + * watchdog_allowed_cpumask might race with a concurrent update. + * + * The watchdog time stamp can race against a concurrent real + * update as well, the only side effect might be a cycle delay for + * the softlockup check. */ - for_each_watchdog_cpu(cpu) + for_each_cpu(cpu, &watchdog_allowed_mask) per_cpu(watchdog_touch_ts, cpu) = 0; wq_watchdog_touch(-1); } @@ -322,9 +333,6 @@ static void watchdog_interrupt_count(void) __this_cpu_inc(hrtimer_interrupts); } -static int watchdog_enable_all_cpus(void); -static void watchdog_disable_all_cpus(void); - /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { @@ -333,7 +341,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; - if (atomic_read(&watchdog_park_in_progress) != 0) + if (!watchdog_enabled) return HRTIMER_NORESTART; /* kick the hardlockup detector */ @@ -447,32 +455,38 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio) static void watchdog_enable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); - /* kick off the timer for the hardlockup detector */ + /* + * Start the timer first to prevent the NMI watchdog triggering + * before the timer has a chance to fire. + */ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hrtimer->function = watchdog_timer_fn; - - /* Enable the perf event */ - watchdog_nmi_enable(cpu); - - /* done here because hrtimer_start can only pin to smp_processor_id() */ hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL_PINNED); - /* initialize timestamp */ - watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); + /* Initialize timestamp */ __touch_watchdog(); + /* Enable the perf event */ + if (watchdog_enabled & NMI_WATCHDOG_ENABLED) + watchdog_nmi_enable(cpu); + + watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); } static void watchdog_disable(unsigned int cpu) { - struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); + struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer); watchdog_set_prio(SCHED_NORMAL, 0); - hrtimer_cancel(hrtimer); - /* disable the perf event */ + /* + * Disable the perf event first. That prevents that a large delay + * between disabling the timer and disabling the perf event causes + * the perf NMI to detect a false positive. + */ watchdog_nmi_disable(cpu); + hrtimer_cancel(hrtimer); } static void watchdog_cleanup(unsigned int cpu, bool online) @@ -499,21 +513,6 @@ static void watchdog(unsigned int cpu) __this_cpu_write(soft_lockup_hrtimer_cnt, __this_cpu_read(hrtimer_interrupts)); __touch_watchdog(); - - /* - * watchdog_nmi_enable() clears the NMI_WATCHDOG_ENABLED bit in the - * failure path. Check for failures that can occur asynchronously - - * for example, when CPUs are on-lined - and shut down the hardware - * perf event on each CPU accordingly. - * - * The only non-obvious place this bit can be cleared is through - * watchdog_nmi_enable(), so a pr_info() is placed there. Placing a - * pr_info here would be too noisy as it would result in a message - * every few seconds if the hardlockup was disabled but the softlockup - * enabled. - */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - watchdog_nmi_disable(cpu); } static struct smp_hotplug_thread watchdog_threads = { @@ -527,295 +526,174 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -/* - * park all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function returns an error if kthread_park() of a watchdog thread - * fails. In this situation, the watchdog threads of some CPUs can already - * be parked and the watchdog threads of other CPUs can still be runnable. - * Callers are expected to handle this special condition as appropriate in - * their context. - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static int watchdog_park_threads(void) +static void softlockup_update_smpboot_threads(void) { - int cpu, ret = 0; + lockdep_assert_held(&watchdog_mutex); - atomic_set(&watchdog_park_in_progress, 1); + if (!softlockup_threads_initialized) + return; - for_each_watchdog_cpu(cpu) { - ret = kthread_park(per_cpu(softlockup_watchdog, cpu)); - if (ret) - break; - } - - atomic_set(&watchdog_park_in_progress, 0); - - return ret; + smpboot_update_cpumask_percpu_thread(&watchdog_threads, + &watchdog_allowed_mask); } -/* - * unpark all watchdog threads that are specified in 'watchdog_cpumask' - * - * This function may only be called in a context that is protected against - * races with CPU hotplug - for example, via get_online_cpus(). - */ -static void watchdog_unpark_threads(void) +/* Temporarily park all watchdog threads */ +static void softlockup_park_all_threads(void) { - int cpu; - - for_each_watchdog_cpu(cpu) - kthread_unpark(per_cpu(softlockup_watchdog, cpu)); + cpumask_clear(&watchdog_allowed_mask); + softlockup_update_smpboot_threads(); } -static int update_watchdog_all_cpus(void) +/* Unpark enabled threads */ +static void softlockup_unpark_threads(void) { - int ret; - - ret = watchdog_park_threads(); - if (ret) - return ret; - - watchdog_unpark_threads(); - - return 0; + cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask); + softlockup_update_smpboot_threads(); } -static int watchdog_enable_all_cpus(void) +static void lockup_detector_reconfigure(void) { - int err = 0; - - if (!watchdog_running) { - err = smpboot_register_percpu_thread_cpumask(&watchdog_threads, - &watchdog_cpumask); - if (err) - pr_err("Failed to create watchdog threads, disabled\n"); - else - watchdog_running = 1; - } else { - /* - * Enable/disable the lockup detectors or - * change the sample period 'on the fly'. - */ - err = update_watchdog_all_cpus(); - - if (err) { - watchdog_disable_all_cpus(); - pr_err("Failed to update lockup detectors, disabled\n"); - } - } - - if (err) - watchdog_enabled = 0; - - return err; + cpus_read_lock(); + watchdog_nmi_stop(); + softlockup_park_all_threads(); + set_sample_period(); + lockup_detector_update_enable(); + if (watchdog_enabled && watchdog_thresh) + softlockup_unpark_threads(); + watchdog_nmi_start(); + cpus_read_unlock(); + /* + * Must be called outside the cpus locked section to prevent + * recursive locking in the perf code. + */ + __lockup_detector_cleanup(); } -static void watchdog_disable_all_cpus(void) +/* + * Create the watchdog thread infrastructure and configure the detector(s). + * + * The threads are not unparked as watchdog_allowed_mask is empty. When + * the threads are sucessfully initialized, take the proper locks and + * unpark the threads in the watchdog_cpumask if the watchdog is enabled. + */ +static __init void lockup_detector_setup(void) { - if (watchdog_running) { - watchdog_running = 0; - smpboot_unregister_percpu_thread(&watchdog_threads); - } -} + int ret; -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) -{ - return smpboot_update_cpumask_percpu_thread( - &watchdog_threads, &watchdog_cpumask); -} -#endif + /* + * If sysctl is off and watchdog got disabled on the command line, + * nothing to do here. + */ + lockup_detector_update_enable(); -#else /* SOFTLOCKUP */ -static int watchdog_park_threads(void) -{ - return 0; -} + if (!IS_ENABLED(CONFIG_SYSCTL) && + !(watchdog_enabled && watchdog_thresh)) + return; -static void watchdog_unpark_threads(void) -{ -} + ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads, + &watchdog_allowed_mask); + if (ret) { + pr_err("Failed to initialize soft lockup detector threads\n"); + return; + } -static int watchdog_enable_all_cpus(void) -{ - return 0; + mutex_lock(&watchdog_mutex); + softlockup_threads_initialized = true; + lockup_detector_reconfigure(); + mutex_unlock(&watchdog_mutex); } -static void watchdog_disable_all_cpus(void) +#else /* CONFIG_SOFTLOCKUP_DETECTOR */ +static inline int watchdog_park_threads(void) { return 0; } +static inline void watchdog_unpark_threads(void) { } +static inline int watchdog_enable_all_cpus(void) { return 0; } +static inline void watchdog_disable_all_cpus(void) { } +static void lockup_detector_reconfigure(void) { + cpus_read_lock(); + watchdog_nmi_stop(); + lockup_detector_update_enable(); + watchdog_nmi_start(); + cpus_read_unlock(); } - -#ifdef CONFIG_SYSCTL -static int watchdog_update_cpus(void) +static inline void lockup_detector_setup(void) { - return 0; + lockup_detector_reconfigure(); } -#endif +#endif /* !CONFIG_SOFTLOCKUP_DETECTOR */ -static void set_sample_period(void) +static void __lockup_detector_cleanup(void) { + lockdep_assert_held(&watchdog_mutex); + hardlockup_detector_perf_cleanup(); } -#endif /* SOFTLOCKUP */ -/* - * Suspend the hard and soft lockup detector by parking the watchdog threads. +/** + * lockup_detector_cleanup - Cleanup after cpu hotplug or sysctl changes + * + * Caller must not hold the cpu hotplug rwsem. */ -int lockup_detector_suspend(void) +void lockup_detector_cleanup(void) { - int ret = 0; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - /* - * Multiple suspend requests can be active in parallel (counted by - * the 'watchdog_suspended' variable). If the watchdog threads are - * running, the first caller takes care that they will be parked. - * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related code in 'proc' handlers). - */ - if (watchdog_running && !watchdog_suspended) - ret = watchdog_park_threads(); - - if (ret == 0) - watchdog_suspended++; - else { - watchdog_disable_all_cpus(); - pr_err("Failed to suspend lockup detectors, disabled\n"); - watchdog_enabled = 0; - } - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - - return ret; + mutex_lock(&watchdog_mutex); + __lockup_detector_cleanup(); + mutex_unlock(&watchdog_mutex); } -/* - * Resume the hard and soft lockup detector by unparking the watchdog threads. +/** + * lockup_detector_soft_poweroff - Interface to stop lockup detector(s) + * + * Special interface for parisc. It prevents lockup detector warnings from + * the default pm_poweroff() function which busy loops forever. */ -void lockup_detector_resume(void) +void lockup_detector_soft_poweroff(void) { - mutex_lock(&watchdog_proc_mutex); - - watchdog_suspended--; - /* - * The watchdog threads are unparked if they were previously running - * and if there is no more active suspend request. - */ - if (watchdog_running && !watchdog_suspended) - watchdog_unpark_threads(); - - watchdog_nmi_reconfigure(); - - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + watchdog_enabled = 0; } #ifdef CONFIG_SYSCTL -/* - * Update the run state of the lockup detectors. - */ -static int proc_watchdog_update(void) +/* Propagate any changes to the watchdog threads */ +static void proc_watchdog_update(void) { - int err = 0; - - /* - * Watchdog threads won't be started if they are already active. - * The 'watchdog_running' variable in watchdog_*_all_cpus() takes - * care of this. If those threads are already active, the sample - * period will be updated and the lockup detectors will be enabled - * or disabled 'on the fly'. - */ - if (watchdog_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - - watchdog_nmi_reconfigure(); - - return err; - + /* Remove impossible cpus to keep sysctl output clean. */ + cpumask_and(&watchdog_cpumask, &watchdog_cpumask, cpu_possible_mask); + lockup_detector_reconfigure(); } /* * common function for watchdog, nmi_watchdog and soft_watchdog parameter * - * caller | table->data points to | 'which' contains the flag(s) - * -------------------|-----------------------|----------------------------- - * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED or'ed - * | | with SOFT_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_nmi_watchdog | nmi_watchdog_enabled | NMI_WATCHDOG_ENABLED - * -------------------|-----------------------|----------------------------- - * proc_soft_watchdog | soft_watchdog_enabled | SOFT_WATCHDOG_ENABLED + * caller | table->data points to | 'which' + * -------------------|----------------------------|-------------------------- + * proc_watchdog | watchdog_user_enabled | NMI_WATCHDOG_ENABLED | + * | | SOFT_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_nmi_watchdog | nmi_watchdog_user_enabled | NMI_WATCHDOG_ENABLED + * -------------------|----------------------------|-------------------------- + * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; - int *watchdog_param = (int *)table->data; + int err, old, *param = table->data; - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); + mutex_lock(&watchdog_mutex); - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } - - /* - * If the parameter is being read return the state of the corresponding - * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the - * run state of the lockup detectors. - */ if (!write) { - *watchdog_param = (watchdog_enabled & which) != 0; + /* + * On read synchronize the userspace interface. This is a + * racy snapshot. + */ + *param = (watchdog_enabled & which) != 0; err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); } else { + old = READ_ONCE(*param); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err) - goto out; - - /* - * There is a race window between fetching the current value - * from 'watchdog_enabled' and storing the new value. During - * this race window, watchdog_nmi_enable() can sneak in and - * clear the NMI_WATCHDOG_ENABLED bit in 'watchdog_enabled'. - * The 'cmpxchg' detects this race and the loop retries. - */ - do { - old = watchdog_enabled; - /* - * If the parameter value is not zero set the - * corresponding bit(s), else clear it(them). - */ - if (*watchdog_param) - new = old | which; - else - new = old & ~which; - } while (cmpxchg(&watchdog_enabled, old, new) != old); - - /* - * Update the run state of the lockup detectors. There is _no_ - * need to check the value returned by proc_watchdog_update() - * and to restore the previous value of 'watchdog_enabled' as - * both lockup detectors are disabled if proc_watchdog_update() - * returns an error. - */ - if (old == new) - goto out; - - err = proc_watchdog_update(); + if (!err && old != READ_ONCE(*param)) + proc_watchdog_update(); } -out: - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + mutex_unlock(&watchdog_mutex); return err; } @@ -835,6 +713,8 @@ int proc_watchdog(struct ctl_table *table, int write, int proc_nmi_watchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { + if (!nmi_watchdog_available && write) + return -ENOTSUPP; return proc_watchdog_common(NMI_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); } @@ -855,39 +735,17 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old, new; - - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); + int err, old; - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } + mutex_lock(&watchdog_mutex); - old = ACCESS_ONCE(watchdog_thresh); + old = READ_ONCE(watchdog_thresh); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (err || !write) - goto out; - - /* - * Update the sample period. Restore on failure. - */ - new = ACCESS_ONCE(watchdog_thresh); - if (old == new) - goto out; + if (!err && write && old != READ_ONCE(watchdog_thresh)) + proc_watchdog_update(); - set_sample_period(); - err = proc_watchdog_update(); - if (err) { - watchdog_thresh = old; - set_sample_period(); - } -out: - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + mutex_unlock(&watchdog_mutex); return err; } @@ -902,45 +760,19 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write, { int err; - get_online_cpus(); - mutex_lock(&watchdog_proc_mutex); - - if (watchdog_suspended) { - /* no parameter changes allowed while watchdog is suspended */ - err = -EAGAIN; - goto out; - } + mutex_lock(&watchdog_mutex); err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); - if (!err && write) { - /* Remove impossible cpus to keep sysctl output cleaner. */ - cpumask_and(&watchdog_cpumask, &watchdog_cpumask, - cpu_possible_mask); - - if (watchdog_running) { - /* - * Failure would be due to being unable to allocate - * a temporary cpumask, so we are likely not in a - * position to do much else to make things better. - */ - if (watchdog_update_cpus() != 0) - pr_err("cpumask update failed\n"); - } + if (!err && write) + proc_watchdog_update(); - watchdog_nmi_reconfigure(); - } -out: - mutex_unlock(&watchdog_proc_mutex); - put_online_cpus(); + mutex_unlock(&watchdog_mutex); return err; } - #endif /* CONFIG_SYSCTL */ void __init lockup_detector_init(void) { - set_sample_period(); - #ifdef CONFIG_NO_HZ_FULL if (tick_nohz_full_enabled()) { pr_info("Disabling watchdog on nohz_full cores by default\n"); @@ -951,6 +783,7 @@ void __init lockup_detector_init(void) cpumask_copy(&watchdog_cpumask, cpu_possible_mask); #endif - if (watchdog_enabled) - watchdog_enable_all_cpus(); + if (!watchdog_nmi_probe()) + nmi_watchdog_available = true; + lockup_detector_setup(); } diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 3a09ea1b1d3d..71a62ceacdc8 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -21,8 +21,10 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +static struct cpumask dead_events_mask; static unsigned long hardlockup_allcpu_dumped; +static unsigned int watchdog_cpus; void arch_touch_nmi_watchdog(void) { @@ -103,15 +105,12 @@ static struct perf_event_attr wd_hw_attr = { /* Callback function for perf event subsystem */ static void watchdog_overflow_callback(struct perf_event *event, - struct perf_sample_data *data, - struct pt_regs *regs) + struct perf_sample_data *data, + struct pt_regs *regs) { /* Ensure the watchdog never gets throttled */ event->hw.interrupts = 0; - if (atomic_read(&watchdog_park_in_progress) != 0) - return; - if (__this_cpu_read(watchdog_nmi_touch) == true) { __this_cpu_write(watchdog_nmi_touch, false); return; @@ -160,104 +159,131 @@ static void watchdog_overflow_callback(struct perf_event *event, return; } -/* - * People like the simple clean cpu node info on boot. - * Reduce the watchdog noise by only printing messages - * that are different from what cpu0 displayed. - */ -static unsigned long firstcpu_err; -static atomic_t watchdog_cpus; - -int watchdog_nmi_enable(unsigned int cpu) +static int hardlockup_detector_event_create(void) { + unsigned int cpu = smp_processor_id(); struct perf_event_attr *wd_attr; - struct perf_event *event = per_cpu(watchdog_ev, cpu); - int firstcpu = 0; - - /* nothing to do if the hard lockup detector is disabled */ - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto out; - - /* is it already setup and enabled? */ - if (event && event->state > PERF_EVENT_STATE_OFF) - goto out; - - /* it is setup but not enabled */ - if (event != NULL) - goto out_enable; - - if (atomic_inc_return(&watchdog_cpus) == 1) - firstcpu = 1; + struct perf_event *evt; wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); /* Try to register using hardware perf events */ - event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL, + watchdog_overflow_callback, NULL); + if (IS_ERR(evt)) { + pr_info("Perf event create on CPU %d failed with %ld\n", cpu, + PTR_ERR(evt)); + return PTR_ERR(evt); + } + this_cpu_write(watchdog_ev, evt); + return 0; +} - /* save the first cpu's error for future comparision */ - if (firstcpu && IS_ERR(event)) - firstcpu_err = PTR_ERR(event); +/** + * hardlockup_detector_perf_enable - Enable the local event + */ +void hardlockup_detector_perf_enable(void) +{ + if (hardlockup_detector_event_create()) + return; - if (!IS_ERR(event)) { - /* only print for the first cpu initialized */ - if (firstcpu || firstcpu_err) - pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); - goto out_save; - } + if (!watchdog_cpus++) + pr_info("Enabled. Permanently consumes one hw-PMU counter.\n"); - /* - * Disable the hard lockup detector if _any_ CPU fails to set up - * set up the hardware perf event. The watchdog() function checks - * the NMI_WATCHDOG_ENABLED bit periodically. - * - * The barriers are for syncing up watchdog_enabled across all the - * cpus, as clear_bit() does not use barriers. - */ - smp_mb__before_atomic(); - clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled); - smp_mb__after_atomic(); - - /* skip displaying the same error again */ - if (!firstcpu && (PTR_ERR(event) == firstcpu_err)) - return PTR_ERR(event); - - /* vary the KERN level based on the returned errno */ - if (PTR_ERR(event) == -EOPNOTSUPP) - pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); - else if (PTR_ERR(event) == -ENOENT) - pr_warn("disabled (cpu%i): hardware events not enabled\n", - cpu); - else - pr_err("disabled (cpu%i): unable to create perf event: %ld\n", - cpu, PTR_ERR(event)); - - pr_info("Shutting down hard lockup detector on all cpus\n"); - - return PTR_ERR(event); - - /* success path */ -out_save: - per_cpu(watchdog_ev, cpu) = event; -out_enable: - perf_event_enable(per_cpu(watchdog_ev, cpu)); -out: - return 0; + perf_event_enable(this_cpu_read(watchdog_ev)); } -void watchdog_nmi_disable(unsigned int cpu) +/** + * hardlockup_detector_perf_disable - Disable the local event + */ +void hardlockup_detector_perf_disable(void) { - struct perf_event *event = per_cpu(watchdog_ev, cpu); + struct perf_event *event = this_cpu_read(watchdog_ev); if (event) { perf_event_disable(event); + cpumask_set_cpu(smp_processor_id(), &dead_events_mask); + watchdog_cpus--; + } +} + +/** + * hardlockup_detector_perf_cleanup - Cleanup disabled events and destroy them + * + * Called from lockup_detector_cleanup(). Serialized by the caller. + */ +void hardlockup_detector_perf_cleanup(void) +{ + int cpu; + + for_each_cpu(cpu, &dead_events_mask) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* + * Required because for_each_cpu() reports unconditionally + * CPU0 as set on UP kernels. Sigh. + */ + if (event) + perf_event_release_kernel(event); per_cpu(watchdog_ev, cpu) = NULL; + } + cpumask_clear(&dead_events_mask); +} + +/** + * hardlockup_detector_perf_stop - Globally stop watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_stop(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_disable(event); + } +} - /* should be in cleanup, but blocks oprofile */ - perf_event_release_kernel(event); +/** + * hardlockup_detector_perf_restart - Globally restart watchdog events + * + * Special interface for x86 to handle the perf HT bug. + */ +void __init hardlockup_detector_perf_restart(void) +{ + int cpu; + + lockdep_assert_cpus_held(); + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return; + + for_each_online_cpu(cpu) { + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) + perf_event_enable(event); + } +} + +/** + * hardlockup_detector_perf_init - Probe whether NMI event is available at all + */ +int __init hardlockup_detector_perf_init(void) +{ + int ret = hardlockup_detector_event_create(); - /* watchdog_nmi_enable() expects this to be zero initially. */ - if (atomic_dec_and_test(&watchdog_cpus)) - firstcpu_err = 0; + if (ret) { + pr_info("Perf NMI watchdog permanently disabled\n"); + } else { + perf_event_release_kernel(this_cpu_read(watchdog_ev)); + this_cpu_write(watchdog_ev, NULL); } + return ret; } diff --git a/lib/once.c b/lib/once.c index 05c8604627eb..831c5a6b0bb2 100644 --- a/lib/once.c +++ b/lib/once.c @@ -5,7 +5,7 @@ struct once_work { struct work_struct work; - struct static_key *key; + struct static_key_true *key; }; static void once_deferred(struct work_struct *w) @@ -14,11 +14,11 @@ static void once_deferred(struct work_struct *w) work = container_of(w, struct once_work, work); BUG_ON(!static_key_enabled(work->key)); - static_key_slow_dec(work->key); + static_branch_disable(work->key); kfree(work); } -static void once_disable_jump(struct static_key *key) +static void once_disable_jump(struct static_key_true *key) { struct once_work *w; @@ -51,7 +51,7 @@ bool __do_once_start(bool *done, unsigned long *flags) } EXPORT_SYMBOL(__do_once_start); -void __do_once_done(bool *done, struct static_key *once_key, +void __do_once_done(bool *done, struct static_key_true *once_key, unsigned long *flags) __releases(once_lock) { diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 83ba5483455a..1b659ab652fb 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -916,8 +916,8 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) u16 tvlv_len = 0; unsigned long send_time; - if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || - (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) return; /* the interface gets activated here to avoid race conditions between @@ -1264,7 +1264,7 @@ static bool batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, * drops as they can't send and receive at the same time. */ tq_iface_penalty = BATADV_TQ_MAX_VALUE; - if (if_outgoing && (if_incoming == if_outgoing) && + if (if_outgoing && if_incoming == if_outgoing && batadv_is_wifi_hardif(if_outgoing)) tq_iface_penalty = batadv_hop_penalty(BATADV_TQ_MAX_VALUE, bat_priv); @@ -1369,7 +1369,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, ret = BATADV_NEIGH_DUP; } else { set_mark = 0; - if (is_dup && (ret != BATADV_NEIGH_DUP)) + if (is_dup && ret != BATADV_NEIGH_DUP) ret = BATADV_ORIG_DUP; } @@ -1515,7 +1515,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, /* drop packet if sender is not a direct neighbor and if we * don't route towards it */ - if (!is_single_hop_neigh && (!orig_neigh_router)) { + if (!is_single_hop_neigh && !orig_neigh_router) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM via unknown neighbor!\n"); goto out_neigh; @@ -1535,7 +1535,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, sameseq = orig_ifinfo->last_real_seqno == ntohl(ogm_packet->seqno); similar_ttl = (orig_ifinfo->last_ttl - 3) <= ogm_packet->ttl; - if (is_bidirect && ((dup_status == BATADV_NO_DUP) || + if (is_bidirect && (dup_status == BATADV_NO_DUP || (sameseq && similar_ttl))) { batadv_iv_ogm_orig_update(bat_priv, orig_node, orig_ifinfo, ethhdr, @@ -1553,8 +1553,8 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, /* OGMs from secondary interfaces should only scheduled once * per interface where it has been received, not multiple times */ - if ((ogm_packet->ttl <= 2) && - (if_incoming != if_outgoing)) { + if (ogm_packet->ttl <= 2 && + if_incoming != if_outgoing) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Drop packet: OGM from secondary interface and wrong outgoing interface\n"); goto out_neigh; @@ -1590,7 +1590,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, if_incoming, if_outgoing); out_neigh: - if ((orig_neigh_node) && (!is_single_hop_neigh)) + if (orig_neigh_node && !is_single_hop_neigh) batadv_orig_node_put(orig_neigh_node); out: if (router_ifinfo) @@ -2523,9 +2523,9 @@ batadv_iv_gw_get_best_gw_node(struct batadv_priv *bat_priv) tmp_gw_factor *= 100 * 100; tmp_gw_factor >>= 18; - if ((tmp_gw_factor > max_gw_factor) || - ((tmp_gw_factor == max_gw_factor) && - (tq_avg > max_tq))) { + if (tmp_gw_factor > max_gw_factor || + (tmp_gw_factor == max_gw_factor && + tq_avg > max_tq)) { if (curr_gw) batadv_gw_node_put(curr_gw); curr_gw = gw_node; diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 4e2724c5b33d..93ef1c06227e 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -767,7 +767,7 @@ batadv_v_gw_get_best_gw_node(struct batadv_priv *bat_priv) if (batadv_v_gw_throughput_get(gw_node, &bw) < 0) goto next; - if (curr_gw && (bw <= max_bw)) + if (curr_gw && bw <= max_bw) goto next; if (curr_gw) diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index bd1064d98e16..1de992c58b35 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -134,7 +134,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX; throughput = link_settings.base.speed; - if (throughput && (throughput != SPEED_UNKNOWN)) + if (throughput && throughput != SPEED_UNKNOWN) return throughput * 10; } @@ -263,8 +263,8 @@ static void batadv_v_elp_periodic_work(struct work_struct *work) goto out; /* we are in the process of shutting this interface down */ - if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) || - (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED)) + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE || + hard_iface->if_status == BATADV_IF_TO_BE_REMOVED) goto out; /* the interface was enabled but may not be ready yet */ diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 8be61734fc43..c251445a42a0 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -304,8 +304,8 @@ static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv, * due to the store & forward characteristics of WIFI. * Very low throughput values are the exception. */ - if ((throughput > 10) && - (if_incoming == if_outgoing) && + if (throughput > 10 && + if_incoming == if_outgoing && !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX)) return throughput / 2; @@ -455,7 +455,7 @@ static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv, /* drop packets with old seqnos, however accept the first packet after * a host has been rebooted. */ - if ((seq_diff < 0) && !protection_started) + if (seq_diff < 0 && !protection_started) goto out; neigh_node->last_seen = jiffies; @@ -568,8 +568,8 @@ static bool batadv_v_ogm_route_update(struct batadv_priv *bat_priv, router_throughput = router_ifinfo->bat_v.throughput; neigh_throughput = neigh_ifinfo->bat_v.throughput; - if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) && - (router_throughput >= neigh_throughput)) + if (neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF && + router_throughput >= neigh_throughput) goto out; } @@ -621,7 +621,7 @@ batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv, return; /* only unknown & newer OGMs contain TVLVs we are interested in */ - if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT)) + if (seqno_age > 0 && if_outgoing == BATADV_IF_DEFAULT) batadv_tvlv_containers_process(bat_priv, true, orig_node, NULL, NULL, (unsigned char *)(ogm2 + 1), diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index b6cfa78e9381..760c0de72582 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -492,8 +492,8 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, /* this is an hash collision with the temporary selected node. Choose * the one with the lowest address */ - if ((tmp_max == max) && max_orig_node && - (batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0)) + if (tmp_max == max && max_orig_node && + batadv_compare_eth(candidate->orig, max_orig_node->orig) > 0) goto out; ret = true; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index de9955d5224d..10d521f0b17f 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -248,12 +248,12 @@ void batadv_gw_election(struct batadv_priv *bat_priv) } } - if ((curr_gw) && (!next_gw)) { + if (curr_gw && !next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Removing selected gateway - no gateway in range\n"); batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL); - } else if ((!curr_gw) && (next_gw)) { + } else if (!curr_gw && next_gw) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Adding route to gateway %pM (bandwidth: %u.%u/%u.%u MBit, tq: %i)\n", next_gw->orig_node->orig, @@ -411,8 +411,8 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, goto out; } - if ((gw_node->bandwidth_down == ntohl(gateway->bandwidth_down)) && - (gw_node->bandwidth_up == ntohl(gateway->bandwidth_up))) + if (gw_node->bandwidth_down == ntohl(gateway->bandwidth_down) && + gw_node->bandwidth_up == ntohl(gateway->bandwidth_up)) goto out; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 33940c5c74a8..2c26039c23fc 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -56,8 +56,8 @@ bool batadv_parse_throughput(struct net_device *net_dev, char *buff, if (strncasecmp(tmp_ptr, "mbit", 4) == 0) bw_unit_type = BATADV_BW_UNIT_MBIT; - if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) || - (bw_unit_type == BATADV_BW_UNIT_MBIT)) + if (strncasecmp(tmp_ptr, "kbit", 4) == 0 || + bw_unit_type == BATADV_BW_UNIT_MBIT) *tmp_ptr = '\0'; } @@ -190,7 +190,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, if (!up_new) up_new = 1; - if ((down_curr == down_new) && (up_curr == up_new)) + if (down_curr == down_new && up_curr == up_new) return count; batadv_gw_reselect(bat_priv); @@ -224,16 +224,16 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, /* only fetch the tvlv value if the handler wasn't called via the * CIFNOTFND flag and if there is data to fetch */ - if ((flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) || - (tvlv_value_len < sizeof(gateway))) { + if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND || + tvlv_value_len < sizeof(gateway)) { gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; } else { gateway_ptr = tvlv_value; gateway.bandwidth_down = gateway_ptr->bandwidth_down; gateway.bandwidth_up = gateway_ptr->bandwidth_up; - if ((gateway.bandwidth_down == 0) || - (gateway.bandwidth_up == 0)) { + if (gateway.bandwidth_down == 0 || + gateway.bandwidth_up == 0) { gateway.bandwidth_down = 0; gateway.bandwidth_up = 0; } @@ -242,8 +242,8 @@ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig, &gateway); /* restart gateway selection */ - if ((gateway.bandwidth_down != 0) && - (atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT)) + if (gateway.bandwidth_down != 0 && + atomic_read(&bat_priv->gw.mode) == BATADV_GW_MODE_CLIENT) batadv_gw_check_election(bat_priv, orig); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index f7b413b9297e..4e3d5340ad96 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -504,8 +504,8 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->net_dev == net_dev) @@ -568,8 +568,8 @@ int batadv_hardif_min_mtu(struct net_device *soft_iface) rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) continue; if (hard_iface->soft_iface != soft_iface) @@ -654,8 +654,8 @@ out: static void batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface) { - if ((hard_iface->if_status != BATADV_IF_ACTIVE) && - (hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED)) + if (hard_iface->if_status != BATADV_IF_ACTIVE && + hard_iface->if_status != BATADV_IF_TO_BE_ACTIVATED) return; hard_iface->if_status = BATADV_IF_INACTIVE; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 8ead292886d1..bded31121d12 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -132,10 +132,10 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf, size_t packet_len; int error; - if ((file->f_flags & O_NONBLOCK) && (socket_client->queue_len == 0)) + if ((file->f_flags & O_NONBLOCK) && socket_client->queue_len == 0) return -EAGAIN; - if ((!buf) || (count < sizeof(struct batadv_icmp_packet))) + if (!buf || count < sizeof(struct batadv_icmp_packet)) return -EINVAL; if (!access_ok(VERIFY_WRITE, buf, count)) diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index fb381fb26a66..4daed7ad46f2 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -73,8 +73,8 @@ * list traversals just rcu-locked */ struct list_head batadv_hardif_list; -static int (*batadv_rx_handler[256])(struct sk_buff *, - struct batadv_hard_iface *); +static int (*batadv_rx_handler[256])(struct sk_buff *skb, + struct batadv_hard_iface *recv_if); unsigned char batadv_broadcast_addr[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; @@ -540,12 +540,12 @@ batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)) { - int (*curr)(struct sk_buff *, - struct batadv_hard_iface *); + int (*curr)(struct sk_buff *skb, + struct batadv_hard_iface *recv_if); curr = batadv_rx_handler[packet_type]; - if ((curr != batadv_recv_unhandled_packet) && - (curr != batadv_recv_unhandled_unicast_packet)) + if (curr != batadv_recv_unhandled_packet && + curr != batadv_recv_unhandled_unicast_packet) return -EBUSY; batadv_rx_handler[packet_type] = recv_handler; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 05cc7637c064..edb2f239d04d 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2017.3" +#define BATADV_SOURCE_VERSION "2017.4" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index d327670641ac..e553a8770a89 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1126,7 +1126,7 @@ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, bool orig_initialized; if (orig_mcast_enabled && tvlv_value && - (tvlv_value_len >= sizeof(mcast_flags))) + tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; spin_lock_bh(&orig->mcast_handler_lock); diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 8e2a4b205257..2967b86c13da 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1062,9 +1062,9 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv, continue; /* don't purge if the interface is not (going) down */ - if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && - (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && - (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + if (if_outgoing->if_status != BATADV_IF_INACTIVE && + if_outgoing->if_status != BATADV_IF_NOT_IN_USE && + if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -1106,9 +1106,9 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv, continue; /* don't purge if the interface is not (going) down */ - if ((if_outgoing->if_status != BATADV_IF_INACTIVE) && - (if_outgoing->if_status != BATADV_IF_NOT_IN_USE) && - (if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED)) + if (if_outgoing->if_status != BATADV_IF_INACTIVE && + if_outgoing->if_status != BATADV_IF_NOT_IN_USE && + if_outgoing->if_status != BATADV_IF_TO_BE_REMOVED) continue; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, @@ -1155,13 +1155,13 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv, last_seen = neigh_node->last_seen; if_incoming = neigh_node->if_incoming; - if ((batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT)) || - (if_incoming->if_status == BATADV_IF_INACTIVE) || - (if_incoming->if_status == BATADV_IF_NOT_IN_USE) || - (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) { - if ((if_incoming->if_status == BATADV_IF_INACTIVE) || - (if_incoming->if_status == BATADV_IF_NOT_IN_USE) || - (if_incoming->if_status == BATADV_IF_TO_BE_REMOVED)) + if (batadv_has_timed_out(last_seen, BATADV_PURGE_TIMEOUT) || + if_incoming->if_status == BATADV_IF_INACTIVE || + if_incoming->if_status == BATADV_IF_NOT_IN_USE || + if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) { + if (if_incoming->if_status == BATADV_IF_INACTIVE || + if_incoming->if_status == BATADV_IF_NOT_IN_USE || + if_incoming->if_status == BATADV_IF_TO_BE_REMOVED) batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "neighbor purge: originator %pM, neighbor: %pM, iface: %s\n", orig_node->orig, neigh_node->addr, diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index f10e3ff26f9d..40d9bf3e5bfe 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -93,14 +93,14 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, batadv_orig_ifinfo_put(orig_ifinfo); /* route deleted */ - if ((curr_router) && (!neigh_node)) { + if (curr_router && !neigh_node) { batadv_dbg(BATADV_DBG_ROUTES, bat_priv, "Deleting route towards: %pM\n", orig_node->orig); batadv_tt_global_del_orig(bat_priv, orig_node, -1, "Deleted route towards originator"); /* route added */ - } else if ((!curr_router) && (neigh_node)) { + } else if (!curr_router && neigh_node) { batadv_dbg(BATADV_DBG_ROUTES, bat_priv, "Adding route towards: %pM (via %pM)\n", orig_node->orig, neigh_node->addr); @@ -381,7 +381,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb, /* add record route information if not full */ if ((icmph->msg_type == BATADV_ECHO_REPLY || icmph->msg_type == BATADV_ECHO_REQUEST) && - (skb->len >= sizeof(struct batadv_icmp_packet_rr))) { + skb->len >= sizeof(struct batadv_icmp_packet_rr)) { if (skb_linearize(skb) < 0) goto free_skb; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 054a65e6eb68..7895323fd2a7 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -142,7 +142,7 @@ int batadv_send_unicast_skb(struct sk_buff *skb, #ifdef CONFIG_BATMAN_ADV_BATMAN_V hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr); - if ((hardif_neigh) && (ret != NET_XMIT_DROP)) + if (hardif_neigh && ret != NET_XMIT_DROP) hardif_neigh->bat_v.last_unicast_tx = jiffies; if (hardif_neigh) @@ -615,8 +615,8 @@ batadv_forw_packet_list_steal(struct hlist_head *forw_list, * we delete only packets belonging to the given interface */ if (hard_iface && - (forw_packet->if_incoming != hard_iface) && - (forw_packet->if_outgoing != hard_iface)) + forw_packet->if_incoming != hard_iface && + forw_packet->if_outgoing != hard_iface) continue; hlist_del(&forw_packet->list); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index e7d5fbb6ad53..543d2c3e0f0d 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -69,8 +69,8 @@ int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) int result; /* TODO: We must check if we can release all references to non-payload - * data using __skb_header_release in our skbs to allow skb_cow_header to - * work optimally. This means that those skbs are not allowed to read + * data using __skb_header_release in our skbs to allow skb_cow_header + * to work optimally. This means that those skbs are not allowed to read * or write any data which is before the current position of skb->data * after that call and thus allow other skbs with the same data buffer * to write freely in that area. @@ -160,7 +160,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) static int batadv_interface_change_mtu(struct net_device *dev, int new_mtu) { /* check ranges */ - if ((new_mtu < 68) || (new_mtu > batadv_hardif_min_mtu(dev))) + if (new_mtu < 68 || new_mtu > batadv_hardif_min_mtu(dev)) return -EINVAL; dev->mtu = new_mtu; diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 0ae8b30e4eaa..aa187fd42475 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -925,8 +925,8 @@ static int batadv_store_mesh_iface_finish(struct net_device *net_dev, if (hard_iface->if_status == status_tmp) goto out; - if ((hard_iface->soft_iface) && - (strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0)) + if (hard_iface->soft_iface && + strncmp(hard_iface->soft_iface->name, ifname, IFNAMSIZ) == 0) goto out; if (status_tmp == BATADV_IF_NOT_IN_USE) { diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index bfe8effe9238..4b90033f35a8 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1206,7 +1206,7 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst, /* send the ack */ r = batadv_send_skb_to_orig(skb, orig_node, NULL); - if (unlikely(r < 0) || (r == NET_XMIT_DROP)) { + if (unlikely(r < 0) || r == NET_XMIT_DROP) { ret = BATADV_TP_REASON_DST_UNREACHABLE; goto out; } diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 40b1ede527ca..4aee55fdcc92 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_stp.o br_stp_bpdu.o \ br_stp_if.o br_stp_timer.o br_netlink.o \ - br_netlink_tunnel.o + br_netlink_tunnel.o br_arp_nd_proxy.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c new file mode 100644 index 000000000000..2cf7716254be --- /dev/null +++ b/net/bridge/br_arp_nd_proxy.c @@ -0,0 +1,469 @@ +/* + * Handle bridge arp/nd proxy/suppress + * + * Copyright (C) 2017 Cumulus Networks + * Copyright (c) 2017 Roopa Prabhu <roopa@cumulusnetworks.com> + * + * Authors: + * Roopa Prabhu <roopa@cumulusnetworks.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/neighbour.h> +#include <net/arp.h> +#include <linux/if_vlan.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_checksum.h> +#endif + +#include "br_private.h" + +void br_recalculate_neigh_suppress_enabled(struct net_bridge *br) +{ + struct net_bridge_port *p; + bool neigh_suppress = false; + + list_for_each_entry(p, &br->port_list, list) { + if (p->flags & BR_NEIGH_SUPPRESS) { + neigh_suppress = true; + break; + } + } + + br->neigh_suppress_enabled = neigh_suppress; +} + +#if IS_ENABLED(CONFIG_INET) +static void br_arp_send(struct net_bridge *br, struct net_bridge_port *p, + struct net_device *dev, __be32 dest_ip, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw, + __be16 vlan_proto, u16 vlan_tci) +{ + struct net_bridge_vlan_group *vg; + struct sk_buff *skb; + u16 pvid; + + netdev_dbg(dev, "arp send dev %s dst %pI4 dst_hw %pM src %pI4 src_hw %pM\n", + dev->name, &dest_ip, dest_hw, &src_ip, src_hw); + + if (!vlan_tci) { + arp_send(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + return; + } + + skb = arp_create(ARPOP_REPLY, ETH_P_ARP, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (!skb) + return; + + if (p) + vg = nbp_vlan_group_rcu(p); + else + vg = br_vlan_group_rcu(br); + pvid = br_get_pvid(vg); + if (pvid == (vlan_tci & VLAN_VID_MASK)) + vlan_tci = 0; + + if (vlan_tci) + __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci); + + if (p) { + arp_xmit(skb); + } else { + skb_reset_mac_header(skb); + __skb_pull(skb, skb_network_offset(skb)); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_HOST; + + netif_rx_ni(skb); + } +} + +static int br_chk_addr_ip(struct net_device *dev, void *data) +{ + __be32 ip = *(__be32 *)data; + struct in_device *in_dev; + __be32 addr = 0; + + in_dev = __in_dev_get_rcu(dev); + if (in_dev) + addr = inet_confirm_addr(dev_net(dev), in_dev, 0, ip, + RT_SCOPE_HOST); + + if (addr == ip) + return 1; + + return 0; +} + +static bool br_is_local_ip(struct net_device *dev, __be32 ip) +{ + if (br_chk_addr_ip(dev, &ip)) + return true; + + /* check if ip is configured on upper dev */ + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip, &ip)) + return true; + + return false; +} + +void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p) +{ + struct net_device *dev = br->dev; + struct net_device *vlandev = dev; + struct neighbour *n; + struct arphdr *parp; + u8 *arpptr, *sha; + __be32 sip, tip; + + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + + if ((dev->flags & IFF_NOARP) || + !pskb_may_pull(skb, arp_hdr_len(dev))) + return; + + parp = arp_hdr(skb); + + if (parp->ar_pro != htons(ETH_P_IP) || + parp->ar_hln != dev->addr_len || + parp->ar_pln != 4) + return; + + arpptr = (u8 *)parp + sizeof(struct arphdr); + sha = arpptr; + arpptr += dev->addr_len; /* sha */ + memcpy(&sip, arpptr, sizeof(sip)); + arpptr += sizeof(sip); + arpptr += dev->addr_len; /* tha */ + memcpy(&tip, arpptr, sizeof(tip)); + + if (ipv4_is_loopback(tip) || + ipv4_is_multicast(tip)) + return; + + if (br->neigh_suppress_enabled) { + if (p && (p->flags & BR_NEIGH_SUPPRESS)) + return; + if (ipv4_is_zeronet(sip) || sip == tip) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + } + + if (parp->ar_op != htons(ARPOP_REQUEST)) + return; + + if (vid != 0) { + vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto, + vid); + if (!vlandev) + return; + } + + if (br->neigh_suppress_enabled && br_is_local_ip(vlandev, tip)) { + /* its our local ip, so don't proxy reply + * and don't forward to neigh suppress ports + */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + n = neigh_lookup(&arp_tbl, &tip, vlandev); + if (n) { + struct net_bridge_fdb_entry *f; + + if (!(n->nud_state & NUD_VALID)) { + neigh_release(n); + return; + } + + f = br_fdb_find_rcu(br, n->ha, vid); + if (f) { + bool replied = false; + + if ((p && (p->flags & BR_PROXYARP)) || + (f->dst && (f->dst->flags & (BR_PROXYARP_WIFI | + BR_NEIGH_SUPPRESS)))) { + if (!vid) + br_arp_send(br, p, skb->dev, sip, tip, + sha, n->ha, sha, 0, 0); + else + br_arp_send(br, p, skb->dev, sip, tip, + sha, n->ha, sha, + skb->vlan_proto, + skb_vlan_tag_get(skb)); + replied = true; + } + + /* If we have replied or as long as we know the + * mac, indicate to arp replied + */ + if (replied || br->neigh_suppress_enabled) + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } + + neigh_release(n); + } +} +#endif + +#if IS_ENABLED(CONFIG_IPV6) +struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg) +{ + struct nd_msg *m; + + m = skb_header_pointer(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr), sizeof(*msg), msg); + if (!m) + return NULL; + + if (m->icmph.icmp6_code != 0 || + (m->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && + m->icmph.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) + return NULL; + + return m; +} + +static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, + struct sk_buff *request, struct neighbour *n, + __be16 vlan_proto, u16 vlan_tci, struct nd_msg *ns) +{ + struct net_device *dev = request->dev; + struct net_bridge_vlan_group *vg; + struct sk_buff *reply; + struct nd_msg *na; + struct ipv6hdr *pip6; + int na_olen = 8; /* opt hdr + ETH_ALEN for target */ + int ns_olen; + int i, len; + u8 *daddr; + u16 pvid; + + if (!dev) + return; + + len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + + sizeof(*na) + na_olen + dev->needed_tailroom; + + reply = alloc_skb(len, GFP_ATOMIC); + if (!reply) + return; + + reply->protocol = htons(ETH_P_IPV6); + reply->dev = dev; + skb_reserve(reply, LL_RESERVED_SPACE(dev)); + skb_push(reply, sizeof(struct ethhdr)); + skb_set_mac_header(reply, 0); + + daddr = eth_hdr(request)->h_source; + + /* Do we need option processing ? */ + ns_olen = request->len - (skb_network_offset(request) + + sizeof(struct ipv6hdr)) - sizeof(*ns); + for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) { + if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { + daddr = ns->opt + i + sizeof(struct nd_opt_hdr); + break; + } + } + + /* Ethernet header */ + ether_addr_copy(eth_hdr(reply)->h_dest, daddr); + ether_addr_copy(eth_hdr(reply)->h_source, n->ha); + eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); + reply->protocol = htons(ETH_P_IPV6); + + skb_pull(reply, sizeof(struct ethhdr)); + skb_set_network_header(reply, 0); + skb_put(reply, sizeof(struct ipv6hdr)); + + /* IPv6 header */ + pip6 = ipv6_hdr(reply); + memset(pip6, 0, sizeof(struct ipv6hdr)); + pip6->version = 6; + pip6->priority = ipv6_hdr(request)->priority; + pip6->nexthdr = IPPROTO_ICMPV6; + pip6->hop_limit = 255; + pip6->daddr = ipv6_hdr(request)->saddr; + pip6->saddr = *(struct in6_addr *)n->primary_key; + + skb_pull(reply, sizeof(struct ipv6hdr)); + skb_set_transport_header(reply, 0); + + na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen); + + /* Neighbor Advertisement */ + memset(na, 0, sizeof(*na) + na_olen); + na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; + na->icmph.icmp6_router = 0; /* XXX: should be 1 ? */ + na->icmph.icmp6_override = 1; + na->icmph.icmp6_solicited = 1; + na->target = ns->target; + ether_addr_copy(&na->opt[2], n->ha); + na->opt[0] = ND_OPT_TARGET_LL_ADDR; + na->opt[1] = na_olen >> 3; + + na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, + &pip6->daddr, + sizeof(*na) + na_olen, + IPPROTO_ICMPV6, + csum_partial(na, sizeof(*na) + na_olen, 0)); + + pip6->payload_len = htons(sizeof(*na) + na_olen); + + skb_push(reply, sizeof(struct ipv6hdr)); + skb_push(reply, sizeof(struct ethhdr)); + + reply->ip_summed = CHECKSUM_UNNECESSARY; + + if (p) + vg = nbp_vlan_group_rcu(p); + else + vg = br_vlan_group_rcu(br); + pvid = br_get_pvid(vg); + if (pvid == (vlan_tci & VLAN_VID_MASK)) + vlan_tci = 0; + + if (vlan_tci) + __vlan_hwaccel_put_tag(reply, vlan_proto, vlan_tci); + + netdev_dbg(dev, "nd send dev %s dst %pI6 dst_hw %pM src %pI6 src_hw %pM\n", + dev->name, &pip6->daddr, daddr, &pip6->saddr, n->ha); + + if (p) { + dev_queue_xmit(reply); + } else { + skb_reset_mac_header(reply); + __skb_pull(reply, skb_network_offset(reply)); + reply->ip_summed = CHECKSUM_UNNECESSARY; + reply->pkt_type = PACKET_HOST; + + netif_rx_ni(reply); + } +} + +static int br_chk_addr_ip6(struct net_device *dev, void *data) +{ + struct in6_addr *addr = (struct in6_addr *)data; + + if (ipv6_chk_addr(dev_net(dev), addr, dev, 0)) + return 1; + + return 0; +} + +static bool br_is_local_ip6(struct net_device *dev, struct in6_addr *addr) + +{ + if (br_chk_addr_ip6(dev, addr)) + return true; + + /* check if ip is configured on upper dev */ + if (netdev_walk_all_upper_dev_rcu(dev, br_chk_addr_ip6, addr)) + return true; + + return false; +} + +void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p, struct nd_msg *msg) +{ + struct net_device *dev = br->dev; + struct net_device *vlandev = NULL; + struct in6_addr *saddr, *daddr; + struct ipv6hdr *iphdr; + struct neighbour *n; + + BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; + + if (p && (p->flags & BR_NEIGH_SUPPRESS)) + return; + + if (msg->icmph.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT && + !msg->icmph.icmp6_solicited) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + if (msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) + return; + + iphdr = ipv6_hdr(skb); + saddr = &iphdr->saddr; + daddr = &iphdr->daddr; + + if (ipv6_addr_any(saddr) || !ipv6_addr_cmp(saddr, daddr)) { + /* prevent flooding to neigh suppress ports */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + if (vid != 0) { + /* build neigh table lookup on the vlan device */ + vlandev = __vlan_find_dev_deep_rcu(br->dev, skb->vlan_proto, + vid); + if (!vlandev) + return; + } else { + vlandev = dev; + } + + if (br_is_local_ip6(vlandev, &msg->target)) { + /* its our own ip, so don't proxy reply + * and don't forward to arp suppress ports + */ + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + return; + } + + n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, vlandev); + if (n) { + struct net_bridge_fdb_entry *f; + + if (!(n->nud_state & NUD_VALID)) { + neigh_release(n); + return; + } + + f = br_fdb_find_rcu(br, n->ha, vid); + if (f) { + bool replied = false; + + if (f->dst && (f->dst->flags & BR_NEIGH_SUPPRESS)) { + if (vid != 0) + br_nd_send(br, p, skb, n, + skb->vlan_proto, + skb_vlan_tag_get(skb), msg); + else + br_nd_send(br, p, skb, n, 0, 0, msg); + replied = true; + } + + /* If we have replied or as long as we know the + * mac, indicate to NEIGH_SUPPRESS ports that we + * have replied + */ + if (replied || br->neigh_suppress_enabled) + BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; + } + neigh_release(n); + } +} +#endif diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 7acb77c9bd65..28bb22186fa0 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -39,6 +39,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct pcpu_sw_netstats *brstats = this_cpu_ptr(br->stats); const struct nf_br_ops *nf_ops; const unsigned char *dest; + struct ethhdr *eth; u16 vid = 0; rcu_read_lock(); @@ -57,11 +58,30 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev) BR_INPUT_SKB_CB(skb)->brdev = dev; skb_reset_mac_header(skb); + eth = eth_hdr(skb); skb_pull(skb, ETH_HLEN); if (!br_allowed_ingress(br, br_vlan_group_rcu(br), skb, &vid)) goto out; + if (IS_ENABLED(CONFIG_INET) && + (eth->h_proto == htons(ETH_P_ARP) || + eth->h_proto == htons(ETH_P_RARP)) && + br->neigh_suppress_enabled) { + br_do_proxy_suppress_arp(skb, br, vid, NULL); + } else if (IS_ENABLED(CONFIG_IPV6) && + skb->protocol == htons(ETH_P_IPV6) && + br->neigh_suppress_enabled && + pskb_may_pull(skb, sizeof(struct ipv6hdr) + + sizeof(struct nd_msg)) && + ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { + struct nd_msg *msg, _msg; + + msg = br_is_nd_neigh_msg(skb, &_msg); + if (msg) + br_do_suppress_nd(skb, br, vid, NULL, msg); + } + dest = eth_hdr(skb)->h_dest; if (is_broadcast_ether_addr(dest)) { br_flood(br, skb, BR_PKT_BROADCAST, false, true); diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 48fb17417fac..b4eed113d2ec 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -204,7 +204,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) continue; - if ((p->flags & BR_PROXYARP_WIFI) && + if ((p->flags & (BR_PROXYARP_WIFI | BR_NEIGH_SUPPRESS)) && BR_INPUT_SKB_CB(skb)->proxyarp_replied) continue; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 59a74a414e20..ae38547bbf91 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -310,6 +310,8 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } + br_recalculate_neigh_suppress_enabled(br); + br_fdb_delete_by_port(br, NULL, 0, 1); cancel_delayed_work_sync(&br->gc_work); @@ -660,4 +662,7 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) if (mask & BR_AUTO_MASK) nbp_update_port_count(br); + + if (mask & BR_NEIGH_SUPPRESS) + br_recalculate_neigh_suppress_enabled(br); } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7cb613776b31..a096d3e189da 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -71,62 +71,6 @@ static int br_pass_frame_up(struct sk_buff *skb) br_netif_receive_skb); } -static void br_do_proxy_arp(struct sk_buff *skb, struct net_bridge *br, - u16 vid, struct net_bridge_port *p) -{ - struct net_device *dev = br->dev; - struct neighbour *n; - struct arphdr *parp; - u8 *arpptr, *sha; - __be32 sip, tip; - - BR_INPUT_SKB_CB(skb)->proxyarp_replied = false; - - if ((dev->flags & IFF_NOARP) || - !pskb_may_pull(skb, arp_hdr_len(dev))) - return; - - parp = arp_hdr(skb); - - if (parp->ar_pro != htons(ETH_P_IP) || - parp->ar_op != htons(ARPOP_REQUEST) || - parp->ar_hln != dev->addr_len || - parp->ar_pln != 4) - return; - - arpptr = (u8 *)parp + sizeof(struct arphdr); - sha = arpptr; - arpptr += dev->addr_len; /* sha */ - memcpy(&sip, arpptr, sizeof(sip)); - arpptr += sizeof(sip); - arpptr += dev->addr_len; /* tha */ - memcpy(&tip, arpptr, sizeof(tip)); - - if (ipv4_is_loopback(tip) || - ipv4_is_multicast(tip)) - return; - - n = neigh_lookup(&arp_tbl, &tip, dev); - if (n) { - struct net_bridge_fdb_entry *f; - - if (!(n->nud_state & NUD_VALID)) { - neigh_release(n); - return; - } - - f = br_fdb_find_rcu(br, n->ha, vid); - if (f && ((p->flags & BR_PROXYARP) || - (f->dst && (f->dst->flags & BR_PROXYARP_WIFI)))) { - arp_send(ARPOP_REPLY, ETH_P_ARP, sip, skb->dev, tip, - sha, n->ha, sha); - BR_INPUT_SKB_CB(skb)->proxyarp_replied = true; - } - - neigh_release(n); - } -} - /* note: already called with rcu_read_lock */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { @@ -171,8 +115,22 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb BR_INPUT_SKB_CB(skb)->brdev = br->dev; - if (IS_ENABLED(CONFIG_INET) && skb->protocol == htons(ETH_P_ARP)) - br_do_proxy_arp(skb, br, vid, p); + if (IS_ENABLED(CONFIG_INET) && + (skb->protocol == htons(ETH_P_ARP) || + skb->protocol == htons(ETH_P_RARP))) { + br_do_proxy_suppress_arp(skb, br, vid, p); + } else if (IS_ENABLED(CONFIG_IPV6) && + skb->protocol == htons(ETH_P_IPV6) && + br->neigh_suppress_enabled && + pskb_may_pull(skb, sizeof(struct ipv6hdr) + + sizeof(struct nd_msg)) && + ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { + struct nd_msg *msg, _msg; + + msg = br_is_nd_neigh_msg(skb, &_msg); + if (msg) + br_do_suppress_nd(skb, br, vid, p, msg); + } switch (pkt_type) { case BR_PKT_MULTICAST: diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 8dc5c8d69bcd..7947e0436e18 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -859,8 +859,32 @@ out: spin_unlock(&br->multicast_lock); } +static void br_mc_router_state_change(struct net_bridge *p, + bool is_mc_router) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_BRIDGE_MROUTER, + .flags = SWITCHDEV_F_DEFER, + .u.mrouter = is_mc_router, + }; + + switchdev_port_attr_set(p->dev, &attr); +} + static void br_multicast_local_router_expired(unsigned long data) { + struct net_bridge *br = (struct net_bridge *)data; + + spin_lock(&br->multicast_lock); + if (br->multicast_router == MDB_RTR_TYPE_DISABLED || + br->multicast_router == MDB_RTR_TYPE_PERM || + timer_pending(&br->multicast_router_timer)) + goto out; + + br_mc_router_state_change(br, false); +out: + spin_unlock(&br->multicast_lock); } static void br_multicast_querier_expired(struct net_bridge *br, @@ -1364,9 +1388,12 @@ static void br_multicast_mark_router(struct net_bridge *br, unsigned long now = jiffies; if (!port) { - if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) + if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY) { + if (!timer_pending(&br->multicast_router_timer)) + br_mc_router_state_change(br, true); mod_timer(&br->multicast_router_timer, now + br->multicast_querier_interval); + } return; } @@ -1952,7 +1979,7 @@ void br_multicast_init(struct net_bridge *br) spin_lock_init(&br->multicast_lock); setup_timer(&br->multicast_router_timer, - br_multicast_local_router_expired, 0); + br_multicast_local_router_expired, (unsigned long)br); setup_timer(&br->ip4_other_query.timer, br_ip4_multicast_querier_expired, (unsigned long)br); setup_timer(&br->ip4_own_query.timer, br_ip4_multicast_query_expired, @@ -2042,9 +2069,14 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val) switch (val) { case MDB_RTR_TYPE_DISABLED: case MDB_RTR_TYPE_PERM: + br_mc_router_state_change(br, val == MDB_RTR_TYPE_PERM); del_timer(&br->multicast_router_timer); - /* fall through */ + br->multicast_router = val; + err = 0; + break; case MDB_RTR_TYPE_TEMP_QUERY: + if (br->multicast_router != MDB_RTR_TYPE_TEMP_QUERY) + br_mc_router_state_change(br, false); br->multicast_router = val; err = 0; break; @@ -2184,6 +2216,18 @@ bool br_multicast_enabled(const struct net_device *dev) } EXPORT_SYMBOL_GPL(br_multicast_enabled); +bool br_multicast_router(const struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + bool is_router; + + spin_lock_bh(&br->multicast_lock); + is_router = br_multicast_is_router(br); + spin_unlock_bh(&br->multicast_lock); + return is_router; +} +EXPORT_SYMBOL_GPL(br_multicast_router); + int br_multicast_set_querier(struct net_bridge *br, unsigned long val) { unsigned long max_delay; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index dea88a255d26..f0e82682e071 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -138,6 +138,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ + + nla_total_size(1) /* IFLA_BRPORT_NEIGH_SUPPRESS */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_ROOT_ID */ + nla_total_size(sizeof(struct ifla_bridge_id)) /* IFLA_BRPORT_BRIDGE_ID */ + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_DESIGNATED_PORT */ @@ -210,7 +211,9 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u8(skb, IFLA_BRPORT_CONFIG_PENDING, p->config_pending) || nla_put_u8(skb, IFLA_BRPORT_VLAN_TUNNEL, !!(p->flags & BR_VLAN_TUNNEL)) || - nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask)) + nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || + nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, + !!(p->flags & BR_NEIGH_SUPPRESS))) return -EMSGSIZE; timerval = br_timer_value(&p->message_age_timer); @@ -785,6 +788,11 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) p->group_fwd_mask = fwd_mask; } + err = br_set_port_flag(p, tb, IFLA_BRPORT_NEIGH_SUPPRESS, + BR_NEIGH_SUPPRESS); + if (err) + return err; + br_port_flags_change(p, old_flags ^ p->flags); return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index ab4df24f7bba..fa0039f44818 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -404,6 +404,7 @@ struct net_bridge { #ifdef CONFIG_NET_SWITCHDEV int offload_fwd_mark; #endif + bool neigh_suppress_enabled; }; struct br_input_skb_cb { @@ -1139,4 +1140,11 @@ static inline void br_switchdev_frame_unmark(struct sk_buff *skb) } #endif /* CONFIG_NET_SWITCHDEV */ +/* br_arp_nd_proxy.c */ +void br_recalculate_neigh_suppress_enabled(struct net_bridge *br); +void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p); +void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br, + u16 vid, struct net_bridge_port *p, struct nd_msg *msg); +struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m); #endif diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 9110d5e56085..0a1fa9ccd8b7 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -191,6 +191,7 @@ BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); +BRPORT_ATTR_FLAG(neigh_suppress, BR_NEIGH_SUPPRESS); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -241,6 +242,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_multicast_flood, &brport_attr_broadcast_flood, &brport_attr_group_fwd_mask, + &brport_attr_neigh_suppress, NULL }; diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 2585b100ebbb..276b60262981 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -65,8 +65,8 @@ static int ebt_broute(struct sk_buff *skb) static int __net_init broute_net_init(struct net *net) { - net->xt.broute_table = ebt_register_table(net, &broute_table, NULL); - return PTR_ERR_OR_ZERO(net->xt.broute_table); + return ebt_register_table(net, &broute_table, NULL, + &net->xt.broute_table); } static void __net_exit broute_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 45a00dbdbcad..c41da5fac84f 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_filter[] = { static int __net_init frame_filter_net_init(struct net *net) { - net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter); - return PTR_ERR_OR_ZERO(net->xt.frame_filter); + return ebt_register_table(net, &frame_filter, ebt_ops_filter, + &net->xt.frame_filter); } static void __net_exit frame_filter_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index 57cd5bb154e7..08df7406ecb3 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,8 +93,8 @@ static const struct nf_hook_ops ebt_ops_nat[] = { static int __net_init frame_nat_net_init(struct net *net) { - net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat); - return PTR_ERR_OR_ZERO(net->xt.frame_nat); + return ebt_register_table(net, &frame_nat, ebt_ops_nat, + &net->xt.frame_nat); } static void __net_exit frame_nat_net_exit(struct net *net) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 83951f978445..3b3dcf719e07 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1169,9 +1169,8 @@ static void __ebt_unregister_table(struct net *net, struct ebt_table *table) kfree(table); } -struct ebt_table * -ebt_register_table(struct net *net, const struct ebt_table *input_table, - const struct nf_hook_ops *ops) +int ebt_register_table(struct net *net, const struct ebt_table *input_table, + const struct nf_hook_ops *ops, struct ebt_table **res) { struct ebt_table_info *newinfo; struct ebt_table *t, *table; @@ -1183,7 +1182,7 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, repl->entries == NULL || repl->entries_size == 0 || repl->counters != NULL || input_table->private != NULL) { BUGPRINT("Bad table data for ebt_register_table!!!\n"); - return ERR_PTR(-EINVAL); + return -EINVAL; } /* Don't add one table to multiple lists. */ @@ -1252,16 +1251,18 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table, list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]); mutex_unlock(&ebt_mutex); + WRITE_ONCE(*res, table); + if (!ops) - return table; + return 0; ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); if (ret) { __ebt_unregister_table(net, table); - return ERR_PTR(ret); + *res = NULL; } - return table; + return ret; free_unlock: mutex_unlock(&ebt_mutex); free_chainstack: @@ -1276,7 +1277,7 @@ free_newinfo: free_table: kfree(table); out: - return ERR_PTR(ret); + return ret; } void ebt_unregister_table(struct net *net, struct ebt_table *table, diff --git a/net/core/dst.c b/net/core/dst.c index a6c47da7d0f8..662a2d4a3d19 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -322,3 +322,19 @@ metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + +void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) +{ +#ifdef CONFIG_DST_CACHE + int cpu; + + for_each_possible_cpu(cpu) { + struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); + + if (one_md_dst->type == METADATA_IP_TUNNEL) + dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); + } +#endif + free_percpu(md_dst); +} +EXPORT_SYMBOL_GPL(metadata_dst_free_percpu); diff --git a/net/core/filter.c b/net/core/filter.c index b7e8caa1e790..140fa9f9c0f4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -43,6 +43,7 @@ #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/unaligned.h> +#include <asm/cmpxchg.h> #include <linux/filter.h> #include <linux/ratelimit.h> #include <linux/seccomp.h> @@ -2987,14 +2988,15 @@ static const struct bpf_func_proto * bpf_get_skb_set_tunnel_proto(enum bpf_func_id which) { if (!md_dst) { - /* Race is not possible, since it's called from verifier - * that is holding verifier mutex. - */ - md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, - METADATA_IP_TUNNEL, - GFP_KERNEL); - if (!md_dst) + struct metadata_dst __percpu *tmp; + + tmp = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX, + METADATA_IP_TUNNEL, + GFP_KERNEL); + if (!tmp) return NULL; + if (cmpxchg(&md_dst, NULL, tmp)) + metadata_dst_free_percpu(tmp); } switch (which) { diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e84d108cfee4..6a09f3d575af 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3066,21 +3066,21 @@ int ndo_dflt_fdb_add(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_add); -static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid) +static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, + struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan\n"); + NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid vlan id %d\n", - vid); + NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } @@ -3105,24 +3105,24 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid ifindex\n"); + NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - pr_info("PF_BRIDGE: RTM_NEWNEIGH with invalid address\n"); + NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); - err = fdb_vid_parse(tb[NDA_VLAN], &vid); + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; @@ -3209,24 +3209,24 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid ifindex\n"); + NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { - pr_info("PF_BRIDGE: RTM_DELNEIGH with invalid address\n"); + NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); - err = fdb_vid_parse(tb[NDA_VLAN], &vid); + err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; @@ -3666,7 +3666,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { - pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } @@ -3741,7 +3741,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { - pr_info("PF_BRIDGE: RTM_SETLINK with unknown ifindex\n"); + NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 822a90e56aea..40717501cbdd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1350,8 +1350,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) /* Set the tail pointer and length */ skb_put(n, skb->len); - if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) - BUG(); + BUG_ON(skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)); copy_skb_header(n, skb); return n; @@ -1449,8 +1448,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, BUG_ON(nhead < 0); - if (skb_shared(skb)) - BUG(); + BUG_ON(skb_shared(skb)); size = SKB_DATA_ALIGN(size); @@ -1595,9 +1593,8 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, head_copy_off = newheadroom - head_copy_len; /* Copy the linear header and data. */ - if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, - skb->len + head_copy_len)) - BUG(); + BUG_ON(skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)); copy_skb_header(n, skb); @@ -1878,8 +1875,8 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta) return NULL; } - if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) - BUG(); + BUG_ON(skb_copy_bits(skb, skb_headlen(skb), + skb_tail_pointer(skb), delta)); /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 51ca2a524a27..832c659ff993 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -14,6 +14,7 @@ #include <linux/platform_device.h> #include <linux/slab.h> #include <linux/module.h> +#include <linux/notifier.h> #include <linux/of.h> #include <linux/of_mdio.h> #include <linux/of_platform.h> @@ -261,6 +262,28 @@ bool dsa_schedule_work(struct work_struct *work) return queue_work(dsa_owq, work); } +static ATOMIC_NOTIFIER_HEAD(dsa_notif_chain); + +int register_dsa_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&dsa_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(register_dsa_notifier); + +int unregister_dsa_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&dsa_notif_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_dsa_notifier); + +int call_dsa_notifiers(unsigned long val, struct net_device *dev, + struct dsa_notifier_info *info) +{ + info->dev = dev; + return atomic_notifier_call_chain(&dsa_notif_chain, val, info); +} +EXPORT_SYMBOL_GPL(call_dsa_notifiers); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 54ed054777bd..6ac9e11d385c 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -336,12 +336,6 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) if (err) return err; - if (ds->ops->set_addr) { - err = ds->ops->set_addr(ds, dst->cpu_dp->netdev->dev_addr); - if (err < 0) - return err; - } - if (!ds->slave_mii_bus && ds->ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c index 19ff6e0a21dc..b0fefbffe082 100644 --- a/net/dsa/legacy.c +++ b/net/dsa/legacy.c @@ -172,12 +172,6 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, if (ret) return ret; - if (ops->set_addr) { - ret = ops->set_addr(ds, master->dev_addr); - if (ret < 0) - return ret; - } - if (!ds->slave_mii_bus && ops->phy_read) { ds->slave_mii_bus = devm_mdiobus_alloc(ds->dev); if (!ds->slave_mii_bus) diff --git a/net/dsa/slave.c b/net/dsa/slave.c index fb2954ff198c..45f4ea845c07 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1116,6 +1116,7 @@ int dsa_slave_resume(struct net_device *slave_dev) int dsa_slave_create(struct dsa_port *port, const char *name) { + struct dsa_notifier_register_info rinfo = { }; struct dsa_switch *ds = port->ds; struct net_device *master; struct net_device *slave_dev; @@ -1177,6 +1178,12 @@ int dsa_slave_create(struct dsa_port *port, const char *name) goto out_free; } + rinfo.info.dev = slave_dev; + rinfo.master = master; + rinfo.port_number = p->dp->index; + rinfo.switch_number = p->dp->ds->index; + call_dsa_notifiers(DSA_PORT_REGISTER, slave_dev, &rinfo.info); + ret = register_netdev(slave_dev); if (ret) { netdev_err(master, "error %d registering interface %s\n", @@ -1200,6 +1207,7 @@ out_free: void dsa_slave_destroy(struct net_device *slave_dev) { struct dsa_slave_priv *p = netdev_priv(slave_dev); + struct dsa_notifier_register_info rinfo = { }; struct device_node *port_dn; port_dn = p->dp->dn; @@ -1211,6 +1219,11 @@ void dsa_slave_destroy(struct net_device *slave_dev) if (of_phy_is_fixed_link(port_dn)) of_phy_deregister_fixed_link(port_dn); } + rinfo.info.dev = slave_dev; + rinfo.master = p->dp->cpu_dp->netdev; + rinfo.port_number = p->dp->index; + rinfo.switch_number = p->dp->ds->index; + call_dsa_notifiers(DSA_PORT_UNREGISTER, slave_dev, &rinfo.info); unregister_netdev(slave_dev); free_percpu(p->stats64); free_netdev(slave_dev); diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 8e4bdb9d9ae3..cc4f472fbd77 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -86,6 +86,12 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; brcm_tag[3] = (1 << p->dp->index) & BRCM_IG_DSTMAP1_MASK; + /* Now tell the master network device about the desired output queue + * as well + */ + skb_set_queue_mapping(skb, BRCM_TAG_SET_PORT_QUEUE(p->dp->index, + queue)); + return skb; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 416bb304a281..1859c473b21a 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -86,7 +86,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb, greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 681e33998e03..3c1570d3e22f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -782,7 +782,7 @@ static bool icmp_tag_validation(int proto) } /* - * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, ICMP_QUENCH, and + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEEDED, ICMP_QUENCH, and * ICMP_PARAMETERPROB. */ @@ -810,7 +810,8 @@ static bool icmp_unreach(struct sk_buff *skb) if (iph->ihl < 5) /* Mangled header, drop. */ goto out_err; - if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->type) { + case ICMP_DEST_UNREACH: switch (icmph->code & 15) { case ICMP_NET_UNREACH: case ICMP_HOST_UNREACH: @@ -846,8 +847,16 @@ static bool icmp_unreach(struct sk_buff *skb) } if (icmph->code > NR_ICMP_UNREACH) goto out; - } else if (icmph->type == ICMP_PARAMETERPROB) + break; + case ICMP_PARAMETERPROB: info = ntohl(icmph->un.gateway) >> 24; + break; + case ICMP_TIME_EXCEEDED: + __ICMP_INC_STATS(net, ICMP_MIB_INTIMEEXCDS); + if (icmph->code == ICMP_EXC_FRAGTIME) + goto out; + break; + } /* * Throw it at our lower layers diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index dc2317776499..c105a315b1a3 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -579,8 +579,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev, if (gre_handle_offloads(skb, false)) goto err_free_rt; - if (skb->len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } @@ -731,8 +731,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, if (skb_cow_head(skb, dev->needed_headroom)) goto free_skb; - if (skb->len - dev->hard_header_len > dev->mtu) { - pskb_trim(skb, dev->mtu); + if (skb->len > dev->mtu + dev->hard_header_len) { + pskb_trim(skb, dev->mtu + dev->hard_header_len); truncate = true; } diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 811689e523c3..f75fc6b53115 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -330,7 +330,8 @@ static unsigned int ipv4_synproxy_hook(void *priv, if (synproxy == NULL) return NF_ACCEPT; - if (nf_is_loopback_packet(skb)) + if (nf_is_loopback_packet(skb) || + ip_hdr(skb)->protocol != IPPROTO_TCP) return NF_ACCEPT; thoff = ip_hdrlen(skb); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1c7ed77968c9..4306db827374 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2513,7 +2513,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or struct rtable *ort = (struct rtable *) dst_orig; struct rtable *rt; - rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); + rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8cf742fd4f99..3b34850d361f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; + sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsorted_sent_queue); @@ -469,8 +470,10 @@ void tcp_init_transfer(struct sock *sk, int bpf_op) tcp_init_buffer_space(sk); } -static void tcp_tx_timestamp(struct sock *sk, u16 tsflags, struct sk_buff *skb) +static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { + struct sk_buff *skb = tcp_write_queue_tail(sk); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -699,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!tcp_send_head(sk)) - return; - skb = tcp_write_queue_tail(sk); + if (!skb) + return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); @@ -962,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, int copy, i; bool can_coalesce; - if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 || + if (!skb || (copy = size_goal - skb->len) <= 0 || !tcp_skb_can_collapse_to(skb)) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - skb_queue_empty(&sk->sk_write_queue)); + tcp_rtx_and_write_queues_empty(sk)); if (!skb) goto wait_for_memory; @@ -1041,7 +1043,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sk->sk_tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sk->sk_tsflags); if (!(flags & MSG_SENDPAGE_NOTLAST)) tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } @@ -1197,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) goto out_err; } - skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL; + skb = tcp_write_queue_tail(sk); uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; @@ -1273,7 +1275,7 @@ restart: int max = size_goal; skb = tcp_write_queue_tail(sk); - if (tcp_send_head(sk)) { + if (skb) { if (skb->ip_summed == CHECKSUM_NONE) max = mss_now; copy = max - skb->len; @@ -1293,7 +1295,7 @@ new_segment: process_backlog = false; goto restart; } - first_skb = skb_queue_empty(&sk->sk_write_queue); + first_skb = tcp_rtx_and_write_queues_empty(sk); skb = sk_stream_alloc_skb(sk, select_size(sk, sg, first_skb), sk->sk_allocation, @@ -1418,7 +1420,7 @@ wait_for_memory: out: if (copied) { - tcp_tx_timestamp(sk, sockc.tsflags, tcp_write_queue_tail(sk)); + tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: @@ -1519,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); + if (err) + return err; + copied += skb->len; + } + skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) @@ -2318,6 +2327,37 @@ static inline bool tcp_need_reset(int state) TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } +static void tcp_rtx_queue_purge(struct sock *sk) +{ + struct rb_node *p = rb_first(&sk->tcp_rtx_queue); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + /* Since we are deleting whole queue, no need to + * list_del(&skb->tcp_tsorted_anchor) + */ + tcp_rtx_queue_unlink(skb, sk); + sk_wmem_free_skb(sk, skb); + } +} + +void tcp_write_queue_purge(struct sock *sk) +{ + struct sk_buff *skb; + + tcp_chrono_stop(sk, TCP_CHRONO_BUSY); + while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + tcp_skb_tsorted_anchor_cleanup(skb); + sk_wmem_free_skb(sk, skb); + } + tcp_rtx_queue_purge(sk); + INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); + sk_mem_reclaim(sk); + tcp_clear_all_retrans_hints(tcp_sk(sk)); +} + int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2376,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags) * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; - tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(sk->sk_rx_dst); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 29fff14d5a53..7ee4aadcdd71 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -465,17 +465,15 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node *p; - struct sk_buff *skb; struct dst_entry *dst; + struct sk_buff *skb; if (!tp->syn_fastopen) return; if (!tp->data_segs_in) { - p = rb_first(&tp->out_of_order_queue); - if (p && !rb_next(p)) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); + if (skb && !skb_rb_next(skb)) { if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_fastopen_active_disable(sk); return; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fb0d7ed84b94..d0682ce2a5d6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1142,6 +1142,7 @@ struct tcp_sacktag_state { u64 last_sackt; struct rate_sample *rate; int flag; + unsigned int mss_now; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1191,7 +1192,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, if (pkt_len >= skb->len && !in_sack) return 0; - err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); + err = tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1288,13 +1290,13 @@ static u8 tcp_sacktag_one(struct sock *sk, /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ -static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, +static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, bool dup_sack) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *prev = tcp_write_queue_prev(sk, skb); u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ @@ -1363,8 +1365,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(TCP_SKB_CB(prev)->tx.delivered_mstamp)) TCP_SKB_CB(prev)->tx.delivered_mstamp = 0; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + tcp_rtx_queue_unlink_and_free(skb, sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_SACKMERGED); @@ -1414,9 +1415,9 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, goto fallback; /* Can only happen with delayed DSACK + discard craziness */ - if (unlikely(skb == tcp_write_queue_head(sk))) + prev = skb_rb_prev(skb); + if (!prev) goto fallback; - prev = tcp_write_queue_prev(sk, skb); if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; @@ -1495,18 +1496,17 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if (!skb_shift(prev, skb, len)) goto fallback; - if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) + if (!tcp_shifted_skb(sk, prev, skb, state, pcount, len, mss, dup_sack)) goto out; /* Hole filled allows collapsing with the next as well, this is very * useful when hole on every nth skb pattern happens */ - if (prev == tcp_write_queue_tail(sk)) + skb = skb_rb_next(prev); + if (!skb) goto out; - skb = tcp_write_queue_next(sk, prev); if (!skb_can_shift(skb) || - (skb == tcp_send_head(sk)) || ((TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) || (mss != tcp_skb_seglen(skb))) goto out; @@ -1514,7 +1514,8 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, len = skb->len; if (skb_shift(prev, skb, len)) { pcount += tcp_skb_pcount(skb); - tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0); + tcp_shifted_skb(sk, prev, skb, state, tcp_skb_pcount(skb), + len, mss, 0); } out: @@ -1538,13 +1539,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *tmp; - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { int in_sack = 0; bool dup_sack = dup_sack_in; - if (skb == tcp_send_head(sk)) - break; - /* queue is in-order => we can short-circuit the walk early */ if (!before(TCP_SKB_CB(skb)->seq, end_seq)) break; @@ -1606,23 +1604,44 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, return skb; } -/* Avoid all extra work that is being done by sacktag while walking in - * a normal way - */ +static struct sk_buff *tcp_sacktag_bsearch(struct sock *sk, + struct tcp_sacktag_state *state, + u32 seq) +{ + struct rb_node *parent, **p = &sk->tcp_rtx_queue.rb_node; + struct sk_buff *skb; + int unack_bytes; + + while (*p) { + parent = *p; + skb = rb_to_skb(parent); + if (before(seq, TCP_SKB_CB(skb)->seq)) { + p = &parent->rb_left; + continue; + } + if (!before(seq, TCP_SKB_CB(skb)->end_seq)) { + p = &parent->rb_right; + continue; + } + + state->fack_count = 0; + unack_bytes = TCP_SKB_CB(skb)->seq - tcp_sk(sk)->snd_una; + if (state->mss_now && unack_bytes > 0) + state->fack_count = unack_bytes / state->mss_now; + + return skb; + } + return NULL; +} + static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, struct tcp_sacktag_state *state, u32 skip_to_seq) { - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - - if (after(TCP_SKB_CB(skb)->end_seq, skip_to_seq)) - break; + if (skb && after(TCP_SKB_CB(skb)->seq, skip_to_seq)) + return skb; - state->fack_count += tcp_skb_pcount(skb); - } - return skb; + return tcp_sacktag_bsearch(sk, state, skip_to_seq); } static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, @@ -1744,8 +1763,9 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, } } - skb = tcp_write_queue_head(sk); + state->mss_now = tcp_current_mss(sk); state->fack_count = 0; + skb = NULL; i = 0; if (!tp->sacked_out) { @@ -1969,7 +1989,7 @@ void tcp_enter_loss(struct sock *sk) if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); if (is_reneg) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); @@ -1978,10 +1998,7 @@ void tcp_enter_loss(struct sock *sk) } tcp_clear_all_retrans_hints(tp); - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - + skb_rbtree_walk_from(skb) { mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || is_reneg); if (mark_lost) @@ -2207,20 +2224,18 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); - if (tp->lost_skb_hint) { - skb = tp->lost_skb_hint; - cnt = tp->lost_cnt_hint; + skb = tp->lost_skb_hint; + if (skb) { /* Head already handled? */ - if (mark_head && skb != tcp_write_queue_head(sk)) + if (mark_head && after(TCP_SKB_CB(skb)->seq, tp->snd_una)) return; + cnt = tp->lost_cnt_hint; } else { - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); cnt = 0; } - tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk_from(skb) { /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -2244,7 +2259,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) /* If needed, chop off the prefix to mark as lost. */ lost = (packets - oldcnt) * mss; if (lost < skb->len && - tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0) + tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + lost, mss, GFP_ATOMIC) < 0) break; cnt = packets; } @@ -2328,7 +2344,7 @@ static bool tcp_any_retrans_done(const struct sock *sk) if (tp->retrans_out) return true; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) return true; @@ -2369,9 +2385,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (unmark_loss) { struct sk_buff *skb; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } tp->lost_out = 0; @@ -2616,9 +2630,7 @@ void tcp_simple_retransmit(struct sock *sk) unsigned int mss = tcp_current_mss(sk); u32 prior_lost = tp->lost_out; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) - break; + skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { if (tcp_skb_seglen(skb) > mss && !(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { @@ -2712,7 +2724,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack, * is updated in tcp_ack()). Otherwise fall back to * the conventional recovery. */ - if (tcp_send_head(sk) && + if (!tcp_write_queue_empty(sk) && after(tcp_wnd_end(tp), tp->snd_nxt)) { *rexmit = REXMIT_NEW; return; @@ -2804,9 +2816,9 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && (tcp_fackets_out(tp) > tp->reordering)); - if (WARN_ON(!tp->packets_out && tp->sacked_out)) + if (!tp->packets_out && tp->sacked_out) tp->sacked_out = 0; - if (WARN_ON(!tp->sacked_out && tp->fackets_out)) + if (!tp->sacked_out && tp->fackets_out) tp->fackets_out = 0; /* Now state machine starts. @@ -3076,11 +3088,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, struct tcp_sock *tp = tcp_sk(sk); u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; + struct sk_buff *skb, *next; bool fully_acked = true; long sack_rtt_us = -1L; long seq_rtt_us = -1L; long ca_rtt_us = -1L; - struct sk_buff *skb; u32 pkts_acked = 0; u32 last_in_flight = 0; bool rtt_update; @@ -3088,7 +3100,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, first_ackt = 0; - while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + for (skb = skb_rb_first(&sk->tcp_rtx_queue); skb; skb = next) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u8 sacked = scb->sacked; u32 acked_pcount; @@ -3106,8 +3118,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, break; fully_acked = false; } else { - /* Speedup tcp_unlink_write_queue() and next loop */ - prefetchw(skb->next); acked_pcount = tcp_skb_pcount(skb); } @@ -3159,12 +3169,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (!fully_acked) break; - tcp_unlink_write_queue(skb, sk); - sk_wmem_free_skb(sk, skb); + next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; if (unlikely(skb == tp->lost_skb_hint)) tp->lost_skb_hint = NULL; + tcp_rtx_queue_unlink_and_free(skb, sk); } if (!skb) @@ -3256,12 +3266,14 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, static void tcp_ack_probe(struct sock *sk) { - const struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *head = tcp_send_head(sk); + const struct tcp_sock *tp = tcp_sk(sk); /* Was it a usable window open? */ - - if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tcp_wnd_end(tp))) { + if (!head) + return; + if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). @@ -3381,7 +3393,7 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); - if (tcp_send_head(sk)) + if (!tcp_write_queue_empty(sk)) tcp_slow_start_after_idle_check(sk); if (nwin > tp->max_window) { @@ -3566,8 +3578,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) sack_state.first_sackt = 0; sack_state.rate = &rs; - /* We very likely will need to access write queue head. */ - prefetchw(sk->sk_write_queue.next); + /* We very likely will need to access rtx queue. */ + prefetch(sk->tcp_rtx_queue.rb_node); /* If the ack is older than previous acks * then we can probably ignore it. @@ -3681,8 +3693,7 @@ no_queue: * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (tcp_send_head(sk)) - tcp_ack_probe(sk); + tcp_ack_probe(sk); if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -4335,7 +4346,7 @@ static void tcp_ofo_queue(struct sock *sk) p = rb_first(&tp->out_of_order_queue); while (p) { - skb = rb_entry(p, struct sk_buff, rbnode); + skb = rb_to_skb(p); if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) break; @@ -4399,7 +4410,7 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct rb_node **p, *q, *parent; + struct rb_node **p, *parent; struct sk_buff *skb1; u32 seq, end_seq; bool fragstolen; @@ -4458,7 +4469,7 @@ coalesce_done: parent = NULL; while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(seq, TCP_SKB_CB(skb1)->seq)) { p = &parent->rb_left; continue; @@ -4503,9 +4514,7 @@ insert: merge_right: /* Remove other segments covered by skb. */ - while ((q = rb_next(&skb->rbnode)) != NULL) { - skb1 = rb_entry(q, struct sk_buff, rbnode); - + while ((skb1 = skb_rb_next(skb)) != NULL) { if (!after(end_seq, TCP_SKB_CB(skb1)->seq)) break; if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { @@ -4520,7 +4529,7 @@ merge_right: tcp_drop(sk, skb1); } /* If there is no skb after us, we are the last_skb ! */ - if (!q) + if (!skb1) tp->ooo_last_skb = skb; add_sack: @@ -4706,7 +4715,7 @@ static struct sk_buff *tcp_skb_next(struct sk_buff *skb, struct sk_buff_head *li if (list) return !skb_queue_is_last(list, skb) ? skb->next : NULL; - return rb_entry_safe(rb_next(&skb->rbnode), struct sk_buff, rbnode); + return skb_rb_next(skb); } static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, @@ -4727,7 +4736,7 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, } /* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -4735,7 +4744,7 @@ static void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) while (*p) { parent = *p; - skb1 = rb_entry(parent, struct sk_buff, rbnode); + skb1 = rb_to_skb(parent); if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) p = &parent->rb_left; else @@ -4854,26 +4863,19 @@ static void tcp_collapse_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb, *head; - struct rb_node *p; u32 start, end; - p = rb_first(&tp->out_of_order_queue); - skb = rb_entry_safe(p, struct sk_buff, rbnode); + skb = skb_rb_first(&tp->out_of_order_queue); new_range: if (!skb) { - p = rb_last(&tp->out_of_order_queue); - /* Note: This is possible p is NULL here. We do not - * use rb_entry_safe(), as ooo_last_skb is valid only - * if rbtree is not empty. - */ - tp->ooo_last_skb = rb_entry(p, struct sk_buff, rbnode); + tp->ooo_last_skb = skb_rb_last(&tp->out_of_order_queue); return; } start = TCP_SKB_CB(skb)->seq; end = TCP_SKB_CB(skb)->end_seq; for (head = skb;;) { - skb = tcp_skb_next(skb, NULL); + skb = skb_rb_next(skb); /* Range is terminated when we see a gap or when * we are at the queue end. @@ -4916,14 +4918,14 @@ static bool tcp_prune_ofo_queue(struct sock *sk) do { prev = rb_prev(node); rb_erase(node, &tp->out_of_order_queue); - tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode)); + tcp_drop(sk, rb_to_skb(node)); sk_mem_reclaim(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !tcp_under_memory_pressure(sk)) break; node = prev; } while (node); - tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode); + tp->ooo_last_skb = rb_to_skb(prev); /* Reset SACK state. A conforming SACK implementation will * do the same at a timeout based retransmit. When a connection @@ -5538,7 +5540,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, struct tcp_fastopen_cookie *cookie) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *data = tp->syn_data ? tcp_write_queue_head(sk) : NULL; + struct sk_buff *data = tp->syn_data ? tcp_rtx_queue_head(sk) : NULL; u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; @@ -5573,9 +5575,8 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, tcp_fastopen_cache_set(sk, mss, cookie, syn_drop, try_exp); if (data) { /* Retransmit unacked data in SYN */ - tcp_for_write_queue_from(data, sk) { - if (data == tcp_send_head(sk) || - __tcp_retransmit_skb(sk, data, 1)) + skb_rbtree_walk_from(data) { + if (__tcp_retransmit_skb(sk, data, 1)) break; } tcp_rearm_rto(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7460fd90884..5418ecf03b78 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) TCP_TIMEOUT_INIT; icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); BUG_ON(!skb); tcp_mstamp_refresh(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8162e2880178..696b0a168f16 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); /* Account for new data that has been sent to the network. */ -static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned int prior_packets = tp->packets_out; - tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + __skb_unlink(skb, &sk->sk_write_queue); + tcp_rbtree_insert(&sk->tcp_rtx_queue, skb); + tp->packets_out += tcp_skb_pcount(skb); if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2) TCP_SKB_CB(skb)->eor = 0; } +/* Insert buff after skb on the write or rtx queue of sk. */ +static void tcp_insert_write_queue_after(struct sk_buff *skb, + struct sk_buff *buff, + struct sock *sk, + enum tcp_queue tcp_queue) +{ + if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE) + __skb_queue_after(&sk->sk_write_queue, skb, buff); + else + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, +int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); @@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor); return 0; @@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) * is caused by insufficient sender buffer: * 1) just sent some data (see tcp_write_xmit) * 2) not cwnd limited (this else condition) - * 3) no more data to send (null tcp_send_head ) + * 3) no more data to send (tcp_write_queue_empty()) * 4) application is hitting buffer limit (SOCK_NOSPACE) */ - if (!tcp_send_head(sk) && sk->sk_socket && + if (tcp_write_queue_empty(sk) && sk->sk_socket && test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) && (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED); @@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp, * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, +static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue, + struct sk_buff *skb, unsigned int len, unsigned int mss_now, gfp_t gfp) { struct sk_buff *buff; @@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* All of a TSO frame must be composed of paged data. */ if (skb->len != skb->data_len) - return tcp_fragment(sk, skb, len, mss_now, gfp); + return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp); buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) @@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ __skb_header_release(buff); - tcp_insert_write_queue_after(skb, buff, sk); + tcp_insert_write_queue_after(skb, buff, sk, tcp_queue); return 0; } @@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, goto send_now; } - head = tcp_write_queue_head(sk); - + /* TODO : use tsorted_sent_queue ? */ + head = tcp_rtx_queue_head(sk); + if (!head) + goto send_now; age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp); /* If next ACK is likely to come too late (half srtt), do not defer */ if (age < (tp->srtt_us >> 4)) @@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, limit <<= factor; if (refcount_read(&sk->sk_wmem_alloc) > limit) { - /* Always send the 1st or 2nd skb in write queue. + /* Always send skb if rtx queue is empty. * No need to wait for TX completion to call us back, * after softirq/tasklet schedule. * This helps when TX completions are delayed too much. */ - if (skb == sk->sk_write_queue.next || - skb->prev == sk->sk_write_queue.next) + if (tcp_rtx_queue_empty(sk)) return false; set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); @@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type) * it's the "most interesting" or current chrono we are * tracking and starts busy chrono if we have pending data. */ - if (tcp_write_queue_empty(sk)) + if (tcp_rtx_and_write_queues_empty(sk)) tcp_chrono_set(tp, TCP_CHRONO_UNSPEC); else if (type == tp->chrono_type) tcp_chrono_set(tp, TCP_CHRONO_BUSY); @@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, nonagle); if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) + unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, limit, mss_now, gfp))) break; if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags)) @@ -2350,7 +2368,7 @@ repair: tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return !tp->packets_out && tcp_send_head(sk); + return !tp->packets_out && !tcp_write_queue_empty(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) return false; if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) && - tcp_send_head(sk)) + !tcp_write_queue_empty(sk)) return false; /* Probe timeout is 2*rtt. Add minimum RTO to account @@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk) int mss = tcp_current_mss(sk); skb = tcp_send_head(sk); - if (skb) { - if (tcp_snd_wnd_test(tp, skb, mss)) { - pcount = tp->packets_out; - tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - if (tp->packets_out > pcount) - goto probe_sent; - goto rearm_timer; - } - skb = tcp_write_queue_prev(sk, skb); - } else { - skb = tcp_write_queue_tail(sk); + if (skb && tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; } + skb = skb_rb_last(&sk->tcp_rtx_queue); /* At most one outstanding TLP retransmission. */ if (tp->tlp_high_seq) @@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) { - if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, + if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, + (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_next(sk, skb); + skb = skb_rb_next(skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) @@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb, static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + struct sk_buff *next_skb = skb_rb_next(skb); int skb_size, next_skb_size; skb_size = skb->len; @@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) } tcp_highest_sack_combine(sk, next_skb, skb); - tcp_unlink_write_queue(next_skb, sk); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_PARTIAL; @@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) tcp_skb_collapse_tstamp(skb, next_skb); - sk_wmem_free_skb(sk, next_skb); + tcp_rtx_queue_unlink_and_free(next_skb, sk); return true; } @@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb) return false; if (skb_cloned(skb)) return false; - if (skb == tcp_send_head(sk)) - return false; /* Some heuristics for collapsing over SACK'd could be invented */ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) return false; @@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) return; - tcp_for_write_queue_from_safe(skb, tmp, sk) { + skb_rbtree_walk_from_safe(skb, tmp) { if (!tcp_can_collapse(sk, skb)) break; @@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) len = cur_mss * segs; if (skb->len > len) { - if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len, + cur_mss, GFP_ATOMIC)) return -ENOMEM; /* We'll try again later. */ } else { if (skb_unclone(skb, GFP_ATOMIC)) @@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) void tcp_xmit_retransmit_queue(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb, *rtx_head = NULL, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - struct sk_buff *hole = NULL; u32 max_segs; int mib_idx; if (!tp->packets_out) return; - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - } else { - skb = tcp_write_queue_head(sk); + skb = tp->retransmit_skb_hint; + if (!skb) { + rtx_head = tcp_rtx_queue_head(sk); + skb = rtx_head; } - max_segs = tcp_tso_segs(sk, tcp_current_mss(sk)); - tcp_for_write_queue_from(skb, sk) { + skb_rbtree_walk_from(skb) { __u8 sacked; int segs; - if (skb == tcp_send_head(sk)) - break; - if (tcp_pacing_check(sk)) break; @@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (tcp_in_cwnd_reduction(sk)) tp->prr_out += tcp_skb_pcount(skb); - if (skb == tcp_write_queue_head(sk) && + if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, @@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { + if (!tskb && tcp_under_memory_pressure(sk)) + tskb = skb_rb_last(&sk->tcp_rtx_queue); + + if (tskb) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; tp->write_seq++; - if (!tcp_send_head(sk)) { + if (tcp_write_queue_empty(sk)) { /* This means tskb was already sent. * Pretend we included the FIN on previous transmit. * We need to set tp->snd_nxt to the value it would have @@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk) { struct sk_buff *skb; - skb = tcp_write_queue_head(sk); + skb = tcp_rtx_queue_head(sk); if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { - pr_debug("%s: wrong queue state\n", __func__); + pr_err("%s: wrong queue state\n", __func__); return -EFAULT; } if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) { @@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk) if (!nskb) return -ENOMEM; INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor); - tcp_unlink_write_queue(skb, sk); + tcp_rtx_queue_unlink_and_free(skb, sk); __skb_header_release(nskb); - __tcp_add_write_queue_head(sk, nskb); - sk_wmem_free_skb(sk, skb); + tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb); sk->sk_wmem_queued += nskb->truesize; sk_mem_charge(sk, nskb->truesize); skb = nskb; @@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb) tcb->end_seq += skb->len; __skb_header_release(skb); - __tcp_add_write_queue_tail(sk, skb); sk->sk_wmem_queued += skb->truesize; sk_mem_charge(sk, skb->truesize); tp->write_seq = tcb->end_seq; @@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH; if (!err) { tp->syn_data = (fo->copied > 0); + tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT); goto done; } - /* data was not sent, this is our new send_head */ - sk->sk_send_head = syn_data; + /* data was not sent, put it in write_queue */ + __skb_queue_tail(&sk->sk_write_queue, syn_data); tp->packets_out -= tcp_skb_pcount(syn_data); fallback: @@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk) tp->retrans_stamp = tcp_time_stamp(tp); tcp_connect_queue_skb(sk, buff); tcp_ecn_send_syn(sk, buff); + tcp_rbtree_insert(&sk->tcp_rtx_queue, buff); /* Send off SYN; include data in Fast Open. */ err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : @@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib) skb->len > mss) { seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; - if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC)) + if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE, + skb, seg_size, mss, GFP_ATOMIC)) return -1; } else if (!tcp_skb_pcount(skb)) tcp_set_skb_tso_segs(skb, mss); @@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk) err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 655dd8d7f064..7014cc00c74c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk, return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (unlikely(!start_ts)) - start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk)); + if (unlikely(!start_ts)) { + struct sk_buff *head = tcp_rtx_queue_head(sk); + + if (!head) + return false; + start_ts = tcp_skb_timestamp(head); + } if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); @@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data) static void tcp_probe_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); + struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; u32 start_ts; - if (tp->packets_out || !tcp_send_head(sk)) { + if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; return; } @@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(tcp_send_head(sk)); + start_ts = tcp_skb_timestamp(skb); if (!start_ts) - tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp; + skb->skb_mstamp = tp->tcp_mstamp; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > jiffies_to_msecs(icsk->icsk_user_timeout)) @@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk) if (!tp->packets_out) goto out; - WARN_ON(tcp_write_queue_empty(sk)); + WARN_ON(tcp_rtx_queue_empty(sk)); tp->tlp_high_seq = 0; @@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk) goto out; } tcp_enter_loss(sk); - tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); + tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1); __sk_dst_reset(sk); goto out_reset_timer; } @@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); - if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { + if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, * do not backoff. */ @@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_when(tp); /* It is alive without keepalive 8) */ - if (tp->packets_out || tcp_send_head(sk)) + if (tp->packets_out || !tcp_write_queue_empty(sk)) goto resched; elapsed = keepalive_time_elapsed(tp); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index eb0359bdaa11..7c9a6e4a7253 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2239,20 +2239,16 @@ int udp_v4_early_demux(struct sk_buff *skb) iph = ip_hdr(skb); uh = udp_hdr(skb); - if (skb->pkt_type == PACKET_BROADCAST || - skb->pkt_type == PACKET_MULTICAST) { + if (skb->pkt_type == PACKET_MULTICAST) { in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; - /* we are supposed to accept bcast packets */ - if (skb->pkt_type == PACKET_MULTICAST) { - ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, - iph->protocol); - if (!ours) - return 0; - } + ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, + iph->protocol); + if (!ours) + return 0; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 97658bfc1b58..e360d55be555 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -120,7 +120,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, * will be using a length value equal to only one MSS sized * segment instead of the entire frame. */ - if (gso_partial) { + if (gso_partial && skb_is_gso(skb)) { uh->len = htons(skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)uh); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 837418ff2d4b..4603aa488f4f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -152,7 +152,7 @@ static void ipv6_regen_rndid(struct inet6_dev *idev); static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); -static int ipv6_count_addresses(struct inet6_dev *idev); +static int ipv6_count_addresses(const struct inet6_dev *idev); static int ipv6_generate_stable_address(struct in6_addr *addr, u8 dad_count, const struct inet6_dev *idev); @@ -616,23 +616,23 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; + struct inet6_dev *in6_dev = NULL; + struct net_device *dev = NULL; struct netconfmsg *ncm; struct sk_buff *skb; struct ipv6_devconf *devconf; - struct inet6_dev *in6_dev; - struct net_device *dev; int ifindex; int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, devconf_ipv6_policy, extack); if (err < 0) - goto errout; + return err; - err = -EINVAL; if (!tb[NETCONFA_IFINDEX]) - goto errout; + return -EINVAL; + err = -EINVAL; ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); switch (ifindex) { case NETCONFA_IFINDEX_ALL: @@ -642,10 +642,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, devconf = net->ipv6.devconf_dflt; break; default: - dev = __dev_get_by_index(net, ifindex); + dev = dev_get_by_index(net, ifindex); if (!dev) - goto errout; - in6_dev = __in6_dev_get(dev); + return -EINVAL; + in6_dev = in6_dev_get(dev); if (!in6_dev) goto errout; devconf = &in6_dev->cnf; @@ -653,7 +653,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = -ENOBUFS; - skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC); + skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); if (!skb) goto errout; @@ -669,6 +669,10 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout: + if (in6_dev) + in6_dev_put(in6_dev); + if (dev) + dev_put(dev); return err; } @@ -945,7 +949,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) break; } - list_add_tail(&ifp->if_list, p); + list_add_tail_rcu(&ifp->if_list, p); } static u32 inet6_addr_hash(const struct in6_addr *addr) @@ -1204,7 +1208,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) action = check_cleanup_prefix_route(ifp, &expires); - list_del_init(&ifp->if_list); + list_del_rcu(&ifp->if_list); __in6_ifa_put(ifp); write_unlock_bh(&ifp->idev->lock); @@ -1558,8 +1562,7 @@ static int __ipv6_dev_get_saddr(struct net *net, { struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx]; - read_lock_bh(&idev->lock); - list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(score->ifa, &idev->addr_list, if_list) { int i; /* @@ -1609,11 +1612,6 @@ static int __ipv6_dev_get_saddr(struct net *net, } break; } else if (minihiscore < miniscore) { - if (hiscore->ifa) - in6_ifa_put(hiscore->ifa); - - in6_ifa_hold(score->ifa); - swap(hiscore, score); hiscore_idx = 1 - hiscore_idx; @@ -1625,7 +1623,6 @@ static int __ipv6_dev_get_saddr(struct net *net, } } out: - read_unlock_bh(&idev->lock); return hiscore_idx; } @@ -1662,6 +1659,7 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, int dst_type; bool use_oif_addr = false; int hiscore_idx = 0; + int ret = 0; dst_type = __ipv6_addr_type(daddr); dst.addr = daddr; @@ -1737,15 +1735,14 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, } out: - rcu_read_unlock(); - hiscore = &scores[hiscore_idx]; if (!hiscore->ifa) - return -EADDRNOTAVAIL; + ret = -EADDRNOTAVAIL; + else + *saddr = hiscore->ifa->addr; - *saddr = hiscore->ifa->addr; - in6_ifa_put(hiscore->ifa); - return 0; + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ipv6_dev_get_saddr); @@ -1785,15 +1782,15 @@ int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, return err; } -static int ipv6_count_addresses(struct inet6_dev *idev) +static int ipv6_count_addresses(const struct inet6_dev *idev) { + const struct inet6_ifaddr *ifp; int cnt = 0; - struct inet6_ifaddr *ifp; - read_lock_bh(&idev->lock); - list_for_each_entry(ifp, &idev->addr_list, if_list) + rcu_read_lock(); + list_for_each_entry_rcu(ifp, &idev->addr_list, if_list) cnt++; - read_unlock_bh(&idev->lock); + rcu_read_unlock(); return cnt; } @@ -1859,20 +1856,18 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; + const struct inet6_dev *idev; bool ret = false; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - list_for_each_entry(ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { ret = ipv6_prefix_equal(addr, &ifa->addr, prefix_len); if (ret) break; } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); @@ -1882,22 +1877,20 @@ EXPORT_SYMBOL(ipv6_chk_custom_prefix); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev) { - struct inet6_dev *idev; - struct inet6_ifaddr *ifa; + const struct inet6_ifaddr *ifa; + const struct inet6_dev *idev; int onlink; onlink = 0; rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) { - read_lock_bh(&idev->lock); - list_for_each_entry(ifa, &idev->addr_list, if_list) { + list_for_each_entry_rcu(ifa, &idev->addr_list, if_list) { onlink = ipv6_prefix_equal(addr, &ifa->addr, ifa->prefix_len); if (onlink) break; } - read_unlock_bh(&idev->lock); } rcu_read_unlock(); return onlink; @@ -2321,24 +2314,24 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0); + rcu_read_lock(); + fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true); if (!fn) goto out; - noflags |= RTF_CACHE; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != dev->ifindex) continue; if ((rt->rt6i_flags & flags) != flags) continue; if ((rt->rt6i_flags & noflags) != 0) continue; - dst_hold(&rt->dst); + if (!dst_hold_safe(&rt->dst)) + rt = NULL; break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -3562,7 +3555,6 @@ static int addrconf_ifdown(struct net_device *dev, int how) struct net *net = dev_net(dev); struct inet6_dev *idev; struct inet6_ifaddr *ifa, *tmp; - struct list_head del_list; int _keep_addr; bool keep_addr; int state, i; @@ -3654,7 +3646,6 @@ restart: */ keep_addr = (!how && _keep_addr > 0 && !idev->cnf.disable_ipv6); - INIT_LIST_HEAD(&del_list); list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) { struct rt6_info *rt = NULL; bool keep; @@ -3663,8 +3654,6 @@ restart: keep = keep_addr && (ifa->flags & IFA_F_PERMANENT) && !addr_is_local(&ifa->addr); - if (!keep) - list_move(&ifa->if_list, &del_list); write_unlock_bh(&idev->lock); spin_lock_bh(&ifa->lock); @@ -3698,19 +3687,14 @@ restart: } write_lock_bh(&idev->lock); + if (!keep) { + list_del_rcu(&ifa->if_list); + in6_ifa_put(ifa); + } } write_unlock_bh(&idev->lock); - /* now clean up addresses to be removed */ - while (!list_empty(&del_list)) { - ifa = list_first_entry(&del_list, - struct inet6_ifaddr, if_list); - list_del(&ifa->if_list); - - in6_ifa_put(ifa); - } - /* Step 5: Discard anycast and multicast list */ if (how) { ipv6_ac_destroy_dev(idev); @@ -3820,8 +3804,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp) goto out; if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || - dev_net(dev)->ipv6.devconf_all->accept_dad < 1 || - idev->cnf.accept_dad < 1 || + (dev_net(dev)->ipv6.devconf_all->accept_dad < 1 && + idev->cnf.accept_dad < 1) || !(ifp->flags&IFA_F_TENTATIVE) || ifp->flags & IFA_F_NODAD) { bump_id = ifp->flags & IFA_F_TENTATIVE; @@ -4906,17 +4890,15 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, extack); if (err < 0) - goto errout; + return err; addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer); - if (!addr) { - err = -EINVAL; - goto errout; - } + if (!addr) + return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifa_index) - dev = __dev_get_by_index(net, ifm->ifa_index); + dev = dev_get_by_index(net, ifm->ifa_index); ifa = ipv6_get_ifaddr(net, addr, dev, 1); if (!ifa) { @@ -4942,6 +4924,8 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, errout_ifa: in6_ifa_put(ifa); errout: + if (dev) + dev_put(dev); return err; } @@ -5898,10 +5882,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) spin_lock(&ifa->lock); if (ifa->rt) { struct rt6_info *rt = ifa->rt; - struct fib6_table *table = rt->rt6i_table; int cpu; - read_lock(&table->tb6_lock); + rcu_read_lock(); addrconf_set_nopolicy(ifa->rt, val); if (rt->rt6i_pcpu) { for_each_possible_cpu(cpu) { @@ -5911,7 +5894,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) addrconf_set_nopolicy(*rtp, val); } } - read_unlock(&table->tb6_lock); + rcu_read_unlock(); } spin_unlock(&ifa->lock); } @@ -6585,13 +6568,13 @@ int __init addrconf_init(void) __rtnl_register(PF_INET6, RTM_NEWADDR, inet6_rtm_newaddr, NULL, 0); __rtnl_register(PF_INET6, RTM_DELADDR, inet6_rtm_deladdr, NULL, 0); __rtnl_register(PF_INET6, RTM_GETADDR, inet6_rtm_getaddr, - inet6_dump_ifaddr, 0); + inet6_dump_ifaddr, RTNL_FLAG_DOIT_UNLOCKED); __rtnl_register(PF_INET6, RTM_GETMULTICAST, NULL, inet6_dump_ifmcaddr, 0); __rtnl_register(PF_INET6, RTM_GETANYCAST, NULL, inet6_dump_ifacaddr, 0); __rtnl_register(PF_INET6, RTM_GETNETCONF, inet6_netconf_get_devconf, - inet6_netconf_dump_devconf, 0); + inet6_netconf_dump_devconf, RTNL_FLAG_DOIT_UNLOCKED); ipv6_addr_label_rtnl_register(); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index c6311d7108f6..2606d2fa3ac4 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -18,7 +18,6 @@ #include <linux/if_addrlabel.h> #include <linux/netlink.h> #include <linux/rtnetlink.h> -#include <linux/refcount.h> #if 0 #define ADDRLABEL(x...) printk(x) @@ -36,7 +35,6 @@ struct ip6addrlbl_entry { int addrtype; u32 label; struct hlist_node list; - refcount_t refcnt; struct rcu_head rcu; }; @@ -111,28 +109,6 @@ static const __net_initconst struct ip6addrlbl_init_table } }; -/* Object management */ -static inline void ip6addrlbl_free(struct ip6addrlbl_entry *p) -{ - kfree(p); -} - -static void ip6addrlbl_free_rcu(struct rcu_head *h) -{ - ip6addrlbl_free(container_of(h, struct ip6addrlbl_entry, rcu)); -} - -static bool ip6addrlbl_hold(struct ip6addrlbl_entry *p) -{ - return refcount_inc_not_zero(&p->refcnt); -} - -static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) -{ - if (refcount_dec_and_test(&p->refcnt)) - call_rcu(&p->rcu, ip6addrlbl_free_rcu); -} - /* Find label */ static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, const struct in6_addr *addr, @@ -219,7 +195,6 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, newp->addrtype = addrtype; newp->label = label; INIT_HLIST_NODE(&newp->list); - refcount_set(&newp->refcnt, 1); return newp; } @@ -243,7 +218,7 @@ static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, goto out; } hlist_replace_rcu(&p->list, &newp->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); goto out; } else if ((p->prefixlen == newp->prefixlen && !p->ifindex) || (p->prefixlen < newp->prefixlen)) { @@ -281,7 +256,7 @@ static int ip6addrlbl_add(struct net *net, ret = __ip6addrlbl_add(net, newp, replace); spin_unlock(&net->ipv6.ip6addrlbl_table.lock); if (ret) - ip6addrlbl_free(newp); + kfree(newp); return ret; } @@ -302,7 +277,7 @@ static int __ip6addrlbl_del(struct net *net, p->ifindex == ifindex && ipv6_addr_equal(&p->prefix, prefix)) { hlist_del_rcu(&p->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); ret = 0; break; } @@ -360,7 +335,7 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) spin_lock(&net->ipv6.ip6addrlbl_table.lock); hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { hlist_del_rcu(&p->list); - ip6addrlbl_put(p); + kfree_rcu(p, rcu); } spin_unlock(&net->ipv6.ip6addrlbl_table.lock); } @@ -546,38 +521,28 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, return -EINVAL; addr = nla_data(tb[IFAL_ADDRESS]); - rcu_read_lock(); - p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); - if (p && !ip6addrlbl_hold(p)) - p = NULL; - lseq = net->ipv6.ip6addrlbl_table.seq; - rcu_read_unlock(); - - if (!p) { - err = -ESRCH; - goto out; - } - skb = nlmsg_new(ip6addrlbl_msgsize(), GFP_KERNEL); - if (!skb) { - ip6addrlbl_put(p); + if (!skb) return -ENOBUFS; - } - err = ip6addrlbl_fill(skb, p, lseq, - NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - RTM_NEWADDRLABEL, 0); + err = -ESRCH; - ip6addrlbl_put(p); + rcu_read_lock(); + p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); + lseq = net->ipv6.ip6addrlbl_table.seq; + if (p) + err = ip6addrlbl_fill(skb, p, lseq, + NETLINK_CB(in_skb).portid, + nlh->nlmsg_seq, + RTM_NEWADDRLABEL, 0); + rcu_read_unlock(); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(skb); - goto out; + } else { + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } - - err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); -out: return err; } diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index aeb49b4d8c7d..4e52d52a6752 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -250,15 +250,15 @@ static bool opt_unrec(struct sk_buff *skb, __u32 offset) return (*op & 0xC0) == 0x80; } -int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, - struct icmp6hdr *thdr, int len) +void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, + struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; skb = skb_peek(&sk->sk_write_queue); if (!skb) - goto out; + return; icmp6h = icmp6_hdr(skb); memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); @@ -286,8 +286,6 @@ int icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, tmp_csum); } ip6_push_pending_frames(sk); -out: - return 0; } struct icmpv6_msg { @@ -437,7 +435,6 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, int iif = 0; int addr_type = 0; int len; - int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); if ((u8 *)hdr < skb->head || @@ -574,17 +571,16 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, rcu_read_lock(); idev = __in6_dev_get(skb->dev); - err = ip6_append_data(sk, icmpv6_getfrag, &msg, - len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), - &ipc6, &fl6, (struct rt6_info *)dst, - MSG_DONTWAIT, &sockc_unused); - if (err) { + if (ip6_append_data(sk, icmpv6_getfrag, &msg, + len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), + &ipc6, &fl6, (struct rt6_info *)dst, + MSG_DONTWAIT, &sockc_unused)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, - len + sizeof(struct icmp6hdr)); + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + len + sizeof(struct icmp6hdr)); } rcu_read_unlock(); out_dst_release: @@ -681,7 +677,6 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct icmpv6_msg msg; struct dst_entry *dst; struct ipcm6_cookie ipc6; - int err = 0; u32 mark = IP6_REPLY_MARK(net, skb->mark); struct sockcm_cookie sockc_unused = {0}; @@ -718,8 +713,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(net, sk, &dst, &fl6); - if (err) + if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) @@ -736,17 +730,16 @@ static void icmpv6_echo_reply(struct sk_buff *skb) ipc6.dontfrag = np->dontfrag; ipc6.opt = NULL; - err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), - sizeof(struct icmp6hdr), &ipc6, &fl6, - (struct rt6_info *)dst, MSG_DONTWAIT, - &sockc_unused); - - if (err) { + if (ip6_append_data(sk, icmpv6_getfrag, &msg, + skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), &ipc6, &fl6, + (struct rt6_info *)dst, MSG_DONTWAIT, + &sockc_unused)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, - skb->len + sizeof(struct icmp6hdr)); + icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, + skb->len + sizeof(struct icmp6hdr)); } dst_release(dst); out: diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e5308d7cbd75..c2ecd5ec638a 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -38,14 +38,6 @@ #include <net/ip6_fib.h> #include <net/ip6_route.h> -#define RT6_DEBUG 2 - -#if RT6_DEBUG >= 3 -#define RT6_TRACE(x...) pr_debug(x) -#else -#define RT6_TRACE(x...) do { ; } while (0) -#endif - static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { @@ -62,9 +54,12 @@ struct fib6_cleaner { #define FWS_INIT FWS_L #endif -static void fib6_prune_clones(struct net *net, struct fib6_node *fn); -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn); -static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn); +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); +static struct fib6_node *fib6_repair_tree(struct net *net, + struct fib6_table *table, + struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); @@ -110,6 +105,20 @@ enum { FIB6_NO_SERNUM_CHANGE = 0, }; +void fib6_update_sernum(struct rt6_info *rt) +{ + struct fib6_table *table = rt->rt6i_table; + struct net *net = dev_net(rt->dst.dev); + struct fib6_node *fn; + + spin_lock_bh(&table->tb6_lock); + fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&table->tb6_lock)); + if (fn) + fn->fn_sernum = fib6_new_sernum(net); + spin_unlock_bh(&table->tb6_lock); +} + /* * Auxiliary address test functions for the radix tree. * @@ -140,18 +149,21 @@ static __be32 addr_bit_set(const void *token, int fn_bit) addr[fn_bit >> 5]; } -static struct fib6_node *node_alloc(void) +static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); + if (fn) + net->ipv6.rt6_stats->fib_nodes++; return fn; } -static void node_free_immediate(struct fib6_node *fn) +static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); + net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) @@ -161,9 +173,10 @@ static void node_free_rcu(struct rcu_head *head) kmem_cache_free(fib6_node_kmem, fn); } -static void node_free(struct fib6_node *fn) +static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); + net->ipv6.rt6_stats->fib_nodes--; } void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) @@ -185,9 +198,6 @@ void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) *ppcpu_rt = NULL; } } - - free_percpu(non_pcpu_rt->rt6i_pcpu); - non_pcpu_rt->rt6i_pcpu = NULL; } EXPORT_SYMBOL_GPL(rt6_free_pcpu); @@ -205,8 +215,7 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb) * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ - rwlock_init(&tb->tb6_lock); - + spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* @@ -225,7 +234,8 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; - table->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(table->tb6_root.leaf, + net->ipv6.ip6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } @@ -322,11 +332,8 @@ unsigned int fib6_tables_seq_read(struct net *net) struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - read_lock_bh(&tb->tb6_lock); + hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; - read_unlock_bh(&tb->tb6_lock); - } } rcu_read_unlock(); @@ -372,7 +379,7 @@ static int fib6_node_dump(struct fib6_walker *w) { struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) + for_each_fib6_walker_rt(w) fib6_rt_dump(rt, w->args); w->leaf = NULL; return 0; @@ -382,9 +389,9 @@ static void fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { w->root = &tb->tb6_root; - read_lock_bh(&tb->tb6_lock); + spin_lock_bh(&tb->tb6_lock); fib6_walk(net, w); - read_unlock_bh(&tb->tb6_lock); + spin_unlock_bh(&tb->tb6_lock); } /* Called with rcu_read_lock() */ @@ -421,7 +428,7 @@ static int fib6_dump_node(struct fib6_walker *w) int res; struct rt6_info *rt; - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args); if (res < 0) { /* Frame is full, suspend walking */ @@ -480,9 +487,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, w->count = 0; w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = w->root->fn_sernum; @@ -497,9 +504,9 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, } else w->skip = 0; - read_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); - read_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; @@ -580,11 +587,13 @@ out: * node. */ -static struct fib6_node *fib6_add_1(struct fib6_node *root, - struct in6_addr *addr, int plen, - int offset, int allow_create, - int replace_required, int sernum, - struct netlink_ext_ack *extack) +static struct fib6_node *fib6_add_1(struct net *net, + struct fib6_table *table, + struct fib6_node *root, + struct in6_addr *addr, int plen, + int offset, int allow_create, + int replace_required, + struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; @@ -599,7 +608,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, fn = root; do { - key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match @@ -625,12 +636,10 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { - rt6_release(fn->leaf); - fn->leaf = NULL; + RCU_INIT_POINTER(fn->leaf, NULL); + rt6_release(leaf); } - fn->fn_sernum = sernum; - return fn; } @@ -639,10 +648,13 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, */ /* Try to walk down on tree. */ - fn->fn_sernum = sernum; dir = addr_bit_set(addr, fn->fn_bit); pn = fn; - fn = dir ? fn->right : fn->left; + fn = dir ? + rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)) : + rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { @@ -668,19 +680,17 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, * Create new leaf node without children. */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - - ln->parent = pn; - ln->fn_sernum = sernum; + RCU_INIT_POINTER(ln->parent, pn); if (dir) - pn->right = ln; + rcu_assign_pointer(pn->right, ln); else - pn->left = ln; + rcu_assign_pointer(pn->left, ln); return ln; @@ -694,7 +704,8 @@ insert_above: * and the current */ - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. @@ -710,14 +721,14 @@ insert_above: * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { - in = node_alloc(); - ln = node_alloc(); + in = node_alloc(net); + ln = node_alloc(net); if (!in || !ln) { if (in) - node_free_immediate(in); + node_free_immediate(net, in); if (ln) - node_free_immediate(ln); + node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } @@ -731,31 +742,28 @@ insert_above: in->fn_bit = bit; - in->parent = pn; + RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; - atomic_inc(&in->leaf->rt6i_ref); - - in->fn_sernum = sernum; + atomic_inc(&rcu_dereference_protected(in->leaf, + lockdep_is_held(&table->tb6_lock))->rt6i_ref); /* update parent pointer */ if (dir) - pn->right = in; + rcu_assign_pointer(pn->right, in); else - pn->left = in; + rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; - ln->parent = in; - fn->parent = in; - - ln->fn_sernum = sernum; + RCU_INIT_POINTER(ln->parent, in); + rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { - in->right = ln; - in->left = fn; + rcu_assign_pointer(in->right, ln); + rcu_assign_pointer(in->left, fn); } else { - in->left = ln; - in->right = fn; + rcu_assign_pointer(in->left, ln); + rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ @@ -765,28 +773,26 @@ insert_above: * (old node)[fn] NULL */ - ln = node_alloc(); + ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; - ln->parent = pn; - - ln->fn_sernum = sernum; - - if (dir) - pn->right = ln; - else - pn->left = ln; + RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) - ln->right = fn; + RCU_INIT_POINTER(ln->right, fn); else - ln->left = fn; + RCU_INIT_POINTER(ln->left, fn); + + rcu_assign_pointer(fn->parent, ln); - fn->parent = ln; + if (dir) + rcu_assign_pointer(pn->right, ln); + else + rcu_assign_pointer(pn->left, ln); } return ln; } @@ -832,6 +838,8 @@ static int fib6_commit_metrics(struct dst_entry *dst, struct mx6_config *mxc) static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, struct net *net) { + struct fib6_table *table = rt->rt6i_table; + if (atomic_read(&rt->rt6i_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, @@ -840,12 +848,17 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, * to still alive ones. */ while (fn) { - if (!(fn->fn_flags & RTN_RTINFO) && fn->leaf == rt) { - fn->leaf = fib6_find_prefix(net, fn); - atomic_inc(&fn->leaf->rt6i_ref); + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_leaf; + if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { + new_leaf = fib6_find_prefix(net, table, fn); + atomic_inc(&new_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_leaf); rt6_release(rt); } - fn = fn->parent; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); } } } @@ -857,9 +870,11 @@ static void fib6_purge_rt(struct rt6_info *rt, struct fib6_node *fn, static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc) { + struct rt6_info *leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct rt6_info *iter = NULL; - struct rt6_info **ins; - struct rt6_info **fallback_ins = NULL; + struct rt6_info __rcu **ins; + struct rt6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || @@ -874,7 +889,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, ins = &fn->leaf; - for (iter = fn->leaf; iter; iter = iter->dst.rt6_next) { + for (iter = leaf; iter; + iter = rcu_dereference_protected(iter->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock))) { /* * Search for duplicates */ @@ -936,7 +953,8 @@ next_iter: if (fallback_ins && !found) { /* No ECMP-able route found, replace first non-ECMP one */ ins = fallback_ins; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); found++; } @@ -950,7 +968,7 @@ next_iter: struct rt6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ - sibling = fn->leaf; + sibling = leaf; while (sibling) { if (sibling->rt6i_metric == rt->rt6i_metric && rt6_qualify_for_ecmp(sibling)) { @@ -958,7 +976,8 @@ next_iter: &sibling->rt6i_siblings); break; } - sibling = sibling->dst.rt6_next; + sibling = rcu_dereference_protected(sibling->dst.rt6_next, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings @@ -987,10 +1006,10 @@ add: if (err) return err; - rt->dst.rt6_next = iter; - *ins = rt; - rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(rt->dst.rt6_next, iter); atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(rt->rt6i_node, fn); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_ADD, rt); if (!info->skip_notify) @@ -1016,10 +1035,10 @@ add: if (err) return err; - *ins = rt; + atomic_inc(&rt->rt6i_ref); rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; - atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(*ins, rt); call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt); if (!info->skip_notify) @@ -1031,14 +1050,15 @@ add: nsiblings = iter->rt6i_nsiblings; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->dst.rt6_next; - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); while (iter) { if (iter->rt6i_metric > rt->rt6i_metric) break; @@ -1046,14 +1066,16 @@ add: *ins = iter->dst.rt6_next; iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); - if (fn->rr_ptr == iter) + if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; + info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->dst.rt6_next; } - iter = *ins; + iter = rcu_dereference_protected(*ins, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); } WARN_ON(nsiblings != 0); } @@ -1077,16 +1099,33 @@ void fib6_force_start_gc(struct net *net) jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } +static void fib6_update_sernum_upto_root(struct rt6_info *rt, + int sernum) +{ + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + + /* paired with smp_rmb() in rt6_get_cookie_safe() */ + smp_wmb(); + while (fn) { + fn->fn_sernum = sernum; + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); + } +} + /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees + * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info, struct mx6_config *mxc, struct netlink_ext_ack *extack) { + struct fib6_table *table = rt->rt6i_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; @@ -1095,6 +1134,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (WARN_ON_ONCE(!atomic_read(&rt->dst.__refcnt))) return -EINVAL; + if (WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE)) + return -EINVAL; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) @@ -1105,9 +1146,10 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); - fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen, + fn = fib6_add_1(info->nl_net, table, root, + &rt->rt6i_dst.addr, rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst), allow_create, - replace_required, sernum, extack); + replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; @@ -1120,7 +1162,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (rt->rt6i_src.plen) { struct fib6_node *sn; - if (!fn->subtree) { + if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* @@ -1134,42 +1176,40 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, */ /* Create subtree root node */ - sfn = node_alloc(); + sfn = node_alloc(info->nl_net); if (!sfn) goto failure; - sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); + rcu_assign_pointer(sfn->leaf, + info->nl_net->ipv6.ip6_null_entry); sfn->fn_flags = RTN_ROOT; - sfn->fn_sernum = sernum; /* Now add the first leaf node to new subtree */ - sn = fib6_add_1(sfn, &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, sfn, + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ - node_free_immediate(sfn); + node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ - sfn->parent = fn; - fn->subtree = sfn; + rcu_assign_pointer(sfn->parent, fn); + rcu_assign_pointer(fn->subtree, sfn); } else { - sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, - rt->rt6i_src.plen, + sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), + &rt->rt6i_src.addr, rt->rt6i_src.plen, offsetof(struct rt6_info, rt6i_src), - allow_create, replace_required, sernum, - extack); + allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); @@ -1177,9 +1217,9 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, } } - if (!fn->leaf) { - fn->leaf = rt; + if (!rcu_access_pointer(fn->leaf)) { atomic_inc(&rt->rt6i_ref); + rcu_assign_pointer(fn->leaf, rt); } fn = sn; } @@ -1187,9 +1227,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, err = fib6_add_rt2node(fn, rt, info, mxc); if (!err) { + fib6_update_sernum_upto_root(rt, sernum); fib6_start_gc(info->nl_net, rt); - if (!(rt->rt6i_flags & RTF_CACHE)) - fib6_prune_clones(info->nl_net, pn); } out: @@ -1199,19 +1238,23 @@ out: * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ - if (pn != fn && pn->leaf == rt) { - pn->leaf = NULL; + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + if (pn != fn && pn_leaf == rt) { + pn_leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); atomic_dec(&rt->rt6i_ref); } - if (pn != fn && !pn->leaf && !(pn->fn_flags & RTN_RTINFO)) { - pn->leaf = fib6_find_prefix(info->nl_net, pn); + if (pn != fn && !pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { + pn_leaf = fib6_find_prefix(info->nl_net, table, pn); #if RT6_DEBUG >= 2 - if (!pn->leaf) { - WARN_ON(pn->leaf == NULL); - pn->leaf = info->nl_net->ipv6.ip6_null_entry; + if (!pn_leaf) { + WARN_ON(!pn_leaf); + pn_leaf = info->nl_net->ipv6.ip6_null_entry; } #endif - atomic_inc(&pn->leaf->rt6i_ref); + atomic_inc(&pn_leaf->rt6i_ref); + rcu_assign_pointer(pn->leaf, pn_leaf); } #endif goto failure; @@ -1226,7 +1269,7 @@ failure: * fn->leaf. */ if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) - fib6_repair_tree(info->nl_net, fn); + fib6_repair_tree(info->nl_net, table, fn); /* Always release dst as dst->__refcnt is guaranteed * to be taken before entering this function */ @@ -1264,7 +1307,8 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, dir = addr_bit_set(args->addr, fn->fn_bit); - next = dir ? fn->right : fn->left; + next = dir ? rcu_dereference(fn->right) : + rcu_dereference(fn->left); if (next) { fn = next; @@ -1274,18 +1318,22 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, } while (fn) { - if (FIB6_SUBTREE(fn) || fn->fn_flags & RTN_RTINFO) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + + if (subtree || fn->fn_flags & RTN_RTINFO) { + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; - key = (struct rt6key *) ((u8 *) fn->leaf + - args->offset); + if (!leaf) + goto backtrack; + + key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES - if (fn->subtree) { + if (subtree) { struct fib6_node *sfn; - sfn = fib6_lookup_1(fn->subtree, - args + 1); + sfn = fib6_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; @@ -1295,18 +1343,18 @@ static struct fib6_node *fib6_lookup_1(struct fib6_node *root, return fn; } } -#ifdef CONFIG_IPV6_SUBTREES backtrack: -#endif if (fn->fn_flags & RTN_ROOT) break; - fn = fn->parent; + fn = rcu_dereference(fn->parent); } return NULL; } +/* called with rcu_read_lock() held + */ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { @@ -1337,54 +1385,84 @@ struct fib6_node *fib6_lookup(struct fib6_node *root, const struct in6_addr *dad /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) + * exact_match == true means we try to find fn with exact match of + * the passed in prefix addr + * exact_match == false means we try to find fn with longest prefix + * match of the passed in prefix addr. This is useful for finding fn + * for cached route as it will be stored in the exception table under + * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, - int plen, int offset) + int plen, int offset, + bool exact_match) { - struct fib6_node *fn; + struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { - struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + struct rt6_info *leaf = rcu_dereference(fn->leaf); + struct rt6key *key; + + /* This node is being deleted */ + if (!leaf) { + if (plen <= fn->fn_bit) + goto out; + else + goto next; + } + + key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) - return NULL; + goto out; if (plen == fn->fn_bit) return fn; + prev = fn; + +next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) - fn = fn->right; + fn = rcu_dereference(fn->right); else - fn = fn->left; + fn = rcu_dereference(fn->left); } - return NULL; +out: + if (exact_match) + return NULL; + else + return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, - const struct in6_addr *saddr, int src_len) + const struct in6_addr *saddr, int src_len, + bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, - offsetof(struct rt6_info, rt6i_dst)); + offsetof(struct rt6_info, rt6i_dst), + exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { + struct fib6_node *subtree = FIB6_SUBTREE(fn); + WARN_ON(saddr == NULL); - if (fn && fn->subtree) - fn = fib6_locate_1(fn->subtree, saddr, src_len, - offsetof(struct rt6_info, rt6i_src)); + if (fn && subtree) + fn = fib6_locate_1(subtree, saddr, src_len, + offsetof(struct rt6_info, rt6i_src), + exact_match); } #endif @@ -1400,16 +1478,26 @@ struct fib6_node *fib6_locate(struct fib6_node *root, * */ -static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) +static struct rt6_info *fib6_find_prefix(struct net *net, + struct fib6_table *table, + struct fib6_node *fn) { + struct fib6_node *child_left, *child_right; + if (fn->fn_flags & RTN_ROOT) return net->ipv6.ip6_null_entry; while (fn) { - if (fn->left) - return fn->left->leaf; - if (fn->right) - return fn->right->leaf; + child_left = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + child_right = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + if (child_left) + return rcu_dereference_protected(child_left->leaf, + lockdep_is_held(&table->tb6_lock)); + if (child_right) + return rcu_dereference_protected(child_right->leaf, + lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } @@ -1419,31 +1507,49 @@ static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn) /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. + * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, - struct fib6_node *fn) + struct fib6_table *table, + struct fib6_node *fn) { int children; int nstate; - struct fib6_node *child, *pn; + struct fib6_node *child; struct fib6_walker *w; int iter = 0; for (;;) { + struct fib6_node *fn_r = rcu_dereference_protected(fn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *fn_l = rcu_dereference_protected(fn->left, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_r = rcu_dereference_protected(pn->right, + lockdep_is_held(&table->tb6_lock)); + struct fib6_node *pn_l = rcu_dereference_protected(pn->left, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *fn_leaf = rcu_dereference_protected(fn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *pn_leaf = rcu_dereference_protected(pn->leaf, + lockdep_is_held(&table->tb6_lock)); + struct rt6_info *new_fn_leaf; + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); - WARN_ON(fn->leaf); + WARN_ON(fn_leaf); children = 0; child = NULL; - if (fn->right) - child = fn->right, children |= 1; - if (fn->left) - child = fn->left, children |= 2; + if (fn_r) + child = fn_r, children |= 1; + if (fn_l) + child = fn_l, children |= 2; if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES @@ -1451,36 +1557,36 @@ static struct fib6_node *fib6_repair_tree(struct net *net, || (children && fn->fn_flags & RTN_ROOT) #endif ) { - fn->leaf = fib6_find_prefix(net, fn); + new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 - if (!fn->leaf) { - WARN_ON(!fn->leaf); - fn->leaf = net->ipv6.ip6_null_entry; + if (!new_fn_leaf) { + WARN_ON(!new_fn_leaf); + new_fn_leaf = net->ipv6.ip6_null_entry; } #endif - atomic_inc(&fn->leaf->rt6i_ref); - return fn->parent; + atomic_inc(&new_fn_leaf->rt6i_ref); + rcu_assign_pointer(fn->leaf, new_fn_leaf); + return pn; } - pn = fn->parent; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); - FIB6_SUBTREE(pn) = NULL; + RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif - if (pn->right == fn) - pn->right = child; - else if (pn->left == fn) - pn->left = child; + if (pn_r == fn) + rcu_assign_pointer(pn->right, child); + else if (pn_l == fn) + rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) - child->parent = pn; + rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } @@ -1489,19 +1595,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { - if (w->root == fn) { - w->root = w->node = NULL; - RT6_TRACE("W %p adjusted by delroot 1\n", w); - } else if (w->node == fn) { + if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { - if (w->root == fn) { - w->root = child; - RT6_TRACE("W %p adjusted by delroot 2\n", w); - } if (w->node == fn) { w->node = child; if (children&2) { @@ -1516,33 +1615,39 @@ static struct fib6_node *fib6_repair_tree(struct net *net, } read_unlock(&net->ipv6.fib6_walker_lock); - node_free(fn); + node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; - rt6_release(pn->leaf); - pn->leaf = NULL; + RCU_INIT_POINTER(pn->leaf, NULL); + rt6_release(pn_leaf); fn = pn; } } -static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, - struct nl_info *info) +static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, + struct rt6_info __rcu **rtp, struct nl_info *info) { struct fib6_walker *w; - struct rt6_info *rt = *rtp; + struct rt6_info *rt = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; RT6_TRACE("fib6_del_route\n"); + WARN_ON_ONCE(rt->rt6i_flags & RTF_CACHE); + /* Unlink it */ *rtp = rt->dst.rt6_next; rt->rt6i_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; + /* Flush all cached dst in exception table */ + rt6_flush_exceptions(rt); + /* Reset round-robin state, if necessary */ - if (fn->rr_ptr == rt) + if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ @@ -1561,20 +1666,19 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); - w->leaf = rt->dst.rt6_next; + w->leaf = rcu_dereference_protected(rt->dst.rt6_next, + lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); - rt->dst.rt6_next = NULL; - /* If it was last route, expunge its radix tree node */ - if (!fn->leaf) { + if (!rcu_access_pointer(fn->leaf)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; - fn = fib6_repair_tree(net, fn); + fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); @@ -1585,12 +1689,15 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, rt6_release(rt); } +/* Need to own table->tb6_lock */ int fib6_del(struct rt6_info *rt, struct nl_info *info) { struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, lockdep_is_held(&rt->rt6i_table->tb6_lock)); + struct fib6_table *table = rt->rt6i_table; struct net *net = info->nl_net; - struct rt6_info **rtp; + struct rt6_info __rcu **rtp; + struct rt6_info __rcu **rtp_next; #if RT6_DEBUG >= 2 if (rt->dst.obsolete > 0) { @@ -1603,28 +1710,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) WARN_ON(!(fn->fn_flags & RTN_RTINFO)); - if (!(rt->rt6i_flags & RTF_CACHE)) { - struct fib6_node *pn = fn; -#ifdef CONFIG_IPV6_SUBTREES - /* clones of this route might be in another subtree */ - if (rt->rt6i_src.plen) { - while (!(pn->fn_flags & RTN_ROOT)) - pn = pn->parent; - pn = pn->parent; - } -#endif - fib6_prune_clones(info->nl_net, pn); - } + /* remove cached dst from exception table */ + if (rt->rt6i_flags & RTF_CACHE) + return rt6_remove_exception_rt(rt); /* * Walk the leaf entries looking for ourself */ - for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->dst.rt6_next) { - if (*rtp == rt) { - fib6_del_route(fn, rtp, info); + for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { + struct rt6_info *cur = rcu_dereference_protected(*rtp, + lockdep_is_held(&table->tb6_lock)); + if (rt == cur) { + fib6_del_route(table, fn, rtp, info); return 0; } + rtp_next = &cur->dst.rt6_next; } return -ENOENT; } @@ -1651,22 +1752,22 @@ int fib6_del(struct rt6_info *rt, struct nl_info *info) * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. + * + * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { - struct fib6_node *fn, *pn; + struct fib6_node *fn, *pn, *left, *right; + + /* w->root should always be table->tb6_root */ + WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; - if (w->prune && fn != w->root && - fn->fn_flags & RTN_RTINFO && w->state < FWS_C) { - w->state = FWS_C; - w->leaf = fn->leaf; - } switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: @@ -1677,20 +1778,22 @@ static int fib6_walk_continue(struct fib6_walker *w) w->state = FWS_L; #endif case FWS_L: - if (fn->left) { - w->node = fn->left; + left = rcu_dereference_protected(fn->left, 1); + if (left) { + w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; case FWS_R: - if (fn->right) { - w->node = fn->right; + right = rcu_dereference_protected(fn->right, 1); + if (right) { + w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; - w->leaf = fn->leaf; + w->leaf = rcu_dereference_protected(fn->leaf, 1); case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; @@ -1712,7 +1815,9 @@ skip: case FWS_U: if (fn == w->root) return 0; - pn = fn->parent; + pn = rcu_dereference_protected(fn->parent, 1); + left = rcu_dereference_protected(pn->left, 1); + right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { @@ -1721,13 +1826,13 @@ skip: continue; } #endif - if (pn->left == fn) { + if (left == fn) { w->state = FWS_R; continue; } - if (pn->right == fn) { + if (right == fn) { w->state = FWS_C; - w->leaf = w->node->leaf; + w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 @@ -1770,7 +1875,7 @@ static int fib6_clean_node(struct fib6_walker *w) return 0; } - for (rt = w->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res < 0) { w->leaf = rt; @@ -1798,20 +1903,16 @@ static int fib6_clean_node(struct fib6_walker *w) * func is called on each route. * It may return -1 -> delete this route. * 0 -> continue walking - * - * prune==1 -> only immediate children of node (certainly, - * ignoring pure split nodes) will be scanned. */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct rt6_info *, void *arg), - bool prune, int sernum, void *arg) + int sernum, void *arg) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; - c.w.prune = prune; c.w.count = 0; c.w.skip = 0; c.func = func; @@ -1834,10 +1935,10 @@ static void __fib6_clean_all(struct net *net, for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, - func, false, sernum, arg); - write_unlock_bh(&table->tb6_lock); + func, sernum, arg); + spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); @@ -1849,22 +1950,6 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *), __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg); } -static int fib6_prune_clone(struct rt6_info *rt, void *arg) -{ - if (rt->rt6i_flags & RTF_CACHE) { - RT6_TRACE("pruning clone %p\n", rt); - return -1; - } - - return 0; -} - -static void fib6_prune_clones(struct net *net, struct fib6_node *fn) -{ - fib6_clean_tree(net, fn, fib6_prune_clone, true, - FIB6_NO_SERNUM_CHANGE, NULL); -} - static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); @@ -1876,12 +1961,6 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -struct fib6_gc_args -{ - int timeout; - int more; -}; - static int fib6_age(struct rt6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; @@ -1890,9 +1969,6 @@ static int fib6_age(struct rt6_info *rt, void *arg) /* * check addrconf expiration here. * Routes are expired even if they are in use. - * - * Also age clones. Note, that clones are aged out - * only if they are not in use now. */ if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) { @@ -1901,31 +1977,14 @@ static int fib6_age(struct rt6_info *rt, void *arg) return -1; } gc_args->more++; - } else if (rt->rt6i_flags & RTF_CACHE) { - if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) - rt->dst.obsolete = DST_OBSOLETE_KILL; - if (atomic_read(&rt->dst.__refcnt) == 1 && - rt->dst.obsolete == DST_OBSOLETE_KILL) { - RT6_TRACE("aging clone %p\n", rt); - return -1; - } else if (rt->rt6i_flags & RTF_GATEWAY) { - struct neighbour *neigh; - __u8 neigh_flags = 0; - - neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); - if (neigh) { - neigh_flags = neigh->flags; - neigh_release(neigh); - } - if (!(neigh_flags & NTF_ROUTER)) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); - return -1; - } - } - gc_args->more++; } + /* Also age clones in the exception table. + * Note, that clones are aged out + * only if they are not in use now. + */ + rt6_age_exceptions(rt, gc_args, now); + return 0; } @@ -1993,7 +2052,8 @@ static int __net_init fib6_net_init(struct net *net) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; - net->ipv6.fib6_main_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); @@ -2004,7 +2064,8 @@ static int __net_init fib6_net_init(struct net *net) if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; - net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry; + rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, + net->ipv6.ip6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); @@ -2134,7 +2195,9 @@ static int ipv6_route_yield(struct fib6_walker *w) return 1; do { - iter->w.leaf = iter->w.leaf->dst.rt6_next; + iter->w.leaf = rcu_dereference_protected( + iter->w.leaf->dst.rt6_next, + lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; @@ -2199,7 +2262,7 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!v) goto iter_table; - n = ((struct rt6_info *)v)->dst.rt6_next; + n = rcu_dereference_bh(((struct rt6_info *)v)->dst.rt6_next); if (n) { ++*pos; return n; @@ -2207,9 +2270,9 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) iter_table: ipv6_route_check_sernum(iter); - read_lock(&iter->tbl->tb6_lock); + spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); - read_unlock(&iter->tbl->tb6_lock); + spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { if (v) ++*pos; diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index cdb3728faca7..4a87f9428ca5 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, for (skb = segs; skb; skb = skb->next) { ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff); - if (gso_partial) + if (gso_partial && skb_is_gso(skb)) payload_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)(ipv6h + 1); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index a5cd43d75393..437af8c95277 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -353,7 +353,7 @@ static unsigned int ipv6_synproxy_hook(void *priv, nexthdr = ipv6_hdr(skb)->nexthdr; thoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); - if (thoff < 0) + if (thoff < 0 || nexthdr != IPPROTO_TCP) return NF_ACCEPT; th = skb_header_pointer(skb, thoff, sizeof(_th), &_th); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index ac826dd338ff..d12c55dad7d1 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -154,9 +154,8 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { - err = icmpv6_push_pending_frames(sk, &fl6, - (struct icmp6hdr *) &pfh.icmph, - len); + icmpv6_push_pending_frames(sk, &fl6, + (struct icmp6hdr *)&pfh.icmph, len); } release_sock(sk); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 26cc9f483b6d..2e8842fa6450 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -44,6 +44,7 @@ #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <linux/slab.h> +#include <linux/jhash.h> #include <net/net_namespace.h> #include <net/snmp.h> #include <net/ipv6.h> @@ -104,6 +105,9 @@ static int rt6_fill_node(struct net *net, struct in6_addr *dst, struct in6_addr *src, int iif, int type, u32 portid, u32 seq, unsigned int flags); +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr); #ifdef CONFIG_IPV6_ROUTE_INFO static struct rt6_info *rt6_add_route_info(struct net *net, @@ -139,9 +143,11 @@ static void rt6_uncached_list_del(struct rt6_info *rt) { if (!list_empty(&rt->rt6i_uncached)) { struct uncached_list *ul = rt->rt6i_uncached_list; + struct net *net = dev_net(rt->dst.dev); spin_lock_bh(&ul->lock); list_del(&rt->rt6i_uncached); + atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache); spin_unlock_bh(&ul->lock); } } @@ -355,8 +361,10 @@ static struct rt6_info *__ip6_dst_alloc(struct net *net, struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, flags); - if (rt) + if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); + } return rt; } @@ -369,17 +377,7 @@ struct rt6_info *ip6_dst_alloc(struct net *net, if (rt) { rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC); - if (rt->rt6i_pcpu) { - int cpu; - - for_each_possible_cpu(cpu) { - struct rt6_info **p; - - p = per_cpu_ptr(rt->rt6i_pcpu, cpu); - /* no one shares rt */ - *p = NULL; - } - } else { + if (!rt->rt6i_pcpu) { dst_release_immediate(&rt->dst); return NULL; } @@ -392,6 +390,7 @@ EXPORT_SYMBOL(ip6_dst_alloc); static void ip6_dst_destroy(struct dst_entry *dst) { struct rt6_info *rt = (struct rt6_info *)dst; + struct rt6_exception_bucket *bucket; struct dst_entry *from = dst->from; struct inet6_dev *idev; @@ -404,6 +403,11 @@ static void ip6_dst_destroy(struct dst_entry *dst) rt->rt6i_idev = NULL; in6_dev_put(idev); } + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1); + if (bucket) { + rt->rt6i_exception_bucket = NULL; + kfree(bucket); + } dst->from = NULL; dst_release(from); @@ -478,7 +482,7 @@ static struct rt6_info *rt6_multipath_select(struct rt6_info *match, } /* - * Route lookup. Any table->tb6_lock is implied. + * Route lookup. rcu_read_lock() should be held. */ static inline struct rt6_info *rt6_device_match(struct net *net, @@ -493,7 +497,7 @@ static inline struct rt6_info *rt6_device_match(struct net *net, if (!oif && ipv6_addr_any(saddr)) goto out; - for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) { + for (sprt = rt; sprt; sprt = rcu_dereference(sprt->dst.rt6_next)) { struct net_device *dev = sprt->dst.dev; if (oif) { @@ -702,6 +706,7 @@ out: } static struct rt6_info *find_rr_leaf(struct fib6_node *fn, + struct rt6_info *leaf, struct rt6_info *rr_head, u32 metric, int oif, int strict, bool *do_rr) @@ -711,7 +716,7 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = NULL; cont = NULL; - for (rt = rr_head; rt; rt = rt->dst.rt6_next) { + for (rt = rr_head; rt; rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -720,7 +725,8 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, match = find_match(rt, oif, strict, &mpri, match, do_rr); } - for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) { + for (rt = leaf; rt && rt != rr_head; + rt = rcu_dereference(rt->dst.rt6_next)) { if (rt->rt6i_metric != metric) { cont = rt; break; @@ -732,37 +738,59 @@ static struct rt6_info *find_rr_leaf(struct fib6_node *fn, if (match || !cont) return match; - for (rt = cont; rt; rt = rt->dst.rt6_next) + for (rt = cont; rt; rt = rcu_dereference(rt->dst.rt6_next)) match = find_match(rt, oif, strict, &mpri, match, do_rr); return match; } -static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) +static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn, + int oif, int strict) { + struct rt6_info *leaf = rcu_dereference(fn->leaf); struct rt6_info *match, *rt0; - struct net *net; bool do_rr = false; + int key_plen; - rt0 = fn->rr_ptr; + if (!leaf) + return net->ipv6.ip6_null_entry; + + rt0 = rcu_dereference(fn->rr_ptr); if (!rt0) - fn->rr_ptr = rt0 = fn->leaf; + rt0 = leaf; - match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict, + /* Double check to make sure fn is not an intermediate node + * and fn->leaf does not points to its child's leaf + * (This might happen if all routes under fn are deleted from + * the tree and fib6_repair_tree() is called on the node.) + */ + key_plen = rt0->rt6i_dst.plen; +#ifdef CONFIG_IPV6_SUBTREES + if (rt0->rt6i_src.plen) + key_plen = rt0->rt6i_src.plen; +#endif + if (fn->fn_bit != key_plen) + return net->ipv6.ip6_null_entry; + + match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict, &do_rr); if (do_rr) { - struct rt6_info *next = rt0->dst.rt6_next; + struct rt6_info *next = rcu_dereference(rt0->dst.rt6_next); /* no entries matched; do round-robin */ if (!next || next->rt6i_metric != rt0->rt6i_metric) - next = fn->leaf; - - if (next != rt0) - fn->rr_ptr = next; + next = leaf; + + if (next != rt0) { + spin_lock_bh(&leaf->rt6i_table->tb6_lock); + /* make sure next is not being deleted from the tree */ + if (next->rt6i_node) + rcu_assign_pointer(fn->rr_ptr, next); + spin_unlock_bh(&leaf->rt6i_table->tb6_lock); + } } - net = dev_net(rt0->dst.dev); return match ? match : net->ipv6.ip6_null_entry; } @@ -850,13 +878,14 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, static struct fib6_node* fib6_backtrack(struct fib6_node *fn, struct in6_addr *saddr) { - struct fib6_node *pn; + struct fib6_node *pn, *sn; while (1) { if (fn->fn_flags & RTN_TL_ROOT) return NULL; - pn = fn->parent; - if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) - fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); + pn = rcu_dereference(fn->parent); + sn = FIB6_SUBTREE(pn); + if (sn && sn != fn) + fn = fib6_lookup(sn, NULL, saddr); else fn = pn; if (fn->fn_flags & RTN_RTINFO) @@ -864,27 +893,57 @@ static struct fib6_node* fib6_backtrack(struct fib6_node *fn, } } +static bool ip6_hold_safe(struct net *net, struct rt6_info **prt, + bool null_fallback) +{ + struct rt6_info *rt = *prt; + + if (dst_hold_safe(&rt->dst)) + return true; + if (null_fallback) { + rt = net->ipv6.ip6_null_entry; + dst_hold(&rt->dst); + } else { + rt = NULL; + } + *prt = rt; + return false; +} + static struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, int flags) { + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; - struct rt6_info *rt; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - rt = fn->leaf; - rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags); - if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) - rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags); + rt = rcu_dereference(fn->leaf); + if (!rt) { + rt = net->ipv6.ip6_null_entry; + } else { + rt = rt6_device_match(net, rt, &fl6->saddr, + fl6->flowi6_oif, flags); + if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0) + rt = rt6_multipath_select(rt, fl6, + fl6->flowi6_oif, flags); + } if (rt == net->ipv6.ip6_null_entry) { fn = fib6_backtrack(fn, &fl6->saddr); if (fn) goto restart; } - dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + /* Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; + + if (ip6_hold_safe(net, &rt, true)) + dst_use_noref(&rt->dst, jiffies); + + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); @@ -938,9 +997,9 @@ static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info, struct fib6_table *table; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_add(&table->tb6_root, rt, info, mxc, extack); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); return err; } @@ -1038,7 +1097,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt) return pcpu_rt; } -/* It should be called with read_lock_bh(&tb6_lock) acquired */ +/* It should be called with rcu_read_lock() acquired */ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) { struct rt6_info *pcpu_rt, **p; @@ -1046,16 +1105,14 @@ static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt) p = this_cpu_ptr(rt->rt6i_pcpu); pcpu_rt = *p; - if (pcpu_rt) { - dst_hold(&pcpu_rt->dst); + if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false)) rt6_dst_from_metrics_check(pcpu_rt); - } + return pcpu_rt; } static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) { - struct fib6_table *table = rt->rt6i_table; struct rt6_info *pcpu_rt, *prev, **p; pcpu_rt = ip6_rt_pcpu_alloc(rt); @@ -1066,36 +1123,514 @@ static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt) return net->ipv6.ip6_null_entry; } - read_lock_bh(&table->tb6_lock); - if (rt->rt6i_pcpu) { - p = this_cpu_ptr(rt->rt6i_pcpu); - prev = cmpxchg(p, NULL, pcpu_rt); - if (prev) { - /* If someone did it before us, return prev instead */ - dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = prev; - } - } else { - /* rt has been removed from the fib6 tree - * before we have a chance to acquire the read_lock. - * In this case, don't brother to create a pcpu rt - * since rt is going away anyway. The next - * dst_check() will trigger a re-lookup. - */ - dst_release_immediate(&pcpu_rt->dst); - pcpu_rt = rt; - } dst_hold(&pcpu_rt->dst); + p = this_cpu_ptr(rt->rt6i_pcpu); + prev = cmpxchg(p, NULL, pcpu_rt); + BUG_ON(prev); + rt6_dst_from_metrics_check(pcpu_rt); - read_unlock_bh(&table->tb6_lock); return pcpu_rt; } +/* exception hash table implementation + */ +static DEFINE_SPINLOCK(rt6_exception_lock); + +/* Remove rt6_ex from hash table and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_remove_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex) +{ + struct net *net; + + if (!bucket || !rt6_ex) + return; + + net = dev_net(rt6_ex->rt6i->dst.dev); + rt6_ex->rt6i->rt6i_node = NULL; + hlist_del_rcu(&rt6_ex->hlist); + rt6_release(rt6_ex->rt6i); + kfree_rcu(rt6_ex, rcu); + WARN_ON_ONCE(!bucket->depth); + bucket->depth--; + net->ipv6.rt6_stats->fib_rt_cache--; +} + +/* Remove oldest rt6_ex in bucket and free the memory + * Caller must hold rt6_exception_lock + */ +static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket) +{ + struct rt6_exception *rt6_ex, *oldest = NULL; + + if (!bucket) + return; + + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + if (!oldest || time_before(rt6_ex->stamp, oldest->stamp)) + oldest = rt6_ex; + } + rt6_remove_exception(bucket, oldest); +} + +static u32 rt6_exception_hash(const struct in6_addr *dst, + const struct in6_addr *src) +{ + static u32 seed __read_mostly; + u32 val; + + net_get_random_once(&seed, sizeof(seed)); + val = jhash(dst, sizeof(*dst), seed); + +#ifdef CONFIG_IPV6_SUBTREES + if (src) + val = jhash(src, sizeof(*src), val); +#endif + return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT); +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rt6_exception_lock + */ +static struct rt6_exception * +__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +/* Helper function to find the cached rt in the hash table + * and update bucket pointer to point to the bucket for this + * (daddr, saddr) pair + * Caller must hold rcu_read_lock() + */ +static struct rt6_exception * +__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket, + const struct in6_addr *daddr, + const struct in6_addr *saddr) +{ + struct rt6_exception *rt6_ex; + u32 hval; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!(*bucket) || !daddr) + return NULL; + + hval = rt6_exception_hash(daddr, saddr); + *bucket += hval; + + hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) { + struct rt6_info *rt6 = rt6_ex->rt6i; + bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr); + +#ifdef CONFIG_IPV6_SUBTREES + if (matched && saddr) + matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr); +#endif + if (matched) + return rt6_ex; + } + return NULL; +} + +static int rt6_insert_exception(struct rt6_info *nrt, + struct rt6_info *ort) +{ + struct net *net = dev_net(ort->dst.dev); + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err = 0; + + /* ort can't be a cache or pcpu route */ + if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)) + ort = (struct rt6_info *)ort->dst.from; + WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU)); + + spin_lock_bh(&rt6_exception_lock); + + if (ort->exception_bucket_flushed) { + err = -EINVAL; + goto out; + } + + bucket = rcu_dereference_protected(ort->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) { + bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket), + GFP_ATOMIC); + if (!bucket) { + err = -ENOMEM; + goto out; + } + rcu_assign_pointer(ort->rt6i_exception_bucket, bucket); + } + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates ort is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (ort->rt6i_src.plen) + src_key = &nrt->rt6i_src.addr; +#endif + + /* Update rt6i_prefsrc as it could be changed + * in rt6_remove_prefsrc() + */ + nrt->rt6i_prefsrc = ort->rt6i_prefsrc; + /* rt6_mtu_change() might lower mtu on ort. + * Only insert this exception route if its mtu + * is less than ort's mtu value. + */ + if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) { + err = -EINVAL; + goto out; + } + + rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_remove_exception(bucket, rt6_ex); + + rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC); + if (!rt6_ex) { + err = -ENOMEM; + goto out; + } + rt6_ex->rt6i = nrt; + rt6_ex->stamp = jiffies; + atomic_inc(&nrt->rt6i_ref); + nrt->rt6i_node = ort->rt6i_node; + hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain); + bucket->depth++; + net->ipv6.rt6_stats->fib_rt_cache++; + + if (bucket->depth > FIB6_MAX_DEPTH) + rt6_exception_remove_oldest(bucket); + +out: + spin_unlock_bh(&rt6_exception_lock); + + /* Update fn->fn_sernum to invalidate all cached dst */ + if (!err) + fib6_update_sernum(ort); + + return err; +} + +void rt6_flush_exceptions(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + spin_lock_bh(&rt6_exception_lock); + /* Prevent rt6_insert_exception() to recreate the bucket list */ + rt->exception_bucket_flushed = 1; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + if (!bucket) + goto out; + + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist) + rt6_remove_exception(bucket, rt6_ex); + WARN_ON_ONCE(bucket->depth); + bucket++; + } + +out: + spin_unlock_bh(&rt6_exception_lock); +} + +/* Find cached rt in the hash table inside passed in rt + * Caller has to hold rcu_read_lock() + */ +static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt, + struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + struct rt6_info *res = NULL; + + bucket = rcu_dereference(rt->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates rt is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (rt->rt6i_src.plen) + src_key = saddr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key); + + if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i)) + res = rt6_ex->rt6i; + + return res; +} + +/* Remove the passed in cached rt from the hash table that contains it */ +int rt6_remove_exception_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + int err; + + if (!from || + !(rt->rt6i_flags & RTF_CACHE)) + return -EINVAL; + + if (!rcu_access_pointer(from->rt6i_exception_bucket)) + return -ENOENT; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(from->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_spinlock(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) { + rt6_remove_exception(bucket, rt6_ex); + err = 0; + } else { + err = -ENOENT; + } + + spin_unlock_bh(&rt6_exception_lock); + return err; +} + +/* Find rt6_ex which contains the passed in rt cache and + * refresh its stamp + */ +static void rt6_update_exception_stamp_rt(struct rt6_info *rt) +{ + struct rt6_info *from = (struct rt6_info *)rt->dst.from; + struct rt6_exception_bucket *bucket; + struct in6_addr *src_key = NULL; + struct rt6_exception *rt6_ex; + + if (!from || + !(rt->rt6i_flags & RTF_CACHE)) + return; + + rcu_read_lock(); + bucket = rcu_dereference(from->rt6i_exception_bucket); + +#ifdef CONFIG_IPV6_SUBTREES + /* rt6i_src.plen != 0 indicates 'from' is in subtree + * and exception table is indexed by a hash of + * both rt6i_dst and rt6i_src. + * Otherwise, the exception table is indexed by + * a hash of only rt6i_dst. + */ + if (from->rt6i_src.plen) + src_key = &rt->rt6i_src.addr; +#endif + rt6_ex = __rt6_find_exception_rcu(&bucket, + &rt->rt6i_dst.addr, + src_key); + if (rt6_ex) + rt6_ex->stamp = jiffies; + + rcu_read_unlock(); +} + +static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + rt6_ex->rt6i->rt6i_prefsrc.plen = 0; + } + bucket++; + } + } +} + +static void rt6_exceptions_update_pmtu(struct rt6_info *rt, int mtu) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + int i; + + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + /* For RTF_CACHE with rt6i_pmtu == 0 + * (i.e. a redirected route), + * the metrics of its rt->dst.from has already + * been updated. + */ + if (entry->rt6i_pmtu && entry->rt6i_pmtu > mtu) + entry->rt6i_pmtu = mtu; + } + bucket++; + } + } +} + +#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) + +static void rt6_exceptions_clean_tohost(struct rt6_info *rt, + struct in6_addr *gateway) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + struct rt6_info *entry = rt6_ex->rt6i; + + if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) == + RTF_CACHE_GATEWAY && + ipv6_addr_equal(gateway, + &entry->rt6i_gateway)) { + rt6_remove_exception(bucket, rt6_ex); + } + } + bucket++; + } + } + + spin_unlock_bh(&rt6_exception_lock); +} + +static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, + struct rt6_exception *rt6_ex, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_info *rt = rt6_ex->rt6i; + + if (atomic_read(&rt->dst.__refcnt) == 1 && + time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + RT6_TRACE("aging clone %p\n", rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } else if (rt->rt6i_flags & RTF_GATEWAY) { + struct neighbour *neigh; + __u8 neigh_flags = 0; + + neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway); + if (neigh) { + neigh_flags = neigh->flags; + neigh_release(neigh); + } + if (!(neigh_flags & NTF_ROUTER)) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + rt6_remove_exception(bucket, rt6_ex); + return; + } + } + gc_args->more++; +} + +void rt6_age_exceptions(struct rt6_info *rt, + struct fib6_gc_args *gc_args, + unsigned long now) +{ + struct rt6_exception_bucket *bucket; + struct rt6_exception *rt6_ex; + struct hlist_node *tmp; + int i; + + if (!rcu_access_pointer(rt->rt6i_exception_bucket)) + return; + + spin_lock_bh(&rt6_exception_lock); + bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, + lockdep_is_held(&rt6_exception_lock)); + + if (bucket) { + for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) { + hlist_for_each_entry_safe(rt6_ex, tmp, + &bucket->chain, hlist) { + rt6_age_examine_exception(bucket, rt6_ex, + gc_args, now); + } + bucket++; + } + } + spin_unlock_bh(&rt6_exception_lock); +} + struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, int flags) { struct fib6_node *fn, *saved_fn; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; int strict = 0; strict |= flags & RT6_LOOKUP_F_IFACE; @@ -1103,7 +1638,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, if (net->ipv6.devconf_all->forwarding == 0) strict |= RT6_LOOKUP_F_REACHABLE; - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); saved_fn = fn; @@ -1112,7 +1647,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, oif = 0; redo_rt6_select: - rt = rt6_select(fn, oif, strict); + rt = rt6_select(net, fn, oif, strict); if (rt->rt6i_nsiblings) rt = rt6_multipath_select(rt, fl6, oif, strict); if (rt == net->ipv6.ip6_null_entry) { @@ -1127,13 +1662,22 @@ redo_rt6_select: } } + /*Search through exception table */ + rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr); + if (rt_cache) + rt = rt_cache; - if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) { - dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); - - rt6_dst_from_metrics_check(rt); - + if (rt == net->ipv6.ip6_null_entry) { + rcu_read_unlock(); + dst_hold(&rt->dst); + trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); + return rt; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + rt6_dst_from_metrics_check(rt); + } + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) && @@ -1146,8 +1690,14 @@ redo_rt6_select: struct rt6_info *uncached_rt; - dst_use(&rt->dst, jiffies); - read_unlock_bh(&table->tb6_lock); + if (ip6_hold_safe(net, &rt, true)) { + dst_use_noref(&rt->dst, jiffies); + } else { + rcu_read_unlock(); + uncached_rt = rt; + goto uncached_rt_out; + } + rcu_read_unlock(); uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL); dst_release(&rt->dst); @@ -1157,11 +1707,13 @@ redo_rt6_select: * No need for another dst_hold() */ rt6_uncached_list_add(uncached_rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); } else { uncached_rt = net->ipv6.ip6_null_entry; dst_hold(&uncached_rt->dst); } +uncached_rt_out: trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6); return uncached_rt; @@ -1170,26 +1722,28 @@ redo_rt6_select: struct rt6_info *pcpu_rt; - rt->dst.lastuse = jiffies; - rt->dst.__use++; + dst_use_noref(&rt->dst, jiffies); + local_bh_disable(); pcpu_rt = rt6_get_pcpu_route(rt); - if (pcpu_rt) { - read_unlock_bh(&table->tb6_lock); - } else { - /* We have to do the read_unlock first - * because rt6_make_pcpu_route() may trigger - * ip6_dst_gc() which will take the write_lock. - */ - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - pcpu_rt = rt6_make_pcpu_route(rt); - dst_release(&rt->dst); + if (!pcpu_rt) { + /* atomic_inc_not_zero() is needed when using rcu */ + if (atomic_inc_not_zero(&rt->rt6i_ref)) { + /* No dst_hold() on rt is needed because grabbing + * rt->rt6i_ref makes sure rt can't be released. + */ + pcpu_rt = rt6_make_pcpu_route(rt); + rt6_release(rt); + } else { + /* rt is already removed from tree */ + pcpu_rt = net->ipv6.ip6_null_entry; + dst_hold(&pcpu_rt->dst); + } } - + local_bh_enable(); + rcu_read_unlock(); trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6); return pcpu_rt; - } } EXPORT_SYMBOL_GPL(ip6_pol_route); @@ -1325,9 +1879,10 @@ struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_ori struct dst_entry *new = NULL; rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1, - DST_OBSOLETE_NONE, 0); + DST_OBSOLETE_DEAD, 0); if (rt) { rt6_info_init(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc); new = &rt->dst; new->__use = 1; @@ -1491,23 +2046,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); + /* update rt6_ex->stamp for cache */ + if (rt6->rt6i_flags & RTF_CACHE) + rt6_update_exception_stamp_rt(rt6); } else if (daddr) { struct rt6_info *nrt6; nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); - - /* ip6_ins_rt(nrt6) will bump the - * rt6->rt6i_node->fn_sernum - * which will fail the next rt6_check() and - * invalidate the sk->sk_dst_cache. - */ - ip6_ins_rt(nrt6); - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt6->dst); + if (rt6_insert_exception(nrt6, rt6)) + dst_release_immediate(&nrt6->dst); } } } @@ -1571,7 +2120,7 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, int flags) { struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6; - struct rt6_info *rt; + struct rt6_info *rt, *rt_cache; struct fib6_node *fn; /* Get the "current" route for this destination and @@ -1584,10 +2133,10 @@ static struct rt6_info *__ip6_route_redirect(struct net *net, * routes. */ - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr); restart: - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt6_check_expired(rt)) continue; if (rt->dst.error) @@ -1596,8 +2145,23 @@ restart: continue; if (fl6->flowi6_oif != rt->dst.dev->ifindex) continue; - if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) + /* rt_cache's gateway might be different from its 'parent' + * in the case of an ip redirect. + * So we keep searching in the exception table if the gateway + * is different. + */ + if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) { + rt_cache = rt6_find_cached_rt(rt, + &fl6->daddr, + &fl6->saddr); + if (rt_cache && + ipv6_addr_equal(&rdfl->gateway, + &rt_cache->rt6i_gateway)) { + rt = rt_cache; + break; + } continue; + } break; } @@ -1615,9 +2179,9 @@ restart: } out: - dst_hold(&rt->dst); + ip6_hold_safe(net, &rt, true); - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); trace_fib6_table_lookup(net, rt, table->tb6_id, fl6); return rt; @@ -1766,6 +2330,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, * do proper release of the net_device */ rt6_uncached_list_add(rt); + atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache); dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0); @@ -2216,9 +2781,9 @@ static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info) } table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); err = fib6_del(rt, info); - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out: ip6_rt_put(rt); @@ -2244,7 +2809,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) if (rt == net->ipv6.ip6_null_entry) goto out_put; table = rt->rt6i_table; - write_lock_bh(&table->tb6_lock); + spin_lock_bh(&table->tb6_lock); if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) { struct rt6_info *sibling, *next_sibling; @@ -2274,7 +2839,7 @@ static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg) err = fib6_del(rt, info); out_unlock: - write_unlock_bh(&table->tb6_lock); + spin_unlock_bh(&table->tb6_lock); out_put: ip6_rt_put(rt); @@ -2288,9 +2853,9 @@ out_put: static int ip6_route_del(struct fib6_config *cfg, struct netlink_ext_ack *extack) { + struct rt6_info *rt, *rt_cache; struct fib6_table *table; struct fib6_node *fn; - struct rt6_info *rt; int err = -ESRCH; table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table); @@ -2299,17 +2864,22 @@ static int ip6_route_del(struct fib6_config *cfg, return err; } - read_lock_bh(&table->tb6_lock); + rcu_read_lock(); fn = fib6_locate(&table->tb6_root, &cfg->fc_dst, cfg->fc_dst_len, - &cfg->fc_src, cfg->fc_src_len); + &cfg->fc_src, cfg->fc_src_len, + !(cfg->fc_flags & RTF_CACHE)); if (fn) { - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { - if ((rt->rt6i_flags & RTF_CACHE) && - !(cfg->fc_flags & RTF_CACHE)) - continue; + for_each_fib6_node_rt_rcu(fn) { + if (cfg->fc_flags & RTF_CACHE) { + rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst, + &cfg->fc_src); + if (!rt_cache) + continue; + rt = rt_cache; + } if (cfg->fc_ifindex && (!rt->dst.dev || rt->dst.dev->ifindex != cfg->fc_ifindex)) @@ -2321,8 +2891,9 @@ static int ip6_route_del(struct fib6_config *cfg, continue; if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol) continue; - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + if (!dst_hold_safe(&rt->dst)) + break; + rcu_read_unlock(); /* if gateway was specified only delete the one hop */ if (cfg->fc_flags & RTF_GATEWAY) @@ -2331,7 +2902,7 @@ static int ip6_route_del(struct fib6_config *cfg, return __ip6_del_rt_siblings(rt, cfg); } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return err; } @@ -2435,8 +3006,14 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; - if (ip6_ins_rt(nrt)) - goto out_release; + /* No need to remove rt from the exception table if rt is + * a cached route because rt6_insert_exception() will + * takes care of it + */ + if (rt6_insert_exception(nrt, rt)) { + dst_release_immediate(&nrt->dst); + goto out; + } netevent.old = &rt->dst; netevent.new = &nrt->dst; @@ -2444,17 +3021,6 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu netevent.neigh = neigh; call_netevent_notifiers(NETEVENT_REDIRECT, &netevent); - if (rt->rt6i_flags & RTF_CACHE) { - rt = (struct rt6_info *) dst_clone(&rt->dst); - ip6_del_rt(rt); - } - -out_release: - /* Release the reference taken in - * ip6_rt_cache_alloc() - */ - dst_release(&nrt->dst); - out: neigh_release(neigh); } @@ -2511,23 +3077,23 @@ static struct rt6_info *rt6_get_route_info(struct net *net, if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0); + rcu_read_lock(); + fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true); if (!fn) goto out; - for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) { + for_each_fib6_node_rt_rcu(fn) { if (rt->dst.dev->ifindex != ifindex) continue; if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY)) continue; if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr)) continue; - dst_hold(&rt->dst); + ip6_hold_safe(NULL, &rt, false); break; } out: - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); return rt; } @@ -2573,16 +3139,16 @@ struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_dev if (!table) return NULL; - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (dev == rt->dst.dev && ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) && ipv6_addr_equal(&rt->rt6i_gateway, addr)) break; } if (rt) - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); + ip6_hold_safe(NULL, &rt, false); + rcu_read_unlock(); return rt; } @@ -2620,17 +3186,20 @@ static void __rt6_purge_dflt_routers(struct fib6_table *table) struct rt6_info *rt; restart: - read_lock_bh(&table->tb6_lock); - for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) { + rcu_read_lock(); + for_each_fib6_node_rt_rcu(&table->tb6_root) { if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) && (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) { - dst_hold(&rt->dst); - read_unlock_bh(&table->tb6_lock); - ip6_del_rt(rt); + if (dst_hold_safe(&rt->dst)) { + rcu_read_unlock(); + ip6_del_rt(rt); + } else { + rcu_read_unlock(); + } goto restart; } } - read_unlock_bh(&table->tb6_lock); + rcu_read_unlock(); table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER; } @@ -2818,8 +3387,12 @@ static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg) if (((void *)rt->dst.dev == dev || !dev) && rt != net->ipv6.ip6_null_entry && ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) { + spin_lock_bh(&rt6_exception_lock); /* remove prefsrc entry */ rt->rt6i_prefsrc.plen = 0; + /* need to update cache as well */ + rt6_exceptions_remove_prefsrc(rt); + spin_unlock_bh(&rt6_exception_lock); } return 0; } @@ -2836,18 +3409,23 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp) } #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY) -#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE) /* Remove routers and update dst entries when gateway turn into host. */ static int fib6_clean_tohost(struct rt6_info *rt, void *arg) { struct in6_addr *gateway = (struct in6_addr *)arg; - if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) || - ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) && - ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { + if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) && + ipv6_addr_equal(gateway, &rt->rt6i_gateway)) { return -1; } + + /* Further clean up cached routes in exception table. + * This is needed because cached route may have a different + * gateway than its 'parent' in the case of an ip redirect. + */ + rt6_exceptions_clean_tohost(rt, gateway); + return 0; } @@ -2926,19 +3504,14 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) if (rt->dst.dev == arg->dev && dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { - if (rt->rt6i_flags & RTF_CACHE) { - /* For RTF_CACHE with rt6i_pmtu == 0 - * (i.e. a redirected route), - * the metrics of its rt->dst.from has already - * been updated. - */ - if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu) - rt->rt6i_pmtu = arg->mtu; - } else if (dst_mtu(&rt->dst) >= arg->mtu || - (dst_mtu(&rt->dst) < arg->mtu && - dst_mtu(&rt->dst) == idev->cnf.mtu6)) { + spin_lock_bh(&rt6_exception_lock); + if (dst_mtu(&rt->dst) >= arg->mtu || + (dst_mtu(&rt->dst) < arg->mtu && + dst_mtu(&rt->dst) == idev->cnf.mtu6)) { dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu); } + rt6_exceptions_update_pmtu(rt, arg->mtu); + spin_unlock_bh(&rt6_exception_lock); } return 0; } @@ -3839,7 +4412,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", net->ipv6.rt6_stats->fib_nodes, net->ipv6.rt6_stats->fib_route_nodes, - net->ipv6.rt6_stats->fib_rt_alloc, + atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc), net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, dst_entries_get_slow(&net->ipv6.ip6_dst_ops), diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 5c467ef97311..801ea9098387 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -24,6 +24,7 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" + depends on NET_IP_TUNNEL || NET_IP_TUNNEL=n ---help--- Add support for forwarding of mpls packets. diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index c5b9ce41d66f..8ca9915befc8 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -16,6 +16,7 @@ #include <net/arp.h> #include <net/ip_fib.h> #include <net/netevent.h> +#include <net/ip_tunnels.h> #include <net/netns/generic.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6.h> @@ -39,6 +40,36 @@ static int one = 1; static int label_limit = (1 << 20) - 1; static int ttl_max = 255; +#if IS_ENABLED(CONFIG_NET_IP_TUNNEL) +static size_t ipgre_mpls_encap_hlen(struct ip_tunnel_encap *e) +{ + return sizeof(struct mpls_shim_hdr); +} + +static const struct ip_tunnel_encap_ops mpls_iptun_ops = { + .encap_hlen = ipgre_mpls_encap_hlen, +}; + +static int ipgre_tunnel_encap_add_mpls_ops(void) +{ + return ip_tunnel_encap_add_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); +} + +static void ipgre_tunnel_encap_del_mpls_ops(void) +{ + ip_tunnel_encap_del_ops(&mpls_iptun_ops, TUNNEL_ENCAP_MPLS); +} +#else +static int ipgre_tunnel_encap_add_mpls_ops(void) +{ + return 0; +} + +static void ipgre_tunnel_encap_del_mpls_ops(void) +{ +} +#endif + static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags); @@ -2485,6 +2516,10 @@ static int __init mpls_init(void) 0); rtnl_register(PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, mpls_netconf_dump_devconf, 0); + err = ipgre_tunnel_encap_add_mpls_ops(); + if (err) + pr_err("Can't add mpls over gre tunnel ops\n"); + err = 0; out: return err; @@ -2502,6 +2537,7 @@ static void __exit mpls_exit(void) dev_remove_pack(&mpls_packet_type); unregister_netdevice_notifier(&mpls_dev_notifier); unregister_pernet_subsys(&mpls_net_ops); + ipgre_tunnel_encap_del_mpls_ops(); } module_exit(mpls_exit); diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index e495b5e484b1..cf84f7b37cd9 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1191,14 +1191,17 @@ static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb, from->family == to->family)) return -IPSET_ERR_TYPE_MISMATCH; - if (from->ref_netlink || to->ref_netlink) + write_lock_bh(&ip_set_ref_lock); + + if (from->ref_netlink || to->ref_netlink) { + write_unlock_bh(&ip_set_ref_lock); return -EBUSY; + } strncpy(from_name, from->name, IPSET_MAXNAMELEN); strncpy(from->name, to->name, IPSET_MAXNAMELEN); strncpy(to->name, from_name, IPSET_MAXNAMELEN); - write_lock_bh(&ip_set_ref_lock); swap(from->ref, to->ref); ip_set(inst, from_id) = to; ip_set(inst, to_id) = from; @@ -2072,25 +2075,28 @@ static struct pernet_operations ip_set_net_ops = { static int __init ip_set_init(void) { - int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); + int ret = register_pernet_subsys(&ip_set_net_ops); + + if (ret) { + pr_err("ip_set: cannot register pernet_subsys.\n"); + return ret; + } + ret = nfnetlink_subsys_register(&ip_set_netlink_subsys); if (ret != 0) { pr_err("ip_set: cannot register with nfnetlink.\n"); + unregister_pernet_subsys(&ip_set_net_ops); return ret; } + ret = nf_register_sockopt(&so_set); if (ret != 0) { pr_err("SO_SET registry failed: %d\n", ret); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + unregister_pernet_subsys(&ip_set_net_ops); return ret; } - ret = register_pernet_subsys(&ip_set_net_ops); - if (ret) { - pr_err("ip_set: cannot register pernet_subsys.\n"); - nf_unregister_sockopt(&so_set); - nfnetlink_subsys_unregister(&ip_set_netlink_subsys); - return ret; - } + pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL); return 0; } @@ -2098,9 +2104,10 @@ ip_set_init(void) static void __exit ip_set_fini(void) { - unregister_pernet_subsys(&ip_set_net_ops); nf_unregister_sockopt(&so_set); nfnetlink_subsys_unregister(&ip_set_netlink_subsys); + + unregister_pernet_subsys(&ip_set_net_ops); pr_debug("these are the famous last words\n"); } diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index 20bfbd315f61..613eb212cb48 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -123,13 +123,12 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], return ret; ip &= ip_set_hostmask(h->netmask); + e.ip = htonl(ip); + if (e.ip == 0) + return -IPSET_ERR_HASH_ELEM; - if (adt == IPSET_TEST) { - e.ip = htonl(ip); - if (e.ip == 0) - return -IPSET_ERR_HASH_ELEM; + if (adt == IPSET_TEST) return adtfn(set, &e, &ext, &ext, flags); - } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { @@ -148,17 +147,20 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); - if (retried) + if (retried) { ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip += hosts) { e.ip = htonl(ip); - if (e.ip == 0) - return -IPSET_ERR_HASH_ELEM; + } + for (; ip <= ip_to;) { ret = adtfn(set, &e, &ext, &ext, flags); - if (ret && !ip_set_eexist(ret, flags)) return ret; + ip += hosts; + e.ip = htonl(ip); + if (e.ip == 0) + return 0; + ret = 0; } return ret; diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index b64cf14e8352..f3ba8348cf9d 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -149,7 +149,7 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { e.ip = htonl(ip); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index f438740e6c6a..ddb8039ec1d2 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -178,7 +178,7 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index 6215fb898c50..a7f4d7a85420 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -185,7 +185,7 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 5ab1b99a53c2..a2f19b9906e9 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -271,7 +271,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - for (; !before(ip_to, ip); ip++) { + for (; ip <= ip_to; ip++) { e.ip = htonl(ip); p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; @@ -281,7 +281,7 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip == ntohl(h->next.ip) && p == ntohs(h->next.port) ? ntohl(h->next.ip2) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip2 = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &cidr); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index 5d9e895452e7..1c67a1761e45 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -193,7 +193,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], } if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 44cf11939c91..d417074f1c1a 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -255,7 +255,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index db614e13b193..7f9ae2e9645b 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -250,13 +250,13 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip[0]); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip[0] = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); ip2 = (retried && ip == ntohl(h->next.ip[0])) ? ntohl(h->next.ip[1]) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip[1] = htonl(ip2); last2 = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index 54b64b6cd0cd..e6ef382febe4 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -241,7 +241,7 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip = htonl(ip); last = ip_set_range_to_cidr(ip, ip_to, &cidr); e.cidr = cidr - 1; diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index aff846960ac4..8602f2595a1a 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -291,7 +291,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], if (retried) ip = ntohl(h->next.ip[0]); - while (!after(ip, ip_to)) { + while (ip <= ip_to) { e.ip[0] = htonl(ip); ip_last = ip_set_range_to_cidr(ip, ip_to, &e.cidr[0]); p = retried && ip == ntohl(h->next.ip[0]) ? ntohs(h->next.port) @@ -301,7 +301,7 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], ip2 = (retried && ip == ntohl(h->next.ip[0]) && p == ntohs(h->next.port)) ? ntohl(h->next.ip[1]) : ip2_from; - while (!after(ip2, ip2_to)) { + while (ip2 <= ip2_to) { e.ip[1] = htonl(ip2); ip2_last = ip_set_range_to_cidr(ip2, ip2_to, &e.cidr[1]); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 90d396814798..4527921b1c3a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -921,6 +921,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, { struct sk_buff *new_skb = NULL; struct iphdr *old_iph = NULL; + __u8 old_dsfield; #ifdef CONFIG_IP_VS_IPV6 struct ipv6hdr *old_ipv6h = NULL; #endif @@ -945,7 +946,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, *payload_len = ntohs(old_ipv6h->payload_len) + sizeof(*old_ipv6h); - *dsfield = ipv6_get_dsfield(old_ipv6h); + old_dsfield = ipv6_get_dsfield(old_ipv6h); *ttl = old_ipv6h->hop_limit; if (df) *df = 0; @@ -960,12 +961,15 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, /* fix old IP header checksum */ ip_send_check(old_iph); - *dsfield = ipv4_get_dsfield(old_iph); + old_dsfield = ipv4_get_dsfield(old_iph); *ttl = old_iph->ttl; if (payload_len) *payload_len = ntohs(old_iph->tot_len); } + /* Implement full-functionality option for ECN encapsulation */ + *dsfield = INET_ECN_encapsulate(old_dsfield, old_dsfield); + return skb; error: kfree_skb(skb); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 929927171426..64e1ee091225 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1048,7 +1048,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TYPE, basechain->type->name)) goto nla_put_failure; - if (nft_dump_stats(skb, nft_base_chain(chain)->stats)) + if (basechain->stats && nft_dump_stats(skb, basechain->stats)) goto nla_put_failure; } @@ -1487,8 +1487,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, chain2 = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask); - if (IS_ERR(chain2)) - return PTR_ERR(chain2); + if (!IS_ERR(chain2)) + return -EEXIST; } if (nla[NFTA_CHAIN_COUNTERS]) { @@ -2741,8 +2741,10 @@ cont: list_for_each_entry(i, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, i)) continue; - if (!strcmp(set->name, i->name)) + if (!strcmp(set->name, i->name)) { + kfree(set->name); return -ENFILE; + } } return 0; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index c83a3b5e1c6c..d8571f414208 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -892,7 +892,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(&compat_tmp, user, sizeof(compat_tmp)) != 0) return ERR_PTR(-EFAULT); - strlcpy(info->name, compat_tmp.name, sizeof(info->name)); + memcpy(info->name, compat_tmp.name, sizeof(info->name) - 1); info->num_counters = compat_tmp.num_counters; user += sizeof(compat_tmp); } else @@ -905,9 +905,9 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len, if (copy_from_user(info, user, sizeof(*info)) != 0) return ERR_PTR(-EFAULT); - info->name[sizeof(info->name) - 1] = '\0'; user += sizeof(*info); } + info->name[sizeof(info->name) - 1] = '\0'; size = sizeof(struct xt_counters); size *= info->num_counters; diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c index 38986a95216c..29123934887b 100644 --- a/net/netfilter/xt_bpf.c +++ b/net/netfilter/xt_bpf.c @@ -8,6 +8,7 @@ */ #include <linux/module.h> +#include <linux/syscalls.h> #include <linux/skbuff.h> #include <linux/filter.h> #include <linux/bpf.h> @@ -49,6 +50,22 @@ static int __bpf_mt_check_fd(int fd, struct bpf_prog **ret) return 0; } +static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret) +{ + mm_segment_t oldfs = get_fs(); + int retval, fd; + + set_fs(KERNEL_DS); + fd = bpf_obj_get_user(path); + set_fs(oldfs); + if (fd < 0) + return fd; + + retval = __bpf_mt_check_fd(fd, ret); + sys_close(fd); + return retval; +} + static int bpf_mt_check(const struct xt_mtchk_param *par) { struct xt_bpf_info *info = par->matchinfo; @@ -66,9 +83,10 @@ static int bpf_mt_check_v1(const struct xt_mtchk_param *par) return __bpf_mt_check_bytecode(info->bpf_program, info->bpf_program_num_elem, &info->filter); - else if (info->mode == XT_BPF_MODE_FD_PINNED || - info->mode == XT_BPF_MODE_FD_ELF) + else if (info->mode == XT_BPF_MODE_FD_ELF) return __bpf_mt_check_fd(info->fd, &info->filter); + else if (info->mode == XT_BPF_MODE_PATH_PINNED) + return __bpf_mt_check_path(info->path, &info->filter); else return -EINVAL; } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index e75ef39669c5..575d2153e3b8 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -76,7 +76,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, transparent = nf_sk_is_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && - transparent) + transparent && sk_fullsock(sk)) pskb->mark = sk->sk_mark; if (sk != skb->sk) @@ -133,7 +133,7 @@ socket_mt6_v1_v2_v3(const struct sk_buff *skb, struct xt_action_param *par) transparent = nf_sk_is_transparent(sk); if (info->flags & XT_SOCKET_RESTORESKMARK && !wildcard && - transparent) + transparent && sk_fullsock(sk)) pskb->mark = sk->sk_mark; if (sk != skb->sk) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 94c11cf0459d..f34750691c5c 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2266,16 +2266,17 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; + if (cb->start) { + ret = cb->start(cb); + if (ret) + goto error_unlock; + } + nlk->cb_running = true; mutex_unlock(nlk->cb_mutex); - ret = 0; - if (cb->start) - ret = cb->start(cb); - - if (!ret) - ret = netlink_dump(sk); + ret = netlink_dump(sk); sock_put(sk); diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index a54a556fcdb5..a551232daf61 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1203,6 +1203,10 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return err == -EINPROGRESS ? 0 : err; break; + case OVS_ACTION_ATTR_CT_CLEAR: + err = ovs_ct_clear(skb, key); + break; + case OVS_ACTION_ATTR_PUSH_ETH: err = push_eth(skb, key, nla_data(a)); break; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index d558e882ca0c..fe861e2f0deb 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1129,6 +1129,17 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, return err; } +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) +{ + if (skb_nfct(skb)) { + nf_conntrack_put(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + ovs_ct_fill_key(skb, key); + } + + return 0; +} + static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, const struct sw_flow_key *key, bool log) { diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index bc7efd1867ab..399dfdd2c4f9 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -30,6 +30,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, const struct ovs_conntrack_info *); +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key); void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); int ovs_ct_put_key(const struct sw_flow_key *swkey, @@ -73,6 +74,12 @@ static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, return -ENOTSUPP; } +static inline int ovs_ct_clear(struct sk_buff *skb, + struct sw_flow_key *key) +{ + return -ENOTSUPP; +} + static inline void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e8eb427ce6d1..dc0d79092e74 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -48,6 +48,7 @@ #include <net/ndisc.h> #include <net/mpls.h> #include <net/vxlan.h> +#include <net/erspan.h> #include "flow_netlink.h" @@ -75,6 +76,7 @@ static bool actions_may_change_flow(const struct nlattr *actions) break; case OVS_ACTION_ATTR_CT: + case OVS_ACTION_ATTR_CT_CLEAR: case OVS_ACTION_ATTR_HASH: case OVS_ACTION_ATTR_POP_ETH: case OVS_ACTION_ATTR_POP_MPLS: @@ -319,7 +321,8 @@ size_t ovs_tun_key_attr_size(void) * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. */ + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ - + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_DST */ + + nla_total_size(4); /* OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS */ } size_t ovs_key_attr_size(void) @@ -371,6 +374,7 @@ static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] .next = ovs_vxlan_ext_key_lens }, [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = sizeof(u32) }, }; /* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ @@ -593,6 +597,33 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, return 0; } +static int erspan_tun_opt_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + struct erspan_metadata opts; + + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); + + memset(&opts, 0, sizeof(opts)); + opts.index = nla_get_be32(attr); + + /* Index has only 20-bit */ + if (ntohl(opts.index) & ~INDEX_MASK) { + OVS_NLERR(log, "ERSPAN index number %x too large.", + ntohl(opts.index)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), is_mask); + opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), + is_mask); + + return 0; +} + static int ip_tun_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool log) @@ -700,6 +731,19 @@ static int ip_tun_from_nlattr(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_PAD: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = erspan_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + tun_flags |= TUNNEL_ERSPAN_OPT; + opts_type = type; + break; default: OVS_NLERR(log, "Unknown IP tunnel attribute %d", type); @@ -824,6 +868,10 @@ static int __ip_tun_to_nlattr(struct sk_buff *skb, else if (output->tun_flags & TUNNEL_VXLAN_OPT && vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) return -EMSGSIZE; + else if (output->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + ((struct erspan_metadata *)tun_opts)->index)) + return -EMSGSIZE; } return 0; @@ -2195,6 +2243,8 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, break; case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + break; } }; @@ -2479,6 +2529,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), [OVS_ACTION_ATTR_CT] = (u32)-1, + [OVS_ACTION_ATTR_CT_CLEAR] = 0, [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), [OVS_ACTION_ATTR_POP_ETH] = 0, @@ -2620,6 +2671,9 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, skip_copy = true; break; + case OVS_ACTION_ATTR_CT_CLEAR: + break; + case OVS_ACTION_ATTR_PUSH_ETH: /* Disallow pushing an Ethernet header if one * is already present */ diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index f925753668a7..3b0ef691f5b1 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -35,11 +35,11 @@ #include <net/phonet/pn_dev.h> /* Transport protocol registration */ -static struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; +static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; -static struct phonet_protocol *phonet_proto_get(unsigned int protocol) +static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { - struct phonet_protocol *pp; + const struct phonet_protocol *pp; if (protocol >= PHONET_NPROTO) return NULL; @@ -53,7 +53,7 @@ static struct phonet_protocol *phonet_proto_get(unsigned int protocol) return pp; } -static inline void phonet_proto_put(struct phonet_protocol *pp) +static inline void phonet_proto_put(const struct phonet_protocol *pp) { module_put(pp->prot->owner); } @@ -65,7 +65,7 @@ static int pn_socket_create(struct net *net, struct socket *sock, int protocol, { struct sock *sk; struct pn_sock *pn; - struct phonet_protocol *pnp; + const struct phonet_protocol *pnp; int err; if (!capable(CAP_SYS_ADMIN)) @@ -149,7 +149,7 @@ static int pn_header_parse(const struct sk_buff *skb, unsigned char *haddr) return 1; } -struct header_ops phonet_header_ops = { +const struct header_ops phonet_header_ops = { .create = pn_header_create, .parse = pn_header_parse, }; @@ -470,7 +470,7 @@ static struct packet_type phonet_packet_type __read_mostly = { static DEFINE_MUTEX(proto_tab_lock); int __init_or_module phonet_proto_register(unsigned int protocol, - struct phonet_protocol *pp) + const struct phonet_protocol *pp) { int err = 0; @@ -492,7 +492,8 @@ int __init_or_module phonet_proto_register(unsigned int protocol, } EXPORT_SYMBOL(phonet_proto_register); -void phonet_proto_unregister(unsigned int protocol, struct phonet_protocol *pp) +void phonet_proto_unregister(unsigned int protocol, + const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); BUG_ON(proto_tab[protocol] != pp); diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 5e710435ffa9..b44fb9018fb8 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -195,7 +195,7 @@ static struct proto pn_proto = { .name = "PHONET", }; -static struct phonet_protocol pn_dgram_proto = { +static const struct phonet_protocol pn_dgram_proto = { .ops = &phonet_dgram_ops, .prot = &pn_proto, .sock_type = SOCK_DGRAM, diff --git a/net/phonet/pep.c b/net/phonet/pep.c index e81537991ddf..9fc76b19cd3c 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1351,7 +1351,7 @@ static struct proto pep_proto = { .name = "PNPIPE", }; -static struct phonet_protocol pep_pn_proto = { +static const struct phonet_protocol pep_pn_proto = { .ops = &phonet_stream_ops, .prot = &pep_proto, .sock_type = SOCK_SEQPACKET, diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index c2f5c13550c0..e458ece96d3d 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -20,26 +20,15 @@ #include "qrtr.h" -#define QRTR_PROTO_VER 1 +#define QRTR_PROTO_VER_1 1 +#define QRTR_PROTO_VER_2 3 /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff -enum qrtr_pkt_type { - QRTR_TYPE_DATA = 1, - QRTR_TYPE_HELLO = 2, - QRTR_TYPE_BYE = 3, - QRTR_TYPE_NEW_SERVER = 4, - QRTR_TYPE_DEL_SERVER = 5, - QRTR_TYPE_DEL_CLIENT = 6, - QRTR_TYPE_RESUME_TX = 7, - QRTR_TYPE_EXIT = 8, - QRTR_TYPE_PING = 9, -}; - /** - * struct qrtr_hdr - (I|R)PCrouter packet header + * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version * @type: packet type; one of QRTR_TYPE_* * @src_node_id: source node @@ -49,7 +38,7 @@ enum qrtr_pkt_type { * @dst_node_id: destination node * @dst_port_id: destination port */ -struct qrtr_hdr { +struct qrtr_hdr_v1 { __le32 version; __le32 type; __le32 src_node_id; @@ -60,9 +49,44 @@ struct qrtr_hdr { __le32 dst_port_id; } __packed; -#define QRTR_HDR_SIZE sizeof(struct qrtr_hdr) -#define QRTR_NODE_BCAST ((unsigned int)-1) -#define QRTR_PORT_CTRL ((unsigned int)-2) +/** + * struct qrtr_hdr_v2 - (I|R)PCrouter packet header later versions + * @version: protocol version + * @type: packet type; one of QRTR_TYPE_* + * @flags: bitmask of QRTR_FLAGS_* + * @optlen: length of optional header data + * @size: length of packet, excluding this header and optlen + * @src_node_id: source node + * @src_port_id: source port + * @dst_node_id: destination node + * @dst_port_id: destination port + */ +struct qrtr_hdr_v2 { + u8 version; + u8 type; + u8 flags; + u8 optlen; + __le32 size; + __le16 src_node_id; + __le16 src_port_id; + __le16 dst_node_id; + __le16 dst_port_id; +}; + +#define QRTR_FLAGS_CONFIRM_RX BIT(0) + +struct qrtr_cb { + u32 src_node; + u32 src_port; + u32 dst_node; + u32 dst_port; + + u8 type; + u8 confirm_rx; +}; + +#define QRTR_HDR_MAX_SIZE max_t(size_t, sizeof(struct qrtr_hdr_v1), \ + sizeof(struct qrtr_hdr_v2)) struct qrtr_sock { /* WARNING: sk must be the first member */ @@ -111,8 +135,12 @@ struct qrtr_node { struct list_head item; }; -static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb); -static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb); +static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to); +static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to); /* Release node resources and free the node. * @@ -150,10 +178,27 @@ static void qrtr_node_release(struct qrtr_node *node) } /* Pass an outgoing packet socket buffer to the endpoint driver. */ -static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { + struct qrtr_hdr_v1 *hdr; + size_t len = skb->len; int rc = -ENODEV; + hdr = skb_push(skb, sizeof(*hdr)); + hdr->version = cpu_to_le32(QRTR_PROTO_VER_1); + hdr->type = cpu_to_le32(type); + hdr->src_node_id = cpu_to_le32(from->sq_node); + hdr->src_port_id = cpu_to_le32(from->sq_port); + hdr->dst_node_id = cpu_to_le32(to->sq_node); + hdr->dst_port_id = cpu_to_le32(to->sq_port); + + hdr->size = cpu_to_le32(len); + hdr->confirm_rx = 0; + + skb_put_padto(skb, ALIGN(len, 4)); + mutex_lock(&node->ep_lock); if (node->ep) rc = node->ep->xmit(node->ep, skb); @@ -207,125 +252,103 @@ static void qrtr_node_assign(struct qrtr_node *node, unsigned int nid) int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) { struct qrtr_node *node = ep->node; - const struct qrtr_hdr *phdr = data; + const struct qrtr_hdr_v1 *v1; + const struct qrtr_hdr_v2 *v2; struct sk_buff *skb; - unsigned int psize; + struct qrtr_cb *cb; unsigned int size; - unsigned int type; unsigned int ver; - unsigned int dst; - - if (len < QRTR_HDR_SIZE || len & 3) - return -EINVAL; - - ver = le32_to_cpu(phdr->version); - size = le32_to_cpu(phdr->size); - type = le32_to_cpu(phdr->type); - dst = le32_to_cpu(phdr->dst_port_id); - - psize = (size + 3) & ~3; + size_t hdrlen; - if (ver != QRTR_PROTO_VER) - return -EINVAL; - - if (len != psize + QRTR_HDR_SIZE) - return -EINVAL; - - if (dst != QRTR_PORT_CTRL && type != QRTR_TYPE_DATA) + if (len & 3) return -EINVAL; skb = netdev_alloc_skb(NULL, len); if (!skb) return -ENOMEM; - skb_reset_transport_header(skb); - skb_put_data(skb, data, len); - - skb_queue_tail(&node->rx_queue, skb); - schedule_work(&node->work); - - return 0; -} -EXPORT_SYMBOL_GPL(qrtr_endpoint_post); + cb = (struct qrtr_cb *)skb->cb; -static struct sk_buff *qrtr_alloc_ctrl_packet(u32 type, size_t pkt_len, - u32 src_node, u32 dst_node) -{ - struct qrtr_hdr *hdr; - struct sk_buff *skb; - - skb = alloc_skb(QRTR_HDR_SIZE + pkt_len, GFP_KERNEL); - if (!skb) - return NULL; - skb_reset_transport_header(skb); + /* Version field in v1 is little endian, so this works for both cases */ + ver = *(u8*)data; - hdr = skb_put(skb, QRTR_HDR_SIZE); - hdr->version = cpu_to_le32(QRTR_PROTO_VER); - hdr->type = cpu_to_le32(type); - hdr->src_node_id = cpu_to_le32(src_node); - hdr->src_port_id = cpu_to_le32(QRTR_PORT_CTRL); - hdr->confirm_rx = cpu_to_le32(0); - hdr->size = cpu_to_le32(pkt_len); - hdr->dst_node_id = cpu_to_le32(dst_node); - hdr->dst_port_id = cpu_to_le32(QRTR_PORT_CTRL); + switch (ver) { + case QRTR_PROTO_VER_1: + v1 = data; + hdrlen = sizeof(*v1); - return skb; -} + cb->type = le32_to_cpu(v1->type); + cb->src_node = le32_to_cpu(v1->src_node_id); + cb->src_port = le32_to_cpu(v1->src_port_id); + cb->confirm_rx = !!v1->confirm_rx; + cb->dst_node = le32_to_cpu(v1->dst_node_id); + cb->dst_port = le32_to_cpu(v1->dst_port_id); -/* Allocate and construct a resume-tx packet. */ -static struct sk_buff *qrtr_alloc_resume_tx(u32 src_node, - u32 dst_node, u32 port) -{ - const int pkt_len = 20; - struct sk_buff *skb; - __le32 *buf; + size = le32_to_cpu(v1->size); + break; + case QRTR_PROTO_VER_2: + v2 = data; + hdrlen = sizeof(*v2) + v2->optlen; + + cb->type = v2->type; + cb->confirm_rx = !!(v2->flags & QRTR_FLAGS_CONFIRM_RX); + cb->src_node = le16_to_cpu(v2->src_node_id); + cb->src_port = le16_to_cpu(v2->src_port_id); + cb->dst_node = le16_to_cpu(v2->dst_node_id); + cb->dst_port = le16_to_cpu(v2->dst_port_id); + + if (cb->src_port == (u16)QRTR_PORT_CTRL) + cb->src_port = QRTR_PORT_CTRL; + if (cb->dst_port == (u16)QRTR_PORT_CTRL) + cb->dst_port = QRTR_PORT_CTRL; + + size = le32_to_cpu(v2->size); + break; + default: + pr_err("qrtr: Invalid version %d\n", ver); + goto err; + } - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_RESUME_TX, pkt_len, - src_node, dst_node); - if (!skb) - return NULL; + if (len != ALIGN(size, 4) + hdrlen) + goto err; - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_RESUME_TX); - buf[1] = cpu_to_le32(src_node); - buf[2] = cpu_to_le32(port); + if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA) + goto err; - return skb; -} + skb_put_data(skb, data + hdrlen, size); -/* Allocate and construct a BYE message to signal remote termination */ -static struct sk_buff *qrtr_alloc_local_bye(u32 src_node) -{ - const int pkt_len = 20; - struct sk_buff *skb; - __le32 *buf; + skb_queue_tail(&node->rx_queue, skb); + schedule_work(&node->work); - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_BYE, pkt_len, - src_node, qrtr_local_nid); - if (!skb) - return NULL; + return 0; - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_BYE); +err: + kfree_skb(skb); + return -EINVAL; - return skb; } +EXPORT_SYMBOL_GPL(qrtr_endpoint_post); -static struct sk_buff *qrtr_alloc_del_client(struct sockaddr_qrtr *sq) +/** + * qrtr_alloc_ctrl_packet() - allocate control packet skb + * @pkt: reference to qrtr_ctrl_pkt pointer + * + * Returns newly allocated sk_buff, or NULL on failure + * + * This function allocates a sk_buff large enough to carry a qrtr_ctrl_pkt and + * on success returns a reference to the control packet in @pkt. + */ +static struct sk_buff *qrtr_alloc_ctrl_packet(struct qrtr_ctrl_pkt **pkt) { - const int pkt_len = 20; + const int pkt_len = sizeof(struct qrtr_ctrl_pkt); struct sk_buff *skb; - __le32 *buf; - skb = qrtr_alloc_ctrl_packet(QRTR_TYPE_DEL_CLIENT, pkt_len, - sq->sq_node, QRTR_NODE_BCAST); + skb = alloc_skb(QRTR_HDR_MAX_SIZE + pkt_len, GFP_KERNEL); if (!skb) return NULL; - buf = skb_put_zero(skb, pkt_len); - buf[0] = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); - buf[1] = cpu_to_le32(sq->sq_node); - buf[2] = cpu_to_le32(sq->sq_port); + skb_reserve(skb, QRTR_HDR_MAX_SIZE); + *pkt = skb_put_zero(skb, pkt_len); return skb; } @@ -340,24 +363,26 @@ static void qrtr_port_put(struct qrtr_sock *ipc); static void qrtr_node_rx_work(struct work_struct *work) { struct qrtr_node *node = container_of(work, struct qrtr_node, work); + struct qrtr_ctrl_pkt *pkt; + struct sockaddr_qrtr dst; + struct sockaddr_qrtr src; struct sk_buff *skb; while ((skb = skb_dequeue(&node->rx_queue)) != NULL) { - const struct qrtr_hdr *phdr; - u32 dst_node, dst_port; struct qrtr_sock *ipc; - u32 src_node; + struct qrtr_cb *cb; int confirm; - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - src_node = le32_to_cpu(phdr->src_node_id); - dst_node = le32_to_cpu(phdr->dst_node_id); - dst_port = le32_to_cpu(phdr->dst_port_id); - confirm = !!phdr->confirm_rx; + cb = (struct qrtr_cb *)skb->cb; + src.sq_node = cb->src_node; + src.sq_port = cb->src_port; + dst.sq_node = cb->dst_node; + dst.sq_port = cb->dst_port; + confirm = !!cb->confirm_rx; - qrtr_node_assign(node, src_node); + qrtr_node_assign(node, cb->src_node); - ipc = qrtr_port_lookup(dst_port); + ipc = qrtr_port_lookup(cb->dst_port); if (!ipc) { kfree_skb(skb); } else { @@ -368,10 +393,16 @@ static void qrtr_node_rx_work(struct work_struct *work) } if (confirm) { - skb = qrtr_alloc_resume_tx(dst_node, node->nid, dst_port); + skb = qrtr_alloc_ctrl_packet(&pkt); if (!skb) break; - if (qrtr_node_enqueue(node, skb)) + + pkt->cmd = cpu_to_le32(QRTR_TYPE_RESUME_TX); + pkt->client.node = cpu_to_le32(dst.sq_node); + pkt->client.port = cpu_to_le32(dst.sq_port); + + if (qrtr_node_enqueue(node, skb, QRTR_TYPE_RESUME_TX, + &dst, &src)) break; } } @@ -421,6 +452,9 @@ EXPORT_SYMBOL_GPL(qrtr_endpoint_register); void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) { struct qrtr_node *node = ep->node; + struct sockaddr_qrtr src = {AF_QIPCRTR, node->nid, QRTR_PORT_CTRL}; + struct sockaddr_qrtr dst = {AF_QIPCRTR, qrtr_local_nid, QRTR_PORT_CTRL}; + struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; mutex_lock(&node->ep_lock); @@ -428,9 +462,11 @@ void qrtr_endpoint_unregister(struct qrtr_endpoint *ep) mutex_unlock(&node->ep_lock); /* Notify the local controller about the event */ - skb = qrtr_alloc_local_bye(node->nid); - if (skb) - qrtr_local_enqueue(NULL, skb); + skb = qrtr_alloc_ctrl_packet(&pkt); + if (skb) { + pkt->cmd = cpu_to_le32(QRTR_TYPE_BYE); + qrtr_local_enqueue(NULL, skb, QRTR_TYPE_BYE, &src, &dst); + } qrtr_node_release(node); ep->node = NULL; @@ -466,13 +502,24 @@ static void qrtr_port_put(struct qrtr_sock *ipc) /* Remove port assignment. */ static void qrtr_port_remove(struct qrtr_sock *ipc) { + struct qrtr_ctrl_pkt *pkt; struct sk_buff *skb; int port = ipc->us.sq_port; + struct sockaddr_qrtr to; - skb = qrtr_alloc_del_client(&ipc->us); + to.sq_family = AF_QIPCRTR; + to.sq_node = QRTR_NODE_BCAST; + to.sq_port = QRTR_PORT_CTRL; + + skb = qrtr_alloc_ctrl_packet(&pkt); if (skb) { + pkt->cmd = cpu_to_le32(QRTR_TYPE_DEL_CLIENT); + pkt->client.node = cpu_to_le32(ipc->us.sq_node); + pkt->client.port = cpu_to_le32(ipc->us.sq_port); + skb_set_owner_w(skb, &ipc->sk); - qrtr_bcast_enqueue(NULL, skb); + qrtr_bcast_enqueue(NULL, skb, QRTR_TYPE_DEL_CLIENT, &ipc->us, + &to); } if (port == QRTR_PORT_CTRL) @@ -541,7 +588,7 @@ static void qrtr_reset_ports(void) sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; - wake_up_interruptible(sk_sleep(&ipc->sk)); + ipc->sk.sk_error_report(&ipc->sk); sock_put(&ipc->sk); } mutex_unlock(&qrtr_port_lock); @@ -620,19 +667,23 @@ static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) } /* Queue packet to local peer socket. */ -static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { - const struct qrtr_hdr *phdr; struct qrtr_sock *ipc; + struct qrtr_cb *cb; - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - - ipc = qrtr_port_lookup(le32_to_cpu(phdr->dst_port_id)); + ipc = qrtr_port_lookup(to->sq_port); if (!ipc || &ipc->sk == skb->sk) { /* do not send to self */ kfree_skb(skb); return -ENODEV; } + cb = (struct qrtr_cb *)skb->cb; + cb->src_node = from->sq_node; + cb->src_port = from->sq_port; + if (sock_queue_rcv_skb(&ipc->sk, skb)) { qrtr_port_put(ipc); kfree_skb(skb); @@ -645,7 +696,9 @@ static int qrtr_local_enqueue(struct qrtr_node *node, struct sk_buff *skb) } /* Queue packet for broadcast. */ -static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) +static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb, + int type, struct sockaddr_qrtr *from, + struct sockaddr_qrtr *to) { struct sk_buff *skbn; @@ -655,11 +708,11 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) if (!skbn) break; skb_set_owner_w(skbn, skb->sk); - qrtr_node_enqueue(node, skbn); + qrtr_node_enqueue(node, skbn, type, from, to); } mutex_unlock(&qrtr_node_lock); - qrtr_local_enqueue(node, skb); + qrtr_local_enqueue(node, skb, type, from, to); return 0; } @@ -667,13 +720,14 @@ static int qrtr_bcast_enqueue(struct qrtr_node *node, struct sk_buff *skb) static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); - int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *); + int (*enqueue_fn)(struct qrtr_node *, struct sk_buff *, int, + struct sockaddr_qrtr *, struct sockaddr_qrtr *); struct qrtr_sock *ipc = qrtr_sk(sock->sk); struct sock *sk = sock->sk; struct qrtr_node *node; - struct qrtr_hdr *hdr; struct sk_buff *skb; size_t plen; + u32 type = QRTR_TYPE_DATA; int rc; if (msg->msg_flags & ~(MSG_DONTWAIT)) @@ -722,37 +776,19 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } plen = (len + 3) & ~3; - skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_SIZE, + skb = sock_alloc_send_skb(sk, plen + QRTR_HDR_MAX_SIZE, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) goto out_node; - skb_reset_transport_header(skb); - skb_put(skb, len + QRTR_HDR_SIZE); - - hdr = (struct qrtr_hdr *)skb_transport_header(skb); - hdr->version = cpu_to_le32(QRTR_PROTO_VER); - hdr->src_node_id = cpu_to_le32(ipc->us.sq_node); - hdr->src_port_id = cpu_to_le32(ipc->us.sq_port); - hdr->confirm_rx = cpu_to_le32(0); - hdr->size = cpu_to_le32(len); - hdr->dst_node_id = cpu_to_le32(addr->sq_node); - hdr->dst_port_id = cpu_to_le32(addr->sq_port); + skb_reserve(skb, QRTR_HDR_MAX_SIZE); - rc = skb_copy_datagram_from_iter(skb, QRTR_HDR_SIZE, - &msg->msg_iter, len); + rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc) { kfree_skb(skb); goto out_node; } - if (plen != len) { - rc = skb_pad(skb, plen - len); - if (rc) - goto out_node; - skb_put(skb, plen - len); - } - if (ipc->us.sq_port == QRTR_PORT_CTRL) { if (len < 4) { rc = -EINVAL; @@ -761,12 +797,11 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } /* control messages already require the type as 'command' */ - skb_copy_bits(skb, QRTR_HDR_SIZE, &hdr->type, 4); - } else { - hdr->type = cpu_to_le32(QRTR_TYPE_DATA); + skb_copy_bits(skb, 0, &type, 4); + type = le32_to_cpu(type); } - rc = enqueue_fn(node, skb); + rc = enqueue_fn(node, skb, type, &ipc->us, addr); if (rc >= 0) rc = len; @@ -781,9 +816,9 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, msg->msg_name); - const struct qrtr_hdr *phdr; struct sock *sk = sock->sk; struct sk_buff *skb; + struct qrtr_cb *cb; int copied, rc; lock_sock(sk); @@ -800,22 +835,22 @@ static int qrtr_recvmsg(struct socket *sock, struct msghdr *msg, return rc; } - phdr = (const struct qrtr_hdr *)skb_transport_header(skb); - copied = le32_to_cpu(phdr->size); + copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } - rc = skb_copy_datagram_msg(skb, QRTR_HDR_SIZE, msg, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc < 0) goto out; rc = copied; if (addr) { + cb = (struct qrtr_cb *)skb->cb; addr->sq_family = AF_QIPCRTR; - addr->sq_node = le32_to_cpu(phdr->src_node_id); - addr->sq_port = le32_to_cpu(phdr->src_port_id); + addr->sq_node = cb->src_node; + addr->sq_port = cb->src_port; msg->msg_namelen = sizeof(*addr); } @@ -908,7 +943,7 @@ static int qrtr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: skb = skb_peek(&sk->sk_receive_queue); if (skb) - len = skb->len - QRTR_HDR_SIZE; + len = skb->len; rc = put_user(len, (int __user *)argp); break; case SIOCGIFADDR: diff --git a/net/sched/act_api.c b/net/sched/act_api.c index da6fa82c98a8..ac97db92ab68 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -21,6 +21,8 @@ #include <linux/kmod.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/rhashtable.h> +#include <linux/list.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/sch_generic.h> @@ -1249,8 +1251,226 @@ out_module_put: return skb->len; } +struct tcf_action_net { + struct rhashtable egdev_ht; +}; + +static unsigned int tcf_action_net_id; + +struct tcf_action_egdev_cb { + struct list_head list; + tc_setup_cb_t *cb; + void *cb_priv; +}; + +struct tcf_action_egdev { + struct rhash_head ht_node; + const struct net_device *dev; + unsigned int refcnt; + struct list_head cb_list; +}; + +static const struct rhashtable_params tcf_action_egdev_ht_params = { + .key_offset = offsetof(struct tcf_action_egdev, dev), + .head_offset = offsetof(struct tcf_action_egdev, ht_node), + .key_len = sizeof(const struct net_device *), +}; + +static struct tcf_action_egdev * +tcf_action_egdev_lookup(const struct net_device *dev) +{ + struct net *net = dev_net(dev); + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + return rhashtable_lookup_fast(&tan->egdev_ht, &dev, + tcf_action_egdev_ht_params); +} + +static struct tcf_action_egdev * +tcf_action_egdev_get(const struct net_device *dev) +{ + struct tcf_action_egdev *egdev; + struct tcf_action_net *tan; + + egdev = tcf_action_egdev_lookup(dev); + if (egdev) + goto inc_ref; + + egdev = kzalloc(sizeof(*egdev), GFP_KERNEL); + if (!egdev) + return NULL; + INIT_LIST_HEAD(&egdev->cb_list); + tan = net_generic(dev_net(dev), tcf_action_net_id); + rhashtable_insert_fast(&tan->egdev_ht, &egdev->ht_node, + tcf_action_egdev_ht_params); + +inc_ref: + egdev->refcnt++; + return egdev; +} + +static void tcf_action_egdev_put(struct tcf_action_egdev *egdev) +{ + struct tcf_action_net *tan; + + if (--egdev->refcnt) + return; + tan = net_generic(dev_net(egdev->dev), tcf_action_net_id); + rhashtable_remove_fast(&tan->egdev_ht, &egdev->ht_node, + tcf_action_egdev_ht_params); + kfree(egdev); +} + +static struct tcf_action_egdev_cb * +tcf_action_egdev_cb_lookup(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + list_for_each_entry(egdev_cb, &egdev->cb_list, list) + if (egdev_cb->cb == cb && egdev_cb->cb_priv == cb_priv) + return egdev_cb; + return NULL; +} + +static int tcf_action_egdev_cb_call(struct tcf_action_egdev *egdev, + enum tc_setup_type type, + void *type_data, bool err_stop) +{ + struct tcf_action_egdev_cb *egdev_cb; + int ok_count = 0; + int err; + + list_for_each_entry(egdev_cb, &egdev->cb_list, list) { + err = egdev_cb->cb(type, type_data, egdev_cb->cb_priv); + if (err) { + if (err_stop) + return err; + } else { + ok_count++; + } + } + return ok_count; +} + +static int tcf_action_egdev_cb_add(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); + if (WARN_ON(egdev_cb)) + return -EEXIST; + egdev_cb = kzalloc(sizeof(*egdev_cb), GFP_KERNEL); + if (!egdev_cb) + return -ENOMEM; + egdev_cb->cb = cb; + egdev_cb->cb_priv = cb_priv; + list_add(&egdev_cb->list, &egdev->cb_list); + return 0; +} + +static void tcf_action_egdev_cb_del(struct tcf_action_egdev *egdev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev_cb *egdev_cb; + + egdev_cb = tcf_action_egdev_cb_lookup(egdev, cb, cb_priv); + if (WARN_ON(!egdev_cb)) + return; + list_del(&egdev_cb->list); + kfree(egdev_cb); +} + +static int __tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_get(dev); + int err; + + if (!egdev) + return -ENOMEM; + err = tcf_action_egdev_cb_add(egdev, cb, cb_priv); + if (err) + goto err_cb_add; + return 0; + +err_cb_add: + tcf_action_egdev_put(egdev); + return err; +} +int tc_setup_cb_egdev_register(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + int err; + + rtnl_lock(); + err = __tc_setup_cb_egdev_register(dev, cb, cb_priv); + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_register); + +static void __tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); + + if (WARN_ON(!egdev)) + return; + tcf_action_egdev_cb_del(egdev, cb, cb_priv); + tcf_action_egdev_put(egdev); +} +void tc_setup_cb_egdev_unregister(const struct net_device *dev, + tc_setup_cb_t *cb, void *cb_priv) +{ + rtnl_lock(); + __tc_setup_cb_egdev_unregister(dev, cb, cb_priv); + rtnl_unlock(); +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_unregister); + +int tc_setup_cb_egdev_call(const struct net_device *dev, + enum tc_setup_type type, void *type_data, + bool err_stop) +{ + struct tcf_action_egdev *egdev = tcf_action_egdev_lookup(dev); + + if (!egdev) + return 0; + return tcf_action_egdev_cb_call(egdev, type, type_data, err_stop); +} +EXPORT_SYMBOL_GPL(tc_setup_cb_egdev_call); + +static __net_init int tcf_action_net_init(struct net *net) +{ + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + return rhashtable_init(&tan->egdev_ht, &tcf_action_egdev_ht_params); +} + +static void __net_exit tcf_action_net_exit(struct net *net) +{ + struct tcf_action_net *tan = net_generic(net, tcf_action_net_id); + + rhashtable_destroy(&tan->egdev_ht); +} + +static struct pernet_operations tcf_action_net_ops = { + .init = tcf_action_net_init, + .exit = tcf_action_net_exit, + .id = &tcf_action_net_id, + .size = sizeof(struct tcf_action_net), +}; + static int __init tc_action_init(void) { + int err; + + err = register_pernet_subsys(&tcf_action_net_ops); + if (err) + return err; + rtnl_register(PF_UNSPEC, RTM_NEWACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELACTION, tc_ctl_action, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETACTION, tc_ctl_action, tc_dump_action, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 8ccd35825b6b..252ee7d8c731 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -248,6 +248,20 @@ static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len) return ret; } +static const char *ife_meta_id2name(u32 metaid) +{ + switch (metaid) { + case IFE_META_SKBMARK: + return "skbmark"; + case IFE_META_PRIO: + return "skbprio"; + case IFE_META_TCINDEX: + return "tcindex"; + default: + return "unknown"; + } +} + /* called when adding new meta information * under ife->tcf_lock for existing action */ @@ -263,7 +277,7 @@ static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid, if (exists) spin_unlock_bh(&ife->tcf_lock); rtnl_unlock(); - request_module("ifemeta%u", metaid); + request_module("ife-meta-%s", ife_meta_id2name(metaid)); rtnl_lock(); if (exists) spin_lock_bh(&ife->tcf_lock); @@ -392,10 +406,14 @@ static void _tcf_ife_cleanup(struct tc_action *a, int bind) static void tcf_ife_cleanup(struct tc_action *a, int bind) { struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p; spin_lock_bh(&ife->tcf_lock); _tcf_ife_cleanup(a, bind); spin_unlock_bh(&ife->tcf_lock); + + p = rcu_dereference_protected(ife->params, 1); + kfree_rcu(p, rcu); } /* under ife->tcf_lock for existing action */ @@ -432,6 +450,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tc_action_net *tn = net_generic(net, ife_net_id); struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; + struct tcf_ife_params *p, *p_old; struct tcf_ife_info *ife; u16 ife_type = ETH_P_IFE; struct tc_ife *parm; @@ -450,24 +469,41 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_IFE_PARMS]); + /* IFE_DECODE is 0 and indicates the opposite of IFE_ENCODE because + * they cannot run as the same time. Check on all other values which + * are not supported right now. + */ + if (parm->flags & ~IFE_ENCODE) + return -EINVAL; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + exists = tcf_idr_check(tn, parm->index, a, bind); - if (exists && bind) + if (exists && bind) { + kfree(p); return 0; + } if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_ife_ops, - bind, false); - if (ret) + bind, true); + if (ret) { + kfree(p); return ret; + } ret = ACT_P_CREATED; } else { tcf_idr_release(*a, bind); - if (!ovr) + if (!ovr) { + kfree(p); return -EEXIST; + } } ife = to_ife(*a); - ife->flags = parm->flags; + p->flags = parm->flags; if (parm->flags & IFE_ENCODE) { if (tb[TCA_IFE_TYPE]) @@ -478,24 +514,25 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, saddr = nla_data(tb[TCA_IFE_SMAC]); } - if (exists) - spin_lock_bh(&ife->tcf_lock); ife->tcf_action = parm->action; if (parm->flags & IFE_ENCODE) { if (daddr) - ether_addr_copy(ife->eth_dst, daddr); + ether_addr_copy(p->eth_dst, daddr); else - eth_zero_addr(ife->eth_dst); + eth_zero_addr(p->eth_dst); if (saddr) - ether_addr_copy(ife->eth_src, saddr); + ether_addr_copy(p->eth_src, saddr); else - eth_zero_addr(ife->eth_src); + eth_zero_addr(p->eth_src); - ife->eth_type = ife_type; + p->eth_type = ife_type; } + if (exists) + spin_lock_bh(&ife->tcf_lock); + if (ret == ACT_P_CREATED) INIT_LIST_HEAD(&ife->metalist); @@ -511,6 +548,7 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + kfree(p); return err; } @@ -531,6 +569,7 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + kfree(p); return err; } } @@ -538,6 +577,11 @@ metadata_parse_err: if (exists) spin_unlock_bh(&ife->tcf_lock); + p_old = rtnl_dereference(ife->params); + rcu_assign_pointer(ife->params, p); + if (p_old) + kfree_rcu(p_old, rcu); + if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); @@ -549,12 +593,13 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, { unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p = rtnl_dereference(ife->params); struct tc_ife opt = { .index = ife->tcf_index, .refcnt = ife->tcf_refcnt - ref, .bindcnt = ife->tcf_bindcnt - bind, .action = ife->tcf_action, - .flags = ife->flags, + .flags = p->flags, }; struct tcf_t t; @@ -565,17 +610,17 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, if (nla_put_64bit(skb, TCA_IFE_TM, sizeof(t), &t, TCA_IFE_PAD)) goto nla_put_failure; - if (!is_zero_ether_addr(ife->eth_dst)) { - if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst)) + if (!is_zero_ether_addr(p->eth_dst)) { + if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, p->eth_dst)) goto nla_put_failure; } - if (!is_zero_ether_addr(ife->eth_src)) { - if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src)) + if (!is_zero_ether_addr(p->eth_src)) { + if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, p->eth_src)) goto nla_put_failure; } - if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type)) + if (nla_put(skb, TCA_IFE_TYPE, 2, &p->eth_type)) goto nla_put_failure; if (dump_metalist(skb, ife)) { @@ -617,19 +662,15 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, u8 *tlv_data; u16 metalen; - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); - spin_unlock(&ife->tcf_lock); if (skb_at_tc_ingress(skb)) skb_push(skb, skb->dev->hard_header_len); tlv_data = ife_decode(skb, &metalen); if (unlikely(!tlv_data)) { - spin_lock(&ife->tcf_lock); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -647,14 +688,12 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, */ pr_info_ratelimited("Unknown metaid %d dlen %d\n", mtype, dlen); - ife->tcf_qstats.overlimits++; + qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); } } if (WARN_ON(tlv_data != ifehdr_end)) { - spin_lock(&ife->tcf_lock); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -683,7 +722,7 @@ static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife) } static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, - struct tcf_result *res) + struct tcf_result *res, struct tcf_ife_params *p) { struct tcf_ife_info *ife = to_ife(a); int action = ife->tcf_action; @@ -706,23 +745,20 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, exceed_mtu = true; } - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); + bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); tcf_lastuse_update(&ife->tcf_tm); if (!metalen) { /* no metadata to send */ /* abuse overlimits to count when we allow packet * with no metadata */ - ife->tcf_qstats.overlimits++; - spin_unlock(&ife->tcf_lock); + qstats_overlimit_inc(this_cpu_ptr(ife->common.cpu_qstats)); return action; } /* could be stupid policy setup or mtu config * so lets be conservative.. */ if ((action == TC_ACT_SHOT) || exceed_mtu) { - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } @@ -731,6 +767,8 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, ife_meta = ife_encode(skb, metalen); + spin_lock(&ife->tcf_lock); + /* XXX: we dont have a clever way of telling encode to * not repeat some of the computations that are done by * ops->presence_check... @@ -742,25 +780,24 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, } if (err < 0) { /* too corrupt to keep around if overwritten */ - ife->tcf_qstats.drops++; spin_unlock(&ife->tcf_lock); + qstats_drop_inc(this_cpu_ptr(ife->common.cpu_qstats)); return TC_ACT_SHOT; } skboff += err; } + spin_unlock(&ife->tcf_lock); oethh = (struct ethhdr *)skb->data; - if (!is_zero_ether_addr(ife->eth_src)) - ether_addr_copy(oethh->h_source, ife->eth_src); - if (!is_zero_ether_addr(ife->eth_dst)) - ether_addr_copy(oethh->h_dest, ife->eth_dst); - oethh->h_proto = htons(ife->eth_type); + if (!is_zero_ether_addr(p->eth_src)) + ether_addr_copy(oethh->h_source, p->eth_src); + if (!is_zero_ether_addr(p->eth_dst)) + ether_addr_copy(oethh->h_dest, p->eth_dst); + oethh->h_proto = htons(p->eth_type); if (skb_at_tc_ingress(skb)) skb_pull(skb, skb->dev->hard_header_len); - spin_unlock(&ife->tcf_lock); - return action; } @@ -768,21 +805,19 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_ife_info *ife = to_ife(a); + struct tcf_ife_params *p; + int ret; + + rcu_read_lock(); + p = rcu_dereference(ife->params); + if (p->flags & IFE_ENCODE) { + ret = tcf_ife_encode(skb, a, res, p); + rcu_read_unlock(); + return ret; + } + rcu_read_unlock(); - if (ife->flags & IFE_ENCODE) - return tcf_ife_encode(skb, a, res); - - if (!(ife->flags & IFE_ENCODE)) - return tcf_ife_decode(skb, a, res); - - pr_info_ratelimited("unknown failure(policy neither de/encode\n"); - spin_lock(&ife->tcf_lock); - bstats_update(&ife->tcf_bstats, skb); - tcf_lastuse_update(&ife->tcf_tm); - ife->tcf_qstats.drops++; - spin_unlock(&ife->tcf_lock); - - return TC_ACT_SHOT; + return tcf_ife_decode(skb, a, res); } static int tcf_ife_walker(struct net *net, struct sk_buff *skb, diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c index 82892170ce4f..1e3f10e5da99 100644 --- a/net/sched/act_meta_mark.c +++ b/net/sched/act_meta_mark.c @@ -76,4 +76,4 @@ module_exit(ifemark_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE skb mark metadata module"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_SKBMARK); +MODULE_ALIAS_IFE_META("skbmark"); diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c index 26bf4d86030b..4033f9fc4d4a 100644 --- a/net/sched/act_meta_skbprio.c +++ b/net/sched/act_meta_skbprio.c @@ -73,4 +73,4 @@ module_exit(ifeprio_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2015)"); MODULE_DESCRIPTION("Inter-FE skb prio metadata action"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_PRIO); +MODULE_ALIAS_IFE_META("skbprio"); diff --git a/net/sched/act_meta_skbtcindex.c b/net/sched/act_meta_skbtcindex.c index 3b35774ce890..2ea1f26c9e96 100644 --- a/net/sched/act_meta_skbtcindex.c +++ b/net/sched/act_meta_skbtcindex.c @@ -76,4 +76,4 @@ module_exit(ifetc_index_cleanup_module); MODULE_AUTHOR("Jamal Hadi Salim(2016)"); MODULE_DESCRIPTION("Inter-FE skb tc_index metadata module"); MODULE_LICENSE("GPL"); -MODULE_ALIAS_IFE_META(IFE_META_SKBTCINDEX); +MODULE_ALIAS_IFE_META("tcindex"); diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 416627c66f08..8b3e59388480 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -140,6 +140,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, m->tcfm_eaction = parm->eaction; if (dev != NULL) { m->tcfm_ifindex = parm->ifindex; + m->net = net; if (ret != ACT_P_CREATED) dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); @@ -313,15 +314,11 @@ static struct notifier_block mirred_device_notifier = { .notifier_call = mirred_device_event, }; -static int tcf_mirred_device(const struct tc_action *a, struct net *net, - struct net_device **mirred_dev) +static struct net_device *tcf_mirred_get_dev(const struct tc_action *a) { - int ifindex = tcf_mirred_ifindex(a); + struct tcf_mirred *m = to_mirred(a); - *mirred_dev = __dev_get_by_index(net, ifindex); - if (!*mirred_dev) - return -EINVAL; - return 0; + return __dev_get_by_index(m->net, m->tcfm_ifindex); } static struct tc_action_ops act_mirred_ops = { @@ -336,7 +333,7 @@ static struct tc_action_ops act_mirred_ops = { .walk = tcf_mirred_walker, .lookup = tcf_mirred_search, .size = sizeof(struct tcf_mirred), - .get_dev = tcf_mirred_device, + .get_dev = tcf_mirred_get_dev, }; static __net_init int mirred_init_net(struct net *net) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0b2219adf520..2977b8a90851 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1004,29 +1004,42 @@ int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) } EXPORT_SYMBOL(tcf_exts_dump_stats); -int tcf_exts_get_dev(struct net_device *dev, struct tcf_exts *exts, - struct net_device **hw_dev) +static int tc_exts_setup_cb_egdev_call(struct tcf_exts *exts, + enum tc_setup_type type, + void *type_data, bool err_stop) { + int ok_count = 0; #ifdef CONFIG_NET_CLS_ACT const struct tc_action *a; + struct net_device *dev; LIST_HEAD(actions); + int ret; if (!tcf_exts_has_actions(exts)) - return -EINVAL; + return 0; tcf_exts_to_list(exts, &actions); list_for_each_entry(a, &actions, list) { - if (a->ops->get_dev) { - a->ops->get_dev(a, dev_net(dev), hw_dev); - break; - } + if (!a->ops->get_dev) + continue; + dev = a->ops->get_dev(a); + if (!dev || !tc_can_offload(dev)) + continue; + ret = tc_setup_cb_egdev_call(dev, type, type_data, err_stop); + if (ret < 0) + return ret; + ok_count += ret; } - if (*hw_dev) - return 0; #endif - return -EOPNOTSUPP; + return ok_count; +} + +int tc_setup_cb_call(struct tcf_exts *exts, enum tc_setup_type type, + void *type_data, bool err_stop) +{ + return tc_exts_setup_cb_egdev_call(exts, type, type_data, err_stop); } -EXPORT_SYMBOL(tcf_exts_get_dev); +EXPORT_SYMBOL(tc_setup_cb_call); static int __init tc_filter_init(void) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index db831ac708f6..5b7bb968d1d4 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -88,7 +88,6 @@ struct cls_fl_filter { u32 handle; u32 flags; struct rcu_head rcu; - struct net_device *hw_dev; }; static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) @@ -201,16 +200,17 @@ static void fl_destroy_filter(struct rcu_head *head) static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = f->hw_dev; - - if (!tc_can_offload(dev)) - return; + struct net_device *dev = tp->q->dev_queue->dev; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, false); } static int fl_hw_replace_filter(struct tcf_proto *tp, @@ -220,20 +220,9 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload cls_flower = {}; + bool skip_sw = tc_skip_sw(f->flags); int err; - if (!tc_can_offload(dev)) { - if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) || - (f->hw_dev && !tc_can_offload(f->hw_dev))) { - f->hw_dev = dev; - return tc_skip_sw(f->flags) ? -EINVAL : 0; - } - dev = f->hw_dev; - cls_flower.egress_dev = true; - } else { - f->hw_dev = dev; - } - tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_REPLACE; cls_flower.cookie = (unsigned long) f; @@ -242,31 +231,47 @@ static int fl_hw_replace_filter(struct tcf_proto *tp, cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); - if (!err) - f->flags |= TCA_CLS_FLAGS_IN_HW; + if (tc_can_offload(dev)) { + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + if (err) { + if (skip_sw) + return err; + } else { + f->flags |= TCA_CLS_FLAGS_IN_HW; + } + } - if (tc_skip_sw(f->flags)) + err = tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, skip_sw); + if (err < 0) { + fl_hw_destroy_filter(tp, f); return err; + } else if (err > 0) { + f->flags |= TCA_CLS_FLAGS_IN_HW; + } + + if (skip_sw && !(f->flags & TCA_CLS_FLAGS_IN_HW)) + return -EINVAL; + return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; - struct net_device *dev = f->hw_dev; - - if (!tc_can_offload(dev)) - return; + struct net_device *dev = tp->q->dev_queue->dev; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; - dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, - &cls_flower); + if (tc_can_offload(dev)) + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, + &cls_flower); + tc_setup_cb_call(&f->exts, TC_SETUP_CLSFLOWER, + &cls_flower, false); } static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 5a4f10080290..db0228a65e8c 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -148,12 +148,6 @@ struct netem_skb_cb { psched_time_t time_to_send; }; - -static struct sk_buff *netem_rb_to_skb(struct rb_node *rb) -{ - return rb_entry(rb, struct sk_buff, rbnode); -} - static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { /* we assume we can use skb next/prev/tstamp as storage for rb_node */ @@ -364,7 +358,7 @@ static void tfifo_reset(struct Qdisc *sch) struct rb_node *p = rb_first(&q->t_root); while (p) { - struct sk_buff *skb = netem_rb_to_skb(p); + struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); @@ -382,7 +376,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) struct sk_buff *skb; parent = *p; - skb = netem_rb_to_skb(parent); + skb = rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) p = &parent->rb_right; else @@ -538,7 +532,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *t_skb; struct netem_skb_cb *t_last; - t_skb = netem_rb_to_skb(rb_last(&q->t_root)); + t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || t_last->time_to_send > last->time_to_send) { @@ -617,7 +611,7 @@ deliver: if (p) { psched_time_t time_to_send; - skb = netem_rb_to_skb(p); + skb = rb_to_skb(p); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 03513a9fa110..0b83ec51e43b 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -124,7 +124,7 @@ static struct sctp_sched_ops sctp_sched_fcfs = { extern struct sctp_sched_ops sctp_sched_prio; extern struct sctp_sched_ops sctp_sched_rr; -struct sctp_sched_ops *sctp_sched_ops[] = { +static struct sctp_sched_ops *sctp_sched_ops[] = { &sctp_sched_fcfs, &sctp_sched_prio, &sctp_sched_rr, diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 20b66e79c5d6..5f6a20084157 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -380,10 +380,14 @@ static int smc_link_determine_gid(struct smc_link_group *lgr) if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, &gattr)) continue; - if (gattr.ndev && - (vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id)) { - lnk->gid = gid; - return 0; + if (gattr.ndev) { + if (is_vlan_dev(gattr.ndev) && + vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) { + lnk->gid = gid; + dev_put(gattr.ndev); + return 0; + } + dev_put(gattr.ndev); } } return -ENODEV; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 0b5852299158..468e1d725d97 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -369,26 +369,17 @@ void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport) { - struct net_device *ndev; + struct ib_gid_attr gattr; int rc; rc = ib_query_gid(smcibdev->ibdev, ibport, 0, - &smcibdev->gid[ibport - 1], NULL); - /* the SMC protocol requires specification of the roce MAC address; - * if net_device cannot be determined, it can be derived from gid 0 - */ - ndev = smcibdev->ibdev->get_netdev(smcibdev->ibdev, ibport); - if (ndev) { - memcpy(&smcibdev->mac, ndev->dev_addr, ETH_ALEN); - dev_put(ndev); - } else if (!rc) { - memcpy(&smcibdev->mac[ibport - 1][0], - &smcibdev->gid[ibport - 1].raw[8], 3); - memcpy(&smcibdev->mac[ibport - 1][3], - &smcibdev->gid[ibport - 1].raw[13], 3); - smcibdev->mac[ibport - 1][0] &= ~0x02; - } - return rc; + &smcibdev->gid[ibport - 1], &gattr); + if (rc || !gattr.ndev) + return -ENODEV; + + memcpy(smcibdev->mac[ibport - 1], gattr.ndev->dev_addr, ETH_ALEN); + dev_put(gattr.ndev); + return 0; } /* Create an identifier unique for this instance of SMC-R. @@ -419,6 +410,7 @@ int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport) &smcibdev->pattr[ibport - 1]); if (rc) goto out; + /* the SMC protocol requires specification of the RoCE MAC address */ rc = smc_ib_fill_gid_and_mac(smcibdev, ibport); if (rc) goto out; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 9b5de31aa429..c1841f234a71 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2203,7 +2203,7 @@ static void xs_udp_setup_socket(struct work_struct *work) struct sock_xprt *transport = container_of(work, struct sock_xprt, connect_worker.work); struct rpc_xprt *xprt = &transport->xprt; - struct socket *sock = transport->sock; + struct socket *sock; int status = -EIO; sock = xs_create_sock(xprt, transport, diff --git a/net/tipc/Makefile b/net/tipc/Makefile index 31b9f9c52974..a3af73ec0b78 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -8,7 +8,7 @@ tipc-y += addr.o bcast.o bearer.o \ core.o link.o discover.o msg.o \ name_distr.o subscr.o monitor.o name_table.o net.o \ netlink.o netlink_compat.o node.o socket.o eth_media.o \ - server.o socket.o + server.o socket.o group.o tipc-$(CONFIG_TIPC_MEDIA_UDP) += udp_media.o tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 7d99029df342..329325bd553e 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -233,7 +233,7 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, struct sk_buff_head xmitq; int rc = 0; - __skb_queue_head_init(&xmitq); + skb_queue_head_init(&xmitq); tipc_bcast_lock(net); if (tipc_link_bc_peers(l)) rc = tipc_link_xmit(l, pkts, &xmitq); @@ -258,20 +258,20 @@ static int tipc_bcast_xmit(struct net *net, struct sk_buff_head *pkts, static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_nlist *dests, u16 *cong_link_cnt) { + struct tipc_dest *dst, *tmp; struct sk_buff_head _pkts; - struct u32_item *n, *tmp; - u32 dst, selector; + u32 dnode, selector; selector = msg_link_selector(buf_msg(skb_peek(pkts))); - __skb_queue_head_init(&_pkts); + skb_queue_head_init(&_pkts); - list_for_each_entry_safe(n, tmp, &dests->list, list) { - dst = n->value; - if (!tipc_msg_pskb_copy(dst, pkts, &_pkts)) + list_for_each_entry_safe(dst, tmp, &dests->list, list) { + dnode = dst->node; + if (!tipc_msg_pskb_copy(dnode, pkts, &_pkts)) return -ENOMEM; /* Any other return value than -ELINKCONG is ignored */ - if (tipc_node_xmit(net, &_pkts, dst, selector) == -ELINKCONG) + if (tipc_node_xmit(net, &_pkts, dnode, selector) == -ELINKCONG) (*cong_link_cnt)++; } return 0; @@ -554,7 +554,7 @@ void tipc_nlist_add(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = true; - else if (u32_push(&nl->list, node)) + else if (tipc_dest_push(&nl->list, node, 0)) nl->remote++; } @@ -562,13 +562,13 @@ void tipc_nlist_del(struct tipc_nlist *nl, u32 node) { if (node == nl->self) nl->local = false; - else if (u32_del(&nl->list, node)) + else if (tipc_dest_del(&nl->list, node, 0)) nl->remote--; } void tipc_nlist_purge(struct tipc_nlist *nl) { - u32_list_purge(&nl->list); + tipc_dest_list_purge(&nl->list); nl->remote = 0; nl->local = 0; } diff --git a/net/tipc/core.h b/net/tipc/core.h index 5cc5398be722..964342689f2c 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -132,6 +132,11 @@ static inline struct list_head *tipc_nodes(struct net *net) return &tipc_net(net)->node_list; } +static inline struct tipc_server *tipc_topsrv(struct net *net) +{ + return tipc_net(net)->topsrv; +} + static inline unsigned int tipc_hashfn(u32 addr) { return addr & (NODE_HTABLE_SIZE - 1); diff --git a/net/tipc/group.c b/net/tipc/group.c new file mode 100644 index 000000000000..7821085a7dd8 --- /dev/null +++ b/net/tipc/group.c @@ -0,0 +1,871 @@ +/* + * net/tipc/group.c: TIPC group messaging code + * + * Copyright (c) 2017, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "core.h" +#include "addr.h" +#include "group.h" +#include "bcast.h" +#include "server.h" +#include "msg.h" +#include "socket.h" +#include "node.h" +#include "name_table.h" +#include "subscr.h" + +#define ADV_UNIT (((MAX_MSG_SIZE + MAX_H_SIZE) / FLOWCTL_BLK_SZ) + 1) +#define ADV_IDLE ADV_UNIT +#define ADV_ACTIVE (ADV_UNIT * 12) + +enum mbr_state { + MBR_QUARANTINED, + MBR_DISCOVERED, + MBR_JOINING, + MBR_PUBLISHED, + MBR_JOINED, + MBR_PENDING, + MBR_ACTIVE, + MBR_RECLAIMING, + MBR_REMITTED, + MBR_LEAVING +}; + +struct tipc_member { + struct rb_node tree_node; + struct list_head list; + struct list_head congested; + struct sk_buff *event_msg; + struct sk_buff_head deferredq; + struct tipc_group *group; + u32 node; + u32 port; + u32 instance; + enum mbr_state state; + u16 advertised; + u16 window; + u16 bc_rcv_nxt; + u16 bc_syncpt; + u16 bc_acked; + bool usr_pending; +}; + +struct tipc_group { + struct rb_root members; + struct list_head congested; + struct list_head pending; + struct list_head active; + struct list_head reclaiming; + struct tipc_nlist dests; + struct net *net; + int subid; + u32 type; + u32 instance; + u32 domain; + u32 scope; + u32 portid; + u16 member_cnt; + u16 active_cnt; + u16 max_active; + u16 bc_snd_nxt; + u16 bc_ackers; + bool loopback; + bool events; +}; + +static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, + int mtyp, struct sk_buff_head *xmitq); + +static void tipc_group_decr_active(struct tipc_group *grp, + struct tipc_member *m) +{ + if (m->state == MBR_ACTIVE || m->state == MBR_RECLAIMING) + grp->active_cnt--; +} + +static int tipc_group_rcvbuf_limit(struct tipc_group *grp) +{ + int max_active, active_pool, idle_pool; + int mcnt = grp->member_cnt + 1; + + /* Limit simultaneous reception from other members */ + max_active = min(mcnt / 8, 64); + max_active = max(max_active, 16); + grp->max_active = max_active; + + /* Reserve blocks for active and idle members */ + active_pool = max_active * ADV_ACTIVE; + idle_pool = (mcnt - max_active) * ADV_IDLE; + + /* Scale to bytes, considering worst-case truesize/msgsize ratio */ + return (active_pool + idle_pool) * FLOWCTL_BLK_SZ * 4; +} + +u16 tipc_group_bc_snd_nxt(struct tipc_group *grp) +{ + return grp->bc_snd_nxt; +} + +static bool tipc_group_is_enabled(struct tipc_member *m) +{ + return m->state != MBR_QUARANTINED && m->state != MBR_LEAVING; +} + +static bool tipc_group_is_receiver(struct tipc_member *m) +{ + return m && m->state >= MBR_JOINED; +} + +u32 tipc_group_exclude(struct tipc_group *grp) +{ + if (!grp->loopback) + return grp->portid; + return 0; +} + +int tipc_group_size(struct tipc_group *grp) +{ + return grp->member_cnt; +} + +struct tipc_group *tipc_group_create(struct net *net, u32 portid, + struct tipc_group_req *mreq) +{ + struct tipc_group *grp; + u32 type = mreq->type; + + grp = kzalloc(sizeof(*grp), GFP_ATOMIC); + if (!grp) + return NULL; + tipc_nlist_init(&grp->dests, tipc_own_addr(net)); + INIT_LIST_HEAD(&grp->congested); + INIT_LIST_HEAD(&grp->active); + INIT_LIST_HEAD(&grp->pending); + INIT_LIST_HEAD(&grp->reclaiming); + grp->members = RB_ROOT; + grp->net = net; + grp->portid = portid; + grp->domain = addr_domain(net, mreq->scope); + grp->type = type; + grp->instance = mreq->instance; + grp->scope = mreq->scope; + grp->loopback = mreq->flags & TIPC_GROUP_LOOPBACK; + grp->events = mreq->flags & TIPC_GROUP_MEMBER_EVTS; + if (tipc_topsrv_kern_subscr(net, portid, type, 0, ~0, &grp->subid)) + return grp; + kfree(grp); + return NULL; +} + +void tipc_group_delete(struct net *net, struct tipc_group *grp) +{ + struct rb_root *tree = &grp->members; + struct tipc_member *m, *tmp; + struct sk_buff_head xmitq; + + __skb_queue_head_init(&xmitq); + + rbtree_postorder_for_each_entry_safe(m, tmp, tree, tree_node) { + tipc_group_proto_xmit(grp, m, GRP_LEAVE_MSG, &xmitq); + list_del(&m->list); + kfree(m); + } + tipc_node_distr_xmit(net, &xmitq); + tipc_nlist_purge(&grp->dests); + tipc_topsrv_kern_unsubscr(net, grp->subid); + kfree(grp); +} + +struct tipc_member *tipc_group_find_member(struct tipc_group *grp, + u32 node, u32 port) +{ + struct rb_node *n = grp->members.rb_node; + u64 nkey, key = (u64)node << 32 | port; + struct tipc_member *m; + + while (n) { + m = container_of(n, struct tipc_member, tree_node); + nkey = (u64)m->node << 32 | m->port; + if (key < nkey) + n = n->rb_left; + else if (key > nkey) + n = n->rb_right; + else + return m; + } + return NULL; +} + +static struct tipc_member *tipc_group_find_dest(struct tipc_group *grp, + u32 node, u32 port) +{ + struct tipc_member *m; + + m = tipc_group_find_member(grp, node, port); + if (m && tipc_group_is_enabled(m)) + return m; + return NULL; +} + +static struct tipc_member *tipc_group_find_node(struct tipc_group *grp, + u32 node) +{ + struct tipc_member *m; + struct rb_node *n; + + for (n = rb_first(&grp->members); n; n = rb_next(n)) { + m = container_of(n, struct tipc_member, tree_node); + if (m->node == node) + return m; + } + return NULL; +} + +static void tipc_group_add_to_tree(struct tipc_group *grp, + struct tipc_member *m) +{ + u64 nkey, key = (u64)m->node << 32 | m->port; + struct rb_node **n, *parent = NULL; + struct tipc_member *tmp; + + n = &grp->members.rb_node; + while (*n) { + tmp = container_of(*n, struct tipc_member, tree_node); + parent = *n; + tmp = container_of(parent, struct tipc_member, tree_node); + nkey = (u64)tmp->node << 32 | tmp->port; + if (key < nkey) + n = &(*n)->rb_left; + else if (key > nkey) + n = &(*n)->rb_right; + else + return; + } + rb_link_node(&m->tree_node, parent, n); + rb_insert_color(&m->tree_node, &grp->members); +} + +static struct tipc_member *tipc_group_create_member(struct tipc_group *grp, + u32 node, u32 port, + int state) +{ + struct tipc_member *m; + + m = kzalloc(sizeof(*m), GFP_ATOMIC); + if (!m) + return NULL; + INIT_LIST_HEAD(&m->list); + INIT_LIST_HEAD(&m->congested); + __skb_queue_head_init(&m->deferredq); + m->group = grp; + m->node = node; + m->port = port; + m->bc_acked = grp->bc_snd_nxt - 1; + grp->member_cnt++; + tipc_group_add_to_tree(grp, m); + tipc_nlist_add(&grp->dests, m->node); + m->state = state; + return m; +} + +void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port) +{ + tipc_group_create_member(grp, node, port, MBR_DISCOVERED); +} + +static void tipc_group_delete_member(struct tipc_group *grp, + struct tipc_member *m) +{ + rb_erase(&m->tree_node, &grp->members); + grp->member_cnt--; + + /* Check if we were waiting for replicast ack from this member */ + if (grp->bc_ackers && less(m->bc_acked, grp->bc_snd_nxt - 1)) + grp->bc_ackers--; + + list_del_init(&m->list); + list_del_init(&m->congested); + tipc_group_decr_active(grp, m); + + /* If last member on a node, remove node from dest list */ + if (!tipc_group_find_node(grp, m->node)) + tipc_nlist_del(&grp->dests, m->node); + + kfree(m); +} + +struct tipc_nlist *tipc_group_dests(struct tipc_group *grp) +{ + return &grp->dests; +} + +void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, + int *scope) +{ + seq->type = grp->type; + seq->lower = grp->instance; + seq->upper = grp->instance; + *scope = grp->scope; +} + +void tipc_group_update_member(struct tipc_member *m, int len) +{ + struct tipc_group *grp = m->group; + struct tipc_member *_m, *tmp; + + if (!tipc_group_is_enabled(m)) + return; + + m->window -= len; + + if (m->window >= ADV_IDLE) + return; + + if (!list_empty(&m->congested)) + return; + + /* Sort member into congested members' list */ + list_for_each_entry_safe(_m, tmp, &grp->congested, congested) { + if (m->window > _m->window) + continue; + list_add_tail(&m->congested, &_m->congested); + return; + } + list_add_tail(&m->congested, &grp->congested); +} + +void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack) +{ + u16 prev = grp->bc_snd_nxt - 1; + struct tipc_member *m; + struct rb_node *n; + + for (n = rb_first(&grp->members); n; n = rb_next(n)) { + m = container_of(n, struct tipc_member, tree_node); + if (tipc_group_is_enabled(m)) { + tipc_group_update_member(m, len); + m->bc_acked = prev; + } + } + + /* Mark number of acknowledges to expect, if any */ + if (ack) + grp->bc_ackers = grp->member_cnt; + grp->bc_snd_nxt++; +} + +bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, + int len, struct tipc_member **mbr) +{ + struct sk_buff_head xmitq; + struct tipc_member *m; + int adv, state; + + m = tipc_group_find_dest(grp, dnode, dport); + *mbr = m; + if (!m) + return false; + if (m->usr_pending) + return true; + if (m->window >= len) + return false; + m->usr_pending = true; + + /* If not fully advertised, do it now to prevent mutual blocking */ + adv = m->advertised; + state = m->state; + if (state < MBR_JOINED) + return true; + if (state == MBR_JOINED && adv == ADV_IDLE) + return true; + if (state == MBR_ACTIVE && adv == ADV_ACTIVE) + return true; + if (state == MBR_PENDING && adv == ADV_IDLE) + return true; + skb_queue_head_init(&xmitq); + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, &xmitq); + tipc_node_distr_xmit(grp->net, &xmitq); + return true; +} + +bool tipc_group_bc_cong(struct tipc_group *grp, int len) +{ + struct tipc_member *m = NULL; + + /* If prev bcast was replicast, reject until all receivers have acked */ + if (grp->bc_ackers) + return true; + + if (list_empty(&grp->congested)) + return false; + + m = list_first_entry(&grp->congested, struct tipc_member, congested); + if (m->window >= len) + return false; + + return tipc_group_cong(grp, m->node, m->port, len, &m); +} + +/* tipc_group_sort_msg() - sort msg into queue by bcast sequence number + */ +static void tipc_group_sort_msg(struct sk_buff *skb, struct sk_buff_head *defq) +{ + struct tipc_msg *_hdr, *hdr = buf_msg(skb); + u16 bc_seqno = msg_grp_bc_seqno(hdr); + struct sk_buff *_skb, *tmp; + int mtyp = msg_type(hdr); + + /* Bcast/mcast may be bypassed by ucast or other bcast, - sort it in */ + if (mtyp == TIPC_GRP_BCAST_MSG || mtyp == TIPC_GRP_MCAST_MSG) { + skb_queue_walk_safe(defq, _skb, tmp) { + _hdr = buf_msg(_skb); + if (!less(bc_seqno, msg_grp_bc_seqno(_hdr))) + continue; + __skb_queue_before(defq, _skb, skb); + return; + } + /* Bcast was not bypassed, - add to tail */ + } + /* Unicasts are never bypassed, - always add to tail */ + __skb_queue_tail(defq, skb); +} + +/* tipc_group_filter_msg() - determine if we should accept arriving message + */ +void tipc_group_filter_msg(struct tipc_group *grp, struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = __skb_dequeue(inputq); + bool ack, deliver, update, leave = false; + struct sk_buff_head *defq; + struct tipc_member *m; + struct tipc_msg *hdr; + u32 node, port; + int mtyp, blks; + + if (!skb) + return; + + hdr = buf_msg(skb); + node = msg_orignode(hdr); + port = msg_origport(hdr); + + if (!msg_in_group(hdr)) + goto drop; + + m = tipc_group_find_member(grp, node, port); + if (!tipc_group_is_receiver(m)) + goto drop; + + if (less(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) + goto drop; + + TIPC_SKB_CB(skb)->orig_member = m->instance; + defq = &m->deferredq; + tipc_group_sort_msg(skb, defq); + + while ((skb = skb_peek(defq))) { + hdr = buf_msg(skb); + mtyp = msg_type(hdr); + deliver = true; + ack = false; + update = false; + + if (more(msg_grp_bc_seqno(hdr), m->bc_rcv_nxt)) + break; + + /* Decide what to do with message */ + switch (mtyp) { + case TIPC_GRP_MCAST_MSG: + if (msg_nameinst(hdr) != grp->instance) { + update = true; + deliver = false; + } + /* Fall thru */ + case TIPC_GRP_BCAST_MSG: + m->bc_rcv_nxt++; + ack = msg_grp_bc_ack_req(hdr); + break; + case TIPC_GRP_UCAST_MSG: + break; + case TIPC_GRP_MEMBER_EVT: + if (m->state == MBR_LEAVING) + leave = true; + if (!grp->events) + deliver = false; + break; + default: + break; + } + + /* Execute decisions */ + __skb_dequeue(defq); + if (deliver) + __skb_queue_tail(inputq, skb); + else + kfree_skb(skb); + + if (ack) + tipc_group_proto_xmit(grp, m, GRP_ACK_MSG, xmitq); + + if (leave) { + tipc_group_delete_member(grp, m); + __skb_queue_purge(defq); + break; + } + if (!update) + continue; + + blks = msg_blocks(hdr); + tipc_group_update_rcv_win(grp, blks, node, port, xmitq); + } + return; +drop: + kfree_skb(skb); +} + +void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, + u32 port, struct sk_buff_head *xmitq) +{ + struct list_head *active = &grp->active; + int max_active = grp->max_active; + int reclaim_limit = max_active * 3 / 4; + int active_cnt = grp->active_cnt; + struct tipc_member *m, *rm; + + m = tipc_group_find_member(grp, node, port); + if (!m) + return; + + m->advertised -= blks; + + switch (m->state) { + case MBR_JOINED: + /* Reclaim advertised space from least active member */ + if (!list_empty(active) && active_cnt >= reclaim_limit) { + rm = list_first_entry(active, struct tipc_member, list); + rm->state = MBR_RECLAIMING; + list_move_tail(&rm->list, &grp->reclaiming); + tipc_group_proto_xmit(grp, rm, GRP_RECLAIM_MSG, xmitq); + } + /* If max active, become pending and wait for reclaimed space */ + if (active_cnt >= max_active) { + m->state = MBR_PENDING; + list_add_tail(&m->list, &grp->pending); + break; + } + /* Otherwise become active */ + m->state = MBR_ACTIVE; + list_add_tail(&m->list, &grp->active); + grp->active_cnt++; + /* Fall through */ + case MBR_ACTIVE: + if (!list_is_last(&m->list, &grp->active)) + list_move_tail(&m->list, &grp->active); + if (m->advertised > (ADV_ACTIVE * 3 / 4)) + break; + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + break; + case MBR_REMITTED: + if (m->advertised > ADV_IDLE) + break; + m->state = MBR_JOINED; + if (m->advertised < ADV_IDLE) { + pr_warn_ratelimited("Rcv unexpected msg after REMIT\n"); + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + } + break; + case MBR_RECLAIMING: + case MBR_DISCOVERED: + case MBR_JOINING: + case MBR_LEAVING: + default: + break; + } +} + +static void tipc_group_proto_xmit(struct tipc_group *grp, struct tipc_member *m, + int mtyp, struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr; + struct sk_buff *skb; + int adv = 0; + + skb = tipc_msg_create(GROUP_PROTOCOL, mtyp, INT_H_SIZE, 0, + m->node, tipc_own_addr(grp->net), + m->port, grp->portid, 0); + if (!skb) + return; + + if (m->state == MBR_ACTIVE) + adv = ADV_ACTIVE - m->advertised; + else if (m->state == MBR_JOINED || m->state == MBR_PENDING) + adv = ADV_IDLE - m->advertised; + + hdr = buf_msg(skb); + + if (mtyp == GRP_JOIN_MSG) { + msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); + msg_set_adv_win(hdr, adv); + m->advertised += adv; + } else if (mtyp == GRP_LEAVE_MSG) { + msg_set_grp_bc_syncpt(hdr, grp->bc_snd_nxt); + } else if (mtyp == GRP_ADV_MSG) { + msg_set_adv_win(hdr, adv); + m->advertised += adv; + } else if (mtyp == GRP_ACK_MSG) { + msg_set_grp_bc_acked(hdr, m->bc_rcv_nxt); + } else if (mtyp == GRP_REMIT_MSG) { + msg_set_grp_remitted(hdr, m->window); + } + __skb_queue_tail(xmitq, skb); +} + +void tipc_group_proto_rcv(struct tipc_group *grp, bool *usr_wakeup, + struct tipc_msg *hdr, struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + u32 node = msg_orignode(hdr); + u32 port = msg_origport(hdr); + struct tipc_member *m, *pm; + struct tipc_msg *ehdr; + u16 remitted, in_flight; + + if (!grp) + return; + + m = tipc_group_find_member(grp, node, port); + + switch (msg_type(hdr)) { + case GRP_JOIN_MSG: + if (!m) + m = tipc_group_create_member(grp, node, port, + MBR_QUARANTINED); + if (!m) + return; + m->bc_syncpt = msg_grp_bc_syncpt(hdr); + m->bc_rcv_nxt = m->bc_syncpt; + m->window += msg_adv_win(hdr); + + /* Wait until PUBLISH event is received */ + if (m->state == MBR_DISCOVERED) { + m->state = MBR_JOINING; + } else if (m->state == MBR_PUBLISHED) { + m->state = MBR_JOINED; + *usr_wakeup = true; + m->usr_pending = false; + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + ehdr = buf_msg(m->event_msg); + msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); + __skb_queue_tail(inputq, m->event_msg); + } + if (m->window < ADV_IDLE) + tipc_group_update_member(m, 0); + else + list_del_init(&m->congested); + return; + case GRP_LEAVE_MSG: + if (!m) + return; + m->bc_syncpt = msg_grp_bc_syncpt(hdr); + + /* Wait until WITHDRAW event is received */ + if (m->state != MBR_LEAVING) { + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; + return; + } + /* Otherwise deliver already received WITHDRAW event */ + ehdr = buf_msg(m->event_msg); + msg_set_grp_bc_seqno(ehdr, m->bc_syncpt); + __skb_queue_tail(inputq, m->event_msg); + *usr_wakeup = true; + list_del_init(&m->congested); + return; + case GRP_ADV_MSG: + if (!m) + return; + m->window += msg_adv_win(hdr); + *usr_wakeup = m->usr_pending; + m->usr_pending = false; + list_del_init(&m->congested); + return; + case GRP_ACK_MSG: + if (!m) + return; + m->bc_acked = msg_grp_bc_acked(hdr); + if (--grp->bc_ackers) + break; + *usr_wakeup = true; + m->usr_pending = false; + return; + case GRP_RECLAIM_MSG: + if (!m) + return; + *usr_wakeup = m->usr_pending; + m->usr_pending = false; + tipc_group_proto_xmit(grp, m, GRP_REMIT_MSG, xmitq); + m->window = ADV_IDLE; + return; + case GRP_REMIT_MSG: + if (!m || m->state != MBR_RECLAIMING) + return; + + list_del_init(&m->list); + grp->active_cnt--; + remitted = msg_grp_remitted(hdr); + + /* Messages preceding the REMIT still in receive queue */ + if (m->advertised > remitted) { + m->state = MBR_REMITTED; + in_flight = m->advertised - remitted; + } + /* All messages preceding the REMIT have been read */ + if (m->advertised <= remitted) { + m->state = MBR_JOINED; + in_flight = 0; + } + /* ..and the REMIT overtaken by more messages => re-advertise */ + if (m->advertised < remitted) + tipc_group_proto_xmit(grp, m, GRP_ADV_MSG, xmitq); + + m->advertised = ADV_IDLE + in_flight; + + /* Set oldest pending member to active and advertise */ + if (list_empty(&grp->pending)) + return; + pm = list_first_entry(&grp->pending, struct tipc_member, list); + pm->state = MBR_ACTIVE; + list_move_tail(&pm->list, &grp->active); + grp->active_cnt++; + if (pm->advertised <= (ADV_ACTIVE * 3 / 4)) + tipc_group_proto_xmit(grp, pm, GRP_ADV_MSG, xmitq); + return; + default: + pr_warn("Received unknown GROUP_PROTO message\n"); + } +} + +/* tipc_group_member_evt() - receive and handle a member up/down event + */ +void tipc_group_member_evt(struct tipc_group *grp, + bool *usr_wakeup, + int *sk_rcvbuf, + struct sk_buff *skb, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_event *evt = (void *)msg_data(hdr); + u32 instance = evt->found_lower; + u32 node = evt->port.node; + u32 port = evt->port.ref; + int event = evt->event; + struct tipc_member *m; + struct net *net; + bool node_up; + u32 self; + + if (!grp) + goto drop; + + net = grp->net; + self = tipc_own_addr(net); + if (!grp->loopback && node == self && port == grp->portid) + goto drop; + + /* Convert message before delivery to user */ + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_user(hdr, TIPC_CRITICAL_IMPORTANCE); + msg_set_type(hdr, TIPC_GRP_MEMBER_EVT); + msg_set_origport(hdr, port); + msg_set_orignode(hdr, node); + msg_set_nametype(hdr, grp->type); + msg_set_grp_evt(hdr, event); + + m = tipc_group_find_member(grp, node, port); + + if (event == TIPC_PUBLISHED) { + if (!m) + m = tipc_group_create_member(grp, node, port, + MBR_DISCOVERED); + if (!m) + goto drop; + + /* Hold back event if JOIN message not yet received */ + if (m->state == MBR_DISCOVERED) { + m->event_msg = skb; + m->state = MBR_PUBLISHED; + } else { + msg_set_grp_bc_seqno(hdr, m->bc_syncpt); + __skb_queue_tail(inputq, skb); + m->state = MBR_JOINED; + *usr_wakeup = true; + m->usr_pending = false; + } + m->instance = instance; + TIPC_SKB_CB(skb)->orig_member = m->instance; + tipc_group_proto_xmit(grp, m, GRP_JOIN_MSG, xmitq); + if (m->window < ADV_IDLE) + tipc_group_update_member(m, 0); + else + list_del_init(&m->congested); + } else if (event == TIPC_WITHDRAWN) { + if (!m) + goto drop; + + TIPC_SKB_CB(skb)->orig_member = m->instance; + + *usr_wakeup = true; + m->usr_pending = false; + node_up = tipc_node_is_up(net, node); + + /* Hold back event if more messages might be expected */ + if (m->state != MBR_LEAVING && node_up) { + m->event_msg = skb; + tipc_group_decr_active(grp, m); + m->state = MBR_LEAVING; + } else { + if (node_up) + msg_set_grp_bc_seqno(hdr, m->bc_syncpt); + else + msg_set_grp_bc_seqno(hdr, m->bc_rcv_nxt); + __skb_queue_tail(inputq, skb); + } + list_del_init(&m->congested); + } + *sk_rcvbuf = tipc_group_rcvbuf_limit(grp); + return; +drop: + kfree_skb(skb); +} diff --git a/net/tipc/group.h b/net/tipc/group.h new file mode 100644 index 000000000000..d525e1cd7de5 --- /dev/null +++ b/net/tipc/group.h @@ -0,0 +1,73 @@ +/* + * net/tipc/group.h: Include file for TIPC group unicast/multicast functions + * + * Copyright (c) 2017, Ericsson AB + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TIPC_GROUP_H +#define _TIPC_GROUP_H + +#include "core.h" + +struct tipc_group; +struct tipc_member; +struct tipc_msg; + +struct tipc_group *tipc_group_create(struct net *net, u32 portid, + struct tipc_group_req *mreq); +void tipc_group_delete(struct net *net, struct tipc_group *grp); +void tipc_group_add_member(struct tipc_group *grp, u32 node, u32 port); +struct tipc_nlist *tipc_group_dests(struct tipc_group *grp); +void tipc_group_self(struct tipc_group *grp, struct tipc_name_seq *seq, + int *scope); +u32 tipc_group_exclude(struct tipc_group *grp); +void tipc_group_filter_msg(struct tipc_group *grp, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq); +void tipc_group_member_evt(struct tipc_group *grp, bool *wakeup, + int *sk_rcvbuf, struct sk_buff *skb, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq); +void tipc_group_proto_rcv(struct tipc_group *grp, bool *wakeup, + struct tipc_msg *hdr, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq); +void tipc_group_update_bc_members(struct tipc_group *grp, int len, bool ack); +bool tipc_group_cong(struct tipc_group *grp, u32 dnode, u32 dport, + int len, struct tipc_member **m); +bool tipc_group_bc_cong(struct tipc_group *grp, int len); +void tipc_group_update_rcv_win(struct tipc_group *grp, int blks, u32 node, + u32 port, struct sk_buff_head *xmitq); +u16 tipc_group_bc_snd_nxt(struct tipc_group *grp); +void tipc_group_update_member(struct tipc_member *m, int len); +int tipc_group_size(struct tipc_group *grp); +#endif diff --git a/net/tipc/link.c b/net/tipc/link.c index ac0144f532aa..723dd6998684 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1039,6 +1039,7 @@ int tipc_link_retrans(struct tipc_link *l, struct tipc_link *nacker, static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *inputq) { + struct sk_buff_head *mc_inputq = l->bc_rcvlink->inputq; struct tipc_msg *hdr = buf_msg(skb); switch (msg_user(hdr)) { @@ -1046,12 +1047,14 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: - if (unlikely(msg_type(hdr) == TIPC_MCAST_MSG)) { - skb_queue_tail(l->bc_rcvlink->inputq, skb); + if (unlikely(msg_in_group(hdr) || msg_mcast(hdr))) { + skb_queue_tail(mc_inputq, skb); return true; } case CONN_MANAGER: - skb_queue_tail(inputq, skb); + return true; + case GROUP_PROTOCOL: + skb_queue_tail(mc_inputq, skb); return true; case NAME_DISTRIBUTOR: l->bc_rcvlink->state = LINK_ESTABLISHED; diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 121e59a1d0e7..1649d456e22d 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -568,6 +568,14 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) msg_set_destnode(msg, dnode); msg_set_destport(msg, dport); *err = TIPC_OK; + + if (!skb_cloned(skb)) + return true; + + /* Unclone buffer in case it was bundled */ + if (pskb_expand_head(skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC)) + return false; + return true; } @@ -658,3 +666,10 @@ void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, } kfree_skb(skb); } + +void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, + struct sk_buff_head *xmitq) +{ + if (tipc_msg_reverse(tipc_own_addr(net), &skb, err)) + __skb_queue_tail(xmitq, skb); +} diff --git a/net/tipc/msg.h b/net/tipc/msg.h index c843fd2bc48d..cedf811317fb 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -1,7 +1,7 @@ /* * net/tipc/msg.h: Include file for TIPC message header routines * - * Copyright (c) 2000-2007, 2014-2015 Ericsson AB + * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * @@ -61,10 +61,14 @@ struct plist; /* * Payload message types */ -#define TIPC_CONN_MSG 0 -#define TIPC_MCAST_MSG 1 -#define TIPC_NAMED_MSG 2 -#define TIPC_DIRECT_MSG 3 +#define TIPC_CONN_MSG 0 +#define TIPC_MCAST_MSG 1 +#define TIPC_NAMED_MSG 2 +#define TIPC_DIRECT_MSG 3 +#define TIPC_GRP_MEMBER_EVT 4 +#define TIPC_GRP_BCAST_MSG 5 +#define TIPC_GRP_MCAST_MSG 6 +#define TIPC_GRP_UCAST_MSG 7 /* * Internal message users @@ -73,11 +77,13 @@ struct plist; #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 +#define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define SOCK_WAKEUP 14 /* pseudo user */ +#define TOP_SRV 15 /* pseudo user */ /* * Message header sizes @@ -86,6 +92,7 @@ struct plist; #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ +#define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ @@ -96,6 +103,7 @@ struct plist; struct tipc_skb_cb { u32 bytes_read; + u32 orig_member; struct sk_buff *tail; bool validated; u16 chain_imp; @@ -188,6 +196,11 @@ static inline u32 msg_size(struct tipc_msg *m) return msg_bits(m, 0, 0, 0x1ffff); } +static inline u32 msg_blocks(struct tipc_msg *m) +{ + return (msg_size(m) / 1024) + 1; +} + static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); @@ -251,6 +264,18 @@ static inline void msg_set_type(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 29, 0x7, n); } +static inline int msg_in_group(struct tipc_msg *m) +{ + int mtyp = msg_type(m); + + return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; +} + +static inline bool msg_is_grp_evt(struct tipc_msg *m) +{ + return msg_type(m) == TIPC_GRP_MEMBER_EVT; +} + static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; @@ -258,7 +283,10 @@ static inline u32 msg_named(struct tipc_msg *m) static inline u32 msg_mcast(struct tipc_msg *m) { - return msg_type(m) == TIPC_MCAST_MSG; + int mtyp = msg_type(m); + + return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || + (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) @@ -514,6 +542,16 @@ static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) #define DSC_RESP_MSG 1 /* + * Group protocol message types + */ +#define GRP_JOIN_MSG 0 +#define GRP_LEAVE_MSG 1 +#define GRP_ADV_MSG 2 +#define GRP_ACK_MSG 3 +#define GRP_RECLAIM_MSG 4 +#define GRP_REMIT_MSG 5 + +/* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) @@ -764,12 +802,12 @@ static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 16, 0xffff, n); } -static inline u32 msg_adv_win(struct tipc_msg *m) +static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } -static inline void msg_set_adv_win(struct tipc_msg *m, u32 n) +static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } @@ -794,6 +832,68 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } +static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +static inline u16 msg_grp_bc_acked(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +static inline u16 msg_grp_remitted(struct tipc_msg *m) +{ + return msg_bits(m, 9, 16, 0xffff); +} + +static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) +{ + msg_set_bits(m, 9, 16, 0xffff, n); +} + +/* Word 10 + */ +static inline u16 msg_grp_evt(struct tipc_msg *m) +{ + return msg_bits(m, 10, 0, 0x3); +} + +static inline void msg_set_grp_evt(struct tipc_msg *m, int n) +{ + msg_set_bits(m, 10, 0, 0x3, n); +} + +static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) +{ + return msg_bits(m, 10, 0, 0x1); +} + +static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) +{ + msg_set_bits(m, 10, 0, 0x1, n); +} + +static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) +{ + return msg_bits(m, 10, 16, 0xffff); +} + +static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) +{ + msg_set_bits(m, 10, 16, 0xffff, n); +} + static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) @@ -818,6 +918,8 @@ static inline bool msg_is_reset(struct tipc_msg *hdr) struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); +void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, + struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index bd0aac87b41a..2856e19e036e 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -43,6 +43,7 @@ #include "bcast.h" #include "addr.h" #include "node.h" +#include "group.h" #include <net/genetlink.h> #define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */ @@ -596,18 +597,47 @@ not_found: return ref; } -/** - * tipc_nametbl_mc_translate - find multicast destinations - * - * Creates list of all local ports that overlap the given multicast address; - * also determines if any off-node ports overlap. - * - * Note: Publications with a scope narrower than 'limit' are ignored. - * (i.e. local node-scope publications mustn't receive messages arriving - * from another node, even if the multcast link brought it here) - * - * Returns non-zero if any off-node ports overlap - */ +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, + struct list_head *dsts, int *dstcnt, u32 exclude, + bool all) +{ + u32 self = tipc_own_addr(net); + struct publication *publ; + struct name_info *info; + struct name_seq *seq; + struct sub_seq *sseq; + + if (!tipc_in_scope(domain, self)) + return false; + + *dstcnt = 0; + rcu_read_lock(); + seq = nametbl_find_seq(net, type); + if (unlikely(!seq)) + goto exit; + spin_lock_bh(&seq->lock); + sseq = nameseq_find_subseq(seq, instance); + if (likely(sseq)) { + info = sseq->info; + list_for_each_entry(publ, &info->zone_list, zone_list) { + if (!tipc_in_scope(domain, publ->node)) + continue; + if (publ->ref == exclude && publ->node == self) + continue; + tipc_dest_push(dsts, publ->node, publ->ref); + (*dstcnt)++; + if (all) + continue; + list_move_tail(&publ->zone_list, &info->zone_list); + break; + } + } + spin_unlock_bh(&seq->lock); +exit: + rcu_read_unlock(); + return !list_empty(dsts); +} + int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, u32 limit, struct list_head *dports) { @@ -634,7 +664,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, info = sseq->info; list_for_each_entry(publ, &info->node_list, node_list) { if (publ->scope <= limit) - u32_push(dports, publ->ref); + tipc_dest_push(dports, 0, publ->ref); } if (info->cluster_list_size != info->node_list_size) @@ -679,6 +709,37 @@ exit: rcu_read_unlock(); } +/* tipc_nametbl_build_group - build list of communication group members + */ +void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, + u32 type, u32 domain) +{ + struct sub_seq *sseq, *stop; + struct name_info *info; + struct publication *p; + struct name_seq *seq; + + rcu_read_lock(); + seq = nametbl_find_seq(net, type); + if (!seq) + goto exit; + + spin_lock_bh(&seq->lock); + sseq = seq->sseqs; + stop = seq->sseqs + seq->first_free; + for (; sseq != stop; sseq++) { + info = sseq->info; + list_for_each_entry(p, &info->zone_list, zone_list) { + if (!tipc_in_scope(domain, p->node)) + continue; + tipc_group_add_member(grp, p->node, p->ref); + } + } + spin_unlock_bh(&seq->lock); +exit: + rcu_read_unlock(); +} + /* * tipc_nametbl_publish - add name publication to network name tables */ @@ -1057,78 +1118,79 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -bool u32_find(struct list_head *l, u32 value) +struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port) { - struct u32_item *item; + u64 value = (u64)node << 32 | port; + struct tipc_dest *dst; - list_for_each_entry(item, l, list) { - if (item->value == value) - return true; + list_for_each_entry(dst, l, list) { + if (dst->value != value) + continue; + return dst; } - return false; + return NULL; } -bool u32_push(struct list_head *l, u32 value) +bool tipc_dest_push(struct list_head *l, u32 node, u32 port) { - struct u32_item *item; + u64 value = (u64)node << 32 | port; + struct tipc_dest *dst; - list_for_each_entry(item, l, list) { - if (item->value == value) - return false; - } - item = kmalloc(sizeof(*item), GFP_ATOMIC); - if (unlikely(!item)) + if (tipc_dest_find(l, node, port)) return false; - item->value = value; - list_add(&item->list, l); + dst = kmalloc(sizeof(*dst), GFP_ATOMIC); + if (unlikely(!dst)) + return false; + dst->value = value; + list_add(&dst->list, l); return true; } -u32 u32_pop(struct list_head *l) +bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port) { - struct u32_item *item; - u32 value = 0; + struct tipc_dest *dst; if (list_empty(l)) - return 0; - item = list_first_entry(l, typeof(*item), list); - value = item->value; - list_del(&item->list); - kfree(item); - return value; + return false; + dst = list_first_entry(l, typeof(*dst), list); + if (port) + *port = dst->port; + if (node) + *node = dst->node; + list_del(&dst->list); + kfree(dst); + return true; } -bool u32_del(struct list_head *l, u32 value) +bool tipc_dest_del(struct list_head *l, u32 node, u32 port) { - struct u32_item *item, *tmp; + struct tipc_dest *dst; - list_for_each_entry_safe(item, tmp, l, list) { - if (item->value != value) - continue; - list_del(&item->list); - kfree(item); - return true; - } - return false; + dst = tipc_dest_find(l, node, port); + if (!dst) + return false; + list_del(&dst->list); + kfree(dst); + return true; } -void u32_list_purge(struct list_head *l) +void tipc_dest_list_purge(struct list_head *l) { - struct u32_item *item, *tmp; + struct tipc_dest *dst, *tmp; - list_for_each_entry_safe(item, tmp, l, list) { - list_del(&item->list); - kfree(item); + list_for_each_entry_safe(dst, tmp, l, list) { + list_del(&dst->list); + kfree(dst); } } -int u32_list_len(struct list_head *l) +int tipc_dest_list_len(struct list_head *l) { - struct u32_item *item; + struct tipc_dest *dst; int i = 0; - list_for_each_entry(item, l, list) { + list_for_each_entry(dst, l, list) { i++; } return i; diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index 6ebdeb1d84a5..71926e429446 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -40,6 +40,7 @@ struct tipc_subscription; struct tipc_plist; struct tipc_nlist; +struct tipc_group; /* * TIPC name types reserved for internal TIPC use (both current and planned) @@ -101,9 +102,14 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb); u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node); int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper, u32 limit, struct list_head *dports); +void tipc_nametbl_build_group(struct net *net, struct tipc_group *grp, + u32 type, u32 domain); void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, u32 upper, u32 domain, struct tipc_nlist *nodes); +bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 domain, + struct list_head *dsts, int *dstcnt, u32 exclude, + bool all); struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, u32 upper, u32 scope, u32 port_ref, u32 key); @@ -120,16 +126,22 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s); int tipc_nametbl_init(struct net *net); void tipc_nametbl_stop(struct net *net); -struct u32_item { +struct tipc_dest { struct list_head list; - u32 value; + union { + struct { + u32 port; + u32 node; + }; + u64 value; + }; }; -bool u32_push(struct list_head *l, u32 value); -u32 u32_pop(struct list_head *l); -bool u32_find(struct list_head *l, u32 value); -bool u32_del(struct list_head *l, u32 value); -void u32_list_purge(struct list_head *l); -int u32_list_len(struct list_head *l); +struct tipc_dest *tipc_dest_find(struct list_head *l, u32 node, u32 port); +bool tipc_dest_push(struct list_head *l, u32 node, u32 port); +bool tipc_dest_pop(struct list_head *l, u32 *node, u32 *port); +bool tipc_dest_del(struct list_head *l, u32 node, u32 port); +void tipc_dest_list_purge(struct list_head *l); +int tipc_dest_list_len(struct list_head *l); #endif diff --git a/net/tipc/node.c b/net/tipc/node.c index 198dbc7adbe1..89f8ac73bf65 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -157,7 +157,7 @@ static void tipc_node_timeout(unsigned long data); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static void tipc_node_put(struct tipc_node *node); -static bool tipc_node_is_up(struct tipc_node *n); +static bool node_is_up(struct tipc_node *n); struct tipc_sock_conn { u32 port; @@ -657,7 +657,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, *slot1 = i; } - if (!tipc_node_is_up(n)) { + if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); @@ -717,11 +717,27 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) tipc_sk_rcv(n->net, &le->inputq); } -static bool tipc_node_is_up(struct tipc_node *n) +static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } +bool tipc_node_is_up(struct net *net, u32 addr) +{ + struct tipc_node *n; + bool retval = false; + + if (in_own_node(net, addr)) + return true; + + n = tipc_node_find(net, addr); + if (!n) + return false; + retval = node_is_up(n); + tipc_node_put(n); + return retval; +} + void tipc_node_check_dest(struct net *net, u32 onode, struct tipc_bearer *b, u16 capabilities, u32 signature, @@ -1149,7 +1165,7 @@ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; - if (tipc_node_is_up(node)) + if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; @@ -1238,6 +1254,22 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, return 0; } +/* tipc_node_distr_xmit(): send single buffer msgs to individual destinations + * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected + */ +int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) +{ + struct sk_buff *skb; + u32 selector, dnode; + + while ((skb = __skb_dequeue(xmitq))) { + selector = msg_origport(buf_msg(skb)); + dnode = msg_destnode(buf_msg(skb)); + tipc_node_xmit_skb(net, skb, dnode, selector); + } + return 0; +} + void tipc_node_broadcast(struct net *net, struct sk_buff *skb) { struct sk_buff *txskb; @@ -1249,7 +1281,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb) dst = n->addr; if (in_own_node(net, dst)) continue; - if (!tipc_node_is_up(n)) + if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) diff --git a/net/tipc/node.h b/net/tipc/node.h index 898c22916984..acd58d23a70e 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -48,7 +48,8 @@ enum { TIPC_BCAST_SYNCH = (1 << 1), TIPC_BCAST_STATE_NACK = (1 << 2), TIPC_BLOCK_FLOWCTL = (1 << 3), - TIPC_BCAST_RCAST = (1 << 4) + TIPC_BCAST_RCAST = (1 << 4), + TIPC_MCAST_GROUPS = (1 << 5) }; #define TIPC_NODE_CAPABILITIES (TIPC_BCAST_SYNCH | \ @@ -68,6 +69,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector); +int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *list); int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr); @@ -76,6 +78,7 @@ void tipc_node_broadcast(struct net *net, struct sk_buff *skb); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel); +bool tipc_node_is_up(struct net *net, u32 addr); u16 tipc_node_get_capabilities(struct net *net, u32 addr); int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb); diff --git a/net/tipc/server.c b/net/tipc/server.c index 3cd6402e812c..713077536d0c 100644 --- a/net/tipc/server.c +++ b/net/tipc/server.c @@ -36,6 +36,8 @@ #include "server.h" #include "core.h" #include "socket.h" +#include "addr.h" +#include "msg.h" #include <net/sock.h> #include <linux/module.h> @@ -105,13 +107,11 @@ static void tipc_conn_kref_release(struct kref *kref) kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr)); sock_release(sock); con->sock = NULL; - - spin_lock_bh(&s->idr_lock); - idr_remove(&s->conn_idr, con->conid); - s->idr_in_use--; - spin_unlock_bh(&s->idr_lock); } - + spin_lock_bh(&s->idr_lock); + idr_remove(&s->conn_idr, con->conid); + s->idr_in_use--; + spin_unlock_bh(&s->idr_lock); tipc_clean_outqueues(con); kfree(con); } @@ -197,7 +197,8 @@ static void tipc_close_conn(struct tipc_conn *con) struct tipc_server *s = con->server; if (test_and_clear_bit(CF_CONNECTED, &con->flags)) { - tipc_unregister_callbacks(con); + if (con->sock) + tipc_unregister_callbacks(con); if (con->conid) s->tipc_conn_release(con->conid, con->usr_data); @@ -207,8 +208,8 @@ static void tipc_close_conn(struct tipc_conn *con) * are harmless for us here as we have already deleted this * connection from server connection list. */ - kernel_sock_shutdown(con->sock, SHUT_RDWR); - + if (con->sock) + kernel_sock_shutdown(con->sock, SHUT_RDWR); conn_put(con); } } @@ -487,38 +488,104 @@ void tipc_conn_terminate(struct tipc_server *s, int conid) } } +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, + u32 lower, u32 upper, int *conid) +{ + struct tipc_subscriber *scbr; + struct tipc_subscr sub; + struct tipc_server *s; + struct tipc_conn *con; + + sub.seq.type = type; + sub.seq.lower = lower; + sub.seq.upper = upper; + sub.timeout = TIPC_WAIT_FOREVER; + sub.filter = TIPC_SUB_PORTS; + *(u32 *)&sub.usr_handle = port; + + con = tipc_alloc_conn(tipc_topsrv(net)); + if (!con) + return false; + + *conid = con->conid; + s = con->server; + scbr = s->tipc_conn_new(*conid); + if (!scbr) { + tipc_close_conn(con); + return false; + } + + con->usr_data = scbr; + con->sock = NULL; + s->tipc_conn_recvmsg(net, *conid, NULL, scbr, &sub, sizeof(sub)); + return true; +} + +void tipc_topsrv_kern_unsubscr(struct net *net, int conid) +{ + struct tipc_conn *con; + + con = tipc_conn_lookup(tipc_topsrv(net), conid); + if (!con) + return; + tipc_close_conn(con); + conn_put(con); +} + +static void tipc_send_kern_top_evt(struct net *net, struct tipc_event *evt) +{ + u32 port = *(u32 *)&evt->s.usr_handle; + u32 self = tipc_own_addr(net); + struct sk_buff_head evtq; + struct sk_buff *skb; + + skb = tipc_msg_create(TOP_SRV, 0, INT_H_SIZE, sizeof(*evt), + self, self, port, port, 0); + if (!skb) + return; + msg_set_dest_droppable(buf_msg(skb), true); + memcpy(msg_data(buf_msg(skb)), evt, sizeof(*evt)); + skb_queue_head_init(&evtq); + __skb_queue_tail(&evtq, skb); + tipc_sk_rcv(net, &evtq); +} + static void tipc_send_to_sock(struct tipc_conn *con) { - int count = 0; struct tipc_server *s = con->server; struct outqueue_entry *e; + struct tipc_event *evt; struct msghdr msg; + int count = 0; int ret; spin_lock_bh(&con->outqueue_lock); while (test_bit(CF_CONNECTED, &con->flags)) { - e = list_entry(con->outqueue.next, struct outqueue_entry, - list); + e = list_entry(con->outqueue.next, struct outqueue_entry, list); if ((struct list_head *) e == &con->outqueue) break; - spin_unlock_bh(&con->outqueue_lock); - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT; + spin_unlock_bh(&con->outqueue_lock); - if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) { - msg.msg_name = &e->dest; - msg.msg_namelen = sizeof(struct sockaddr_tipc); - } - ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, - e->iov.iov_len); - if (ret == -EWOULDBLOCK || ret == 0) { - cond_resched(); - goto out; - } else if (ret < 0) { - goto send_err; + if (con->sock) { + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT; + if (s->type == SOCK_DGRAM || s->type == SOCK_RDM) { + msg.msg_name = &e->dest; + msg.msg_namelen = sizeof(struct sockaddr_tipc); + } + ret = kernel_sendmsg(con->sock, &msg, &e->iov, 1, + e->iov.iov_len); + if (ret == -EWOULDBLOCK || ret == 0) { + cond_resched(); + goto out; + } else if (ret < 0) { + goto send_err; + } + } else { + evt = e->iov.iov_base; + tipc_send_kern_top_evt(s->net, evt); } - /* Don't starve users filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { cond_resched(); diff --git a/net/tipc/server.h b/net/tipc/server.h index 34f8055afa3b..2113c9192633 100644 --- a/net/tipc/server.h +++ b/net/tipc/server.h @@ -83,13 +83,16 @@ struct tipc_server { int tipc_conn_sendmsg(struct tipc_server *s, int conid, struct sockaddr_tipc *addr, void *data, size_t len); +bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, + u32 lower, u32 upper, int *conid); +void tipc_topsrv_kern_unsubscr(struct net *net, int conid); + /** * tipc_conn_terminate - terminate connection with server * * Note: Must call it in process context since it might sleep */ void tipc_conn_terminate(struct tipc_server *s, int conid); - int tipc_server_start(struct tipc_server *s); void tipc_server_stop(struct tipc_server *s); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d50edd6e0019..2bbab4fe2f53 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1,7 +1,7 @@ /* * net/tipc/socket.c: TIPC socket API * - * Copyright (c) 2001-2007, 2012-2016, Ericsson AB + * Copyright (c) 2001-2007, 2012-2017, Ericsson AB * Copyright (c) 2004-2008, 2010-2013, Wind River Systems * All rights reserved. * @@ -45,6 +45,7 @@ #include "socket.h" #include "bcast.h" #include "netlink.h" +#include "group.h" #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTERVAL msecs_to_jiffies(3600000) /* [ms] => 1 h */ @@ -61,6 +62,11 @@ enum { TIPC_CONNECTING = TCP_SYN_SENT, }; +struct sockaddr_pair { + struct sockaddr_tipc sock; + struct sockaddr_tipc member; +}; + /** * struct tipc_sock - TIPC socket structure * @sk: socket - interacts with 'port' and with user via the socket API @@ -78,7 +84,7 @@ enum { * @conn_timeout: the time we can wait for an unresponded setup request * @dupl_rcvcnt: number of bytes counted twice, in both backlog and rcv queue * @cong_link_cnt: number of congested links - * @sent_unacked: # messages sent by socket, and not yet acked by peer + * @snt_unacked: # messages sent by socket, and not yet acked by peer * @rcv_unacked: # messages read by user, but not yet acked back to peer * @peer: 'connected' peer for dgram/rdm * @node: hash table node @@ -109,9 +115,10 @@ struct tipc_sock { struct rhash_head node; struct tipc_mc_method mc_method; struct rcu_head rcu; + struct tipc_group *group; }; -static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb); +static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); static void tipc_data_ready(struct sock *sk); static void tipc_write_space(struct sock *sk); static void tipc_sock_destruct(struct sock *sk); @@ -123,6 +130,7 @@ static int tipc_sk_publish(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); static int tipc_sk_withdraw(struct tipc_sock *tsk, uint scope, struct tipc_name_seq const *seq); +static int tipc_sk_leave(struct tipc_sock *tsk); static struct tipc_sock *tipc_sk_lookup(struct net *net, u32 portid); static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); @@ -193,6 +201,11 @@ static bool tsk_conn_cong(struct tipc_sock *tsk) return tsk->snt_unacked > tsk->snd_win; } +static u16 tsk_blocks(int len) +{ + return ((len / FLOWCTL_BLK_SZ) + 1); +} + /* tsk_blocks(): translate a buffer size in bytes to number of * advertisable blocks, taking into account the ratio truesize(len)/len * We can trust that this ratio is always < 4 for len >= FLOWCTL_BLK_SZ @@ -453,7 +466,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, msg_set_origport(msg, tsk->portid); setup_timer(&sk->sk_timer, tipc_sk_timeout, (unsigned long)tsk); sk->sk_shutdown = 0; - sk->sk_backlog_rcv = tipc_backlog_rcv; + sk->sk_backlog_rcv = tipc_sk_backlog_rcv; sk->sk_rcvbuf = sysctl_tipc_rmem[1]; sk->sk_data_ready = tipc_data_ready; sk->sk_write_space = tipc_write_space; @@ -559,13 +572,14 @@ static int tipc_release(struct socket *sock) __tipc_shutdown(sock, TIPC_ERR_NO_PORT); sk->sk_shutdown = SHUTDOWN_MASK; + tipc_sk_leave(tsk); tipc_sk_withdraw(tsk, 0, NULL); sk_stop_timer(sk, &sk->sk_timer); tipc_sk_remove(tsk); /* Reject any messages that accumulated in backlog queue */ release_sock(sk); - u32_list_purge(&tsk->cong_links); + tipc_dest_list_purge(&tsk->cong_links); tsk->cong_link_cnt = 0; call_rcu(&tsk->rcu, tipc_sk_callback); sock->sk = NULL; @@ -601,7 +615,10 @@ static int tipc_bind(struct socket *sock, struct sockaddr *uaddr, res = tipc_sk_withdraw(tsk, 0, NULL); goto exit; } - + if (tsk->group) { + res = -EACCES; + goto exit; + } if (uaddr_len < sizeof(struct sockaddr_tipc)) { res = -EINVAL; goto exit; @@ -697,39 +714,43 @@ static unsigned int tipc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tipc_sock *tsk = tipc_sk(sk); - u32 mask = 0; + struct tipc_group *grp = tsk->group; + u32 revents = 0; sock_poll_wait(file, sk_sleep(sk), wait); if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP | POLLIN | POLLRDNORM; + revents |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) - mask |= POLLHUP; + revents |= POLLHUP; switch (sk->sk_state) { case TIPC_ESTABLISHED: if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk)) - mask |= POLLOUT; + revents |= POLLOUT; /* fall thru' */ case TIPC_LISTEN: case TIPC_CONNECTING: - if (!skb_queue_empty(&sk->sk_receive_queue)) - mask |= (POLLIN | POLLRDNORM); + if (skb) + revents |= POLLIN | POLLRDNORM; break; case TIPC_OPEN: - if (!tsk->cong_link_cnt) - mask |= POLLOUT; - if (tipc_sk_type_connectionless(sk) && - (!skb_queue_empty(&sk->sk_receive_queue))) - mask |= (POLLIN | POLLRDNORM); + if (!grp || tipc_group_size(grp)) + if (!tsk->cong_link_cnt) + revents |= POLLOUT; + if (!tipc_sk_type_connectionless(sk)) + break; + if (!skb) + break; + revents |= POLLIN | POLLRDNORM; break; case TIPC_DISCONNECTING: - mask = (POLLIN | POLLRDNORM | POLLHUP); + revents = POLLIN | POLLRDNORM | POLLHUP; break; } - - return mask; + return revents; } /** @@ -757,6 +778,9 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, struct tipc_nlist dsts; int rc; + if (tsk->group) + return -EACCES; + /* Block or return if any destination link is congested */ rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt); if (unlikely(rc)) @@ -794,6 +818,296 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, } /** + * tipc_send_group_msg - send a message to a member in the group + * @net: network namespace + * @m: message to send + * @mb: group member + * @dnode: destination node + * @dport: destination port + * @dlen: total length of message data + */ +static int tipc_send_group_msg(struct net *net, struct tipc_sock *tsk, + struct msghdr *m, struct tipc_member *mb, + u32 dnode, u32 dport, int dlen) +{ + u16 bc_snd_nxt = tipc_group_bc_snd_nxt(tsk->group); + struct tipc_mc_method *method = &tsk->mc_method; + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_msg *hdr = &tsk->phdr; + struct sk_buff_head pkts; + int mtu, rc; + + /* Complete message header */ + msg_set_type(hdr, TIPC_GRP_UCAST_MSG); + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_destport(hdr, dport); + msg_set_destnode(hdr, dnode); + msg_set_grp_bc_seqno(hdr, bc_snd_nxt); + + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + mtu = tipc_node_get_mtu(net, dnode, tsk->portid); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; + + /* Send message */ + rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); + if (unlikely(rc == -ELINKCONG)) { + tipc_dest_push(&tsk->cong_links, dnode, 0); + tsk->cong_link_cnt++; + } + + /* Update send window */ + tipc_group_update_member(mb, blks); + + /* A broadcast sent within next EXPIRE period must follow same path */ + method->rcast = true; + method->mandatory = true; + return dlen; +} + +/** + * tipc_send_group_unicast - send message to a member in the group + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_unicast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct net *net = sock_net(sk); + struct tipc_member *mb = NULL; + u32 node, port; + int rc; + + node = dest->addr.id.node; + port = dest->addr.id.ref; + if (!port && !node) + return -EHOSTUNREACH; + + /* Block or return if destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(&tsk->cong_links, node, 0) && + !tipc_group_cong(grp, node, port, blks, &mb)); + if (unlikely(rc)) + return rc; + + if (unlikely(!mb)) + return -EHOSTUNREACH; + + rc = tipc_send_group_msg(net, tsk, m, mb, node, port, dlen); + + return rc ? rc : dlen; +} + +/** + * tipc_send_group_anycast - send message to any member with given identity + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_anycast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct sock *sk = sock->sk; + struct tipc_sock *tsk = tipc_sk(sk); + struct list_head *cong_links = &tsk->cong_links; + int blks = tsk_blocks(GROUP_H_SIZE + dlen); + struct tipc_group *grp = tsk->group; + struct tipc_member *first = NULL; + struct tipc_member *mbr = NULL; + struct net *net = sock_net(sk); + u32 node, port, exclude; + u32 type, inst, domain; + struct list_head dsts; + int lookups = 0; + int dstcnt, rc; + bool cong; + + INIT_LIST_HEAD(&dsts); + + type = dest->addr.name.name.type; + inst = dest->addr.name.name.instance; + domain = addr_domain(net, dest->scope); + exclude = tipc_group_exclude(grp); + + while (++lookups < 4) { + first = NULL; + + /* Look for a non-congested destination member, if any */ + while (1) { + if (!tipc_nametbl_lookup(net, type, inst, domain, &dsts, + &dstcnt, exclude, false)) + return -EHOSTUNREACH; + tipc_dest_pop(&dsts, &node, &port); + cong = tipc_group_cong(grp, node, port, blks, &mbr); + if (!cong) + break; + if (mbr == first) + break; + if (!first) + first = mbr; + } + + /* Start over if destination was not in member list */ + if (unlikely(!mbr)) + continue; + + if (likely(!cong && !tipc_dest_find(cong_links, node, 0))) + break; + + /* Block or return if destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(cong_links, node, 0) && + !tipc_group_cong(grp, node, port, + blks, &mbr)); + if (unlikely(rc)) + return rc; + + /* Send, unless destination disappeared while waiting */ + if (likely(mbr)) + break; + } + + if (unlikely(lookups >= 4)) + return -EHOSTUNREACH; + + rc = tipc_send_group_msg(net, tsk, m, mbr, node, port, dlen); + + return rc ? rc : dlen; +} + +/** + * tipc_send_group_bcast - send message to all members in communication group + * @sk: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_bcast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct sock *sk = sock->sk; + struct net *net = sock_net(sk); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct tipc_nlist *dsts = tipc_group_dests(grp); + struct tipc_mc_method *method = &tsk->mc_method; + bool ack = method->mandatory && method->rcast; + int blks = tsk_blocks(MCAST_H_SIZE + dlen); + struct tipc_msg *hdr = &tsk->phdr; + int mtu = tipc_bcast_get_mtu(net); + struct sk_buff_head pkts; + int rc = -EHOSTUNREACH; + + if (!dsts->local && !dsts->remote) + return -EHOSTUNREACH; + + /* Block or return if any destination link or member is congested */ + rc = tipc_wait_for_cond(sock, &timeout, !tsk->cong_link_cnt && + !tipc_group_bc_cong(grp, blks)); + if (unlikely(rc)) + return rc; + + /* Complete message header */ + if (dest) { + msg_set_type(hdr, TIPC_GRP_MCAST_MSG); + msg_set_nameinst(hdr, dest->addr.name.name.instance); + } else { + msg_set_type(hdr, TIPC_GRP_BCAST_MSG); + msg_set_nameinst(hdr, 0); + } + msg_set_hdr_sz(hdr, GROUP_H_SIZE); + msg_set_destport(hdr, 0); + msg_set_destnode(hdr, 0); + msg_set_grp_bc_seqno(hdr, tipc_group_bc_snd_nxt(grp)); + + /* Avoid getting stuck with repeated forced replicasts */ + msg_set_grp_bc_ack_req(hdr, ack); + + /* Build message as chain of buffers */ + skb_queue_head_init(&pkts); + rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); + if (unlikely(rc != dlen)) + return rc; + + /* Send message */ + rc = tipc_mcast_xmit(net, &pkts, method, dsts, &tsk->cong_link_cnt); + if (unlikely(rc)) + return rc; + + /* Update broadcast sequence number and send windows */ + tipc_group_update_bc_members(tsk->group, blks, ack); + + /* Broadcast link is now free to choose method for next broadcast */ + method->mandatory = false; + method->expires = jiffies; + + return dlen; +} + +/** + * tipc_send_group_mcast - send message to all members with given identity + * @sock: socket structure + * @m: message to send + * @dlen: total length of message data + * @timeout: timeout to wait for wakeup + * + * Called from function tipc_sendmsg(), which has done all sanity checks + * Returns the number of bytes sent on success, or errno + */ +static int tipc_send_group_mcast(struct socket *sock, struct msghdr *m, + int dlen, long timeout) +{ + struct sock *sk = sock->sk; + DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name); + struct tipc_name_seq *seq = &dest->addr.nameseq; + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; + struct net *net = sock_net(sk); + u32 domain, exclude, dstcnt; + struct list_head dsts; + + INIT_LIST_HEAD(&dsts); + + if (seq->lower != seq->upper) + return -ENOTSUPP; + + domain = addr_domain(net, dest->scope); + exclude = tipc_group_exclude(grp); + if (!tipc_nametbl_lookup(net, seq->type, seq->lower, domain, + &dsts, &dstcnt, exclude, true)) + return -EHOSTUNREACH; + + if (dstcnt == 1) { + tipc_dest_pop(&dsts, &dest->addr.id.node, &dest->addr.id.ref); + return tipc_send_group_unicast(sock, m, dlen, timeout); + } + + tipc_dest_list_purge(&dsts); + return tipc_send_group_bcast(sock, m, dlen, timeout); +} + +/** * tipc_sk_mcast_rcv - Deliver multicast messages to all destination sockets * @arrvq: queue with arriving messages, to be cloned after destination lookup * @inputq: queue with cloned messages, delivered to socket after dest lookup @@ -803,13 +1117,15 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq, void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq) { - struct tipc_msg *msg; - struct list_head dports; - u32 portid; u32 scope = TIPC_CLUSTER_SCOPE; - struct sk_buff_head tmpq; - uint hsz; + u32 self = tipc_own_addr(net); struct sk_buff *skb, *_skb; + u32 lower = 0, upper = ~0; + struct sk_buff_head tmpq; + u32 portid, oport, onode; + struct list_head dports; + struct tipc_msg *msg; + int user, mtyp, hsz; __skb_queue_head_init(&tmpq); INIT_LIST_HEAD(&dports); @@ -817,17 +1133,32 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, skb = tipc_skb_peek(arrvq, &inputq->lock); for (; skb; skb = tipc_skb_peek(arrvq, &inputq->lock)) { msg = buf_msg(skb); + user = msg_user(msg); + mtyp = msg_type(msg); + if (mtyp == TIPC_GRP_UCAST_MSG || user == GROUP_PROTOCOL) { + spin_lock_bh(&inputq->lock); + if (skb_peek(arrvq) == skb) { + __skb_dequeue(arrvq); + __skb_queue_tail(inputq, skb); + } + refcount_dec(&skb->users); + spin_unlock_bh(&inputq->lock); + continue; + } hsz = skb_headroom(skb) + msg_hdr_sz(msg); - - if (in_own_node(net, msg_orignode(msg))) + oport = msg_origport(msg); + onode = msg_orignode(msg); + if (onode == self) scope = TIPC_NODE_SCOPE; /* Create destination port list and message clones: */ - tipc_nametbl_mc_translate(net, - msg_nametype(msg), msg_namelower(msg), - msg_nameupper(msg), scope, &dports); - portid = u32_pop(&dports); - for (; portid; portid = u32_pop(&dports)) { + if (!msg_in_group(msg)) { + lower = msg_namelower(msg); + upper = msg_nameupper(msg); + } + tipc_nametbl_mc_translate(net, msg_nametype(msg), lower, upper, + scope, &dports); + while (tipc_dest_pop(&dports, NULL, &portid)) { _skb = __pskb_copy(skb, hsz, GFP_ATOMIC); if (_skb) { msg_set_destport(buf_msg(_skb), portid); @@ -850,16 +1181,16 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, } /** - * tipc_sk_proto_rcv - receive a connection mng protocol message + * tipc_sk_conn_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket * @skb: pointer to message buffer. */ -static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, - struct sk_buff_head *xmitq) +static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { - struct sock *sk = &tsk->sk; - u32 onode = tsk_own_node(tsk); struct tipc_msg *hdr = buf_msg(skb); + u32 onode = tsk_own_node(tsk); + struct sock *sk = &tsk->sk; int mtyp = msg_type(hdr); bool conn_cong; @@ -931,6 +1262,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) long timeout = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT); struct list_head *clinks = &tsk->cong_links; bool syn = !tipc_sk_type_connectionless(sk); + struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; @@ -941,18 +1273,31 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) return -EMSGSIZE; + if (likely(dest)) { + if (unlikely(m->msg_namelen < sizeof(*dest))) + return -EINVAL; + if (unlikely(dest->family != AF_TIPC)) + return -EINVAL; + } + + if (grp) { + if (!dest) + return tipc_send_group_bcast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_NAME) + return tipc_send_group_anycast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_ID) + return tipc_send_group_unicast(sock, m, dlen, timeout); + if (dest->addrtype == TIPC_ADDR_MCAST) + return tipc_send_group_mcast(sock, m, dlen, timeout); + return -EINVAL; + } + if (unlikely(!dest)) { dest = &tsk->peer; if (!syn || dest->family != AF_TIPC) return -EDESTADDRREQ; } - if (unlikely(m->msg_namelen < sizeof(*dest))) - return -EINVAL; - - if (unlikely(dest->family != AF_TIPC)) - return -EINVAL; - if (unlikely(syn)) { if (sk->sk_state == TIPC_LISTEN) return -EPIPE; @@ -985,7 +1330,6 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; - } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; msg_set_type(hdr, TIPC_DIRECT_MSG); @@ -996,7 +1340,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) } /* Block or return if destination link is congested */ - rc = tipc_wait_for_cond(sock, &timeout, !u32_find(clinks, dnode)); + rc = tipc_wait_for_cond(sock, &timeout, + !tipc_dest_find(clinks, dnode, 0)); if (unlikely(rc)) return rc; @@ -1008,7 +1353,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) rc = tipc_node_xmit(net, &pkts, dnode, tsk->portid); if (unlikely(rc == -ELINKCONG)) { - u32_push(clinks, dnode); + tipc_dest_push(clinks, dnode, 0); tsk->cong_link_cnt++; rc = 0; } @@ -1142,26 +1487,38 @@ static void tipc_sk_finish_conn(struct tipc_sock *tsk, u32 peer_port, } /** - * set_orig_addr - capture sender's address for received message + * tipc_sk_set_orig_addr - capture sender's address for received message * @m: descriptor for message info - * @msg: received message header + * @hdr: received message header * * Note: Address is not captured if not requested by receiver. */ -static void set_orig_addr(struct msghdr *m, struct tipc_msg *msg) +static void tipc_sk_set_orig_addr(struct msghdr *m, struct sk_buff *skb) { - DECLARE_SOCKADDR(struct sockaddr_tipc *, addr, m->msg_name); + DECLARE_SOCKADDR(struct sockaddr_pair *, srcaddr, m->msg_name); + struct tipc_msg *hdr = buf_msg(skb); - if (addr) { - addr->family = AF_TIPC; - addr->addrtype = TIPC_ADDR_ID; - memset(&addr->addr, 0, sizeof(addr->addr)); - addr->addr.id.ref = msg_origport(msg); - addr->addr.id.node = msg_orignode(msg); - addr->addr.name.domain = 0; /* could leave uninitialized */ - addr->scope = 0; /* could leave uninitialized */ - m->msg_namelen = sizeof(struct sockaddr_tipc); - } + if (!srcaddr) + return; + + srcaddr->sock.family = AF_TIPC; + srcaddr->sock.addrtype = TIPC_ADDR_ID; + srcaddr->sock.addr.id.ref = msg_origport(hdr); + srcaddr->sock.addr.id.node = msg_orignode(hdr); + srcaddr->sock.addr.name.domain = 0; + srcaddr->sock.scope = 0; + m->msg_namelen = sizeof(struct sockaddr_tipc); + + if (!msg_in_group(hdr)) + return; + + /* Group message users may also want to know sending member's id */ + srcaddr->member.family = AF_TIPC; + srcaddr->member.addrtype = TIPC_ADDR_NAME; + srcaddr->member.addr.name.name.type = msg_nametype(hdr); + srcaddr->member.addr.name.name.instance = TIPC_SKB_CB(skb)->orig_member; + srcaddr->member.addr.name.domain = 0; + m->msg_namelen = sizeof(*srcaddr); } /** @@ -1318,11 +1675,13 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buflen, int flags) { struct sock *sk = sock->sk; - struct tipc_sock *tsk = tipc_sk(sk); - struct sk_buff *skb; - struct tipc_msg *hdr; bool connected = !tipc_sk_type_connectionless(sk); + struct tipc_sock *tsk = tipc_sk(sk); int rc, err, hlen, dlen, copy; + struct sk_buff_head xmitq; + struct tipc_msg *hdr; + struct sk_buff *skb; + bool grp_evt; long timeout; /* Catch invalid receive requests */ @@ -1336,8 +1695,8 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, } timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + /* Step rcv queue to first msg with data or error; wait if necessary */ do { - /* Look at first msg in receive queue; wait if necessary */ rc = tipc_wait_for_rcvmsg(sock, &timeout); if (unlikely(rc)) goto exit; @@ -1346,13 +1705,14 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, dlen = msg_data_sz(hdr); hlen = msg_hdr_sz(hdr); err = msg_errcode(hdr); + grp_evt = msg_is_grp_evt(hdr); if (likely(dlen || err)) break; tsk_advance_rx_queue(sk); } while (1); /* Collect msg meta data, including error code and rejected data */ - set_orig_addr(m, hdr); + tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, hdr, tsk); if (unlikely(rc)) goto exit; @@ -1372,15 +1732,33 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, if (unlikely(rc)) goto exit; + /* Mark message as group event if applicable */ + if (unlikely(grp_evt)) { + if (msg_grp_evt(hdr) == TIPC_WITHDRAWN) + m->msg_flags |= MSG_EOR; + m->msg_flags |= MSG_OOB; + copy = 0; + } + /* Caption of data or error code/rejected data was successful */ if (unlikely(flags & MSG_PEEK)) goto exit; + /* Send group flow control advertisement when applicable */ + if (tsk->group && msg_in_group(hdr) && !grp_evt) { + skb_queue_head_init(&xmitq); + tipc_group_update_rcv_win(tsk->group, tsk_blocks(hlen + dlen), + msg_orignode(hdr), msg_origport(hdr), + &xmitq); + tipc_node_distr_xmit(sock_net(sk), &xmitq); + } + tsk_advance_rx_queue(sk); + if (likely(!connected)) goto exit; - /* Send connection flow control ack when applicable */ + /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); @@ -1446,7 +1824,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Collect msg meta data, incl. error code and rejected data */ if (!copied) { - set_orig_addr(m, hdr); + tipc_sk_set_orig_addr(m, skb); rc = tipc_sk_anc_data_recv(m, hdr, tsk); if (rc) break; @@ -1532,14 +1910,51 @@ static void tipc_sock_destruct(struct sock *sk) __skb_queue_purge(&sk->sk_receive_queue); } +static void tipc_sk_proto_rcv(struct sock *sk, + struct sk_buff_head *inputq, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = __skb_dequeue(inputq); + struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_msg *hdr = buf_msg(skb); + struct tipc_group *grp = tsk->group; + bool wakeup = false; + + switch (msg_user(hdr)) { + case CONN_MANAGER: + tipc_sk_conn_proto_rcv(tsk, skb, xmitq); + return; + case SOCK_WAKEUP: + tipc_dest_del(&tsk->cong_links, msg_orignode(hdr), 0); + tsk->cong_link_cnt--; + wakeup = true; + break; + case GROUP_PROTOCOL: + tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); + break; + case TOP_SRV: + tipc_group_member_evt(tsk->group, &wakeup, &sk->sk_rcvbuf, + skb, inputq, xmitq); + skb = NULL; + break; + default: + break; + } + + if (wakeup) + sk->sk_write_space(sk); + + kfree_skb(skb); +} + /** - * filter_connect - Handle all incoming messages for a connection-based socket + * tipc_filter_connect - Handle incoming message for a connection-based socket * @tsk: TIPC socket * @skb: pointer to message buffer. Set to NULL if buffer is consumed * * Returns true if everything ok, false otherwise */ -static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) +static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); @@ -1643,6 +2058,9 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = buf_msg(skb); + if (unlikely(msg_in_group(hdr))) + return sk->sk_rcvbuf; + if (unlikely(!msg_connected(hdr))) return sk->sk_rcvbuf << msg_importance(hdr); @@ -1653,7 +2071,7 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) } /** - * filter_rcv - validate incoming message + * tipc_sk_filter_rcv - validate incoming message * @sk: socket * @skb: pointer to message. * @@ -1662,99 +2080,71 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *skb) * * Called with socket lock already taken * - * Returns true if message was added to socket receive queue, otherwise false */ -static bool filter_rcv(struct sock *sk, struct sk_buff *skb, - struct sk_buff_head *xmitq) +static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { + bool sk_conn = !tipc_sk_type_connectionless(sk); struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group *grp = tsk->group; struct tipc_msg *hdr = buf_msg(skb); - unsigned int limit = rcvbuf_limit(sk, skb); - int err = TIPC_OK; - int usr = msg_user(hdr); - u32 onode; + struct net *net = sock_net(sk); + struct sk_buff_head inputq; + int limit, err = TIPC_OK; - if (unlikely(msg_user(hdr) == CONN_MANAGER)) { - tipc_sk_proto_rcv(tsk, skb, xmitq); - return false; - } + TIPC_SKB_CB(skb)->bytes_read = 0; + __skb_queue_head_init(&inputq); + __skb_queue_tail(&inputq, skb); - if (unlikely(usr == SOCK_WAKEUP)) { - onode = msg_orignode(hdr); - kfree_skb(skb); - u32_del(&tsk->cong_links, onode); - tsk->cong_link_cnt--; - sk->sk_write_space(sk); - return false; - } + if (unlikely(!msg_isdata(hdr))) + tipc_sk_proto_rcv(sk, &inputq, xmitq); - /* Drop if illegal message type */ - if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) { - kfree_skb(skb); - return false; - } + if (unlikely(grp)) + tipc_group_filter_msg(grp, &inputq, xmitq); - /* Reject if wrong message type for current socket state */ - if (tipc_sk_type_connectionless(sk)) { - if (msg_connected(hdr)) { + /* Validate and add to receive buffer if there is space */ + while ((skb = __skb_dequeue(&inputq))) { + hdr = buf_msg(skb); + limit = rcvbuf_limit(sk, skb); + if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || + (!sk_conn && msg_connected(hdr)) || + (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; - goto reject; - } - } else if (unlikely(!filter_connect(tsk, skb))) { - err = TIPC_ERR_NO_PORT; - goto reject; - } + else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) + err = TIPC_ERR_OVERLOAD; - /* Reject message if there isn't room to queue it */ - if (unlikely(sk_rmem_alloc_get(sk) + skb->truesize >= limit)) { - err = TIPC_ERR_OVERLOAD; - goto reject; + if (unlikely(err)) { + tipc_skb_reject(net, err, skb, xmitq); + err = TIPC_OK; + continue; + } + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); + sk->sk_data_ready(sk); } - - /* Enqueue message */ - TIPC_SKB_CB(skb)->bytes_read = 0; - __skb_queue_tail(&sk->sk_receive_queue, skb); - skb_set_owner_r(skb, sk); - - sk->sk_data_ready(sk); - return true; - -reject: - if (tipc_msg_reverse(tsk_own_node(tsk), &skb, err)) - __skb_queue_tail(xmitq, skb); - return false; } /** - * tipc_backlog_rcv - handle incoming message from backlog queue + * tipc_sk_backlog_rcv - handle incoming message from backlog queue * @sk: socket * @skb: message * * Caller must hold socket lock - * - * Returns 0 */ -static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) +static int tipc_sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - unsigned int truesize = skb->truesize; + unsigned int before = sk_rmem_alloc_get(sk); struct sk_buff_head xmitq; - u32 dnode, selector; + unsigned int added; __skb_queue_head_init(&xmitq); - if (likely(filter_rcv(sk, skb, &xmitq))) { - atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt); - return 0; - } - - if (skb_queue_empty(&xmitq)) - return 0; + tipc_sk_filter_rcv(sk, skb, &xmitq); + added = sk_rmem_alloc_get(sk) - before; + atomic_add(added, &tipc_sk(sk)->dupl_rcvcnt); - /* Send response/rejected message */ - skb = __skb_dequeue(&xmitq); - dnode = msg_destnode(buf_msg(skb)); - selector = msg_origport(buf_msg(skb)); - tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); + /* Send pending response/rejected messages, if any */ + tipc_node_distr_xmit(sock_net(sk), &xmitq); return 0; } @@ -1786,7 +2176,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { - filter_rcv(sk, skb, xmitq); + tipc_sk_filter_rcv(sk, skb, xmitq); continue; } @@ -1833,14 +2223,10 @@ void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) spin_unlock_bh(&sk->sk_lock.slock); } /* Send pending response/rejected messages, if any */ - while ((skb = __skb_dequeue(&xmitq))) { - dnode = msg_destnode(buf_msg(skb)); - tipc_node_xmit_skb(net, skb, dnode, dport); - } + tipc_node_distr_xmit(sock_net(sk), &xmitq); sock_put(sk); continue; } - /* No destination socket => dequeue skb if still there */ skb = tipc_skb_dequeue(inputq, dport); if (!skb) @@ -1903,28 +2289,32 @@ static int tipc_connect(struct socket *sock, struct sockaddr *dest, int previous; int res = 0; + if (destlen != sizeof(struct sockaddr_tipc)) + return -EINVAL; + lock_sock(sk); - /* DGRAM/RDM connect(), just save the destaddr */ - if (tipc_sk_type_connectionless(sk)) { - if (dst->family == AF_UNSPEC) { - memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); - } else if (destlen != sizeof(struct sockaddr_tipc)) { - res = -EINVAL; - } else { - memcpy(&tsk->peer, dest, destlen); - } + if (tsk->group) { + res = -EINVAL; goto exit; } - /* - * Reject connection attempt using multicast address - * - * Note: send_msg() validates the rest of the address fields, - * so there's no need to do it here - */ - if (dst->addrtype == TIPC_ADDR_MCAST) { + if (dst->family == AF_UNSPEC) { + memset(&tsk->peer, 0, sizeof(struct sockaddr_tipc)); + if (!tipc_sk_type_connectionless(sk)) + res = -EINVAL; + goto exit; + } else if (dst->family != AF_TIPC) { res = -EINVAL; + } + if (dst->addrtype != TIPC_ADDR_ID && dst->addrtype != TIPC_ADDR_NAME) + res = -EINVAL; + if (res) + goto exit; + + /* DGRAM/RDM connect(), just save the destaddr */ + if (tipc_sk_type_connectionless(sk)) { + memcpy(&tsk->peer, dest, destlen); goto exit; } @@ -2345,6 +2735,56 @@ void tipc_sk_rht_destroy(struct net *net) rhashtable_destroy(&tn->sk_rht); } +static int tipc_sk_join(struct tipc_sock *tsk, struct tipc_group_req *mreq) +{ + struct net *net = sock_net(&tsk->sk); + u32 domain = addr_domain(net, mreq->scope); + struct tipc_group *grp = tsk->group; + struct tipc_msg *hdr = &tsk->phdr; + struct tipc_name_seq seq; + int rc; + + if (mreq->type < TIPC_RESERVED_TYPES) + return -EACCES; + if (grp) + return -EACCES; + grp = tipc_group_create(net, tsk->portid, mreq); + if (!grp) + return -ENOMEM; + tsk->group = grp; + msg_set_lookup_scope(hdr, mreq->scope); + msg_set_nametype(hdr, mreq->type); + msg_set_dest_droppable(hdr, true); + seq.type = mreq->type; + seq.lower = mreq->instance; + seq.upper = seq.lower; + tipc_nametbl_build_group(net, grp, mreq->type, domain); + rc = tipc_sk_publish(tsk, mreq->scope, &seq); + if (rc) + tipc_group_delete(net, grp); + + /* Eliminate any risk that a broadcast overtakes the sent JOIN */ + tsk->mc_method.rcast = true; + tsk->mc_method.mandatory = true; + return rc; +} + +static int tipc_sk_leave(struct tipc_sock *tsk) +{ + struct net *net = sock_net(&tsk->sk); + struct tipc_group *grp = tsk->group; + struct tipc_name_seq seq; + int scope; + + if (!grp) + return -EINVAL; + tipc_group_self(grp, &seq, &scope); + tipc_group_delete(net, grp); + tsk->group = NULL; + tipc_sk_withdraw(tsk, scope, &seq); + return 0; +} + /** * tipc_setsockopt - set socket option * @sock: socket structure @@ -2363,6 +2803,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); + struct tipc_group_req mreq; u32 value = 0; int res = 0; @@ -2378,9 +2819,14 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, case TIPC_CONN_TIMEOUT: if (ol < sizeof(value)) return -EINVAL; - res = get_user(value, (u32 __user *)ov); - if (res) - return res; + if (get_user(value, (u32 __user *)ov)) + return -EFAULT; + break; + case TIPC_GROUP_JOIN: + if (ol < sizeof(mreq)) + return -EINVAL; + if (copy_from_user(&mreq, ov, sizeof(mreq))) + return -EFAULT; break; default: if (ov || ol) @@ -2413,6 +2859,12 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, tsk->mc_method.rcast = true; tsk->mc_method.mandatory = true; break; + case TIPC_GROUP_JOIN: + res = tipc_sk_join(tsk, &mreq); + break; + case TIPC_GROUP_LEAVE: + res = tipc_sk_leave(tsk); + break; default: res = -EINVAL; } @@ -2440,7 +2892,8 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - int len; + struct tipc_name_seq seq; + int len, scope; u32 value; int res; @@ -2474,6 +2927,12 @@ static int tipc_getsockopt(struct socket *sock, int lvl, int opt, case TIPC_SOCK_RECVQ_DEPTH: value = skb_queue_len(&sk->sk_receive_queue); break; + case TIPC_GROUP_JOIN: + seq.type = 0; + if (tsk->group) + tipc_group_self(tsk->group, &seq, &scope); + value = seq.type; + break; default: res = -EINVAL; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 67a03f2885a4..fce2cbe6a193 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -549,6 +549,14 @@ nl80211_nan_srf_policy[NL80211_NAN_SRF_ATTR_MAX + 1] = { [NL80211_NAN_SRF_MAC_ADDRS] = { .type = NLA_NESTED }, }; +/* policy for packet pattern attributes */ +static const struct nla_policy +nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = { + [NL80211_PKTPAT_MASK] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_PATTERN] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -10571,7 +10579,8 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) u8 *mask_pat; nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - NULL, info->extack); + nl80211_packet_pattern_policy, + info->extack); err = -EINVAL; if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) @@ -10820,7 +10829,8 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL, NULL); + nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, + nl80211_packet_pattern_policy, NULL); if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) return -EINVAL; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index acf00104ef31..30e5746085b8 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -91,6 +91,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { + xso->dev = NULL; dev_put(dev); return 0; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 2515cd2bc5db..8ac9d32fb79d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -429,7 +429,8 @@ resume: nf_reset(skb); if (decaps) { - skb->sp->olen = 0; + if (skb->sp) + skb->sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; @@ -440,7 +441,8 @@ resume: err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { - skb->sp->olen = 0; + if (skb->sp) + skb->sp->olen = 0; skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 0dab1cd79ce4..12213477cd3a 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -732,12 +732,12 @@ restart: } } } +out: + spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (cnt) { err = 0; xfrm_policy_cache_flush(); } -out: - spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_state_flush); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2bfbd9121e3b..b997f1395357 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -657,6 +657,7 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) { x->km.state = XFRM_STATE_DEAD; + xfrm_dev_state_delete(x); __xfrm_state_put(x); goto out; } diff --git a/samples/bpf/trace_event_kern.c b/samples/bpf/trace_event_kern.c index 41b6115a32eb..a77a583d94d4 100644 --- a/samples/bpf/trace_event_kern.c +++ b/samples/bpf/trace_event_kern.c @@ -37,10 +37,14 @@ struct bpf_map_def SEC("maps") stackmap = { SEC("perf_event") int bpf_prog1(struct bpf_perf_event_data *ctx) { + char time_fmt1[] = "Time Enabled: %llu, Time Running: %llu"; + char time_fmt2[] = "Get Time Failed, ErrCode: %d"; char fmt[] = "CPU-%d period %lld ip %llx"; u32 cpu = bpf_get_smp_processor_id(); + struct bpf_perf_event_value value_buf; struct key_t key; u64 *val, one = 1; + int ret; if (ctx->sample_period < 10000) /* ignore warmup */ @@ -54,6 +58,12 @@ int bpf_prog1(struct bpf_perf_event_data *ctx) return 0; } + ret = bpf_perf_prog_read_value(ctx, (void *)&value_buf, sizeof(struct bpf_perf_event_value)); + if (!ret) + bpf_trace_printk(time_fmt1, sizeof(time_fmt1), value_buf.enabled, value_buf.running); + else + bpf_trace_printk(time_fmt2, sizeof(time_fmt2), ret); + val = bpf_map_lookup_elem(&counts, &key); if (val) (*val)++; diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c index 7bd827b84a67..bf4f1b6d9a52 100644 --- a/samples/bpf/trace_event_user.c +++ b/samples/bpf/trace_event_user.c @@ -127,6 +127,9 @@ static void test_perf_event_all_cpu(struct perf_event_attr *attr) int *pmu_fd = malloc(nr_cpus * sizeof(int)); int i, error = 0; + /* system wide perf event, no need to inherit */ + attr->inherit = 0; + /* open perf_event on all cpus */ for (i = 0; i < nr_cpus; i++) { pmu_fd[i] = sys_perf_event_open(attr, -1, i, -1, 0); @@ -154,6 +157,11 @@ static void test_perf_event_task(struct perf_event_attr *attr) { int pmu_fd; + /* per task perf event, enable inherit so the "dd ..." command can be traced properly. + * Enabling inherit will cause bpf_perf_prog_read_time helper failure. + */ + attr->inherit = 1; + /* open task bound event */ pmu_fd = sys_perf_event_open(attr, 0, -1, -1, 0); if (pmu_fd < 0) { @@ -175,14 +183,12 @@ static void test_bpf_perf_event(void) .freq = 1, .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES, - .inherit = 1, }; struct perf_event_attr attr_type_sw = { .sample_freq = SAMPLE_FREQ, .freq = 1, .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_CLOCK, - .inherit = 1, }; struct perf_event_attr attr_hw_cache_l1d = { .sample_freq = SAMPLE_FREQ, @@ -192,7 +198,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_L1D | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16), - .inherit = 1, }; struct perf_event_attr attr_hw_cache_branch_miss = { .sample_freq = SAMPLE_FREQ, @@ -202,7 +207,6 @@ static void test_bpf_perf_event(void) PERF_COUNT_HW_CACHE_BPU | (PERF_COUNT_HW_CACHE_OP_READ << 8) | (PERF_COUNT_HW_CACHE_RESULT_MISS << 16), - .inherit = 1, }; struct perf_event_attr attr_type_raw = { .sample_freq = SAMPLE_FREQ, @@ -210,7 +214,6 @@ static void test_bpf_perf_event(void) .type = PERF_TYPE_RAW, /* Intel Instruction Retired */ .config = 0xc0, - .inherit = 1, }; printf("Test HW_CPU_CYCLES\n"); diff --git a/samples/bpf/tracex6_kern.c b/samples/bpf/tracex6_kern.c index e7d180305974..46c557afac73 100644 --- a/samples/bpf/tracex6_kern.c +++ b/samples/bpf/tracex6_kern.c @@ -15,6 +15,12 @@ struct bpf_map_def SEC("maps") values = { .value_size = sizeof(u64), .max_entries = 64, }; +struct bpf_map_def SEC("maps") values2 = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(int), + .value_size = sizeof(struct bpf_perf_event_value), + .max_entries = 64, +}; SEC("kprobe/htab_map_get_next_key") int bpf_prog1(struct pt_regs *ctx) @@ -37,5 +43,25 @@ int bpf_prog1(struct pt_regs *ctx) return 0; } +SEC("kprobe/htab_map_lookup_elem") +int bpf_prog2(struct pt_regs *ctx) +{ + u32 key = bpf_get_smp_processor_id(); + struct bpf_perf_event_value *val, buf; + int error; + + error = bpf_perf_event_read_value(&counters, key, &buf, sizeof(buf)); + if (error) + return 0; + + val = bpf_map_lookup_elem(&values2, &key); + if (val) + *val = buf; + else + bpf_map_update_elem(&values2, &key, &buf, BPF_NOEXIST); + + return 0; +} + char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/tracex6_user.c b/samples/bpf/tracex6_user.c index a05a99a0752f..3341a96fc046 100644 --- a/samples/bpf/tracex6_user.c +++ b/samples/bpf/tracex6_user.c @@ -22,6 +22,7 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) { + struct bpf_perf_event_value value2; int pmu_fd, error = 0; cpu_set_t set; __u64 value; @@ -46,8 +47,18 @@ static void check_on_cpu(int cpu, struct perf_event_attr *attr) fprintf(stderr, "Value missing for CPU %d\n", cpu); error = 1; goto on_exit; + } else { + fprintf(stderr, "CPU %d: %llu\n", cpu, value); + } + /* The above bpf_map_lookup_elem should trigger the second kprobe */ + if (bpf_map_lookup_elem(map_fd[2], &cpu, &value2)) { + fprintf(stderr, "Value2 missing for CPU %d\n", cpu); + error = 1; + goto on_exit; + } else { + fprintf(stderr, "CPU %d: counter: %llu, enabled: %llu, running: %llu\n", cpu, + value2.counter, value2.enabled, value2.running); } - fprintf(stderr, "CPU %d: %llu\n", cpu, value); on_exit: assert(bpf_map_delete_elem(map_fd[0], &cpu) == 0 || error); diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c index 74f3fd8ed729..2fe2f761a0d0 100644 --- a/samples/bpf/xdp_monitor_kern.c +++ b/samples/bpf/xdp_monitor_kern.c @@ -13,23 +13,27 @@ struct bpf_map_def SEC("maps") redirect_err_cnt = { /* TODO: have entries for all possible errno's */ }; +#define XDP_UNKNOWN XDP_REDIRECT + 1 +struct bpf_map_def SEC("maps") exception_cnt = { + .type = BPF_MAP_TYPE_PERCPU_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(u64), + .max_entries = XDP_UNKNOWN + 1, +}; + /* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_redirect/format * Code in: kernel/include/trace/events/xdp.h */ struct xdp_redirect_ctx { - unsigned short common_type; // offset:0; size:2; signed:0; - unsigned char common_flags; // offset:2; size:1; signed:0; - unsigned char common_preempt_count;// offset:3; size:1; signed:0; - int common_pid; // offset:4; size:4; signed:1; - - int prog_id; // offset:8; size:4; signed:1; - u32 act; // offset:12 size:4; signed:0; - int ifindex; // offset:16 size:4; signed:1; - int err; // offset:20 size:4; signed:1; - int to_ifindex; // offset:24 size:4; signed:1; - u32 map_id; // offset:28 size:4; signed:0; - int map_index; // offset:32 size:4; signed:1; -}; // offset:36 + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12 size:4; signed:0; + int ifindex; // offset:16 size:4; signed:1; + int err; // offset:20 size:4; signed:1; + int to_ifindex; // offset:24 size:4; signed:1; + u32 map_id; // offset:28 size:4; signed:0; + int map_index; // offset:32 size:4; signed:1; +}; // offset:36 enum { XDP_REDIRECT_SUCCESS = 0, @@ -48,7 +52,7 @@ int xdp_redirect_collect_stat(struct xdp_redirect_ctx *ctx) cnt = bpf_map_lookup_elem(&redirect_err_cnt, &key); if (!cnt) - return 0; + return 1; *cnt += 1; return 0; /* Indicate event was filtered (no further processing)*/ @@ -86,3 +90,31 @@ int trace_xdp_redirect_map(struct xdp_redirect_ctx *ctx) { return xdp_redirect_collect_stat(ctx); } + +/* Tracepoint format: /sys/kernel/debug/tracing/events/xdp/xdp_exception/format + * Code in: kernel/include/trace/events/xdp.h + */ +struct xdp_exception_ctx { + u64 __pad; // First 8 bytes are not accessible by bpf code + int prog_id; // offset:8; size:4; signed:1; + u32 act; // offset:12; size:4; signed:0; + int ifindex; // offset:16; size:4; signed:1; +}; + +SEC("tracepoint/xdp/xdp_exception") +int trace_xdp_exception(struct xdp_exception_ctx *ctx) +{ + u64 *cnt;; + u32 key; + + key = ctx->act; + if (key > XDP_REDIRECT) + key = XDP_UNKNOWN; + + cnt = bpf_map_lookup_elem(&exception_cnt, &key); + if (!cnt) + return 1; + *cnt += 1; + + return 0; +} diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c index c5ab8b776973..eaba165b3549 100644 --- a/samples/bpf/xdp_monitor_user.c +++ b/samples/bpf/xdp_monitor_user.c @@ -20,6 +20,7 @@ static const char *__doc_err_only__= #include <unistd.h> #include <locale.h> +#include <sys/resource.h> #include <getopt.h> #include <net/if.h> #include <time.h> @@ -89,6 +90,23 @@ static const char *err2str(int err) return redir_names[err]; return NULL; } +/* enum xdp_action */ +#define XDP_UNKNOWN XDP_REDIRECT + 1 +#define XDP_ACTION_MAX (XDP_UNKNOWN + 1) +static const char *xdp_action_names[XDP_ACTION_MAX] = { + [XDP_ABORTED] = "XDP_ABORTED", + [XDP_DROP] = "XDP_DROP", + [XDP_PASS] = "XDP_PASS", + [XDP_TX] = "XDP_TX", + [XDP_REDIRECT] = "XDP_REDIRECT", + [XDP_UNKNOWN] = "XDP_UNKNOWN", +}; +static const char *action2str(int action) +{ + if (action < XDP_ACTION_MAX) + return xdp_action_names[action]; + return NULL; +} struct record { __u64 counter; @@ -97,6 +115,7 @@ struct record { struct stats_record { struct record xdp_redir[REDIR_RES_MAX]; + struct record xdp_exception[XDP_ACTION_MAX]; }; static void stats_print_headers(bool err_only) @@ -104,39 +123,72 @@ static void stats_print_headers(bool err_only) if (err_only) printf("\n%s\n", __doc_err_only__); - printf("%-14s %-10s %-18s %-9s\n", - "XDP_REDIRECT", "pps ", "pps-human-readable", "measure-period"); + printf("%-14s %-11s %-10s %-18s %-9s\n", + "ACTION", "result", "pps ", "pps-human-readable", "measure-period"); +} + +static double calc_period(struct record *r, struct record *p) +{ + double period_ = 0; + __u64 period = 0; + + period = r->timestamp - p->timestamp; + if (period > 0) + period_ = ((double) period / NANOSEC_PER_SEC); + + return period_; +} + +static double calc_pps(struct record *r, struct record *p, double period) +{ + __u64 packets = 0; + double pps = 0; + + if (period > 0) { + packets = r->counter - p->counter; + pps = packets / period; + } + return pps; } static void stats_print(struct stats_record *rec, struct stats_record *prev, bool err_only) { + double period = 0, pps = 0; + struct record *r, *p; int i = 0; + char *fmt = "%-14s %-11s %-10.0f %'-18.0f %f\n"; + + /* tracepoint: xdp:xdp_redirect_* */ if (err_only) i = REDIR_ERROR; for (; i < REDIR_RES_MAX; i++) { - struct record *r = &rec->xdp_redir[i]; - struct record *p = &prev->xdp_redir[i]; - __u64 period = 0; - __u64 packets = 0; - double pps = 0; - double period_ = 0; + r = &rec->xdp_redir[i]; + p = &prev->xdp_redir[i]; if (p->timestamp) { - packets = r->counter - p->counter; - period = r->timestamp - p->timestamp; - if (period > 0) { - period_ = ((double) period / NANOSEC_PER_SEC); - pps = packets / period_; - } + period = calc_period(r, p); + pps = calc_pps(r, p, period); } + printf(fmt, "XDP_REDIRECT", err2str(i), pps, pps, period); + } - printf("%-14s %-10.0f %'-18.0f %f\n", - err2str(i), pps, pps, period_); + /* tracepoint: xdp:xdp_exception */ + for (i = 0; i < XDP_ACTION_MAX; i++) { + r = &rec->xdp_exception[i]; + p = &prev->xdp_exception[i]; + if (p->timestamp) { + period = calc_period(r, p); + pps = calc_pps(r, p, period); + } + if (pps > 0) + printf(fmt, action2str(i), "Exception", + pps, pps, period); } + printf("\n"); } static __u64 get_key32_value64_percpu(int fd, __u32 key) @@ -160,25 +212,33 @@ static __u64 get_key32_value64_percpu(int fd, __u32 key) return sum; } -static bool stats_collect(int fd, struct stats_record *rec) +static bool stats_collect(struct stats_record *rec) { + int fd; int i; /* TODO: Detect if someone unloaded the perf event_fd's, as * this can happen by someone running perf-record -e */ + fd = map_data[0].fd; /* map0: redirect_err_cnt */ for (i = 0; i < REDIR_RES_MAX; i++) { rec->xdp_redir[i].timestamp = gettime(); rec->xdp_redir[i].counter = get_key32_value64_percpu(fd, i); } + + fd = map_data[1].fd; /* map1: exception_cnt */ + for (i = 0; i < XDP_ACTION_MAX; i++) { + rec->xdp_exception[i].timestamp = gettime(); + rec->xdp_exception[i].counter = get_key32_value64_percpu(fd, i); + } + return true; } static void stats_poll(int interval, bool err_only) { struct stats_record rec, prev; - int map_fd; memset(&rec, 0, sizeof(rec)); @@ -190,16 +250,17 @@ static void stats_poll(int interval, bool err_only) printf("\n%s", __doc__); /* TODO Need more advanced stats on error types */ - if (verbose) - printf(" - Stats map: %s\n", map_data[0].name); - map_fd = map_data[0].fd; - - stats_print_headers(err_only); + if (verbose) { + printf(" - Stats map0: %s\n", map_data[0].name); + printf(" - Stats map1: %s\n", map_data[1].name); + printf("\n"); + } fflush(stdout); while (1) { memcpy(&prev, &rec, sizeof(rec)); - stats_collect(map_fd, &rec); + stats_collect(&rec); + stats_print_headers(err_only); stats_print(&rec, &prev, err_only); fflush(stdout); sleep(interval); @@ -235,6 +296,7 @@ static void print_bpf_prog_info(void) int main(int argc, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; int longindex = 0, opt; int ret = EXIT_SUCCESS; char bpf_obj_file[256]; @@ -265,13 +327,18 @@ int main(int argc, char **argv) } } + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return EXIT_FAILURE; + } + if (load_bpf_file(bpf_obj_file)) { printf("ERROR - bpf_log_buf: %s", bpf_log_buf); - return 1; + return EXIT_FAILURE; } if (!prog_fd[0]) { printf("ERROR - load_bpf_file: %s\n", strerror(errno)); - return 1; + return EXIT_FAILURE; } if (debug) { diff --git a/tools/bpf/bpftool/Documentation/bpftool-prog.rst b/tools/bpf/bpftool/Documentation/bpftool-prog.rst index 57fc4b9924ea..04d12f768f06 100644 --- a/tools/bpf/bpftool/Documentation/bpftool-prog.rst +++ b/tools/bpf/bpftool/Documentation/bpftool-prog.rst @@ -11,7 +11,7 @@ SYNOPSIS ======== | **bpftool** prog show [*PROG*] -| **bpftool** prog dump xlated *PROG* file *FILE* +| **bpftool** prog dump xlated *PROG* [file *FILE*] [opcodes] | **bpftool** prog dump jited *PROG* [file *FILE*] [opcodes] | **bpftool** prog pin *PROG* *FILE* | **bpftool** prog help @@ -28,9 +28,12 @@ DESCRIPTION Output will start with program ID followed by program type and zero or more named attributes (depending on kernel version). - **bpftool prog dump xlated** *PROG* **file** *FILE* - Dump eBPF instructions of the program from the kernel to a - file. + **bpftool prog dump xlated** *PROG* [**file** *FILE*] [**opcodes**] + Dump eBPF instructions of the program from the kernel. + If *FILE* is specified image will be written to a file, + otherwise it will be disassembled and printed to stdout. + + **opcodes** controls if raw opcodes will be printed. **bpftool prog dump jited** *PROG* [**file** *FILE*] [**opcodes**] Dump jited image (host machine code) of the program. diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile index 8705ee44664d..4f339824ca57 100644 --- a/tools/bpf/bpftool/Makefile +++ b/tools/bpf/bpftool/Makefile @@ -51,7 +51,7 @@ CC = gcc CFLAGS += -O2 CFLAGS += -W -Wall -Wextra -Wno-unused-parameter -Wshadow -CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf +CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi -I$(srctree)/tools/include -I$(srctree)/tools/lib/bpf -I$(srctree)/kernel/bpf/ LIBS = -lelf -lbfd -lopcodes $(LIBBPF) include $(wildcard *.d) @@ -59,7 +59,10 @@ include $(wildcard *.d) all: $(OUTPUT)bpftool SRCS=$(wildcard *.c) -OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) +OBJS=$(patsubst %.c,$(OUTPUT)%.o,$(SRCS)) $(OUTPUT)disasm.o + +$(OUTPUT)disasm.o: $(srctree)/kernel/bpf/disasm.c + $(QUIET_CC)$(COMPILE.c) -MMD -o $@ $< $(OUTPUT)bpftool: $(OBJS) $(LIBBPF) $(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $^ $(LIBS) diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h index 85d2d7870a58..8e809b2bb311 100644 --- a/tools/bpf/bpftool/main.h +++ b/tools/bpf/bpftool/main.h @@ -36,11 +36,12 @@ #ifndef __BPF_TOOL_H #define __BPF_TOOL_H +/* BFD and kernel.h both define GCC_VERSION, differently */ +#undef GCC_VERSION #include <stdbool.h> #include <stdio.h> #include <linux/bpf.h> - -#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) +#include <linux/kernel.h> #define err(msg...) fprintf(stderr, "Error: " msg) #define warn(msg...) fprintf(stderr, "Warning: " msg) @@ -48,11 +49,6 @@ #define ptr_to_u64(ptr) ((__u64)(unsigned long)(ptr)) -#define min(a, b) \ - ({ typeof(a) _a = (a); typeof(b) _b = (b); _a > _b ? _b : _a; }) -#define max(a, b) \ - ({ typeof(a) _a = (a); typeof(b) _b = (b); _a < _b ? _b : _a; }) - #define NEXT_ARG() ({ argc--; argv++; if (argc < 0) usage(); }) #define NEXT_ARGP() ({ (*argc)--; (*argv)++; if (*argc < 0) usage(); }) #define BAD_ARG() ({ err("what is '%s'?\n", *argv); -1; }) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 421ba89ce86a..9e2681c83717 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -35,6 +35,7 @@ #include <errno.h> #include <fcntl.h> +#include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -46,6 +47,7 @@ #include <bpf.h> #include "main.h" +#include "disasm.h" static const char * const prog_type_name[] = { [BPF_PROG_TYPE_UNSPEC] = "unspec", @@ -297,11 +299,39 @@ static int do_show(int argc, char **argv) return 0; } +static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vprintf(fmt, args); + va_end(args); +} + +static void dump_xlated(void *buf, unsigned int len, bool opcodes) +{ + struct bpf_insn *insn = buf; + unsigned int i; + + for (i = 0; i < len / sizeof(*insn); i++) { + printf("% 4d: ", i); + print_bpf_insn(print_insn, NULL, insn + i, true); + + if (opcodes) { + printf(" "); + print_hex(insn + i, 8, " "); + printf("\n"); + } + + if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW)) + i++; + } +} + static int do_dump(int argc, char **argv) { struct bpf_prog_info info = {}; __u32 len = sizeof(info); - bool can_disasm = false; unsigned int buf_size; char *filepath = NULL; bool opcodes = false; @@ -315,7 +345,6 @@ static int do_dump(int argc, char **argv) if (is_prefix(*argv, "jited")) { member_len = &info.jited_prog_len; member_ptr = &info.jited_prog_insns; - can_disasm = true; } else if (is_prefix(*argv, "xlated")) { member_len = &info.xlated_prog_len; member_ptr = &info.xlated_prog_insns; @@ -346,10 +375,6 @@ static int do_dump(int argc, char **argv) NEXT_ARG(); } - if (!filepath && !can_disasm) { - err("expected 'file' got %s\n", *argv); - return -1; - } if (argc) { usage(); return -1; @@ -409,7 +434,10 @@ static int do_dump(int argc, char **argv) goto err_free; } } else { - disasm_print_insn(buf, *member_len, opcodes); + if (member_len == &info.jited_prog_len) + disasm_print_insn(buf, *member_len, opcodes); + else + dump_xlated(buf, *member_len, opcodes); } free(buf); @@ -430,7 +458,7 @@ static int do_help(int argc, char **argv) { fprintf(stderr, "Usage: %s %s show [PROG]\n" - " %s %s dump xlated PROG file FILE\n" + " %s %s dump xlated PROG [file FILE] [opcodes]\n" " %s %s dump jited PROG [file FILE] [opcodes]\n" " %s %s pin PROG FILE\n" " %s %s help\n" diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index cb2b9f95160a..fb4fb81ce5b0 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -230,7 +230,7 @@ union bpf_attr { __u32 numa_node; /* numa node (effective only if * BPF_F_NUMA_NODE is set). */ - __u8 map_name[BPF_OBJ_NAME_LEN]; + char map_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -253,7 +253,7 @@ union bpf_attr { __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ __u32 prog_flags; - __u8 prog_name[BPF_OBJ_NAME_LEN]; + char prog_name[BPF_OBJ_NAME_LEN]; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -697,7 +697,9 @@ union bpf_attr { FN(redirect_map), \ FN(sk_redirect_map), \ FN(sock_map_update), \ - FN(xdp_adjust_meta), + FN(xdp_adjust_meta), \ + FN(perf_event_read_value), \ + FN(perf_prog_read_value), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -869,7 +871,7 @@ struct bpf_prog_info { __u32 created_by_uid; __u32 nr_map_ids; __aligned_u64 map_ids; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); struct bpf_map_info { @@ -879,7 +881,7 @@ struct bpf_map_info { __u32 value_size; __u32 max_entries; __u32 map_flags; - __u8 name[BPF_OBJ_NAME_LEN]; + char name[BPF_OBJ_NAME_LEN]; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index 924af8d79bde..2e7880ea0add 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -12,7 +12,7 @@ CFLAGS += -Wall -O2 -I$(APIDIR) -I$(LIBDIR) -I$(GENDIR) $(GENFLAGS) -I../../../i LDLIBS += -lcap -lelf TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_align + test_align test_verifier_log TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index a56053db26f5..e25dbf6038cf 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -72,6 +72,12 @@ static int (*bpf_sk_redirect_map)(void *map, int key, int flags) = static int (*bpf_sock_map_update)(void *map, void *key, void *value, unsigned long long flags) = (void *) BPF_FUNC_sock_map_update; +static int (*bpf_perf_event_read_value)(void *map, unsigned long long flags, + void *buf, unsigned int buf_size) = + (void *) BPF_FUNC_perf_event_read_value; +static int (*bpf_perf_prog_read_value)(void *ctx, void *buf, + unsigned int buf_size) = + (void *) BPF_FUNC_perf_prog_read_value; /* llvm builtin functions that eBPF C program may use to diff --git a/tools/testing/selftests/bpf/test_verifier_log.c b/tools/testing/selftests/bpf/test_verifier_log.c new file mode 100644 index 000000000000..3cc0b561489e --- /dev/null +++ b/tools/testing/selftests/bpf/test_verifier_log.c @@ -0,0 +1,171 @@ +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> + +#include <linux/bpf.h> +#include <linux/filter.h> +#include <linux/unistd.h> + +#include <bpf/bpf.h> + +#define LOG_SIZE (1 << 20) + +#define err(str...) printf("ERROR: " str) + +static const struct bpf_insn code_sample[] = { + /* We need a few instructions to pass the min log length */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + BPF_EXIT_INSN(), +}; + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64) (unsigned long) ptr; +} + +static int load(char *log, size_t log_len, int log_level) +{ + union bpf_attr attr; + + bzero(&attr, sizeof(attr)); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.insn_cnt = (__u32)(sizeof(code_sample) / sizeof(struct bpf_insn)); + attr.insns = ptr_to_u64(code_sample); + attr.license = ptr_to_u64("GPL"); + attr.log_buf = ptr_to_u64(log); + attr.log_size = log_len; + attr.log_level = log_level; + + return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); +} + +static void check_ret(int ret, int exp_errno) +{ + if (ret > 0) { + close(ret); + err("broken sample loaded successfully!?\n"); + exit(1); + } + + if (!ret || errno != exp_errno) { + err("Program load returned: ret:%d/errno:%d, expected ret:%d/errno:%d\n", + ret, errno, -1, exp_errno); + exit(1); + } +} + +static void check_ones(const char *buf, size_t len, const char *msg) +{ + while (len--) + if (buf[len] != 1) { + err("%s", msg); + exit(1); + } +} + +static void test_log_good(char *log, size_t buf_len, size_t log_len, + size_t exp_len, int exp_errno, const char *full_log) +{ + size_t len; + int ret; + + memset(log, 1, buf_len); + + ret = load(log, log_len, 1); + check_ret(ret, exp_errno); + + len = strnlen(log, buf_len); + if (len == buf_len) { + err("verifier did not NULL terminate the log\n"); + exit(1); + } + if (exp_len && len != exp_len) { + err("incorrect log length expected:%zd have:%zd\n", + exp_len, len); + exit(1); + } + + if (strchr(log, 1)) { + err("verifier leaked a byte through\n"); + exit(1); + } + + check_ones(log + len + 1, buf_len - len - 1, + "verifier wrote bytes past NULL termination\n"); + + if (memcmp(full_log, log, LOG_SIZE)) { + err("log did not match expected output\n"); + exit(1); + } +} + +static void test_log_bad(char *log, size_t log_len, int log_level) +{ + int ret; + + ret = load(log, log_len, log_level); + check_ret(ret, EINVAL); + if (log) + check_ones(log, LOG_SIZE, + "verifier touched log with bad parameters\n"); +} + +int main(int argc, char **argv) +{ + char full_log[LOG_SIZE]; + char log[LOG_SIZE]; + size_t want_len; + int i; + + memset(log, 1, LOG_SIZE); + + /* Test incorrect attr */ + printf("Test log_level 0...\n"); + test_log_bad(log, LOG_SIZE, 0); + + printf("Test log_size < 128...\n"); + test_log_bad(log, 15, 1); + + printf("Test log_buff = NULL...\n"); + test_log_bad(NULL, LOG_SIZE, 1); + + /* Test with log big enough */ + printf("Test oversized buffer...\n"); + test_log_good(full_log, LOG_SIZE, LOG_SIZE, 0, EACCES, full_log); + + want_len = strlen(full_log); + + printf("Test exact buffer...\n"); + test_log_good(log, LOG_SIZE, want_len + 2, want_len, EACCES, full_log); + + printf("Test undersized buffers...\n"); + for (i = 0; i < 64; i++) { + full_log[want_len - i + 1] = 1; + full_log[want_len - i] = 0; + + test_log_good(log, LOG_SIZE, want_len + 1 - i, want_len - i, + ENOSPC, full_log); + } + + printf("test_verifier_log: OK\n"); + return 0; +} diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index e8c86c416ed0..5215493166c9 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -37,6 +37,26 @@ kci_del_dummy() check_err $? } +kci_test_netconf() +{ + dev="$1" + r=$ret + + ip netconf show dev "$dev" > /dev/null + check_err $? + + for f in 4 6; do + ip -$f netconf show dev "$dev" > /dev/null + check_err $? + done + + if [ $ret -ne 0 ] ;then + echo "FAIL: ip netconf show $dev" + test $r -eq 0 && ret=0 + return 1 + fi +} + # add a bridge with vlans on top kci_test_bridge() { @@ -63,6 +83,11 @@ kci_test_bridge() check_err $? ip r s t all > /dev/null check_err $? + + for name in "$devbr" "$vlandev" "$devdummy" ; do + kci_test_netconf "$name" + done + ip -6 addr del dev "$vlandev" dead:42::1234/64 check_err $? @@ -100,6 +125,9 @@ kci_test_gre() check_err $? ip addr > /dev/null check_err $? + + kci_test_netconf "$gredev" + ip addr del dev "$devdummy" 10.23.7.11/24 check_err $? @@ -433,6 +461,47 @@ kci_test_encap() ip netns del "$testns" } +kci_test_macsec() +{ + msname="test_macsec0" + ret=0 + + ip macsec help 2>&1 | grep -q "^Usage: ip macsec" + if [ $? -ne 0 ]; then + echo "SKIP: macsec: iproute2 too old" + return 0 + fi + + ip link add link "$devdummy" "$msname" type macsec port 42 encrypt on + check_err $? + if [ $ret -ne 0 ];then + echo "FAIL: can't add macsec interface, skipping test" + return 1 + fi + + ip macsec add "$msname" tx sa 0 pn 1024 on key 01 12345678901234567890123456789012 + check_err $? + + ip macsec add "$msname" rx port 1234 address "1c:ed:de:ad:be:ef" + check_err $? + + ip macsec add "$msname" rx port 1234 address "1c:ed:de:ad:be:ef" sa 0 pn 1 on key 00 0123456789abcdef0123456789abcdef + check_err $? + + ip macsec show > /dev/null + check_err $? + + ip link del dev "$msname" + check_err $? + + if [ $ret -ne 0 ];then + echo "FAIL: macsec" + return 1 + fi + + echo "PASS: macsec" +} + kci_test_rtnl() { kci_add_dummy @@ -450,6 +519,7 @@ kci_test_rtnl() kci_test_ifalias kci_test_vrf kci_test_encap + kci_test_macsec kci_del_dummy } diff --git a/tools/testing/selftests/networking/timestamping/rxtimestamp.c b/tools/testing/selftests/networking/timestamping/rxtimestamp.c index 00f286661dcd..dd4162fc0419 100644 --- a/tools/testing/selftests/networking/timestamping/rxtimestamp.c +++ b/tools/testing/selftests/networking/timestamping/rxtimestamp.c @@ -341,7 +341,7 @@ int main(int argc, char **argv) return 0; case 'n': t = atoi(optarg); - if (t > ARRAY_SIZE(test_cases)) + if (t >= ARRAY_SIZE(test_cases)) error(1, 0, "Invalid test case: %d", t); all_tests = false; test_cases[t].enabled = true; |