diff options
Diffstat (limited to 'arch')
123 files changed, 3181 insertions, 892 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 19483aea4bbc..99839c23d453 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -5,6 +5,9 @@ config KEXEC_CORE bool +config HAVE_IMA_KEXEC + bool + config OPROFILE tristate "OProfile system profiling" depends on PROFILING diff --git a/arch/arm/boot/dts/hisi-x5hd2.dtsi b/arch/arm/boot/dts/hisi-x5hd2.dtsi index c02e092fad8b..6c712a97e1fe 100644 --- a/arch/arm/boot/dts/hisi-x5hd2.dtsi +++ b/arch/arm/boot/dts/hisi-x5hd2.dtsi @@ -438,7 +438,7 @@ }; gmac0: ethernet@1840000 { - compatible = "hisilicon,hix5hd2-gemac", "hisilicon,hisi-gemac-v1"; + compatible = "hisilicon,hix5hd2-gmac", "hisilicon,hisi-gmac-v1"; reg = <0x1840000 0x1000>,<0x184300c 0x4>; interrupts = <0 71 4>; clocks = <&clock HIX5HD2_MAC0_CLK>; @@ -447,7 +447,7 @@ }; gmac1: ethernet@1841000 { - compatible = "hisilicon,hix5hd2-gemac", "hisilicon,hisi-gemac-v1"; + compatible = "hisilicon,hix5hd2-gmac", "hisilicon,hisi-gmac-v1"; reg = <0x1841000 0x1000>,<0x1843010 0x4>; interrupts = <0 72 4>; clocks = <&clock HIX5HD2_MAC1_CLK>; diff --git a/arch/arm/mach-s3c24xx/common-smdk.c b/arch/arm/mach-s3c24xx/common-smdk.c index e9fbcc91c5c0..9e0bc46e90ec 100644 --- a/arch/arm/mach-s3c24xx/common-smdk.c +++ b/arch/arm/mach-s3c24xx/common-smdk.c @@ -171,6 +171,7 @@ static struct s3c2410_platform_nand smdk_nand_info = { .twrph1 = 20, .nr_sets = ARRAY_SIZE(smdk_nand_sets), .sets = smdk_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; /* devices we initialise */ diff --git a/arch/arm/mach-s3c24xx/mach-anubis.c b/arch/arm/mach-s3c24xx/mach-anubis.c index d03df0df01fa..029ef1b58925 100644 --- a/arch/arm/mach-s3c24xx/mach-anubis.c +++ b/arch/arm/mach-s3c24xx/mach-anubis.c @@ -223,6 +223,7 @@ static struct s3c2410_platform_nand __initdata anubis_nand_info = { .nr_sets = ARRAY_SIZE(anubis_nand_sets), .sets = anubis_nand_sets, .select_chip = anubis_nand_select, + .ecc_mode = NAND_ECC_SOFT, }; /* IDE channels */ diff --git a/arch/arm/mach-s3c24xx/mach-at2440evb.c b/arch/arm/mach-s3c24xx/mach-at2440evb.c index 9ae170fef2a7..7b28eb623fc1 100644 --- a/arch/arm/mach-s3c24xx/mach-at2440evb.c +++ b/arch/arm/mach-s3c24xx/mach-at2440evb.c @@ -114,6 +114,7 @@ static struct s3c2410_platform_nand __initdata at2440evb_nand_info = { .twrph1 = 40, .nr_sets = ARRAY_SIZE(at2440evb_nand_sets), .sets = at2440evb_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; /* DM9000AEP 10/100 ethernet controller */ diff --git a/arch/arm/mach-s3c24xx/mach-bast.c b/arch/arm/mach-s3c24xx/mach-bast.c index ed07cf392d4b..5185036765db 100644 --- a/arch/arm/mach-s3c24xx/mach-bast.c +++ b/arch/arm/mach-s3c24xx/mach-bast.c @@ -299,6 +299,7 @@ static struct s3c2410_platform_nand __initdata bast_nand_info = { .nr_sets = ARRAY_SIZE(bast_nand_sets), .sets = bast_nand_sets, .select_chip = bast_nand_select, + .ecc_mode = NAND_ECC_SOFT, }; /* DM9000 */ diff --git a/arch/arm/mach-s3c24xx/mach-gta02.c b/arch/arm/mach-s3c24xx/mach-gta02.c index 27ae6877550f..b0ed401da3a3 100644 --- a/arch/arm/mach-s3c24xx/mach-gta02.c +++ b/arch/arm/mach-s3c24xx/mach-gta02.c @@ -443,6 +443,7 @@ static struct s3c2410_platform_nand __initdata gta02_nand_info = { .twrph1 = 15, .nr_sets = ARRAY_SIZE(gta02_nand_sets), .sets = gta02_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; diff --git a/arch/arm/mach-s3c24xx/mach-jive.c b/arch/arm/mach-s3c24xx/mach-jive.c index 7d99fe8f6157..895aca225952 100644 --- a/arch/arm/mach-s3c24xx/mach-jive.c +++ b/arch/arm/mach-s3c24xx/mach-jive.c @@ -232,6 +232,7 @@ static struct s3c2410_platform_nand __initdata jive_nand_info = { .twrph1 = 40, .sets = jive_nand_sets, .nr_sets = ARRAY_SIZE(jive_nand_sets), + .ecc_mode = NAND_ECC_SOFT, }; static int __init jive_mtdset(char *options) diff --git a/arch/arm/mach-s3c24xx/mach-mini2440.c b/arch/arm/mach-s3c24xx/mach-mini2440.c index ec60bd4a1646..71af8d2fd320 100644 --- a/arch/arm/mach-s3c24xx/mach-mini2440.c +++ b/arch/arm/mach-s3c24xx/mach-mini2440.c @@ -287,6 +287,7 @@ static struct s3c2410_platform_nand mini2440_nand_info __initdata = { .nr_sets = ARRAY_SIZE(mini2440_nand_sets), .sets = mini2440_nand_sets, .ignore_unset_ecc = 1, + .ecc_mode = NAND_ECC_SOFT, }; /* DM9000AEP 10/100 ethernet controller */ diff --git a/arch/arm/mach-s3c24xx/mach-osiris.c b/arch/arm/mach-s3c24xx/mach-osiris.c index 2f6fdc326835..70b0eb7d3134 100644 --- a/arch/arm/mach-s3c24xx/mach-osiris.c +++ b/arch/arm/mach-s3c24xx/mach-osiris.c @@ -238,6 +238,7 @@ static struct s3c2410_platform_nand __initdata osiris_nand_info = { .nr_sets = ARRAY_SIZE(osiris_nand_sets), .sets = osiris_nand_sets, .select_chip = osiris_nand_select, + .ecc_mode = NAND_ECC_SOFT, }; /* PCMCIA control and configuration */ diff --git a/arch/arm/mach-s3c24xx/mach-qt2410.c b/arch/arm/mach-s3c24xx/mach-qt2410.c index 984516e8307a..868c82087403 100644 --- a/arch/arm/mach-s3c24xx/mach-qt2410.c +++ b/arch/arm/mach-s3c24xx/mach-qt2410.c @@ -284,6 +284,7 @@ static struct s3c2410_platform_nand __initdata qt2410_nand_info = { .twrph1 = 20, .nr_sets = ARRAY_SIZE(qt2410_nand_sets), .sets = qt2410_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; /* UDC */ diff --git a/arch/arm/mach-s3c24xx/mach-rx1950.c b/arch/arm/mach-s3c24xx/mach-rx1950.c index 25a139bb9826..e86ad6a68a0b 100644 --- a/arch/arm/mach-s3c24xx/mach-rx1950.c +++ b/arch/arm/mach-s3c24xx/mach-rx1950.c @@ -611,6 +611,7 @@ static struct s3c2410_platform_nand rx1950_nand_info = { .twrph1 = 15, .nr_sets = ARRAY_SIZE(rx1950_nand_sets), .sets = rx1950_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; static struct s3c2410_udc_mach_info rx1950_udc_cfg __initdata = { diff --git a/arch/arm/mach-s3c24xx/mach-rx3715.c b/arch/arm/mach-s3c24xx/mach-rx3715.c index cf55196f89ca..a39fb9780dd3 100644 --- a/arch/arm/mach-s3c24xx/mach-rx3715.c +++ b/arch/arm/mach-s3c24xx/mach-rx3715.c @@ -164,6 +164,7 @@ static struct s3c2410_platform_nand __initdata rx3715_nand_info = { .twrph1 = 15, .nr_sets = ARRAY_SIZE(rx3715_nand_sets), .sets = rx3715_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; static struct platform_device *rx3715_devices[] __initdata = { diff --git a/arch/arm/mach-s3c24xx/mach-vstms.c b/arch/arm/mach-s3c24xx/mach-vstms.c index b4460d5f7011..f5e6322145fa 100644 --- a/arch/arm/mach-s3c24xx/mach-vstms.c +++ b/arch/arm/mach-s3c24xx/mach-vstms.c @@ -117,6 +117,7 @@ static struct s3c2410_platform_nand __initdata vstms_nand_info = { .twrph1 = 20, .nr_sets = ARRAY_SIZE(vstms_nand_sets), .sets = vstms_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; static struct platform_device *vstms_devices[] __initdata = { diff --git a/arch/arm/mach-s3c64xx/mach-hmt.c b/arch/arm/mach-s3c64xx/mach-hmt.c index bc7dc1fcbf7d..59b5531f1987 100644 --- a/arch/arm/mach-s3c64xx/mach-hmt.c +++ b/arch/arm/mach-s3c64xx/mach-hmt.c @@ -204,6 +204,7 @@ static struct s3c2410_platform_nand hmt_nand_info = { .twrph1 = 40, .nr_sets = ARRAY_SIZE(hmt_nand_sets), .sets = hmt_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; static struct gpio_led hmt_leds[] = { diff --git a/arch/arm/mach-s3c64xx/mach-mini6410.c b/arch/arm/mach-s3c64xx/mach-mini6410.c index ae999fb3fe6d..a3e3e25728b4 100644 --- a/arch/arm/mach-s3c64xx/mach-mini6410.c +++ b/arch/arm/mach-s3c64xx/mach-mini6410.c @@ -142,6 +142,7 @@ static struct s3c2410_platform_nand mini6410_nand_info = { .twrph1 = 40, .nr_sets = ARRAY_SIZE(mini6410_nand_sets), .sets = mini6410_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; static struct s3c_fb_pd_win mini6410_lcd_type0_fb_win = { diff --git a/arch/arm/mach-s3c64xx/mach-real6410.c b/arch/arm/mach-s3c64xx/mach-real6410.c index 4e240ffa7ac7..d6b3ffd7704b 100644 --- a/arch/arm/mach-s3c64xx/mach-real6410.c +++ b/arch/arm/mach-s3c64xx/mach-real6410.c @@ -194,6 +194,7 @@ static struct s3c2410_platform_nand real6410_nand_info = { .twrph1 = 40, .nr_sets = ARRAY_SIZE(real6410_nand_sets), .sets = real6410_nand_sets, + .ecc_mode = NAND_ECC_SOFT, }; static struct platform_device *real6410_devices[] __initdata = { diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index d0de0e032bc2..c1976c0adca7 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -29,7 +29,7 @@ /* Basic configuration for ACPI */ #ifdef CONFIG_ACPI -/* ACPI table mapping after acpi_gbl_permanent_mmap is set */ +/* ACPI table mapping after acpi_permanent_mmap is set */ static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) { diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index b71086d25195..bfe632808d77 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -165,6 +165,11 @@ extern u64 kimage_vaddr; /* the offset between the kernel virtual and physical mappings */ extern u64 kimage_voffset; +static inline unsigned long kaslr_offset(void) +{ + return kimage_vaddr - KIMAGE_VADDR; +} + /* * Allow all memory at the discovery stage. We will clip it later. */ diff --git a/arch/arm64/include/asm/numa.h b/arch/arm64/include/asm/numa.h index 600887e491fd..bf466d1876e3 100644 --- a/arch/arm64/include/asm/numa.h +++ b/arch/arm64/include/asm/numa.h @@ -15,6 +15,8 @@ int __node_distance(int from, int to); extern nodemask_t numa_nodes_parsed __initdata; +extern bool numa_off; + /* Mappings between node number and cpus on that node. */ extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; void numa_clear_node(unsigned int cpu); diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index 252a6d9c1da5..64d9cbd61678 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -132,14 +132,13 @@ static int __init acpi_fadt_sanity_check(void) struct acpi_table_header *table; struct acpi_table_fadt *fadt; acpi_status status; - acpi_size tbl_size; int ret = 0; /* * FADT is required on arm64; retrieve it to check its presence * and carry out revision and ACPI HW reduced compliancy tests */ - status = acpi_get_table_with_size(ACPI_SIG_FADT, 0, &table, &tbl_size); + status = acpi_get_table(ACPI_SIG_FADT, 0, &table); if (ACPI_FAILURE(status)) { const char *msg = acpi_format_exception(status); @@ -170,10 +169,10 @@ static int __init acpi_fadt_sanity_check(void) out: /* - * acpi_get_table_with_size() creates FADT table mapping that + * acpi_get_table() creates FADT table mapping that * should be released after parsing and before resuming boot */ - early_acpi_os_unmap_memory(table, tbl_size); + acpi_put_table(table); return ret; } diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index a53f52ac81c6..b051367e2149 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -338,11 +338,11 @@ subsys_initcall(topology_init); static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { - u64 const kaslr_offset = kimage_vaddr - KIMAGE_VADDR; + const unsigned long offset = kaslr_offset(); - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset > 0) { - pr_emerg("Kernel Offset: 0x%llx from 0x%lx\n", - kaslr_offset, KIMAGE_VADDR); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && offset > 0) { + pr_emerg("Kernel Offset: 0x%lx from 0x%lx\n", + offset, KIMAGE_VADDR); } else { pr_emerg("Kernel Offset: disabled\n"); } diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 4b32168cf91a..b388a99fea7b 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -35,7 +35,7 @@ static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE }; static int numa_distance_cnt; static u8 *numa_distance; -static bool numa_off; +bool numa_off; static __init int numa_parse_early_param(char *opt) { diff --git a/arch/ia64/include/asm/numa.h b/arch/ia64/include/asm/numa.h index 2db0a6c6daa5..ebef7f40aabb 100644 --- a/arch/ia64/include/asm/numa.h +++ b/arch/ia64/include/asm/numa.h @@ -65,6 +65,8 @@ extern int paddr_to_nid(unsigned long paddr); #define local_nodeid (cpu_to_node_map[smp_processor_id()]) +#define numa_off 0 + extern void map_cpu_to_node(int cpu, int nid); extern void unmap_cpu_from_node(int cpu, int nid); extern void numa_clear_node(int cpu); diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index 805ae5d712e8..032fed71223f 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -38,6 +38,6 @@ #endif /* __ASSEMBLY__ */ -#define __NR_syscalls 392 +#define __NR_syscalls 398 #endif /* _ASM_MICROBLAZE_UNISTD_H */ diff --git a/arch/microblaze/include/uapi/asm/unistd.h b/arch/microblaze/include/uapi/asm/unistd.h index a8bd3fa28bc7..d8086159d996 100644 --- a/arch/microblaze/include/uapi/asm/unistd.h +++ b/arch/microblaze/include/uapi/asm/unistd.h @@ -407,5 +407,11 @@ #define __NR_userfaultfd 389 #define __NR_membarrier 390 #define __NR_mlock2 391 +#define __NR_copy_file_range 392 +#define __NR_preadv2 393 +#define __NR_pwritev2 394 +#define __NR_pkey_mprotect 395 +#define __NR_pkey_alloc 396 +#define __NR_pkey_free 397 #endif /* _UAPI_ASM_MICROBLAZE_UNISTD_H */ diff --git a/arch/microblaze/kernel/cpu/cpuinfo.c b/arch/microblaze/kernel/cpu/cpuinfo.c index b70bb538f001..96b3f26d16be 100644 --- a/arch/microblaze/kernel/cpu/cpuinfo.c +++ b/arch/microblaze/kernel/cpu/cpuinfo.c @@ -49,6 +49,8 @@ const struct cpu_ver_key cpu_ver_lookup[] = { {"9.3", 0x20}, {"9.4", 0x21}, {"9.5", 0x22}, + {"9.6", 0x23}, + {"10.0", 0x24}, {NULL, 0}, }; @@ -75,6 +77,10 @@ const struct family_string_key family_string_lookup[] = { {"zynq7000", 0x12}, {"UltraScale Virtex", 0x13}, {"UltraScale Kintex", 0x14}, + {"UltraScale+ Zynq", 0x15}, + {"UltraScale+ Virtex", 0x16}, + {"UltraScale+ Kintex", 0x17}, + {"Spartan7", 0x18}, {NULL, 0}, }; diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S index 6b3dd99126d7..6841c2df14d9 100644 --- a/arch/microblaze/kernel/syscall_table.S +++ b/arch/microblaze/kernel/syscall_table.S @@ -392,3 +392,9 @@ ENTRY(sys_call_table) .long sys_userfaultfd .long sys_membarrier /* 390 */ .long sys_mlock2 + .long sys_copy_file_range + .long sys_preadv2 + .long sys_pwritev2 + .long sys_pkey_mprotect /* 395 */ + .long sys_pkey_alloc + .long sys_pkey_free diff --git a/arch/microblaze/kernel/timer.c b/arch/microblaze/kernel/timer.c index 5bbf38b916ef..9e954959f605 100644 --- a/arch/microblaze/kernel/timer.c +++ b/arch/microblaze/kernel/timer.c @@ -259,7 +259,7 @@ static int __init xilinx_timer_init(struct device_node *timer) int ret; if (initialized) - return; + return -EINVAL; initialized = 1; diff --git a/arch/mips/boot/dts/ingenic/jz4740.dtsi b/arch/mips/boot/dts/ingenic/jz4740.dtsi index f6ae6ed9c4b1..3e1587f1f77a 100644 --- a/arch/mips/boot/dts/ingenic/jz4740.dtsi +++ b/arch/mips/boot/dts/ingenic/jz4740.dtsi @@ -44,6 +44,17 @@ #clock-cells = <1>; }; + rtc_dev: rtc@10003000 { + compatible = "ingenic,jz4740-rtc"; + reg = <0x10003000 0x40>; + + interrupt-parent = <&intc>; + interrupts = <15>; + + clocks = <&cgu JZ4740_CLK_RTC>; + clock-names = "rtc"; + }; + uart0: serial@10030000 { compatible = "ingenic,jz4740-uart"; reg = <0x10030000 0x100>; diff --git a/arch/mips/boot/dts/ingenic/qi_lb60.dts b/arch/mips/boot/dts/ingenic/qi_lb60.dts index 2414d63ae818..be1a7d3a3e1b 100644 --- a/arch/mips/boot/dts/ingenic/qi_lb60.dts +++ b/arch/mips/boot/dts/ingenic/qi_lb60.dts @@ -13,3 +13,7 @@ &ext { clock-frequency = <12000000>; }; + +&rtc_dev { + system-power-controller; +}; diff --git a/arch/mips/include/asm/mach-jz4740/platform.h b/arch/mips/include/asm/mach-jz4740/platform.h index 073b8bfbb3b3..3645974b7f65 100644 --- a/arch/mips/include/asm/mach-jz4740/platform.h +++ b/arch/mips/include/asm/mach-jz4740/platform.h @@ -22,7 +22,6 @@ extern struct platform_device jz4740_udc_device; extern struct platform_device jz4740_udc_xceiv_device; extern struct platform_device jz4740_mmc_device; -extern struct platform_device jz4740_rtc_device; extern struct platform_device jz4740_i2c_device; extern struct platform_device jz4740_nand_device; extern struct platform_device jz4740_framebuffer_device; diff --git a/arch/mips/jz4740/board-qi_lb60.c b/arch/mips/jz4740/board-qi_lb60.c index 258fd03c9ef5..a5bd94b95263 100644 --- a/arch/mips/jz4740/board-qi_lb60.c +++ b/arch/mips/jz4740/board-qi_lb60.c @@ -438,7 +438,6 @@ static struct platform_device *jz_platform_devices[] __initdata = { &jz4740_pcm_device, &jz4740_i2s_device, &jz4740_codec_device, - &jz4740_rtc_device, &jz4740_adc_device, &jz4740_pwm_device, &jz4740_dma_device, diff --git a/arch/mips/jz4740/platform.c b/arch/mips/jz4740/platform.c index 2f1dab35c061..5b7cdd67a9d9 100644 --- a/arch/mips/jz4740/platform.c +++ b/arch/mips/jz4740/platform.c @@ -88,27 +88,6 @@ struct platform_device jz4740_mmc_device = { .resource = jz4740_mmc_resources, }; -/* RTC controller */ -static struct resource jz4740_rtc_resources[] = { - { - .start = JZ4740_RTC_BASE_ADDR, - .end = JZ4740_RTC_BASE_ADDR + 0x38 - 1, - .flags = IORESOURCE_MEM, - }, - { - .start = JZ4740_IRQ_RTC, - .end = JZ4740_IRQ_RTC, - .flags = IORESOURCE_IRQ, - }, -}; - -struct platform_device jz4740_rtc_device = { - .name = "jz4740-rtc", - .id = -1, - .num_resources = ARRAY_SIZE(jz4740_rtc_resources), - .resource = jz4740_rtc_resources, -}; - /* I2C controller */ static struct resource jz4740_i2c_resources[] = { { diff --git a/arch/mips/jz4740/reset.c b/arch/mips/jz4740/reset.c index 954e669c9e6b..67780c4b6573 100644 --- a/arch/mips/jz4740/reset.c +++ b/arch/mips/jz4740/reset.c @@ -57,71 +57,8 @@ static void jz4740_restart(char *command) jz4740_halt(); } -#define JZ_REG_RTC_CTRL 0x00 -#define JZ_REG_RTC_HIBERNATE 0x20 -#define JZ_REG_RTC_WAKEUP_FILTER 0x24 -#define JZ_REG_RTC_RESET_COUNTER 0x28 - -#define JZ_RTC_CTRL_WRDY BIT(7) -#define JZ_RTC_WAKEUP_FILTER_MASK 0x0000FFE0 -#define JZ_RTC_RESET_COUNTER_MASK 0x00000FE0 - -static inline void jz4740_rtc_wait_ready(void __iomem *rtc_base) -{ - uint32_t ctrl; - - do { - ctrl = readl(rtc_base + JZ_REG_RTC_CTRL); - } while (!(ctrl & JZ_RTC_CTRL_WRDY)); -} - -static void jz4740_power_off(void) -{ - void __iomem *rtc_base = ioremap(JZ4740_RTC_BASE_ADDR, 0x38); - unsigned long wakeup_filter_ticks; - unsigned long reset_counter_ticks; - struct clk *rtc_clk; - unsigned long rtc_rate; - - rtc_clk = clk_get(NULL, "rtc"); - if (IS_ERR(rtc_clk)) - panic("unable to get RTC clock"); - rtc_rate = clk_get_rate(rtc_clk); - clk_put(rtc_clk); - - /* - * Set minimum wakeup pin assertion time: 100 ms. - * Range is 0 to 2 sec if RTC is clocked at 32 kHz. - */ - wakeup_filter_ticks = (100 * rtc_rate) / 1000; - if (wakeup_filter_ticks < JZ_RTC_WAKEUP_FILTER_MASK) - wakeup_filter_ticks &= JZ_RTC_WAKEUP_FILTER_MASK; - else - wakeup_filter_ticks = JZ_RTC_WAKEUP_FILTER_MASK; - jz4740_rtc_wait_ready(rtc_base); - writel(wakeup_filter_ticks, rtc_base + JZ_REG_RTC_WAKEUP_FILTER); - - /* - * Set reset pin low-level assertion time after wakeup: 60 ms. - * Range is 0 to 125 ms if RTC is clocked at 32 kHz. - */ - reset_counter_ticks = (60 * rtc_rate) / 1000; - if (reset_counter_ticks < JZ_RTC_RESET_COUNTER_MASK) - reset_counter_ticks &= JZ_RTC_RESET_COUNTER_MASK; - else - reset_counter_ticks = JZ_RTC_RESET_COUNTER_MASK; - jz4740_rtc_wait_ready(rtc_base); - writel(reset_counter_ticks, rtc_base + JZ_REG_RTC_RESET_COUNTER); - - jz4740_rtc_wait_ready(rtc_base); - writel(1, rtc_base + JZ_REG_RTC_HIBERNATE); - - jz4740_halt(); -} - void jz4740_reset_init(void) { _machine_restart = jz4740_restart; _machine_halt = jz4740_halt; - pm_power_off = jz4740_power_off; } diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index a14b86587013..3a71f38cdc05 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -7,6 +7,7 @@ config PARISC select HAVE_FUNCTION_GRAPH_TRACER select HAVE_SYSCALL_TRACEPOINTS select ARCH_WANT_FRAME_POINTERS + select ARCH_HAS_ELF_RANDOMIZE select RTC_CLASS select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE diff --git a/arch/parisc/include/asm/elf.h b/arch/parisc/include/asm/elf.h index 78c9fd32c554..a6b2a421571e 100644 --- a/arch/parisc/include/asm/elf.h +++ b/arch/parisc/include/asm/elf.h @@ -348,9 +348,10 @@ struct pt_regs; /* forward declaration... */ #define ELF_HWCAP 0 -#define STACK_RND_MASK (is_32bit_task() ? \ - 0x7ff >> (PAGE_SHIFT - 12) : \ - 0x3ffff >> (PAGE_SHIFT - 12)) +/* Masks for stack and mmap randomization */ +#define BRK_RND_MASK (is_32bit_task() ? 0x07ffUL : 0x3ffffUL) +#define MMAP_RND_MASK (is_32bit_task() ? 0x1fffUL : 0x3ffffUL) +#define STACK_RND_MASK MMAP_RND_MASK struct mm_struct; extern unsigned long arch_randomize_brk(struct mm_struct *); diff --git a/arch/parisc/include/asm/pdcpat.h b/arch/parisc/include/asm/pdcpat.h index 47539f117958..e1d289092705 100644 --- a/arch/parisc/include/asm/pdcpat.h +++ b/arch/parisc/include/asm/pdcpat.h @@ -289,7 +289,7 @@ extern int pdc_pat_cell_get_number(struct pdc_pat_cell_num *cell_info); extern int pdc_pat_cell_module(unsigned long *actcnt, unsigned long ploc, unsigned long mod, unsigned long view_type, void *mem_addr); extern int pdc_pat_cell_num_to_loc(void *, unsigned long); -extern int pdc_pat_cpu_get_number(struct pdc_pat_cpu_num *cpu_info, void *hpa); +extern int pdc_pat_cpu_get_number(struct pdc_pat_cpu_num *cpu_info, unsigned long hpa); extern int pdc_pat_pd_get_addr_map(unsigned long *actual_len, void *mem_addr, unsigned long count, unsigned long offset); diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index ca40741378be..a3661ee6b060 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -93,9 +93,7 @@ struct system_cpuinfo_parisc { /* Per CPU data structure - ie varies per CPU. */ struct cpuinfo_parisc { unsigned long it_value; /* Interval Timer at last timer Intr */ - unsigned long it_delta; /* Interval delta (tic_10ms / HZ * 100) */ unsigned long irq_count; /* number of IRQ's since boot */ - unsigned long irq_max_cr16; /* longest time to handle a single IRQ */ unsigned long cpuid; /* aka slot_number or set to NO_PROC_ID */ unsigned long hpa; /* Host Physical address */ unsigned long txn_addr; /* MMIO addr of EIR or id_eid */ @@ -103,8 +101,6 @@ struct cpuinfo_parisc { unsigned long pending_ipi; /* bitmap of type ipi_message_type */ #endif unsigned long bh_count; /* number of times bh was invoked */ - unsigned long prof_counter; /* per CPU profiling support */ - unsigned long prof_multiplier; /* per CPU profiling support */ unsigned long fp_rev; unsigned long fp_model; unsigned int state; diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S index 4fcff2dcc9c3..ad4cb1613c57 100644 --- a/arch/parisc/kernel/entry.S +++ b/arch/parisc/kernel/entry.S @@ -878,6 +878,9 @@ ENTRY_CFI(syscall_exit_rfi) STREG %r19,PT_SR7(%r16) intr_return: + /* NOTE: Need to enable interrupts incase we schedule. */ + ssm PSW_SM_I, %r0 + /* check for reschedule */ mfctl %cr30,%r1 LDREG TI_FLAGS(%r1),%r19 /* sched.h: TIF_NEED_RESCHED */ @@ -904,11 +907,6 @@ intr_check_sig: LDREG PT_IASQ1(%r16), %r20 cmpib,COND(=),n 0,%r20,intr_restore /* backward */ - /* NOTE: We need to enable interrupts if we have to deliver - * signals. We used to do this earlier but it caused kernel - * stack overflows. */ - ssm PSW_SM_I, %r0 - copy %r0, %r25 /* long in_syscall = 0 */ #ifdef CONFIG_64BIT ldo -16(%r30),%r29 /* Reference param save area */ @@ -960,10 +958,6 @@ intr_do_resched: cmpib,COND(=) 0, %r20, intr_do_preempt nop - /* NOTE: We need to enable interrupts if we schedule. We used - * to do this earlier but it caused kernel stack overflows. */ - ssm PSW_SM_I, %r0 - #ifdef CONFIG_64BIT ldo -16(%r30),%r29 /* Reference param save area */ #endif diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index e5d71905cad5..9d797ae4fa22 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -1258,7 +1258,7 @@ int pdc_pat_cell_module(unsigned long *actcnt, unsigned long ploc, unsigned long * * Retrieve the cpu number for the cpu at the specified HPA. */ -int pdc_pat_cpu_get_number(struct pdc_pat_cpu_num *cpu_info, void *hpa) +int pdc_pat_cpu_get_number(struct pdc_pat_cpu_num *cpu_info, unsigned long hpa) { int retval; unsigned long flags; diff --git a/arch/parisc/kernel/inventory.c b/arch/parisc/kernel/inventory.c index c05d1876d27c..c9789d9c73b4 100644 --- a/arch/parisc/kernel/inventory.c +++ b/arch/parisc/kernel/inventory.c @@ -216,9 +216,9 @@ pat_query_module(ulong pcell_loc, ulong mod_index) register_parisc_device(dev); /* advertise device */ #ifdef DEBUG_PAT - pdc_pat_cell_mod_maddr_block_t io_pdc_cell; /* dump what we see so far... */ switch (PAT_GET_ENTITY(dev->mod_info)) { + pdc_pat_cell_mod_maddr_block_t io_pdc_cell; unsigned long i; case PAT_ENTITY_PROC: @@ -259,9 +259,9 @@ pat_query_module(ulong pcell_loc, ulong mod_index) pa_pdc_cell->mod[4 + i * 3]); /* finish (ie end) */ printk(KERN_DEBUG " IO_VIEW %ld: 0x%016lx 0x%016lx 0x%016lx\n", - i, io_pdc_cell->mod[2 + i * 3], /* type */ - io_pdc_cell->mod[3 + i * 3], /* start */ - io_pdc_cell->mod[4 + i * 3]); /* finish (ie end) */ + i, io_pdc_cell.mod[2 + i * 3], /* type */ + io_pdc_cell.mod[3 + i * 3], /* start */ + io_pdc_cell.mod[4 + i * 3]); /* finish (ie end) */ } printk(KERN_DEBUG "\n"); break; diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index 518f4f5f1f43..6eabce62463b 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -301,7 +301,6 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - int err; size_t image_size; uint32_t image_type; uint32_t interface_type; @@ -320,8 +319,8 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun if (count != sizeof(uint32_t)) return -EIO; - if ((err = copy_from_user(&image_type, buf, sizeof(uint32_t))) != 0) - return err; + if (copy_from_user(&image_type, buf, sizeof(uint32_t))) + return -EFAULT; /* Get the interface type and test type */ interface_type = (image_type >> 16) & 0xffff; diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 40639439d8b3..ea6603ee8d24 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -276,11 +276,7 @@ void *dereference_function_descriptor(void *ptr) static inline unsigned long brk_rnd(void) { - /* 8MB for 32bit, 1GB for 64bit */ - if (is_32bit_task()) - return (get_random_int() & 0x7ffUL) << PAGE_SHIFT; - else - return (get_random_int() & 0x3ffffUL) << PAGE_SHIFT; + return (get_random_int() & BRK_RND_MASK) << PAGE_SHIFT; } unsigned long arch_randomize_brk(struct mm_struct *mm) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 0c2a94a0f751..85de47f4eb59 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -78,11 +78,6 @@ DEFINE_PER_CPU(struct cpuinfo_parisc, cpu_data); static void init_percpu_prof(unsigned long cpunum) { - struct cpuinfo_parisc *p; - - p = &per_cpu(cpu_data, cpunum); - p->prof_counter = 1; - p->prof_multiplier = 1; } @@ -99,6 +94,7 @@ static int processor_probe(struct parisc_device *dev) unsigned long txn_addr; unsigned long cpuid; struct cpuinfo_parisc *p; + struct pdc_pat_cpu_num cpu_info __maybe_unused; #ifdef CONFIG_SMP if (num_online_cpus() >= nr_cpu_ids) { @@ -123,10 +119,6 @@ static int processor_probe(struct parisc_device *dev) ulong status; unsigned long bytecnt; pdc_pat_cell_mod_maddr_block_t *pa_pdc_cell; -#undef USE_PAT_CPUID -#ifdef USE_PAT_CPUID - struct pdc_pat_cpu_num cpu_info; -#endif pa_pdc_cell = kmalloc(sizeof (*pa_pdc_cell), GFP_KERNEL); if (!pa_pdc_cell) @@ -145,22 +137,27 @@ static int processor_probe(struct parisc_device *dev) kfree(pa_pdc_cell); + /* get the cpu number */ + status = pdc_pat_cpu_get_number(&cpu_info, dev->hpa.start); + BUG_ON(PDC_OK != status); + + pr_info("Logical CPU #%lu is physical cpu #%lu at location " + "0x%lx with hpa %pa\n", + cpuid, cpu_info.cpu_num, cpu_info.cpu_loc, + &dev->hpa.start); + +#undef USE_PAT_CPUID #ifdef USE_PAT_CPUID /* We need contiguous numbers for cpuid. Firmware's notion * of cpuid is for physical CPUs and we just don't care yet. * We'll care when we need to query PAT PDC about a CPU *after* * boot time (ie shutdown a CPU from an OS perspective). */ - /* get the cpu number */ - status = pdc_pat_cpu_get_number(&cpu_info, dev->hpa.start); - - BUG_ON(PDC_OK != status); - if (cpu_info.cpu_num >= NR_CPUS) { - printk(KERN_WARNING "IGNORING CPU at 0x%x," + printk(KERN_WARNING "IGNORING CPU at %pa," " cpu_slot_id > NR_CPUS" " (%ld > %d)\n", - dev->hpa.start, cpu_info.cpu_num, NR_CPUS); + &dev->hpa.start, cpu_info.cpu_num, NR_CPUS); /* Ignore CPU since it will only crash */ boot_cpu_data.cpu_count--; return 1; diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 0a393a04e891..a81e177cac7b 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -225,19 +225,17 @@ static unsigned long mmap_rnd(void) { unsigned long rnd = 0; - /* - * 8 bits of randomness in 32bit mmaps, 20 address space bits - * 28 bits of randomness in 64bit mmaps, 40 address space bits - */ - if (current->flags & PF_RANDOMIZE) { - if (is_32bit_task()) - rnd = get_random_int() % (1<<8); - else - rnd = get_random_int() % (1<<28); - } + if (current->flags & PF_RANDOMIZE) + rnd = get_random_int() & MMAP_RND_MASK; + return rnd << PAGE_SHIFT; } +unsigned long arch_mmap_rnd(void) +{ + return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT; +} + static unsigned long mmap_legacy_base(void) { return TASK_UNMAPPED_BASE + mmap_rnd(); diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 325f30d82b64..4215f5596c8b 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -59,10 +59,9 @@ static unsigned long clocktick __read_mostly; /* timer cycles per tick */ */ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id) { - unsigned long now, now2; + unsigned long now; unsigned long next_tick; - unsigned long cycles_elapsed, ticks_elapsed = 1; - unsigned long cycles_remainder; + unsigned long ticks_elapsed = 0; unsigned int cpu = smp_processor_id(); struct cpuinfo_parisc *cpuinfo = &per_cpu(cpu_data, cpu); @@ -71,102 +70,49 @@ irqreturn_t __irq_entry timer_interrupt(int irq, void *dev_id) profile_tick(CPU_PROFILING); - /* Initialize next_tick to the expected tick time. */ + /* Initialize next_tick to the old expected tick time. */ next_tick = cpuinfo->it_value; - /* Get current cycle counter (Control Register 16). */ - now = mfctl(16); - - cycles_elapsed = now - next_tick; - - if ((cycles_elapsed >> 6) < cpt) { - /* use "cheap" math (add/subtract) instead - * of the more expensive div/mul method - */ - cycles_remainder = cycles_elapsed; - while (cycles_remainder > cpt) { - cycles_remainder -= cpt; - ticks_elapsed++; - } - } else { - /* TODO: Reduce this to one fdiv op */ - cycles_remainder = cycles_elapsed % cpt; - ticks_elapsed += cycles_elapsed / cpt; - } - - /* convert from "division remainder" to "remainder of clock tick" */ - cycles_remainder = cpt - cycles_remainder; - - /* Determine when (in CR16 cycles) next IT interrupt will fire. - * We want IT to fire modulo clocktick even if we miss/skip some. - * But those interrupts don't in fact get delivered that regularly. - */ - next_tick = now + cycles_remainder; + /* Calculate how many ticks have elapsed. */ + do { + ++ticks_elapsed; + next_tick += cpt; + now = mfctl(16); + } while (next_tick - now > cpt); + /* Store (in CR16 cycles) up to when we are accounting right now. */ cpuinfo->it_value = next_tick; - /* Program the IT when to deliver the next interrupt. - * Only bottom 32-bits of next_tick are writable in CR16! - */ - mtctl(next_tick, 16); + /* Go do system house keeping. */ + if (cpu == 0) + xtime_update(ticks_elapsed); + + update_process_times(user_mode(get_irq_regs())); - /* Skip one clocktick on purpose if we missed next_tick. + /* Skip clockticks on purpose if we know we would miss those. * The new CR16 must be "later" than current CR16 otherwise * itimer would not fire until CR16 wrapped - e.g 4 seconds * later on a 1Ghz processor. We'll account for the missed - * tick on the next timer interrupt. + * ticks on the next timer interrupt. + * We want IT to fire modulo clocktick even if we miss/skip some. + * But those interrupts don't in fact get delivered that regularly. * * "next_tick - now" will always give the difference regardless * if one or the other wrapped. If "now" is "bigger" we'll end up * with a very large unsigned number. */ - now2 = mfctl(16); - if (next_tick - now2 > cpt) - mtctl(next_tick+cpt, 16); + while (next_tick - mfctl(16) > cpt) + next_tick += cpt; -#if 1 -/* - * GGG: DEBUG code for how many cycles programming CR16 used. - */ - if (unlikely(now2 - now > 0x3000)) /* 12K cycles */ - printk (KERN_CRIT "timer_interrupt(CPU %d): SLOW! 0x%lx cycles!" - " cyc %lX rem %lX " - " next/now %lX/%lX\n", - cpu, now2 - now, cycles_elapsed, cycles_remainder, - next_tick, now ); -#endif - - /* Can we differentiate between "early CR16" (aka Scenario 1) and - * "long delay" (aka Scenario 3)? I don't think so. - * - * Timer_interrupt will be delivered at least a few hundred cycles - * after the IT fires. But it's arbitrary how much time passes - * before we call it "late". I've picked one second. - * - * It's important NO printk's are between reading CR16 and - * setting up the next value. May introduce huge variance. - */ - if (unlikely(ticks_elapsed > HZ)) { - /* Scenario 3: very long delay? bad in any case */ - printk (KERN_CRIT "timer_interrupt(CPU %d): delayed!" - " cycles %lX rem %lX " - " next/now %lX/%lX\n", - cpu, - cycles_elapsed, cycles_remainder, - next_tick, now ); - } - - /* Done mucking with unreliable delivery of interrupts. - * Go do system house keeping. + /* Program the IT when to deliver the next interrupt. + * Only bottom 32-bits of next_tick are writable in CR16! + * Timer interrupt will be delivered at least a few hundred cycles + * after the IT fires, so if we are too close (<= 500 cycles) to the + * next cycle, simply skip it. */ - - if (!--cpuinfo->prof_counter) { - cpuinfo->prof_counter = cpuinfo->prof_multiplier; - update_process_times(user_mode(get_irq_regs())); - } - - if (cpu == 0) - xtime_update(ticks_elapsed); + if (next_tick - mfctl(16) <= 500) + next_tick += cpt; + mtctl(next_tick, 16); return IRQ_HANDLED; } diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3da87e198878..a8ee573fe610 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -469,6 +469,7 @@ config KEXEC config KEXEC_FILE bool "kexec file based system call" select KEXEC_CORE + select HAVE_IMA_KEXEC select BUILD_BIN2C depends on PPC64 depends on CRYPTO=y diff --git a/arch/powerpc/include/asm/ima.h b/arch/powerpc/include/asm/ima.h new file mode 100644 index 000000000000..2313bdface34 --- /dev/null +++ b/arch/powerpc/include/asm/ima.h @@ -0,0 +1,29 @@ +#ifndef _ASM_POWERPC_IMA_H +#define _ASM_POWERPC_IMA_H + +struct kimage; + +int ima_get_kexec_buffer(void **addr, size_t *size); +int ima_free_kexec_buffer(void); + +#ifdef CONFIG_IMA +void remove_ima_buffer(void *fdt, int chosen_node); +#else +static inline void remove_ima_buffer(void *fdt, int chosen_node) {} +#endif + +#ifdef CONFIG_IMA_KEXEC +int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr, + size_t size); + +int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node); +#else +static inline int setup_ima_buffer(const struct kimage *image, void *fdt, + int chosen_node) +{ + remove_ima_buffer(fdt, chosen_node); + return 0; +} +#endif /* CONFIG_IMA_KEXEC */ + +#endif /* _ASM_POWERPC_IMA_H */ diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 6c3b71502fbc..25668bc8cb2a 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -94,11 +94,22 @@ static inline bool kdump_in_progress(void) #ifdef CONFIG_KEXEC_FILE extern struct kexec_file_ops kexec_elf64_ops; +#ifdef CONFIG_IMA_KEXEC +#define ARCH_HAS_KIMAGE_ARCH + +struct kimage_arch { + phys_addr_t ima_buffer_addr; + size_t ima_buffer_size; +}; +#endif + int setup_purgatory(struct kimage *image, const void *slave_code, const void *fdt, unsigned long kernel_load_addr, unsigned long fdt_load_addr); -int setup_new_fdt(void *fdt, unsigned long initrd_load_addr, - unsigned long initrd_len, const char *cmdline); +int setup_new_fdt(const struct kimage *image, void *fdt, + unsigned long initrd_load_addr, unsigned long initrd_len, + const char *cmdline); +int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size); #endif /* CONFIG_KEXEC_FILE */ #else /* !CONFIG_KEXEC_CORE */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index a3a6047fd395..23f8082d7bfa 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -112,6 +112,10 @@ obj-$(CONFIG_PCI_MSI) += msi.o obj-$(CONFIG_KEXEC_CORE) += machine_kexec.o crash.o \ machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file_$(BITS).o kexec_elf_$(BITS).o +ifeq ($(CONFIG_HAVE_IMA_KEXEC)$(CONFIG_IMA),yy) +obj-y += ima_kexec.o +endif + obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o diff --git a/arch/powerpc/kernel/ima_kexec.c b/arch/powerpc/kernel/ima_kexec.c new file mode 100644 index 000000000000..5ea42c937ca9 --- /dev/null +++ b/arch/powerpc/kernel/ima_kexec.c @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2016 IBM Corporation + * + * Authors: + * Thiago Jung Bauermann <bauerman@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/of.h> +#include <linux/memblock.h> +#include <linux/libfdt.h> + +static int get_addr_size_cells(int *addr_cells, int *size_cells) +{ + struct device_node *root; + + root = of_find_node_by_path("/"); + if (!root) + return -EINVAL; + + *addr_cells = of_n_addr_cells(root); + *size_cells = of_n_size_cells(root); + + of_node_put(root); + + return 0; +} + +static int do_get_kexec_buffer(const void *prop, int len, unsigned long *addr, + size_t *size) +{ + int ret, addr_cells, size_cells; + + ret = get_addr_size_cells(&addr_cells, &size_cells); + if (ret) + return ret; + + if (len < 4 * (addr_cells + size_cells)) + return -ENOENT; + + *addr = of_read_number(prop, addr_cells); + *size = of_read_number(prop + 4 * addr_cells, size_cells); + + return 0; +} + +/** + * ima_get_kexec_buffer - get IMA buffer from the previous kernel + * @addr: On successful return, set to point to the buffer contents. + * @size: On successful return, set to the buffer size. + * + * Return: 0 on success, negative errno on error. + */ +int ima_get_kexec_buffer(void **addr, size_t *size) +{ + int ret, len; + unsigned long tmp_addr; + size_t tmp_size; + const void *prop; + + prop = of_get_property(of_chosen, "linux,ima-kexec-buffer", &len); + if (!prop) + return -ENOENT; + + ret = do_get_kexec_buffer(prop, len, &tmp_addr, &tmp_size); + if (ret) + return ret; + + *addr = __va(tmp_addr); + *size = tmp_size; + + return 0; +} + +/** + * ima_free_kexec_buffer - free memory used by the IMA buffer + */ +int ima_free_kexec_buffer(void) +{ + int ret; + unsigned long addr; + size_t size; + struct property *prop; + + prop = of_find_property(of_chosen, "linux,ima-kexec-buffer", NULL); + if (!prop) + return -ENOENT; + + ret = do_get_kexec_buffer(prop->value, prop->length, &addr, &size); + if (ret) + return ret; + + ret = of_remove_property(of_chosen, prop); + if (ret) + return ret; + + return memblock_free(addr, size); + +} + +/** + * remove_ima_buffer - remove the IMA buffer property and reservation from @fdt + * + * The IMA measurement buffer is of no use to a subsequent kernel, so we always + * remove it from the device tree. + */ +void remove_ima_buffer(void *fdt, int chosen_node) +{ + int ret, len; + unsigned long addr; + size_t size; + const void *prop; + + prop = fdt_getprop(fdt, chosen_node, "linux,ima-kexec-buffer", &len); + if (!prop) + return; + + ret = do_get_kexec_buffer(prop, len, &addr, &size); + fdt_delprop(fdt, chosen_node, "linux,ima-kexec-buffer"); + if (ret) + return; + + ret = delete_fdt_mem_rsv(fdt, addr, size); + if (!ret) + pr_debug("Removed old IMA buffer reservation.\n"); +} + +#ifdef CONFIG_IMA_KEXEC +/** + * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA buffer + * + * Architectures should use this function to pass on the IMA buffer + * information to the next kernel. + * + * Return: 0 on success, negative errno on error. + */ +int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr, + size_t size) +{ + image->arch.ima_buffer_addr = load_addr; + image->arch.ima_buffer_size = size; + + return 0; +} + +static int write_number(void *p, u64 value, int cells) +{ + if (cells == 1) { + u32 tmp; + + if (value > U32_MAX) + return -EINVAL; + + tmp = cpu_to_be32(value); + memcpy(p, &tmp, sizeof(tmp)); + } else if (cells == 2) { + u64 tmp; + + tmp = cpu_to_be64(value); + memcpy(p, &tmp, sizeof(tmp)); + } else + return -EINVAL; + + return 0; +} + +/** + * setup_ima_buffer - add IMA buffer information to the fdt + * @image: kexec image being loaded. + * @fdt: Flattened device tree for the next kernel. + * @chosen_node: Offset to the chosen node. + * + * Return: 0 on success, or negative errno on error. + */ +int setup_ima_buffer(const struct kimage *image, void *fdt, int chosen_node) +{ + int ret, addr_cells, size_cells, entry_size; + u8 value[16]; + + remove_ima_buffer(fdt, chosen_node); + if (!image->arch.ima_buffer_size) + return 0; + + ret = get_addr_size_cells(&addr_cells, &size_cells); + if (ret) + return ret; + + entry_size = 4 * (addr_cells + size_cells); + + if (entry_size > sizeof(value)) + return -EINVAL; + + ret = write_number(value, image->arch.ima_buffer_addr, addr_cells); + if (ret) + return ret; + + ret = write_number(value + 4 * addr_cells, image->arch.ima_buffer_size, + size_cells); + if (ret) + return ret; + + ret = fdt_setprop(fdt, chosen_node, "linux,ima-kexec-buffer", value, + entry_size); + if (ret < 0) + return -EINVAL; + + ret = fdt_add_mem_rsv(fdt, image->arch.ima_buffer_addr, + image->arch.ima_buffer_size); + if (ret) + return -EINVAL; + + pr_debug("IMA buffer at 0x%llx, size = 0x%zx\n", + image->arch.ima_buffer_addr, image->arch.ima_buffer_size); + + return 0; +} +#endif /* CONFIG_IMA_KEXEC */ diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c index 6acffd34a70f..9a42309b091a 100644 --- a/arch/powerpc/kernel/kexec_elf_64.c +++ b/arch/powerpc/kernel/kexec_elf_64.c @@ -627,7 +627,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, goto out; } - ret = setup_new_fdt(fdt, initrd_load_addr, initrd_len, cmdline); + ret = setup_new_fdt(image, fdt, initrd_load_addr, initrd_len, cmdline); if (ret) goto out; diff --git a/arch/powerpc/kernel/machine_kexec_file_64.c b/arch/powerpc/kernel/machine_kexec_file_64.c index 7abc8a75ee48..992c0d258e5d 100644 --- a/arch/powerpc/kernel/machine_kexec_file_64.c +++ b/arch/powerpc/kernel/machine_kexec_file_64.c @@ -27,6 +27,7 @@ #include <linux/memblock.h> #include <linux/of_fdt.h> #include <linux/libfdt.h> +#include <asm/ima.h> #define SLAVE_CODE_SIZE 256 @@ -180,7 +181,7 @@ int setup_purgatory(struct kimage *image, const void *slave_code, * * Return: 0 on success, or negative errno on error. */ -static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size) +int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size) { int i, ret, num_rsvs = fdt_num_mem_rsv(fdt); @@ -209,6 +210,7 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size /* * setup_new_fdt - modify /chosen and memory reservation for the next kernel + * @image: kexec image being loaded. * @fdt: Flattened device tree for the next kernel. * @initrd_load_addr: Address where the next initrd will be loaded. * @initrd_len: Size of the next initrd, or 0 if there will be none. @@ -217,8 +219,9 @@ static int delete_fdt_mem_rsv(void *fdt, unsigned long start, unsigned long size * * Return: 0 on success, or negative errno on error. */ -int setup_new_fdt(void *fdt, unsigned long initrd_load_addr, - unsigned long initrd_len, const char *cmdline) +int setup_new_fdt(const struct kimage *image, void *fdt, + unsigned long initrd_load_addr, unsigned long initrd_len, + const char *cmdline) { int ret, chosen_node; const void *prop; @@ -328,6 +331,12 @@ int setup_new_fdt(void *fdt, unsigned long initrd_load_addr, } } + ret = setup_ima_buffer(image, fdt, chosen_node); + if (ret) { + pr_err("Error setting up the new device tree.\n"); + return ret; + } + ret = fdt_setprop(fdt, chosen_node, "linux,booted-from-kexec", NULL, 0); if (ret) { pr_err("Error setting up the new device tree.\n"); diff --git a/arch/powerpc/oprofile/cell/spu_task_sync.c b/arch/powerpc/oprofile/cell/spu_task_sync.c index 83d2b4ef7f0d..44d67b167e0b 100644 --- a/arch/powerpc/oprofile/cell/spu_task_sync.c +++ b/arch/powerpc/oprofile/cell/spu_task_sync.c @@ -295,7 +295,7 @@ out: * dcookie user still being registered (namely, the reader * of the event buffer). */ -static inline unsigned long fast_get_dcookie(struct path *path) +static inline unsigned long fast_get_dcookie(const struct path *path) { unsigned long cookie; diff --git a/arch/powerpc/platforms/85xx/corenet_generic.c b/arch/powerpc/platforms/85xx/corenet_generic.c index 3803b0addf65..6c0ba75fb256 100644 --- a/arch/powerpc/platforms/85xx/corenet_generic.c +++ b/arch/powerpc/platforms/85xx/corenet_generic.c @@ -117,9 +117,6 @@ static const struct of_device_id of_device_ids[] = { { .compatible = "fsl,qe", }, - { - .compatible = "fsl,fman", - }, /* The following two are for the Freescale hypervisor */ { .name = "hypervisor", diff --git a/arch/tile/include/asm/cache.h b/arch/tile/include/asm/cache.h index 4810e48dbbbf..7d6aaa128e8b 100644 --- a/arch/tile/include/asm/cache.h +++ b/arch/tile/include/asm/cache.h @@ -50,18 +50,15 @@ /* * Originally we used small TLB pages for kernel data and grouped some - * things together as "write once", enforcing the property at the end + * things together as ro-after-init, enforcing the property at the end * of initialization by making those pages read-only and non-coherent. * This allowed better cache utilization since cache inclusion did not * need to be maintained. However, to do this requires an extra TLB * entry, which on balance is more of a performance hit than the * non-coherence is a performance gain, so we now just make "read - * mostly" and "write once" be synonyms. We keep the attribute + * mostly" and "ro-after-init" be synonyms. We keep the attribute * separate in case we change our minds at a future date. */ -#define __write_once __read_mostly - -/* __ro_after_init is the generic name for the tile arch __write_once. */ #define __ro_after_init __read_mostly #endif /* _ASM_TILE_CACHE_H */ diff --git a/arch/tile/include/asm/sections.h b/arch/tile/include/asm/sections.h index 86a746243dc8..50343bfe7936 100644 --- a/arch/tile/include/asm/sections.h +++ b/arch/tile/include/asm/sections.h @@ -19,9 +19,6 @@ #include <asm-generic/sections.h> -/* Write-once data is writable only till the end of initialization. */ -extern char __w1data_begin[], __w1data_end[]; - extern char vdso_start[], vdso_end[]; #ifdef CONFIG_COMPAT extern char vdso32_start[], vdso32_end[]; diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index 2305084c9b93..09233fbe7801 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -43,29 +43,28 @@ void *module_alloc(unsigned long size) int npages; npages = (size + PAGE_SIZE - 1) / PAGE_SIZE; - pages = kmalloc(npages * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); if (pages == NULL) return NULL; for (; i < npages; ++i) { pages[i] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); if (!pages[i]) - goto error; + goto free_pages; } area = __get_vm_area(size, VM_ALLOC, MEM_MODULE_START, MEM_MODULE_END); if (!area) - goto error; + goto free_pages; area->nr_pages = npages; area->pages = pages; if (map_vm_area(area, prot_rwx, pages)) { vunmap(area->addr); - goto error; + goto free_pages; } return area->addr; - -error: + free_pages: while (--i >= 0) __free_page(pages[i]); kfree(pages); diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c index 9475a74cd53a..bc6656b5708b 100644 --- a/arch/tile/kernel/pci.c +++ b/arch/tile/kernel/pci.c @@ -57,7 +57,7 @@ static int pci_probe = 1; * This flag tells if the platform is TILEmpower that needs * special configuration for the PLX switch chip. */ -int __write_once tile_plx_gen1; +int __ro_after_init tile_plx_gen1; static struct pci_controller controllers[TILE_NUM_PCIE]; static int num_controllers; diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c index 0e7a5d09e023..b554a68eea1b 100644 --- a/arch/tile/kernel/pci_gx.c +++ b/arch/tile/kernel/pci_gx.c @@ -131,7 +131,7 @@ static int tile_irq_cpu(int irq) count = cpumask_weight(&intr_cpus_map); if (unlikely(count == 0)) { - pr_warn("intr_cpus_map empty, interrupts will be delievered to dataplane tiles\n"); + pr_warn("intr_cpus_map empty, interrupts will be delivered to dataplane tiles\n"); return irq % (smp_height * smp_width); } diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index 153020abd2f5..443a70bccc1c 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -49,7 +49,7 @@ static inline int ABS(int x) { return x >= 0 ? x : -x; } /* Chip information */ -char chip_model[64] __write_once; +char chip_model[64] __ro_after_init; #ifdef CONFIG_VT struct screen_info screen_info; @@ -97,17 +97,17 @@ int node_controller[MAX_NUMNODES] = { [0 ... MAX_NUMNODES-1] = -1 }; #ifdef CONFIG_HIGHMEM /* Map information from VAs to PAs */ unsigned long pbase_map[1 << (32 - HPAGE_SHIFT)] - __write_once __attribute__((aligned(L2_CACHE_BYTES))); + __ro_after_init __attribute__((aligned(L2_CACHE_BYTES))); EXPORT_SYMBOL(pbase_map); /* Map information from PAs to VAs */ void *vbase_map[NR_PA_HIGHBIT_VALUES] - __write_once __attribute__((aligned(L2_CACHE_BYTES))); + __ro_after_init __attribute__((aligned(L2_CACHE_BYTES))); EXPORT_SYMBOL(vbase_map); #endif /* Node number as a function of the high PA bits */ -int highbits_to_node[NR_PA_HIGHBIT_VALUES] __write_once; +int highbits_to_node[NR_PA_HIGHBIT_VALUES] __ro_after_init; EXPORT_SYMBOL(highbits_to_node); static unsigned int __initdata maxmem_pfn = -1U; @@ -844,11 +844,11 @@ static void __init zone_sizes_init(void) #ifdef CONFIG_NUMA /* which logical CPUs are on which nodes */ -struct cpumask node_2_cpu_mask[MAX_NUMNODES] __write_once; +struct cpumask node_2_cpu_mask[MAX_NUMNODES] __ro_after_init; EXPORT_SYMBOL(node_2_cpu_mask); /* which node each logical CPU is on */ -char cpu_2_node[NR_CPUS] __write_once __attribute__((aligned(L2_CACHE_BYTES))); +char cpu_2_node[NR_CPUS] __ro_after_init __attribute__((aligned(L2_CACHE_BYTES))); EXPORT_SYMBOL(cpu_2_node); /* Return cpu_to_node() except for cpus not yet assigned, which return -1 */ @@ -1269,7 +1269,7 @@ static void __init validate_va(void) * cpus plus any other cpus that are willing to share their cache. * It is set by hv_inquire_tiles(HV_INQ_TILES_LOTAR). */ -struct cpumask __write_once cpu_lotar_map; +struct cpumask __ro_after_init cpu_lotar_map; EXPORT_SYMBOL(cpu_lotar_map); /* @@ -1291,7 +1291,7 @@ EXPORT_SYMBOL(hash_for_home_map); * cache, those tiles will only appear in cpu_lotar_map, NOT in * cpu_cacheable_map, as they are a special case. */ -struct cpumask __write_once cpu_cacheable_map; +struct cpumask __ro_after_init cpu_cacheable_map; EXPORT_SYMBOL(cpu_cacheable_map); static __initdata struct cpumask disabled_map; @@ -1506,7 +1506,7 @@ void __init setup_arch(char **cmdline_p) * Set up per-cpu memory. */ -unsigned long __per_cpu_offset[NR_CPUS] __write_once; +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init; EXPORT_SYMBOL(__per_cpu_offset); static size_t __initdata pfn_offset[MAX_NUMNODES] = { 0 }; diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c index 07e3ff5cc740..94a62e1197ce 100644 --- a/arch/tile/kernel/smp.c +++ b/arch/tile/kernel/smp.c @@ -27,7 +27,7 @@ * We write to width and height with a single store in head_NN.S, * so make the variable aligned to "long". */ -HV_Topology smp_topology __write_once __aligned(sizeof(long)); +HV_Topology smp_topology __ro_after_init __aligned(sizeof(long)); EXPORT_SYMBOL(smp_topology); #if CHIP_HAS_IPI() diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c index ea960d660917..c9357012b1c8 100644 --- a/arch/tile/kernel/time.c +++ b/arch/tile/kernel/time.c @@ -37,7 +37,7 @@ */ /* How many cycles per second we are running at. */ -static cycles_t cycles_per_sec __write_once; +static cycles_t cycles_per_sec __ro_after_init; cycles_t get_clock_rate(void) { @@ -68,7 +68,7 @@ EXPORT_SYMBOL(get_cycles); */ #define SCHED_CLOCK_SHIFT 10 -static unsigned long sched_clock_mult __write_once; +static unsigned long sched_clock_mult __ro_after_init; static cycles_t clocksource_get_cycles(struct clocksource *cs) { diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c index 9772a3554282..4fe78c5b8394 100644 --- a/arch/tile/kernel/unaligned.c +++ b/arch/tile/kernel/unaligned.c @@ -22,7 +22,7 @@ #include <linux/mman.h> #include <linux/types.h> #include <linux/err.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/compat.h> #include <linux/prctl.h> #include <asm/cacheflush.h> diff --git a/arch/tile/lib/cacheflush.c b/arch/tile/lib/cacheflush.c index 9c0ec22009a5..c1ebc1065fc1 100644 --- a/arch/tile/lib/cacheflush.c +++ b/arch/tile/lib/cacheflush.c @@ -138,19 +138,13 @@ finv_buffer_remote(void *buffer, size_t size, int hfh) if ((unsigned long)base < (unsigned long)buffer) base = buffer; - /* - * Fire all the loads we need. The MAF only has eight entries - * so we can have at most eight outstanding loads, so we - * unroll by that amount. - */ -#pragma unroll 8 + /* Fire all the loads we need. */ for (; p >= base; p -= step_size) force_load(p); /* * Repeat, but with finv's instead of loads, to get rid of the * data we just loaded into our own cache and the old home L3. - * No need to unroll since finv's don't target a register. * The finv's are guaranteed not to actually flush the data in * the buffer back to their home, since we just read it, so the * lines are clean in cache; we will only invalidate those lines. diff --git a/arch/tile/mm/extable.c b/arch/tile/mm/extable.c index 4fb0acb9d154..aeaf20c7aaa4 100644 --- a/arch/tile/mm/extable.c +++ b/arch/tile/mm/extable.c @@ -12,7 +12,7 @@ * more details. */ -#include <linux/module.h> +#include <linux/extable.h> #include <linux/spinlock.h> #include <linux/uaccess.h> diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c index beba986589e5..709f8e9ba3e9 100644 --- a/arch/tile/mm/fault.c +++ b/arch/tile/mm/fault.c @@ -29,7 +29,7 @@ #include <linux/tty.h> #include <linux/vt_kern.h> /* For unblank_screen() */ #include <linux/highmem.h> -#include <linux/module.h> +#include <linux/extable.h> #include <linux/kprobes.h> #include <linux/hugetlb.h> #include <linux/syscalls.h> diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index 40ca30a9fee3..b51cc28acd0a 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c @@ -47,7 +47,7 @@ * The noallocl2 option suppresses all use of the L2 cache to cache * locally from a remote home. */ -static int __write_once noallocl2; +static int __ro_after_init noallocl2; static int __init set_noallocl2(char *str) { noallocl2 = 1; diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index adce25462b0d..3a97e4d7205c 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -190,9 +190,9 @@ static void __init page_table_range_init(unsigned long start, static int __initdata ktext_hash = 1; /* .text pages */ static int __initdata kdata_hash = 1; /* .data and .bss pages */ -int __write_once hash_default = 1; /* kernel allocator pages */ +int __ro_after_init hash_default = 1; /* kernel allocator pages */ EXPORT_SYMBOL(hash_default); -int __write_once kstack_hash = 1; /* if no homecaching, use h4h */ +int __ro_after_init kstack_hash = 1; /* if no homecaching, use h4h */ /* * CPUs to use to for striping the pages of kernel data. If hash-for-home @@ -203,7 +203,7 @@ int __write_once kstack_hash = 1; /* if no homecaching, use h4h */ static __initdata struct cpumask kdata_mask; static __initdata int kdata_arg_seen; -int __write_once kdata_huge; /* if no homecaching, small pages */ +int __ro_after_init kdata_huge; /* if no homecaching, small pages */ /* Combine a generic pgprot_t with cache home to get a cache-aware pgprot. */ @@ -896,8 +896,8 @@ void __init pgtable_cache_init(void) panic("pgtable_cache_init(): Cannot create pgd cache"); } -static long __write_once initfree = 1; -static bool __write_once set_initfree_done; +static long __ro_after_init initfree = 1; +static bool __ro_after_init set_initfree_done; /* Select whether to free (1) or mark unusable (0) the __init pages. */ static int __init set_initfree(char *str) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dd47e60aabf5..e487493bbd47 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -412,6 +412,19 @@ config GOLDFISH def_bool y depends on X86_GOLDFISH +config INTEL_RDT_A + bool "Intel Resource Director Technology Allocation support" + default n + depends on X86 && CPU_SUP_INTEL + select KERNFS + help + Select to enable resource allocation which is a sub-feature of + Intel Resource Director Technology(RDT). More information about + RDT can be found in the Intel x86 Architecture Software + Developer Manual. + + Say N if unsure. + if X86_32 config X86_EXTENDED_PLATFORM bool "Support for extended (non-PC) x86 platforms" @@ -555,18 +568,6 @@ config X86_INTEL_QUARK Say Y here if you have a Quark based system such as the Arduino compatible Intel Galileo. -config MLX_PLATFORM - tristate "Mellanox Technologies platform support" - depends on X86_64 - depends on X86_EXTENDED_PLATFORM - ---help--- - This option enables system support for the Mellanox Technologies - platform. - - Say Y here if you are building a kernel for Mellanox system. - - Otherwise, say N. - config X86_INTEL_LPSS bool "Intel Low Power Subsystem Support" depends on X86 && ACPI diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 8f82b02934fa..0c45cc8e64ba 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -7,9 +7,9 @@ #include <linux/perf_event.h> #include <linux/slab.h> #include <asm/cpu_device_id.h> +#include <asm/intel_rdt_common.h> #include "../perf_event.h" -#define MSR_IA32_PQR_ASSOC 0x0c8f #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d @@ -24,32 +24,13 @@ static unsigned int cqm_l3_scale; /* supposedly cacheline size */ static bool cqm_enabled, mbm_enabled; unsigned int mbm_socket_max; -/** - * struct intel_pqr_state - State cache for the PQR MSR - * @rmid: The cached Resource Monitoring ID - * @closid: The cached Class Of Service ID - * @rmid_usecnt: The usage counter for rmid - * - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct intel_pqr_state { - u32 rmid; - u32 closid; - int rmid_usecnt; -}; - /* * The cached intel_pqr_state is strictly per CPU and can never be * updated from a remote CPU. Both functions which modify the state * (intel_cqm_event_start and intel_cqm_event_stop) are called with * interrupts disabled, which is sufficient for the protection. */ -static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); static struct hrtimer *mbm_timers; /** * struct sample - mbm event's (local or total) data diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h new file mode 100644 index 000000000000..44b8762fa0c7 --- /dev/null +++ b/arch/x86/include/asm/asm-prototypes.h @@ -0,0 +1,16 @@ +#include <asm/ftrace.h> +#include <asm/uaccess.h> +#include <asm/string.h> +#include <asm/page.h> +#include <asm/checksum.h> + +#include <asm-generic/asm-prototypes.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/special_insns.h> +#include <asm/preempt.h> + +#ifndef CONFIG_X86_CMPXCHG64 +extern void cmpxchg8b_emu(void); +#endif diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 59ac427960d4..eafee3161d1c 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -105,6 +105,7 @@ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ @@ -188,6 +189,9 @@ #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ @@ -221,6 +225,7 @@ #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 1c7eefe32502..7ec59edde154 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -229,18 +229,18 @@ static struct fd_routine_l { int (*_dma_setup)(char *addr, unsigned long size, int mode, int io); } fd_routine[] = { { - request_dma, - free_dma, - get_dma_residue, - dma_mem_alloc, - hard_dma_setup + ._request_dma = request_dma, + ._free_dma = free_dma, + ._get_dma_residue = get_dma_residue, + ._dma_mem_alloc = dma_mem_alloc, + ._dma_setup = hard_dma_setup }, { - vdma_request_dma, - vdma_nop, - vdma_get_dma_residue, - vdma_mem_alloc, - vdma_dma_setup + ._request_dma = vdma_request_dma, + ._free_dma = vdma_nop, + ._get_dma_residue = vdma_get_dma_residue, + ._dma_mem_alloc = vdma_mem_alloc, + ._dma_setup = vdma_dma_setup } }; diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h new file mode 100644 index 000000000000..95ce5c85b009 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt.h @@ -0,0 +1,224 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#ifdef CONFIG_INTEL_RDT_A + +#include <linux/kernfs.h> +#include <linux/jump_label.h> + +#include <asm/intel_rdt_common.h> + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + int closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: file name + * @mode: access mode + * @kf_ops: operations + * @seq_show: show content of the file + * @write: write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct rdt_resource - attributes of an RDT resource + * @enabled: Is this feature enabled on this machine + * @capable: Is this feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @max_cbm: Largest Cache Bit Mask allowed + * @min_cbm_bits: Minimum number of consecutive bits to be set + * in a cache bit mask + * @domains: All domains for this resource + * @num_domains: Number of domains active + * @msr_base: Base MSR address for CBMs + * @tmp_cbms: Scratch space when updating schemata + * @num_tmp_cbms: Number of CBMs in tmp_cbms + * @cache_level: Which cache level defines scope of this domain + * @cbm_idx_multi: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + */ +struct rdt_resource { + bool enabled; + bool capable; + char *name; + int num_closid; + int cbm_len; + int min_cbm_bits; + u32 max_cbm; + struct list_head domains; + int num_domains; + int msr_base; + u32 *tmp_cbms; + int num_tmp_cbms; + int cache_level; + int cbm_idx_multi; + int cbm_idx_offset; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @cbm: array of cache bit masks (indexed by CLOSID) + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + u32 *cbm; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->capable) + +#define for_each_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=1).EDX */ +union cpuid_0x10_1_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); + +void rdt_cbm_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +/* + * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control and we enable by mounting the + * resctrl file system. + * - Caches the per cpu CLOSid values and does the MSR write only + * when a task with a different CLOSid is scheduled in. + * + * Must be called with preemption disabled. + */ +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) { + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + int closid; + + /* + * If this task has a closid assigned, use it. + * Else use the closid assigned to this cpu. + */ + closid = current->closid; + if (closid == 0) + closid = this_cpu_read(cpu_closid); + + if (closid != state->closid) { + state->closid = closid; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); + } + } +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT_A */ +#endif /* _ASM_X86_INTEL_RDT_H */ diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h new file mode 100644 index 000000000000..b31081b89407 --- /dev/null +++ b/arch/x86/include/asm/intel_rdt_common.h @@ -0,0 +1,27 @@ +#ifndef _ASM_X86_INTEL_RDT_COMMON_H +#define _ASM_X86_INTEL_RDT_COMMON_H + +#define MSR_IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @rmid: The cached Resource Monitoring ID + * @closid: The cached Class Of Service ID + * @rmid_usecnt: The usage counter for rmid + * + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 rmid; + u32 closid; + int rmid_usecnt; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +#endif /* _ASM_X86_INTEL_RDT_COMMON_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7892530cbacf..2e25038dbd93 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -704,6 +704,7 @@ struct kvm_apic_map { /* Hyper-V emulation context */ struct kvm_hv { + struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 72198c64e646..f9813b6d8b80 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -31,6 +31,10 @@ typedef struct { u16 pkey_allocation_map; s16 execute_only_pkey; #endif +#ifdef CONFIG_X86_INTEL_MPX + /* address of the bounds directory */ + void __user *bd_addr; +#endif } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h index 7a35495275a9..0b416d4cf73b 100644 --- a/arch/x86/include/asm/mpx.h +++ b/arch/x86/include/asm/mpx.h @@ -59,7 +59,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs); int mpx_handle_bd_fault(void); static inline int kernel_managing_mpx_tables(struct mm_struct *mm) { - return (mm->bd_addr != MPX_INVALID_BOUNDS_DIR); + return (mm->context.bd_addr != MPX_INVALID_BOUNDS_DIR); } static inline void mpx_mm_init(struct mm_struct *mm) { @@ -67,7 +67,7 @@ static inline void mpx_mm_init(struct mm_struct *mm) * NULL is theoretically a valid place to put the bounds * directory, so point this at an invalid address. */ - mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; } void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start, unsigned long end); diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1cc82ece9ac1..62b775926045 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -116,8 +116,7 @@ static inline void native_pgd_clear(pgd_t *pgd) native_set_pgd(pgd, native_make_pgd(0)); } -extern void sync_global_pgds(unsigned long start, unsigned long end, - int removed); +extern void sync_global_pgds(unsigned long start, unsigned long end); /* * Conversion functions: convert a page and protection to a page entry, diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 33b6365c22fe..abb1fdcc545a 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -45,8 +45,17 @@ extern int tsc_clocksource_reliable; * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: */ +#ifdef CONFIG_X86_TSC +extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); +extern void tsc_verify_tsc_adjust(bool resume); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); +#else +static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } +static inline void tsc_verify_tsc_adjust(bool resume) { } +static inline void check_tsc_sync_source(int cpu) { } +static inline void check_tsc_sync_target(void) { } +#endif extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 05110c1097ae..581386c7e429 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -75,7 +75,7 @@ apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += smpboot.o -obj-$(CONFIG_SMP) += tsc_sync.o +obj-$(CONFIG_X86_TSC) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4764fa56924d..6f65b0eed384 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -715,7 +715,7 @@ int acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) int nid; nid = acpi_get_node(handle); - if (nid != -1) { + if (nid != NUMA_NO_NODE) { set_apicid_to_node(physid, nid); numa_set_node(cpu, nid); } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index bb47e5eacd44..5b7e43eff139 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2160,21 +2160,6 @@ int __generic_processor_info(int apicid, int version, bool enabled) } /* - * This can happen on physical hotplug. The sanity check at boot time - * is done from native_smp_prepare_cpus() after num_possible_cpus() is - * established. - */ - if (topology_update_package_map(apicid, cpu) < 0) { - int thiscpu = max + disabled_cpus; - - pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", - thiscpu, apicid); - - disabled_cpus++; - return -ENOSPC; - } - - /* * Validate version */ if (version == 0x0) { diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 33b63670bf09..52000010c62e 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -32,6 +32,8 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o +obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_MICROCODE) += microcode/ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 729f92ba8224..1f6b50a449ab 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -979,29 +979,21 @@ static void x86_init_cache_qos(struct cpuinfo_x86 *c) } /* - * The physical to logical package id mapping is initialized from the - * acpi/mptables information. Make sure that CPUID actually agrees with - * that. + * Validate that ACPI/mptables have the same information about the + * effective APIC id and update the package map. */ -static void sanitize_package_id(struct cpuinfo_x86 *c) +static void validate_apic_and_package_id(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP - unsigned int pkg, apicid, cpu = smp_processor_id(); + unsigned int apicid, cpu = smp_processor_id(); apicid = apic->cpu_present_to_apicid(cpu); - pkg = apicid >> boot_cpu_data.x86_coreid_bits; - if (apicid != c->initial_apicid) { - pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x CPUID: %x\n", + if (apicid != c->apicid) { + pr_err(FW_BUG "CPU%u: APIC id mismatch. Firmware: %x APIC: %x\n", cpu, apicid, c->initial_apicid); - c->initial_apicid = apicid; } - if (pkg != c->phys_proc_id) { - pr_err(FW_BUG "CPU%u: Using firmware package id %u instead of %u\n", - cpu, pkg, c->phys_proc_id); - c->phys_proc_id = pkg; - } - c->logical_proc_id = topology_phys_to_logical_pkg(pkg); + BUG_ON(topology_update_package_map(c->phys_proc_id, cpu)); #else c->logical_proc_id = 0; #endif @@ -1132,7 +1124,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA numa_add_cpu(smp_processor_id()); #endif - sanitize_package_id(c); } /* @@ -1187,6 +1178,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) enable_sep_cpu(); #endif mtrr_ap_init(); + validate_apic_and_package_id(c); } static __init int setup_noclflush(char *arg) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index be6337156502..0282b0df004a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -153,6 +153,7 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; + unsigned int id; unsigned long size; struct amd_northbridge *nb; }; @@ -894,6 +895,8 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, static void ci_leaf_init(struct cacheinfo *this_leaf, struct _cpuid4_info_regs *base) { + this_leaf->id = base->id; + this_leaf->attributes = CACHE_ID; this_leaf->level = base->eax.split.level; this_leaf->type = cache_type_map[base->eax.split.type]; this_leaf->coherency_line_size = @@ -920,6 +923,22 @@ static int __init_cache_level(unsigned int cpu) return 0; } +/* + * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * ECX as cache index. Then right shift apicid by the number's order to get + * cache id for this cache node. + */ +static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned long num_threads_sharing; + int index_msb; + + num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + id4_regs->id = c->apicid >> index_msb; +} + static int __populate_cache_leaves(unsigned int cpu) { unsigned int idx, ret; @@ -931,6 +950,7 @@ static int __populate_cache_leaves(unsigned int cpu) ret = cpuid4_cache_lookup_regs(idx, &id4_regs); if (ret) return ret; + get_cache_id(cpu, &id4_regs); ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c new file mode 100644 index 000000000000..5a533fefefa0 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -0,0 +1,403 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/cacheinfo.h> +#include <linux/cpuhotplug.h> + +#include <asm/intel-family.h> +#include <asm/intel_rdt.h> + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); + +#define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) + +struct rdt_resource rdt_resources_all[] = { + { + .name = "L3", + .domains = domain_init(RDT_RESOURCE_L3), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 1, + .cbm_idx_offset = 0 + }, + { + .name = "L3DATA", + .domains = domain_init(RDT_RESOURCE_L3DATA), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 2, + .cbm_idx_offset = 0 + }, + { + .name = "L3CODE", + .domains = domain_init(RDT_RESOURCE_L3CODE), + .msr_base = IA32_L3_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 3, + .cbm_idx_multi = 2, + .cbm_idx_offset = 1 + }, + { + .name = "L2", + .domains = domain_init(RDT_RESOURCE_L2), + .msr_base = IA32_L2_CBM_BASE, + .min_cbm_bits = 1, + .cache_level = 2, + .cbm_idx_multi = 1, + .cbm_idx_offset = 0 + }, +}; + +static int cbm_idx(struct rdt_resource *r, int closid) +{ + return closid * r->cbm_idx_multi + r->cbm_idx_offset; +} + +/* + * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs + * as they do not have CPUID enumeration support for Cache allocation. + * The check for Vendor/Family/Model is not enough to guarantee that + * the MSRs won't #GP fault because only the following SKUs support + * CAT: + * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz + * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz + * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz + * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz + * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz + * + * Probe by trying to write the first of the L3 cach mask registers + * and checking that the bits stick. Max CLOSids is always 4 and max cbm length + * is always 20 on hsw server parts. The minimum cache bitmask length + * allowed for HSW server is always 2 bits. Hardcode all of them. + */ +static inline bool cache_alloc_hsw_probe(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; + + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return false; + rdmsr(IA32_L3_CBM_BASE, l, h); + + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return false; + + r->num_closid = 4; + r->cbm_len = 20; + r->max_cbm = max_cbm; + r->min_cbm_bits = 2; + r->capable = true; + r->enabled = true; + + return true; + } + + return false; +} + +static void rdt_get_config(int idx, struct rdt_resource *r) +{ + union cpuid_0x10_1_eax eax; + union cpuid_0x10_1_edx edx; + u32 ebx, ecx; + + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); + r->num_closid = edx.split.cos_max + 1; + r->cbm_len = eax.split.cbm_len + 1; + r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->capable = true; + r->enabled = true; +} + +static void rdt_get_cdp_l3_config(int type) +{ + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + struct rdt_resource *r = &rdt_resources_all[type]; + + r->num_closid = r_l3->num_closid / 2; + r->cbm_len = r_l3->cbm_len; + r->max_cbm = r_l3->max_cbm; + r->capable = true; + /* + * By default, CDP is disabled. CDP can be enabled by mount parameter + * "cdp" during resctrl file system mount time. + */ + r->enabled = false; +} + +static inline bool get_rdt_resources(void) +{ + bool ret = false; + + if (cache_alloc_hsw_probe()) + return true; + + if (!boot_cpu_has(X86_FEATURE_RDT_A)) + return false; + + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); + } + ret = true; + } + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + /* CPUID 0x10.2 fields are same format at 0x10.1 */ + rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + ret = true; + } + + return ret; +} + +static int get_cache_id(int cpu, int level) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + int i; + + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == level) + return ci->info_list[i].id; + } + + return -1; +} + +void rdt_cbm_update(void *arg) +{ + struct msr_param *m = (struct msr_param *)arg; + struct rdt_resource *r = m->res; + int i, cpu = smp_processor_id(); + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + goto found; + } + pr_info_once("cpu %d not found in any domain for resource %s\n", + cpu, r->name); + + return; + +found: + for (i = m->low; i < m->high; i++) { + int idx = cbm_idx(r, i); + + wrmsrl(r->msr_base + idx, d->cbm[i]); + } +} + +/* + * rdt_find_domain - Find a domain in a resource that matches input resource id + * + * Search resource r's domain list to find the resource id. If the resource + * id is found in a domain, return the domain. Otherwise, if requested by + * caller, return the first domain whose id is bigger than the input id. + * The domain list is sorted by id in ascending order. + */ +static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) +{ + struct rdt_domain *d; + struct list_head *l; + + if (id < 0) + return ERR_PTR(id); + + list_for_each(l, &r->domains) { + d = list_entry(l, struct rdt_domain, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +/* + * domain_add_cpu - Add a cpu to a resource's domain list. + * + * If an existing domain in the resource r's domain list matches the cpu's + * resource id, add the cpu in the domain. + * + * Otherwise, a new domain is allocated and inserted into the right position + * in the domain list sorted by id in ascending order. + * + * The order in the domain list is visible to users when we print entries + * in the schemata file and schemata input is validated to have the same order + * as this list. + */ +static void domain_add_cpu(int cpu, struct rdt_resource *r) +{ + int i, id = get_cache_id(cpu, r->cache_level); + struct list_head *add_pos = NULL; + struct rdt_domain *d; + + d = rdt_find_domain(r, id, &add_pos); + if (IS_ERR(d)) { + pr_warn("Could't find cache id for cpu %d\n", cpu); + return; + } + + if (d) { + cpumask_set_cpu(cpu, &d->cpu_mask); + return; + } + + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); + if (!d) + return; + + d->id = id; + + d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); + if (!d->cbm) { + kfree(d); + return; + } + + for (i = 0; i < r->num_closid; i++) { + int idx = cbm_idx(r, i); + + d->cbm[i] = r->max_cbm; + wrmsrl(r->msr_base + idx, d->cbm[i]); + } + + cpumask_set_cpu(cpu, &d->cpu_mask); + list_add_tail(&d->list, add_pos); + r->num_domains++; +} + +static void domain_remove_cpu(int cpu, struct rdt_resource *r) +{ + int id = get_cache_id(cpu, r->cache_level); + struct rdt_domain *d; + + d = rdt_find_domain(r, id, NULL); + if (IS_ERR_OR_NULL(d)) { + pr_warn("Could't find cache id for cpu %d\n", cpu); + return; + } + + cpumask_clear_cpu(cpu, &d->cpu_mask); + if (cpumask_empty(&d->cpu_mask)) { + r->num_domains--; + kfree(d->cbm); + list_del(&d->list); + kfree(d); + } +} + +static void clear_closid(int cpu) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + + per_cpu(cpu_closid, cpu) = 0; + state->closid = 0; + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); +} + +static int intel_rdt_online_cpu(unsigned int cpu) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_add_cpu(cpu, r); + /* The cpu is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + clear_closid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int intel_rdt_offline_cpu(unsigned int cpu) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + for_each_capable_rdt_resource(r) + domain_remove_cpu(cpu, r); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + break; + } + clear_closid(cpu); + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int __init intel_rdt_late_init(void) +{ + struct rdt_resource *r; + int state, ret; + + if (!get_rdt_resources()) + return -ENODEV; + + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "x86/rdt/cat:online:", + intel_rdt_online_cpu, intel_rdt_offline_cpu); + if (state < 0) + return state; + + ret = rdtgroup_init(); + if (ret) { + cpuhp_remove_state(state); + return ret; + } + + for_each_capable_rdt_resource(r) + pr_info("Intel RDT %s allocation detected\n", r->name); + + return 0; +} + +late_initcall(intel_rdt_late_init); diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c new file mode 100644 index 000000000000..8af04afdfcb9 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -0,0 +1,1115 @@ +/* + * User interface for Resource Alloction in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu <fenghua.yu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/cpu.h> +#include <linux/task_work.h> + +#include <uapi/linux/magic.h> + +#include <asm/intel_rdt.h> +#include <asm/intel_rdt_common.h> + +DEFINE_STATIC_KEY_FALSE(rdt_enable_key); +struct kernfs_root *rdt_root; +struct rdtgroup rdtgroup_default; +LIST_HEAD(rdt_all_groups); + +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* + * Trivial allocator for CLOSIDs. Since h/w only supports a small number, + * we can keep a bitmap of free CLOSIDs in a single integer. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set "current->closid" to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static int closid_free_map; + +static void closid_init(void) +{ + struct rdt_resource *r; + int rdt_min_closid = 32; + + /* Compute rdt_min_closid across all resources */ + for_each_enabled_rdt_resource(r) + rdt_min_closid = min(rdt_min_closid, r->num_closid); + + closid_free_map = BIT_MASK(rdt_min_closid) - 1; + + /* CLOSID 0 is always reserved for the default group */ + closid_free_map &= ~1; +} + +int closid_alloc(void) +{ + int closid = ffs(closid_free_map); + + if (closid == 0) + return -ENOSPC; + closid--; + closid_free_map &= ~(1 << closid); + + return closid; +} + +static void closid_free(int closid) +{ + closid_free_map |= 1 << closid; +} + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, + int len) +{ + struct rftype *rft; + int ret; + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) + kernfs_remove_by_name(kn, rft->name); + return ret; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) + seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * This is safe against intel_rdt_sched_in() called from __switch_to() + * because __switch_to() is executed with interrupts disabled. A local call + * from rdt_update_closid() is proteced against __switch_to() because + * preemption is disabled. + */ +static void rdt_update_cpu_closid(void *closid) +{ + if (closid) + this_cpu_write(cpu_closid, *(int *)closid); + /* + * We cannot unconditionally write the MSR because the current + * executing task might have its own closid selected. Just reuse + * the context switch code. + */ + intel_rdt_sched_in(); +} + +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids must have been set up before calling this function. + * + * The per cpu closids are updated with the smp function call, when @closid + * is not NULL. If @closid is NULL then all affected percpu closids must + * have been set up before calling this function. + */ +static void +rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +{ + int cpu = get_cpu(); + + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_update_cpu_closid(closid); + smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + put_cpu(); +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask; + struct rdtgroup *rdtgrp, *r; + int ret; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + ret = cpumask_parse(buf, newmask); + if (ret) + goto unlock; + + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (cpumask_weight(tmpmask)) { + ret = -EINVAL; + goto unlock; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + ret = -EINVAL; + goto unlock; + } + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + rdt_update_closid(tmpmask, &rdtgroup_default.closid); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu closid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); + } + rdt_update_closid(tmpmask, &rdtgrp->closid); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + + return ret ?: nbytes; +} + +struct task_move_callback { + struct callback_head work; + struct rdtgroup *rdtgrp; +}; + +static void move_myself(struct callback_head *head) +{ + struct task_move_callback *callback; + struct rdtgroup *rdtgrp; + + callback = container_of(head, struct task_move_callback, work); + rdtgrp = callback->rdtgrp; + + /* + * If resource group was deleted before this task work callback + * was invoked, then assign the task to root group and free the + * resource group. + */ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + current->closid = 0; + kfree(rdtgrp); + } + + preempt_disable(); + /* update PQR_ASSOC MSR to make resource group go into effect */ + intel_rdt_sched_in(); + preempt_enable(); + + kfree(callback); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + struct task_move_callback *callback; + int ret; + + callback = kzalloc(sizeof(*callback), GFP_KERNEL); + if (!callback) + return -ENOMEM; + callback->work.func = move_myself; + callback->rdtgrp = rdtgrp; + + /* + * Take a refcount, so rdtgrp cannot be freed before the + * callback has been invoked. + */ + atomic_inc(&rdtgrp->waitcount); + ret = task_work_add(tsk, &callback->work, true); + if (ret) { + /* + * Task is exiting. Drop the refcount and free the callback. + * No need to check the refcount as the group cannot be + * deleted before the write function unlocks rdtgroup_mutex. + */ + atomic_dec(&rdtgrp->waitcount); + kfree(callback); + } else { + tsk->closid = rdtgrp->closid; + } + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + ret = -EPERM; + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + pid_t pid; + + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) + return -EINVAL; + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) + ret = rdtgroup_move_task(pid, rdtgrp, of); + else + ret = -ENOENT; + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (t->closid == r->closid) + seq_printf(s, "%d\n", t->pid); + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* Files in each rdtgroup */ +static struct rftype rdtgroup_base_files[] = { + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + }, +}; + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_closid); + + return 0; +} + +static int rdt_cbm_mask_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->max_cbm); + + return 0; +} + +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->min_cbm_bits); + + return 0; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_info_files[] = { + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_cbm_mask_show, + }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + }, +}; + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct kernfs_node *kn_subdir; + struct rdt_resource *r; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + kernfs_get(kn_info); + + for_each_enabled_rdt_resource(r) { + kn_subdir = kernfs_create_dir(kn_info, r->name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) { + ret = PTR_ERR(kn_subdir); + goto out_destroy; + } + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + goto out_destroy; + ret = rdtgroup_add_files(kn_subdir, res_info_files, + ARRAY_SIZE(res_info_files)); + if (ret) + goto out_destroy; + kernfs_activate(kn_subdir); + } + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn_info); + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + +static void l3_qos_cfg_update(void *arg) +{ + bool *enable = arg; + + wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); +} + +static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) +{ + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + list_for_each_entry(d, &r->domains, list) { + /* Pick one CPU from each domain instance to update MSR */ + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + } + cpu = get_cpu(); + /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + l3_qos_cfg_update(&enable); + /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +static int cdp_enable(void) +{ + struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA]; + struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE]; + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; + int ret; + + if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + return -EINVAL; + + ret = set_l3_qos_cfg(r_l3, true); + if (!ret) { + r_l3->enabled = false; + r_l3data->enabled = true; + r_l3code->enabled = true; + } + return ret; +} + +static void cdp_disable(void) +{ + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + + r->enabled = r->capable; + + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + set_l3_qos_cfg(r, false); + } +} + +static int parse_rdtgroupfs_options(char *data) +{ + char *token, *o = data; + int ret = 0; + + while ((token = strsep(&o, ",")) != NULL) { + if (!*token) + return -EINVAL; + + if (!strcmp(token, "cdp")) + ret = cdp_enable(); + } + + return ret; +} + +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || kn->parent == kn_info) + return NULL; + else + return kn->priv; + } else { + return kn->parent->priv; + } +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return NULL; + + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); + + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return; + + mutex_unlock(&rdtgroup_mutex); + + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + kernfs_unbreak_active_protection(kn); + kernfs_put(kn); + kfree(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + +static struct dentry *rdt_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + struct dentry *dentry; + int ret; + + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (static_branch_unlikely(&rdt_enable_key)) { + dentry = ERR_PTR(-EBUSY); + goto out; + } + + ret = parse_rdtgroupfs_options(data); + if (ret) { + dentry = ERR_PTR(ret); + goto out_cdp; + } + + closid_init(); + + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret) { + dentry = ERR_PTR(ret); + goto out_cdp; + } + + dentry = kernfs_mount(fs_type, flags, rdt_root, + RDTGROUP_SUPER_MAGIC, NULL); + if (IS_ERR(dentry)) + goto out_cdp; + + static_branch_enable(&rdt_enable_key); + goto out; + +out_cdp: + cdp_disable(); +out: + mutex_unlock(&rdtgroup_mutex); + + return dentry; +} + +static int reset_all_cbms(struct rdt_resource *r) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int i, cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.res = r; + msr_param.low = 0; + msr_param.high = r->num_closid; + + /* + * Disable resource control for this resource by setting all + * CBMs in all domains to the maximum mask value. Pick one CPU + * from each domain to update the MSRs below. + */ + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + + for (i = 0; i < r->num_closid; i++) + d->cbm[i] = r->max_cbm; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_cbm_update(&msr_param); + /* Update CBM on all other cpus in cpu_mask. */ + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || t->closid == from->closid) { + t->closid = to->closid; +#ifdef CONFIG_SMP + /* + * This is safe on x86 w/o barriers as the ordering + * of writing to task_cpu() and t->on_cpu is + * reverse to the reading here. The detection is + * inaccurate as tasks might move or schedule + * before the smp function call takes place. In + * such a case the function call is pointless, but + * there is no other side effect. + */ + if (mask && t->on_cpu) + cpumask_set_cpu(task_cpu(t), mask); +#endif + } + } + read_unlock(&tasklist_lock); +} + +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + struct rdtgroup *rdtgrp, *tmp; + + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); + + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + kfree(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + get_online_cpus(); + rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + put_online_cpus(); + + kernfs_remove(kn_info); +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + mutex_lock(&rdtgroup_mutex); + + /*Put everything back to default values. */ + for_each_enabled_rdt_resource(r) + reset_all_cbms(r); + cdp_disable(); + rmdir_all_sub(); + static_branch_disable(&rdt_enable_key); + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .mount = rdt_mount, + .kill_sb = rdt_kill_sb, +}; + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct rdtgroup *parent, *rdtgrp; + struct kernfs_node *kn; + int ret, closid; + + /* Only allow mkdir in the root directory */ + if (parent_kn != rdtgroup_default.kn) + return -EPERM; + + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = rdtgroup_kn_lock_live(parent_kn); + if (!parent) { + ret = -ENODEV; + goto out_unlock; + } + + ret = closid_alloc(); + if (ret < 0) + goto out_unlock; + closid = ret; + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + goto out_closid_free; + } + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_cancel_ref; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn} call below. Take one extra reference + * here, which will be dropped inside rdtgroup_kn_unlock(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = rdtgroup_add_files(kn, rdtgroup_base_files, + ARRAY_SIZE(rdtgroup_base_files)); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + ret = 0; + goto out_unlock; + +out_destroy: + kernfs_remove(rdtgrp->kn); +out_cancel_ref: + list_del(&rdtgrp->rdtgroup_list); + kfree(rdtgrp); +out_closid_free: + closid_free(closid); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + int ret, cpu, closid = rdtgroup_default.closid; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* Give any tasks back to the default group */ + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); + + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + /* Update per cpu closid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(cpu_closid, cpu) = closid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + rdt_update_closid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + closid_free(rdtgrp->closid); + list_del(&rdtgrp->rdtgroup_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + ret = 0; +out: + rdtgroup_kn_unlock(kn); + free_cpumask_var(tmpmask); + return ret; +} + +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + seq_puts(seq, ",cdp"); + return 0; +} + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .show_options = rdtgroup_show_options, +}; + +static int __init rdtgroup_setup_root(void) +{ + int ret; + + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = 0; + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, + ARRAY_SIZE(rdtgroup_base_files)); + if (ret) { + kernfs_destroy_root(rdt_root); + goto out; + } + + rdtgroup_default.kn = rdt_root->kn; + kernfs_activate(rdtgroup_default.kn); + +out: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} + +/* + * rdtgroup_init - rdtgroup initialization + * + * Setup resctrl file system including set up root, create mount point, + * register rdtgroup filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int __init rdtgroup_init(void) +{ + int ret = 0; + + ret = rdtgroup_setup_root(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) + goto cleanup_root; + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); +cleanup_root: + kernfs_destroy_root(rdt_root); + + return ret; +} diff --git a/arch/x86/kernel/cpu/intel_rdt_schemata.c b/arch/x86/kernel/cpu/intel_rdt_schemata.c new file mode 100644 index 000000000000..f369cb8db0d5 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_rdt_schemata.c @@ -0,0 +1,245 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <asm/intel_rdt.h> + +/* + * Check whether a cache bit mask is valid. The SDM says: + * Please note that all (and only) contiguous '1' combinations + * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). + * Additionally Haswell requires at least two bits set. + */ +static bool cbm_validate(unsigned long var, struct rdt_resource *r) +{ + unsigned long first_bit, zero_bit; + + if (var == 0 || var > r->max_cbm) + return false; + + first_bit = find_first_bit(&var, r->cbm_len); + zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); + + if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) + return false; + + if ((zero_bit - first_bit) < r->min_cbm_bits) + return false; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +static int parse_cbm(char *buf, struct rdt_resource *r) +{ + unsigned long data; + int ret; + + ret = kstrtoul(buf, 16, &data); + if (ret) + return ret; + if (!cbm_validate(data, r)) + return -EINVAL; + r->tmp_cbms[r->num_tmp_cbms++] = data; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must appear in the + * right order. + */ +static int parse_line(char *line, struct rdt_resource *r) +{ + char *dom = NULL, *id; + struct rdt_domain *d; + unsigned long dom_id; + + list_for_each_entry(d, &r->domains, list) { + dom = strsep(&line, ";"); + if (!dom) + return -EINVAL; + id = strsep(&dom, "="); + if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) + return -EINVAL; + if (parse_cbm(dom, r)) + return -EINVAL; + } + + /* Any garbage at the end of the line? */ + if (line && line[0]) + return -EINVAL; + return 0; +} + +static int update_domains(struct rdt_resource *r, int closid) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu, idx = 0; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.low = closid; + msr_param.high = msr_param.low + 1; + msr_param.res = r; + + list_for_each_entry(d, &r->domains, list) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->cbm[msr_param.low] = r->tmp_cbms[idx++]; + } + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_cbm_update(&msr_param); + /* Update CBM on other cpus. */ + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); + put_cpu(); + + free_cpumask_var(cpu_mask); + + return 0; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + char *tok, *resname; + int closid, ret = 0; + u32 *l3_cbms = NULL; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + closid = rdtgrp->closid; + + /* get scratch space to save all the masks while we validate input */ + for_each_enabled_rdt_resource(r) { + r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), + GFP_KERNEL); + if (!r->tmp_cbms) { + ret = -ENOMEM; + goto out; + } + r->num_tmp_cbms = 0; + } + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strsep(&tok, ":"); + if (!tok) { + ret = -EINVAL; + goto out; + } + for_each_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && + closid < r->num_closid) { + ret = parse_line(tok, r); + if (ret) + goto out; + break; + } + } + if (!r->name) { + ret = -EINVAL; + goto out; + } + } + + /* Did the parser find all the masks we need? */ + for_each_enabled_rdt_resource(r) { + if (r->num_tmp_cbms != r->num_domains) { + ret = -EINVAL; + goto out; + } + } + + for_each_enabled_rdt_resource(r) { + ret = update_domains(r, closid); + if (ret) + goto out; + } + +out: + rdtgroup_kn_unlock(of->kn); + for_each_enabled_rdt_resource(r) { + kfree(r->tmp_cbms); + r->tmp_cbms = NULL; + } + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) +{ + struct rdt_domain *dom; + bool sep = false; + + seq_printf(s, "%s:", r->name); + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + int closid, ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + closid = rdtgrp->closid; + for_each_enabled_rdt_resource(r) { + if (closid < r->num_closid) + show_doms(s, r, closid); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index d1316f9c8329..d9794060fe22 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -20,12 +20,15 @@ struct cpuid_bit { /* Please keep the leaf sorted by cpuid_bit.level for faster search. */ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, - { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 90de28841242..b467b14b03eb 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -298,12 +298,13 @@ ENTRY(start_cpu) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - call 1f # put return address on stack for unwinder -1: xorq %rbp, %rbp # clear frame pointer + pushq $.Lafter_lret # put return address on stack for unwinder + xorq %rbp, %rbp # clear frame pointer movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq +.Lafter_lret: ENDPROC(start_cpu) #include "verify_cpu.S" diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 43c36d8a6ae2..37363e46b1f0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -235,6 +235,7 @@ static inline void play_dead(void) void arch_cpu_idle_enter(void) { + tsc_verify_tsc_adjust(false); local_touch_nmi(); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d0d744108594..a0ac3e81518a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -53,6 +53,7 @@ #include <asm/debugreg.h> #include <asm/switch_to.h> #include <asm/vm86.h> +#include <asm/intel_rdt.h> void __show_regs(struct pt_regs *regs, int all) { @@ -296,5 +297,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) this_cpu_write(current_task, next_p); + /* Load the Intel cache allocation PQR MSR. */ + intel_rdt_sched_in(); + return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a76b65e3e615..a61e141b6891 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -49,6 +49,7 @@ #include <asm/switch_to.h> #include <asm/xen/hypervisor.h> #include <asm/vdso.h> +#include <asm/intel_rdt.h> __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -476,6 +477,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) loadsegment(ss, __KERNEL_DS); } + /* Load the Intel cache allocation PQR MSR. */ + intel_rdt_sched_in(); + return prev_p; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0c37d4fd01b2..46732dc3b73c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -103,7 +103,6 @@ static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); static unsigned int logical_packages __read_mostly; -static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; @@ -273,9 +272,14 @@ static void notrace start_secondary(void *unused) cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); } -int topology_update_package_map(unsigned int apicid, unsigned int cpu) +/** + * topology_update_package_map - Update the physical to logical package map + * @pkg: The physical package id as retrieved via CPUID + * @cpu: The cpu for which this is updated + */ +int topology_update_package_map(unsigned int pkg, unsigned int cpu) { - unsigned int new, pkg = apicid >> boot_cpu_data.x86_coreid_bits; + unsigned int new; /* Called from early boot ? */ if (!physical_package_map) @@ -288,16 +292,17 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - if (logical_packages_frozen) { - physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package max\n", - apicid, pkg); + if (logical_packages >= __max_logical_packages) { + pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n", + logical_packages, cpu, __max_logical_packages); return -ENOSPC; } new = logical_packages++; - pr_info("APIC(%x) Converting physical %u to logical package %u\n", - apicid, pkg, new); + if (new != pkg) { + pr_info("CPU %u Converting physical %u to logical package %u\n", + cpu, pkg, new); + } physical_to_logical_pkg[pkg] = new; found: @@ -318,9 +323,9 @@ int topology_phys_to_logical_pkg(unsigned int phys_pkg) } EXPORT_SYMBOL(topology_phys_to_logical_pkg); -static void __init smp_init_package_map(void) +static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu) { - unsigned int ncpus, cpu; + unsigned int ncpus; size_t size; /* @@ -365,27 +370,9 @@ static void __init smp_init_package_map(void) size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - for_each_present_cpu(cpu) { - unsigned int apicid = apic->cpu_present_to_apicid(cpu); - - if (apicid == BAD_APICID || !apic->apic_id_valid(apicid)) - continue; - if (!topology_update_package_map(apicid, cpu)) - continue; - pr_warn("CPU %u APICId %x disabled\n", cpu, apicid); - per_cpu(x86_bios_cpu_apicid, cpu) = BAD_APICID; - set_cpu_possible(cpu, false); - set_cpu_present(cpu, false); - } - - if (logical_packages > __max_logical_packages) { - pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", - logical_packages, __max_logical_packages); - logical_packages_frozen = true; - __max_logical_packages = logical_packages; - } - pr_info("Max logical packages: %u\n", __max_logical_packages); + + topology_update_package_map(c->phys_proc_id, cpu); } void __init smp_store_boot_cpu_info(void) @@ -395,7 +382,7 @@ void __init smp_store_boot_cpu_info(void) *c = boot_cpu_data; c->cpu_index = id; - smp_init_package_map(); + smp_init_package_map(c, id); } /* @@ -1476,15 +1463,15 @@ __init void prefill_possible_map(void) possible = i; } + nr_cpu_ids = possible; + pr_info("Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); + reset_cpu_possible_mask(); + for (i = 0; i < possible; i++) set_cpu_possible(i, true); - for (; i < NR_CPUS; i++) - set_cpu_possible(i, false); - - nr_cpu_ids = possible; } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46b2f41f8b05..0aed75a1e31b 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -702,6 +702,20 @@ unsigned long native_calibrate_tsc(void) } } + /* + * TSC frequency determined by CPUID is a "hardware reported" + * frequency and is the most accurate one so far we have. This + * is considered a known frequency. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * For Atom SoCs TSC is the only reliable clocksource. + * Mark TSC reliable so no watchdog on it. + */ + if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return crystal_khz * ebx_numerator / eax_denominator; } @@ -1043,18 +1057,20 @@ static void detect_art(void) if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; - cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, - &art_to_tsc_numerator, unused, unused+1); - - /* Don't enable ART in a VM, non-stop TSC required */ + /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - art_to_tsc_denominator < ART_MIN_DENOMINATOR) + !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return; - if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, + &art_to_tsc_numerator, unused, unused+1); + + if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) return; + rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); + /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); } @@ -1064,6 +1080,11 @@ static void detect_art(void) static struct clocksource clocksource_tsc; +static void tsc_resume(struct clocksource *cs) +{ + tsc_verify_tsc_adjust(true); +} + /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a @@ -1096,6 +1117,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, + .resume = tsc_resume, }; void mark_tsc_unstable(char *reason) @@ -1283,10 +1305,10 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* - * Trust the results of the earlier calibration on systems - * exporting a reliable TSC. + * When TSC frequency is known (retrieved via MSR or CPUID), we skip + * the refined calibration and directly register it as a clocksource. */ - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } @@ -1363,6 +1385,8 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); + else + tsc_store_and_check_tsc_adjust(true); check_system_tsc_reliable(); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 0fe720d64fef..19afdbd7d0a7 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void) #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; #endif + + /* + * TSC frequency determined by MSR is always considered "known" + * because it is reported by HW. + * Another fact is that on MSR capable platforms, PIT/HPET is + * generally not available so calibration won't work at all. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Unfortunately there is no way for hardware to tell whether the + * TSC is reliable. We were told by silicon design team that TSC + * on Atom SoCs are always "reliable". TSC is also the only + * reliable clocksource on these SoCs (HPET is either not present + * or not functional) so mark TSC reliable which removes the + * requirement for a watchdog clocksource. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return res; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 78083bf23ed1..d0db011051a5 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -14,18 +14,166 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include <linux/topology.h> #include <linux/spinlock.h> #include <linux/kernel.h> #include <linux/smp.h> #include <linux/nmi.h> #include <asm/tsc.h> +struct tsc_adjust { + s64 bootval; + s64 adjusted; + unsigned long nextcheck; + bool warned; +}; + +static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); + +void tsc_verify_tsc_adjust(bool resume) +{ + struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); + s64 curval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + + /* Rate limit the MSR check */ + if (!resume && time_before(jiffies, adj->nextcheck)) + return; + + adj->nextcheck = jiffies + HZ; + + rdmsrl(MSR_IA32_TSC_ADJUST, curval); + if (adj->adjusted == curval) + return; + + /* Restore the original value */ + wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + + if (!adj->warned || resume) { + pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", + smp_processor_id(), adj->adjusted, curval); + adj->warned = true; + } +} + +static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, + unsigned int cpu, bool bootcpu) +{ + /* + * First online CPU in a package stores the boot value in the + * adjustment value. This value might change later via the sync + * mechanism. If that fails we still can yell about boot values not + * being consistent. + * + * On the boot cpu we just force set the ADJUST value to 0 if it's + * non zero. We don't do that on non boot cpus because physical + * hotplug should have set the ADJUST register to a value > 0 so + * the TSC is in sync with the already running cpus. + * + * But we always force positive ADJUST values. Otherwise the TSC + * deadline timer creates an interrupt storm. We also have to + * prevent values > 0x7FFFFFFF as those wreckage the timer as well. + */ + if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) || + (bootval > 0x7FFFFFFF)) { + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, + bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } + cur->adjusted = bootval; +} + +#ifndef CONFIG_SMP +bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); + return false; +} + +#else /* !CONFIG_SMP */ + +/* + * Store and check the TSC ADJUST MSR if available + */ +bool tsc_store_and_check_tsc_adjust(bool bootcpu) +{ + struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); + unsigned int refcpu, cpu = smp_processor_id(); + struct cpumask *mask; + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return false; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + cur->warned = false; + + /* + * Check whether this CPU is the first in a package to come up. In + * this case do not check the boot value against another package + * because the new package might have been physically hotplugged, + * where TSC_ADJUST is expected to be different. When called on the + * boot CPU topology_core_cpumask() might not be available yet. + */ + mask = topology_core_cpumask(cpu); + refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids; + + if (refcpu >= nr_cpu_ids) { + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), + bootcpu); + return false; + } + + ref = per_cpu_ptr(&tsc_adjust, refcpu); + /* + * Compare the boot value and complain if it differs in the + * package. + */ + if (bootval != ref->bootval) { + pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->bootval, cpu, bootval); + } + /* + * The TSC_ADJUST values in a package must be the same. If the boot + * value on this newly upcoming CPU differs from the adjustment + * value of the already online CPU in this package, set it to that + * adjusted value. + */ + if (bootval != ref->adjusted) { + pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->adjusted, cpu, bootval); + cur->adjusted = ref->adjusted; + wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + } + /* + * We have the TSCs forced to be in sync on this package. Skip sync + * test: + */ + return true; +} + /* * Entry/exit counters that make sure that both CPUs * run the measurement code at once: */ static atomic_t start_count; static atomic_t stop_count; +static atomic_t skip_test; +static atomic_t test_runs; /* * We use a raw spinlock in this exceptional case, because @@ -37,15 +185,16 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; static cycles_t last_tsc; static cycles_t max_warp; static int nr_warps; +static int random_warps; /* * TSC-warp measurement loop running on both CPUs. This is not called * if there is no TSC. */ -static void check_tsc_warp(unsigned int timeout) +static cycles_t check_tsc_warp(unsigned int timeout) { - cycles_t start, now, prev, end; - int i; + cycles_t start, now, prev, end, cur_max_warp = 0; + int i, cur_warps = 0; start = rdtsc_ordered(); /* @@ -85,13 +234,22 @@ static void check_tsc_warp(unsigned int timeout) if (unlikely(prev > now)) { arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); + cur_max_warp = max_warp; + /* + * Check whether this bounces back and forth. Only + * one CPU should observe time going backwards. + */ + if (cur_warps != nr_warps) + random_warps++; nr_warps++; + cur_warps = nr_warps; arch_spin_unlock(&sync_lock); } } WARN(!(now-start), "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); + return cur_max_warp; } /* @@ -136,15 +294,26 @@ void check_tsc_sync_source(int cpu) } /* - * Reset it - in case this is a second bootup: + * Set the maximum number of test runs to + * 1 if the CPU does not provide the TSC_ADJUST MSR + * 3 if the MSR is available, so the target can try to adjust */ - atomic_set(&stop_count, 0); - + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + atomic_set(&test_runs, 1); + else + atomic_set(&test_runs, 3); +retry: /* - * Wait for the target to arrive: + * Wait for the target to start or to skip the test: */ - while (atomic_read(&start_count) != cpus-1) + while (atomic_read(&start_count) != cpus - 1) { + if (atomic_read(&skip_test) > 0) { + atomic_set(&skip_test, 0); + return; + } cpu_relax(); + } + /* * Trigger the target to continue into the measurement too: */ @@ -155,21 +324,35 @@ void check_tsc_sync_source(int cpu) while (atomic_read(&stop_count) != cpus-1) cpu_relax(); - if (nr_warps) { + /* + * If the test was successful set the number of runs to zero and + * stop. If not, decrement the number of runs an check if we can + * retry. In case of random warps no retry is attempted. + */ + if (!nr_warps) { + atomic_set(&test_runs, 0); + + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); + + } else if (atomic_dec_and_test(&test_runs) || random_warps) { + /* Force it to 0 if random warps brought us here */ + atomic_set(&test_runs, 0); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); + if (random_warps) + pr_warning("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); - } else { - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", - smp_processor_id(), cpu); } /* * Reset it - just in case we boot another CPU later: */ atomic_set(&start_count, 0); + random_warps = 0; nr_warps = 0; max_warp = 0; last_tsc = 0; @@ -178,6 +361,12 @@ void check_tsc_sync_source(int cpu) * Let the target continue with the bootup: */ atomic_inc(&stop_count); + + /* + * Retry, if there is a chance to do so. + */ + if (atomic_read(&test_runs) > 0) + goto retry; } /* @@ -185,6 +374,9 @@ void check_tsc_sync_source(int cpu) */ void check_tsc_sync_target(void) { + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + unsigned int cpu = smp_processor_id(); + cycles_t cur_max_warp, gbl_max_warp; int cpus = 2; /* Also aborts if there is no TSC. */ @@ -192,6 +384,16 @@ void check_tsc_sync_target(void) return; /* + * Store, verify and sanitize the TSC adjust register. If + * successful skip the test. + */ + if (tsc_store_and_check_tsc_adjust(false)) { + atomic_inc(&skip_test); + return; + } + +retry: + /* * Register this CPU's participation and wait for the * source CPU to start the measurement: */ @@ -199,7 +401,12 @@ void check_tsc_sync_target(void) while (atomic_read(&start_count) != cpus) cpu_relax(); - check_tsc_warp(loop_timeout(smp_processor_id())); + cur_max_warp = check_tsc_warp(loop_timeout(cpu)); + + /* + * Store the maximum observed warp value for a potential retry: + */ + gbl_max_warp = max_warp; /* * Ok, we are done: @@ -211,4 +418,61 @@ void check_tsc_sync_target(void) */ while (atomic_read(&stop_count) != cpus) cpu_relax(); + + /* + * Reset it for the next sync test: + */ + atomic_set(&stop_count, 0); + + /* + * Check the number of remaining test runs. If not zero, the test + * failed and a retry with adjusted TSC is possible. If zero the + * test was either successful or failed terminally. + */ + if (!atomic_read(&test_runs)) + return; + + /* + * If the warp value of this CPU is 0, then the other CPU + * observed time going backwards so this TSC was ahead and + * needs to move backwards. + */ + if (!cur_max_warp) + cur_max_warp = -gbl_max_warp; + + /* + * Add the result to the previous adjustment value. + * + * The adjustement value is slightly off by the overhead of the + * sync mechanism (observed values are ~200 TSC cycles), but this + * really depends on CPU, node distance and frequency. So + * compensating for this is hard to get right. Experiments show + * that the warp is not longer detectable when the observed warp + * value is used. In the worst case the adjustment needs to go + * through a 3rd run for fine tuning. + */ + cur->adjusted += cur_max_warp; + + /* + * TSC deadline timer stops working or creates an interrupt storm + * with adjust values < 0 and > x07ffffff. + * + * To allow adjust values > 0x7FFFFFFF we need to disable the + * deadline timer and use the local APIC timer, but that requires + * more intrusive changes and we do not have any useful information + * from Intel about the underlying HW wreckage yet. + */ + if (cur->adjusted < 0) + cur->adjusted = 0; + if (cur->adjusted > 0x7FFFFFFF) + cur->adjusted = 0x7FFFFFFF; + + pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", + cpu, cur_max_warp, cur->adjusted); + + wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + goto retry; + } + +#endif /* CONFIG_SMP */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b2d3cf1ef54a..e85f6bd7b9d5 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -373,16 +373,17 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_cpuid_7_0_ebx_x86_features = F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | - F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(AVX512BW) | F(AVX512VL); + F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(SHA_NI) | F(AVX512BW) | F(AVX512VL); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; /* cpuid 7.0.ecx*/ - const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + const u32 kvm_cpuid_7_0_ecx_x86_features = + F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/; /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 99cde5220e07..1572c35b4f1a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -852,6 +852,10 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; + mutex_lock(&kvm->arch.hyperv.hv_lock); + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) + goto out_unlock; + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; /* * Because the TSC parameters only vary when there is a @@ -859,7 +863,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, */ if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq)))) - return; + goto out_unlock; /* * While we're computing and writing the parameters, force the @@ -868,15 +872,15 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = 0; if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence))) - return; + goto out_unlock; if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref)) - return; + goto out_unlock; /* Ensure sequence is zero before writing the rest of the struct. */ smp_wmb(); if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref))) - return; + goto out_unlock; /* * Now switch to the TSC page mechanism by writing the sequence. @@ -891,6 +895,8 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, hv->tsc_ref.tsc_sequence = tsc_seq; kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)); +out_unlock: + mutex_unlock(&kvm->arch.hyperv.hv_lock); } static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, @@ -1142,9 +1148,9 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); r = kvm_hv_set_msr_pw(vcpu, msr, data, host); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else return kvm_hv_set_msr(vcpu, msr, data, host); @@ -1155,9 +1161,9 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) if (kvm_hv_msr_partition_wide(msr)) { int r; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); r = kvm_hv_get_msr_pw(vcpu, msr, pdata); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else return kvm_hv_get_msr(vcpu, msr, pdata); @@ -1165,7 +1171,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) bool kvm_hv_hypercall_enabled(struct kvm *kvm) { - return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE; + return READ_ONCE(kvm->arch.hyperv.hv_hypercall) & HV_X64_MSR_HYPERCALL_ENABLE; } static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index aae43c6f2472..24db5fb6f575 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1389,10 +1389,10 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12) return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR; } -static inline bool is_exception(u32 intr_info) +static inline bool is_nmi(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK)) - == (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK); + == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK); } static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, @@ -5728,7 +5728,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (is_machine_check(intr_info)) return handle_machine_check(vcpu); - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) + if (is_nmi(intr_info)) return 1; /* already handled by vmx_vcpu_run() */ if (is_no_device(intr_info)) { @@ -7122,7 +7122,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); + VMXERR_VMPTRLD_VMXON_POINTER); return kvm_skip_emulated_instruction(vcpu); } break; @@ -8170,7 +8170,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: - if (!is_exception(intr_info)) + if (is_nmi(intr_info)) return false; else if (is_page_fault(intr_info)) return enable_ept; @@ -8765,8 +8765,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) { + if (is_nmi(exit_intr_info)) { kvm_before_handle_nmi(&vmx->vcpu); asm("int $2"); kvm_after_handle_nmi(&vmx->vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1f0d2383f5ee..445c51b6cf6d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2844,7 +2844,24 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + int idx; + /* + * Disable page faults because we're in atomic context here. + * kvm_write_guest_offset_cached() would call might_fault() + * that relies on pagefault_disable() to tell if there's a + * bug. NOTE: the write to guest memory may not go through if + * during postcopy live migration or if there's heavy guest + * paging. + */ + pagefault_disable(); + /* + * kvm_memslots() will be called by + * kvm_write_guest_offset_cached() so take the srcu lock. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + pagefault_enable(); kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -7881,6 +7898,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) raw_spin_lock_init(&kvm->arch.tsc_write_lock); mutex_init(&kvm->arch.apic_map_lock); + mutex_init(&kvm->arch.hyperv.hv_lock); spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); kvm->arch.kvmclock_offset = -ktime_get_boot_ns(); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 17c55a536fdd..e3254ca0eec4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -413,7 +413,7 @@ out: void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); } /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 14b9dd71d9e8..963895f9af7f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -89,10 +89,10 @@ static int __init nonx32_setup(char *str) __setup("noexec32=", nonx32_setup); /* - * When memory was added/removed make sure all the processes MM have + * When memory was added make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end, int removed) +void sync_global_pgds(unsigned long start, unsigned long end) { unsigned long address; @@ -100,12 +100,7 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) const pgd_t *pgd_ref = pgd_offset_k(address); struct page *page; - /* - * When it is called after memory hot remove, pgd_none() - * returns true. In this case (removed == 1), we must clear - * the PGD entries in the local PGD level page. - */ - if (pgd_none(*pgd_ref) && !removed) + if (pgd_none(*pgd_ref)) continue; spin_lock(&pgd_lock); @@ -122,13 +117,8 @@ void sync_global_pgds(unsigned long start, unsigned long end, int removed) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); - if (removed) { - if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) - pgd_clear(pgd); - } else { - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - } + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); spin_unlock(pgt_lock); } @@ -596,7 +586,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, } if (pgd_changed) - sync_global_pgds(vaddr_start, vaddr_end - 1, 0); + sync_global_pgds(vaddr_start, vaddr_end - 1); __flush_tlb_all(); @@ -1239,7 +1229,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) } else err = vmemmap_populate_basepages(start, end, node); if (!err) - sync_global_pgds(start, end - 1, 0); + sync_global_pgds(start, end - 1); return err; } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index e4f800999b32..324e5713d386 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -350,12 +350,12 @@ int mpx_enable_management(void) * The copy_xregs_to_kernel() beneath get_xsave_field_ptr() is * expected to be relatively expensive. Storing the bounds * directory here means that we do not have to do xsave in the - * unmap path; we can just use mm->bd_addr instead. + * unmap path; we can just use mm->context.bd_addr instead. */ bd_base = mpx_get_bounds_dir(); down_write(&mm->mmap_sem); - mm->bd_addr = bd_base; - if (mm->bd_addr == MPX_INVALID_BOUNDS_DIR) + mm->context.bd_addr = bd_base; + if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) ret = -ENXIO; up_write(&mm->mmap_sem); @@ -370,7 +370,7 @@ int mpx_disable_management(void) return -ENXIO; down_write(&mm->mmap_sem); - mm->bd_addr = MPX_INVALID_BOUNDS_DIR; + mm->context.bd_addr = MPX_INVALID_BOUNDS_DIR; up_write(&mm->mmap_sem); return 0; } @@ -947,7 +947,7 @@ static int try_unmap_single_bt(struct mm_struct *mm, end = bta_end_vaddr; } - bde_vaddr = mm->bd_addr + mpx_get_bd_entry_offset(mm, start); + bde_vaddr = mm->context.bd_addr + mpx_get_bd_entry_offset(mm, start); ret = get_bt_addr(mm, bde_vaddr, &bt_addr); /* * No bounds table there, so nothing to unmap. diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 3f35b48d1d9d..12dcad7297a5 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -19,7 +19,7 @@ #include "numa_internal.h" -int __initdata numa_off; +int numa_off; nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 3c3c19ea94df..184842ef332e 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -8,7 +8,6 @@ obj-y += iris/ obj-y += intel/ obj-y += intel-mid/ obj-y += intel-quark/ -obj-y += mellanox/ obj-y += olpc/ obj-y += scx200/ obj-y += sfi/ diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 1eb47b6298c2..e793fe509971 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -49,8 +49,13 @@ static unsigned long __init mfld_calibrate_tsc(void) fast_calibrate = ratio * fsb; pr_debug("read penwell tsc %lu khz\n", fast_calibrate); lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c index 59253db41bbc..e0607c77a1bd 100644 --- a/arch/x86/platform/intel-mid/mrfld.c +++ b/arch/x86/platform/intel-mid/mrfld.c @@ -78,8 +78,12 @@ static unsigned long __init tangier_calibrate_tsc(void) pr_debug("Setting lapic_timer_frequency = %d\n", lapic_timer_frequency); - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } diff --git a/arch/x86/platform/mellanox/Makefile b/arch/x86/platform/mellanox/Makefile deleted file mode 100644 index f43c93188a1d..000000000000 --- a/arch/x86/platform/mellanox/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-$(CONFIG_MLX_PLATFORM) += mlx-platform.o diff --git a/arch/x86/platform/mellanox/mlx-platform.c b/arch/x86/platform/mellanox/mlx-platform.c deleted file mode 100644 index 7dcfcca97399..000000000000 --- a/arch/x86/platform/mellanox/mlx-platform.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * arch/x86/platform/mellanox/mlx-platform.c - * Copyright (c) 2016 Mellanox Technologies. All rights reserved. - * Copyright (c) 2016 Vadim Pasternak <vadimp@mellanox.com> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the names of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include <linux/device.h> -#include <linux/dmi.h> -#include <linux/i2c.h> -#include <linux/i2c-mux.h> -#include <linux/module.h> -#include <linux/platform_device.h> -#include <linux/platform_data/i2c-mux-reg.h> - -#define MLX_PLAT_DEVICE_NAME "mlxplat" - -/* LPC bus IO offsets */ -#define MLXPLAT_CPLD_LPC_I2C_BASE_ADRR 0x2000 -#define MLXPLAT_CPLD_LPC_REG_BASE_ADRR 0x2500 -#define MLXPLAT_CPLD_LPC_IO_RANGE 0x100 -#define MLXPLAT_CPLD_LPC_I2C_CH1_OFF 0xdb -#define MLXPLAT_CPLD_LPC_I2C_CH2_OFF 0xda -#define MLXPLAT_CPLD_LPC_PIO_OFFSET 0x10000UL -#define MLXPLAT_CPLD_LPC_REG1 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \ - MLXPLAT_CPLD_LPC_I2C_CH1_OFF) | \ - MLXPLAT_CPLD_LPC_PIO_OFFSET) -#define MLXPLAT_CPLD_LPC_REG2 ((MLXPLAT_CPLD_LPC_REG_BASE_ADRR + \ - MLXPLAT_CPLD_LPC_I2C_CH2_OFF) | \ - MLXPLAT_CPLD_LPC_PIO_OFFSET) - -/* Start channel numbers */ -#define MLXPLAT_CPLD_CH1 2 -#define MLXPLAT_CPLD_CH2 10 - -/* Number of LPC attached MUX platform devices */ -#define MLXPLAT_CPLD_LPC_MUX_DEVS 2 - -/* mlxplat_priv - platform private data - * @pdev_i2c - i2c controller platform device - * @pdev_mux - array of mux platform devices - */ -struct mlxplat_priv { - struct platform_device *pdev_i2c; - struct platform_device *pdev_mux[MLXPLAT_CPLD_LPC_MUX_DEVS]; -}; - -/* Regions for LPC I2C controller and LPC base register space */ -static const struct resource mlxplat_lpc_resources[] = { - [0] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_I2C_BASE_ADRR, - MLXPLAT_CPLD_LPC_IO_RANGE, - "mlxplat_cpld_lpc_i2c_ctrl", IORESOURCE_IO), - [1] = DEFINE_RES_NAMED(MLXPLAT_CPLD_LPC_REG_BASE_ADRR, - MLXPLAT_CPLD_LPC_IO_RANGE, - "mlxplat_cpld_lpc_regs", - IORESOURCE_IO), -}; - -/* Platform default channels */ -static const int mlxplat_default_channels[][8] = { - { - MLXPLAT_CPLD_CH1, MLXPLAT_CPLD_CH1 + 1, MLXPLAT_CPLD_CH1 + 2, - MLXPLAT_CPLD_CH1 + 3, MLXPLAT_CPLD_CH1 + 4, MLXPLAT_CPLD_CH1 + - 5, MLXPLAT_CPLD_CH1 + 6, MLXPLAT_CPLD_CH1 + 7 - }, - { - MLXPLAT_CPLD_CH2, MLXPLAT_CPLD_CH2 + 1, MLXPLAT_CPLD_CH2 + 2, - MLXPLAT_CPLD_CH2 + 3, MLXPLAT_CPLD_CH2 + 4, MLXPLAT_CPLD_CH2 + - 5, MLXPLAT_CPLD_CH2 + 6, MLXPLAT_CPLD_CH2 + 7 - }, -}; - -/* Platform channels for MSN21xx system family */ -static const int mlxplat_msn21xx_channels[] = { 1, 2, 3, 4, 5, 6, 7, 8 }; - -/* Platform mux data */ -static struct i2c_mux_reg_platform_data mlxplat_mux_data[] = { - { - .parent = 1, - .base_nr = MLXPLAT_CPLD_CH1, - .write_only = 1, - .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG1, - .reg_size = 1, - .idle_in_use = 1, - }, - { - .parent = 1, - .base_nr = MLXPLAT_CPLD_CH2, - .write_only = 1, - .reg = (void __iomem *)MLXPLAT_CPLD_LPC_REG2, - .reg_size = 1, - .idle_in_use = 1, - }, - -}; - -static struct platform_device *mlxplat_dev; - -static int __init mlxplat_dmi_default_matched(const struct dmi_system_id *dmi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - mlxplat_mux_data[i].values = mlxplat_default_channels[i]; - mlxplat_mux_data[i].n_values = - ARRAY_SIZE(mlxplat_default_channels[i]); - } - - return 1; -}; - -static int __init mlxplat_dmi_msn21xx_matched(const struct dmi_system_id *dmi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - mlxplat_mux_data[i].values = mlxplat_msn21xx_channels; - mlxplat_mux_data[i].n_values = - ARRAY_SIZE(mlxplat_msn21xx_channels); - } - - return 1; -}; - -static struct dmi_system_id mlxplat_dmi_table[] __initdata = { - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN24"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN27"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSB"), - }, - }, - { - .callback = mlxplat_dmi_default_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSX"), - }, - }, - { - .callback = mlxplat_dmi_msn21xx_matched, - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "Mellanox Technologies"), - DMI_MATCH(DMI_PRODUCT_NAME, "MSN21"), - }, - }, - { } -}; - -static int __init mlxplat_init(void) -{ - struct mlxplat_priv *priv; - int i, err; - - if (!dmi_check_system(mlxplat_dmi_table)) - return -ENODEV; - - mlxplat_dev = platform_device_register_simple(MLX_PLAT_DEVICE_NAME, -1, - mlxplat_lpc_resources, - ARRAY_SIZE(mlxplat_lpc_resources)); - - if (IS_ERR(mlxplat_dev)) - return PTR_ERR(mlxplat_dev); - - priv = devm_kzalloc(&mlxplat_dev->dev, sizeof(struct mlxplat_priv), - GFP_KERNEL); - if (!priv) { - err = -ENOMEM; - goto fail_alloc; - } - platform_set_drvdata(mlxplat_dev, priv); - - priv->pdev_i2c = platform_device_register_simple("i2c_mlxcpld", -1, - NULL, 0); - if (IS_ERR(priv->pdev_i2c)) { - err = PTR_ERR(priv->pdev_i2c); - goto fail_alloc; - }; - - for (i = 0; i < ARRAY_SIZE(mlxplat_mux_data); i++) { - priv->pdev_mux[i] = platform_device_register_resndata( - &mlxplat_dev->dev, - "i2c-mux-reg", i, NULL, - 0, &mlxplat_mux_data[i], - sizeof(mlxplat_mux_data[i])); - if (IS_ERR(priv->pdev_mux[i])) { - err = PTR_ERR(priv->pdev_mux[i]); - goto fail_platform_mux_register; - } - } - - return 0; - -fail_platform_mux_register: - for (i--; i > 0 ; i--) - platform_device_unregister(priv->pdev_mux[i]); - platform_device_unregister(priv->pdev_i2c); -fail_alloc: - platform_device_unregister(mlxplat_dev); - - return err; -} -module_init(mlxplat_init); - -static void __exit mlxplat_exit(void) -{ - struct mlxplat_priv *priv = platform_get_drvdata(mlxplat_dev); - int i; - - for (i = ARRAY_SIZE(mlxplat_mux_data) - 1; i >= 0 ; i--) - platform_device_unregister(priv->pdev_mux[i]); - - platform_device_unregister(priv->pdev_i2c); - platform_device_unregister(mlxplat_dev); -} -module_exit(mlxplat_exit); - -MODULE_AUTHOR("Vadim Pasternak (vadimp@mellanox.com)"); -MODULE_DESCRIPTION("Mellanox platform driver"); -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN24*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN27*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSB*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSX*:"); -MODULE_ALIAS("dmi:*:*Mellanox*:MSN21*:"); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 53cace2ec0e2..66ade16c7693 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -252,6 +252,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); + tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); perf_restore_debug_store(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 9fa27ceeecfd..311acad7dad2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -87,12 +87,6 @@ static void cpu_bringup(void) cpu_data(cpu).x86_max_cores = 1; set_cpu_sibling_map(cpu); - /* - * identify_cpu() may have set logical_pkg_id to -1 due - * to incorrect phys_proc_id. Let's re-comupte it. - */ - topology_update_package_map(apic->cpu_present_to_apicid(cpu), cpu); - xen_setup_cpu_clockevents(); notify_cpu_starting(cpu); diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index f61058617ada..f4126cf997a4 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -15,6 +15,7 @@ config XTENSA select GENERIC_SCHED_CLOCK select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG + select HAVE_DMA_CONTIGUOUS select HAVE_EXIT_THREAD select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if !MMU diff --git a/arch/xtensa/boot/dts/kc705.dts b/arch/xtensa/boot/dts/kc705.dts index b1f4ee8c9a22..6106bdc097ad 100644 --- a/arch/xtensa/boot/dts/kc705.dts +++ b/arch/xtensa/boot/dts/kc705.dts @@ -11,4 +11,20 @@ device_type = "memory"; reg = <0x00000000 0x38000000>; }; + + reserved-memory { + #address-cells = <1>; + #size-cells = <1>; + ranges; + + /* global autoconfigured region for contiguous allocations */ + linux,cma { + compatible = "shared-dma-pool"; + reusable; + size = <0x04000000>; + alignment = <0x2000>; + alloc-ranges = <0x00000000 0x20000000>; + linux,cma-default; + }; + }; }; diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 28cf4c5d65ef..b7fbaa56b51a 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -3,6 +3,7 @@ generic-y += bug.h generic-y += clkdev.h generic-y += cputime.h generic-y += div64.h +generic-y += dma-contiguous.h generic-y += emergency-restart.h generic-y += errno.h generic-y += exec.h diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile index c31f5d5afc7d..264fb89c444e 100644 --- a/arch/xtensa/kernel/Makefile +++ b/arch/xtensa/kernel/Makefile @@ -14,6 +14,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += mcount.o obj-$(CONFIG_SMP) += smp.o mxhead.o obj-$(CONFIG_XTENSA_VARIANT_HAVE_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_S32C1I_SELFTEST) += s32c1i_selftest.o AFLAGS_head.o += -mtext-section-literals AFLAGS_mxhead.o += -mtext-section-literals diff --git a/arch/xtensa/kernel/pci-dma.c b/arch/xtensa/kernel/pci-dma.c index 6a16decf278f..70e362e6038e 100644 --- a/arch/xtensa/kernel/pci-dma.c +++ b/arch/xtensa/kernel/pci-dma.c @@ -15,6 +15,7 @@ * Joe Taylor <joe@tensilica.com, joetylr@yahoo.com> */ +#include <linux/dma-contiguous.h> #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/mm.h> @@ -146,6 +147,8 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size, { unsigned long ret; unsigned long uncached = 0; + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; + struct page *page = NULL; /* ignore region speicifiers */ @@ -153,11 +156,18 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size, if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) flag |= GFP_DMA; - ret = (unsigned long)__get_free_pages(flag, get_order(size)); - if (ret == 0) + if (gfpflags_allow_blocking(flag)) + page = dma_alloc_from_contiguous(dev, count, get_order(size)); + + if (!page) + page = alloc_pages(flag, get_order(size)); + + if (!page) return NULL; + ret = (unsigned long)page_address(page); + /* We currently don't support coherent memory outside KSEG */ BUG_ON(ret < XCHAL_KSEG_CACHED_VADDR || @@ -170,16 +180,19 @@ static void *xtensa_dma_alloc(struct device *dev, size_t size, return (void *)uncached; } -static void xtensa_dma_free(struct device *hwdev, size_t size, void *vaddr, +static void xtensa_dma_free(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs) { unsigned long addr = (unsigned long)vaddr + XCHAL_KSEG_CACHED_VADDR - XCHAL_KSEG_BYPASS_VADDR; + struct page *page = virt_to_page(addr); + unsigned long count = PAGE_ALIGN(size) >> PAGE_SHIFT; BUG_ON(addr < XCHAL_KSEG_CACHED_VADDR || addr > XCHAL_KSEG_CACHED_VADDR + XCHAL_KSEG_SIZE - 1); - free_pages(addr, get_order(size)); + if (!dma_release_from_contiguous(dev, page, count)) + __free_pages(page, get_order(size)); } static dma_addr_t xtensa_map_page(struct device *dev, struct page *page, diff --git a/arch/xtensa/kernel/s32c1i_selftest.c b/arch/xtensa/kernel/s32c1i_selftest.c new file mode 100644 index 000000000000..07e56e3a9a8b --- /dev/null +++ b/arch/xtensa/kernel/s32c1i_selftest.c @@ -0,0 +1,128 @@ +/* + * S32C1I selftest. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2016 Cadence Design Systems Inc. + */ + +#include <linux/init.h> +#include <linux/kernel.h> + +#include <asm/traps.h> + +#if XCHAL_HAVE_S32C1I + +static int __initdata rcw_word, rcw_probe_pc, rcw_exc; + +/* + * Basic atomic compare-and-swap, that records PC of S32C1I for probing. + * + * If *v == cmp, set *v = set. Return previous *v. + */ +static inline int probed_compare_swap(int *v, int cmp, int set) +{ + int tmp; + + __asm__ __volatile__( + " movi %1, 1f\n" + " s32i %1, %4, 0\n" + " wsr %2, scompare1\n" + "1: s32c1i %0, %3, 0\n" + : "=a" (set), "=&a" (tmp) + : "a" (cmp), "a" (v), "a" (&rcw_probe_pc), "0" (set) + : "memory" + ); + return set; +} + +/* Handle probed exception */ + +static void __init do_probed_exception(struct pt_regs *regs, + unsigned long exccause) +{ + if (regs->pc == rcw_probe_pc) { /* exception on s32c1i ? */ + regs->pc += 3; /* skip the s32c1i instruction */ + rcw_exc = exccause; + } else { + do_unhandled(regs, exccause); + } +} + +/* Simple test of S32C1I (soc bringup assist) */ + +static int __init check_s32c1i(void) +{ + int n, cause1, cause2; + void *handbus, *handdata, *handaddr; /* temporarily saved handlers */ + + rcw_probe_pc = 0; + handbus = trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR, + do_probed_exception); + handdata = trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR, + do_probed_exception); + handaddr = trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR, + do_probed_exception); + + /* First try an S32C1I that does not store: */ + rcw_exc = 0; + rcw_word = 1; + n = probed_compare_swap(&rcw_word, 0, 2); + cause1 = rcw_exc; + + /* took exception? */ + if (cause1 != 0) { + /* unclean exception? */ + if (n != 2 || rcw_word != 1) + panic("S32C1I exception error"); + } else if (rcw_word != 1 || n != 1) { + panic("S32C1I compare error"); + } + + /* Then an S32C1I that stores: */ + rcw_exc = 0; + rcw_word = 0x1234567; + n = probed_compare_swap(&rcw_word, 0x1234567, 0xabcde); + cause2 = rcw_exc; + + if (cause2 != 0) { + /* unclean exception? */ + if (n != 0xabcde || rcw_word != 0x1234567) + panic("S32C1I exception error (b)"); + } else if (rcw_word != 0xabcde || n != 0x1234567) { + panic("S32C1I store error"); + } + + /* Verify consistency of exceptions: */ + if (cause1 || cause2) { + pr_warn("S32C1I took exception %d, %d\n", cause1, cause2); + /* If emulation of S32C1I upon bus error gets implemented, + * we can get rid of this panic for single core (not SMP) + */ + panic("S32C1I exceptions not currently supported"); + } + if (cause1 != cause2) + panic("inconsistent S32C1I exceptions"); + + trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR, handbus); + trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR, handdata); + trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR, handaddr); + return 0; +} + +#else /* XCHAL_HAVE_S32C1I */ + +/* This condition should not occur with a commercially deployed processor. + * Display reminder for early engr test or demo chips / FPGA bitstreams + */ +static int __init check_s32c1i(void) +{ + pr_warn("Processor configuration lacks atomic compare-and-swap support!\n"); + return 0; +} + +#endif /* XCHAL_HAVE_S32C1I */ + +early_initcall(check_s32c1i); diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 88a044af7504..848e8568fb3c 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -31,10 +31,6 @@ # include <linux/console.h> #endif -#ifdef CONFIG_RTC -# include <linux/timex.h> -#endif - #ifdef CONFIG_PROC_FS # include <linux/seq_file.h> #endif @@ -48,24 +44,22 @@ #include <asm/page.h> #include <asm/setup.h> #include <asm/param.h> -#include <asm/traps.h> #include <asm/smp.h> #include <asm/sysmem.h> #include <platform/hardware.h> #if defined(CONFIG_VGA_CONSOLE) || defined(CONFIG_DUMMY_CONSOLE) -struct screen_info screen_info = { 0, 24, 0, 0, 0, 80, 0, 0, 0, 24, 1, 16}; -#endif - -#ifdef CONFIG_BLK_DEV_FD -extern struct fd_ops no_fd_ops; -struct fd_ops *fd_ops; +struct screen_info screen_info = { + .orig_x = 0, + .orig_y = 24, + .orig_video_cols = 80, + .orig_video_lines = 24, + .orig_video_isVGA = 1, + .orig_video_points = 16, +}; #endif -extern struct rtc_ops no_rtc_ops; -struct rtc_ops *rtc_ops; - #ifdef CONFIG_BLK_DEV_INITRD extern unsigned long initrd_start; extern unsigned long initrd_end; @@ -77,7 +71,6 @@ extern int initrd_below_start_ok; void *dtb_start = __dtb_start; #endif -unsigned char aux_device_present; extern unsigned long loops_per_jiffy; /* Command line specified as configuration option. */ @@ -317,120 +310,6 @@ extern char _SecondaryResetVector_text_start; extern char _SecondaryResetVector_text_end; #endif - -#ifdef CONFIG_S32C1I_SELFTEST -#if XCHAL_HAVE_S32C1I - -static int __initdata rcw_word, rcw_probe_pc, rcw_exc; - -/* - * Basic atomic compare-and-swap, that records PC of S32C1I for probing. - * - * If *v == cmp, set *v = set. Return previous *v. - */ -static inline int probed_compare_swap(int *v, int cmp, int set) -{ - int tmp; - - __asm__ __volatile__( - " movi %1, 1f\n" - " s32i %1, %4, 0\n" - " wsr %2, scompare1\n" - "1: s32c1i %0, %3, 0\n" - : "=a" (set), "=&a" (tmp) - : "a" (cmp), "a" (v), "a" (&rcw_probe_pc), "0" (set) - : "memory" - ); - return set; -} - -/* Handle probed exception */ - -static void __init do_probed_exception(struct pt_regs *regs, - unsigned long exccause) -{ - if (regs->pc == rcw_probe_pc) { /* exception on s32c1i ? */ - regs->pc += 3; /* skip the s32c1i instruction */ - rcw_exc = exccause; - } else { - do_unhandled(regs, exccause); - } -} - -/* Simple test of S32C1I (soc bringup assist) */ - -static int __init check_s32c1i(void) -{ - int n, cause1, cause2; - void *handbus, *handdata, *handaddr; /* temporarily saved handlers */ - - rcw_probe_pc = 0; - handbus = trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR, - do_probed_exception); - handdata = trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR, - do_probed_exception); - handaddr = trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR, - do_probed_exception); - - /* First try an S32C1I that does not store: */ - rcw_exc = 0; - rcw_word = 1; - n = probed_compare_swap(&rcw_word, 0, 2); - cause1 = rcw_exc; - - /* took exception? */ - if (cause1 != 0) { - /* unclean exception? */ - if (n != 2 || rcw_word != 1) - panic("S32C1I exception error"); - } else if (rcw_word != 1 || n != 1) { - panic("S32C1I compare error"); - } - - /* Then an S32C1I that stores: */ - rcw_exc = 0; - rcw_word = 0x1234567; - n = probed_compare_swap(&rcw_word, 0x1234567, 0xabcde); - cause2 = rcw_exc; - - if (cause2 != 0) { - /* unclean exception? */ - if (n != 0xabcde || rcw_word != 0x1234567) - panic("S32C1I exception error (b)"); - } else if (rcw_word != 0xabcde || n != 0x1234567) { - panic("S32C1I store error"); - } - - /* Verify consistency of exceptions: */ - if (cause1 || cause2) { - pr_warn("S32C1I took exception %d, %d\n", cause1, cause2); - /* If emulation of S32C1I upon bus error gets implemented, - we can get rid of this panic for single core (not SMP) */ - panic("S32C1I exceptions not currently supported"); - } - if (cause1 != cause2) - panic("inconsistent S32C1I exceptions"); - - trap_set_handler(EXCCAUSE_LOAD_STORE_ERROR, handbus); - trap_set_handler(EXCCAUSE_LOAD_STORE_DATA_ERROR, handdata); - trap_set_handler(EXCCAUSE_LOAD_STORE_ADDR_ERROR, handaddr); - return 0; -} - -#else /* XCHAL_HAVE_S32C1I */ - -/* This condition should not occur with a commercially deployed processor. - Display reminder for early engr test or demo chips / FPGA bitstreams */ -static int __init check_s32c1i(void) -{ - pr_warn("Processor configuration lacks atomic compare-and-swap support!\n"); - return 0; -} - -#endif /* XCHAL_HAVE_S32C1I */ -early_initcall(check_s32c1i); -#endif /* CONFIG_S32C1I_SELFTEST */ - static inline int mem_reserve(unsigned long start, unsigned long end) { return memblock_reserve(start, end - start); diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 80e4cfb2471a..720fe4e8b497 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -26,6 +26,7 @@ #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/of_fdt.h> +#include <linux/dma-contiguous.h> #include <asm/bootparam.h> #include <asm/page.h> @@ -60,6 +61,7 @@ void __init bootmem_init(void) max_low_pfn = min(max_pfn, MAX_LOW_PFN); memblock_set_current_limit(PFN_PHYS(max_low_pfn)); + dma_contiguous_reserve(PFN_PHYS(max_low_pfn)); memblock_dump_all(); } |