summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild5
-rw-r--r--arch/x86/Kconfig225
-rw-r--r--arch/x86/Kconfig.debug11
-rw-r--r--arch/x86/Makefile14
-rw-r--r--arch/x86/entry/Makefile10
-rw-r--r--arch/x86/entry/calling.h (renamed from arch/x86/include/asm/calling.h)98
-rw-r--r--arch/x86/entry/entry_32.S1248
-rw-r--r--arch/x86/entry/entry_64.S (renamed from arch/x86/kernel/entry_64.S)1074
-rw-r--r--arch/x86/entry/entry_64_compat.S556
-rw-r--r--arch/x86/entry/syscall_32.c (renamed from arch/x86/kernel/syscall_32.c)6
-rw-r--r--arch/x86/entry/syscall_64.c (renamed from arch/x86/kernel/syscall_64.c)0
-rw-r--r--arch/x86/entry/syscalls/Makefile (renamed from arch/x86/syscalls/Makefile)4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl (renamed from arch/x86/syscalls/syscall_32.tbl)0
-rw-r--r--arch/x86/entry/syscalls/syscall_64.tbl (renamed from arch/x86/syscalls/syscall_64.tbl)0
-rw-r--r--arch/x86/entry/syscalls/syscallhdr.sh (renamed from arch/x86/syscalls/syscallhdr.sh)0
-rw-r--r--arch/x86/entry/syscalls/syscalltbl.sh (renamed from arch/x86/syscalls/syscalltbl.sh)0
-rw-r--r--arch/x86/entry/thunk_32.S (renamed from arch/x86/lib/thunk_32.S)15
-rw-r--r--arch/x86/entry/thunk_64.S (renamed from arch/x86/lib/thunk_64.S)46
-rw-r--r--arch/x86/entry/vdso/.gitignore (renamed from arch/x86/vdso/.gitignore)0
-rw-r--r--arch/x86/entry/vdso/Makefile (renamed from arch/x86/vdso/Makefile)0
-rwxr-xr-xarch/x86/entry/vdso/checkundef.sh (renamed from arch/x86/vdso/checkundef.sh)0
-rw-r--r--arch/x86/entry/vdso/vclock_gettime.c (renamed from arch/x86/vdso/vclock_gettime.c)0
-rw-r--r--arch/x86/entry/vdso/vdso-layout.lds.S (renamed from arch/x86/vdso/vdso-layout.lds.S)0
-rw-r--r--arch/x86/entry/vdso/vdso-note.S (renamed from arch/x86/vdso/vdso-note.S)0
-rw-r--r--arch/x86/entry/vdso/vdso.lds.S (renamed from arch/x86/vdso/vdso.lds.S)0
-rw-r--r--arch/x86/entry/vdso/vdso2c.c (renamed from arch/x86/vdso/vdso2c.c)0
-rw-r--r--arch/x86/entry/vdso/vdso2c.h (renamed from arch/x86/vdso/vdso2c.h)0
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c (renamed from arch/x86/vdso/vdso32-setup.c)0
-rw-r--r--arch/x86/entry/vdso/vdso32/.gitignore (renamed from arch/x86/vdso/vdso32/.gitignore)0
-rw-r--r--arch/x86/entry/vdso/vdso32/int80.S (renamed from arch/x86/vdso/vdso32/int80.S)0
-rw-r--r--arch/x86/entry/vdso/vdso32/note.S (renamed from arch/x86/vdso/vdso32/note.S)0
-rw-r--r--arch/x86/entry/vdso/vdso32/sigreturn.S (renamed from arch/x86/vdso/vdso32/sigreturn.S)0
-rw-r--r--arch/x86/entry/vdso/vdso32/syscall.S (renamed from arch/x86/vdso/vdso32/syscall.S)0
-rw-r--r--arch/x86/entry/vdso/vdso32/sysenter.S (renamed from arch/x86/vdso/vdso32/sysenter.S)0
-rw-r--r--arch/x86/entry/vdso/vdso32/vclock_gettime.c (renamed from arch/x86/vdso/vdso32/vclock_gettime.c)0
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso-fakesections.c (renamed from arch/x86/vdso/vdso32/vdso-fakesections.c)0
-rw-r--r--arch/x86/entry/vdso/vdso32/vdso32.lds.S (renamed from arch/x86/vdso/vdso32/vdso32.lds.S)0
-rw-r--r--arch/x86/entry/vdso/vdsox32.lds.S (renamed from arch/x86/vdso/vdsox32.lds.S)0
-rw-r--r--arch/x86/entry/vdso/vgetcpu.c (renamed from arch/x86/vdso/vgetcpu.c)0
-rw-r--r--arch/x86/entry/vdso/vma.c (renamed from arch/x86/vdso/vma.c)0
-rw-r--r--arch/x86/entry/vsyscall/Makefile7
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c (renamed from arch/x86/kernel/vsyscall_64.c)0
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_emu_64.S (renamed from arch/x86/kernel/vsyscall_emu_64.S)0
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_gtod.c (renamed from arch/x86/kernel/vsyscall_gtod.c)0
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_trace.h (renamed from arch/x86/kernel/vsyscall_trace.h)2
-rw-r--r--arch/x86/ia32/Makefile2
-rw-r--r--arch/x86/ia32/ia32entry.S591
-rw-r--r--arch/x86/include/asm/barrier.h4
-rw-r--r--arch/x86/include/asm/cacheflush.h6
-rw-r--r--arch/x86/include/asm/cmpxchg.h2
-rw-r--r--arch/x86/include/asm/dwarf2.h170
-rw-r--r--arch/x86/include/asm/entry_arch.h3
-rw-r--r--arch/x86/include/asm/frame.h7
-rw-r--r--arch/x86/include/asm/hardirq.h3
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/io.h9
-rw-r--r--arch/x86/include/asm/irq_vectors.h11
-rw-r--r--arch/x86/include/asm/kvm_host.h3
-rw-r--r--arch/x86/include/asm/mce.h28
-rw-r--r--arch/x86/include/asm/msr-index.h (renamed from arch/x86/include/uapi/asm/msr-index.h)3
-rw-r--r--arch/x86/include/asm/msr.h12
-rw-r--r--arch/x86/include/asm/mtrr.h15
-rw-r--r--arch/x86/include/asm/paravirt.h29
-rw-r--r--arch/x86/include/asm/paravirt_types.h10
-rw-r--r--arch/x86/include/asm/pat.h9
-rw-r--r--arch/x86/include/asm/pgtable.h8
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/proto.h10
-rw-r--r--arch/x86/include/asm/qspinlock.h57
-rw-r--r--arch/x86/include/asm/qspinlock_paravirt.h6
-rw-r--r--arch/x86/include/asm/segment.h14
-rw-r--r--arch/x86/include/asm/special_insns.h38
-rw-r--r--arch/x86/include/asm/spinlock.h5
-rw-r--r--arch/x86/include/asm/spinlock_types.h4
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/include/asm/trace/irq_vectors.h6
-rw-r--r--arch/x86/include/asm/traps.h3
-rw-r--r--arch/x86/include/uapi/asm/msr.h2
-rw-r--r--arch/x86/include/uapi/asm/mtrr.h8
-rw-r--r--arch/x86/kernel/Makefile5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/cpu/amd.c6
-rw-r--r--arch/x86/kernel/cpu/common.c12
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c8
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c57
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c141
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c44
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c209
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c48
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h2
-rw-r--r--arch/x86/kernel/crash.c1
-rw-r--r--arch/x86/kernel/early-quirks.c8
-rw-r--r--arch/x86/kernel/entry_32.S1401
-rw-r--r--arch/x86/kernel/head64.c2
-rw-r--r--arch/x86/kernel/head_32.S33
-rw-r--r--arch/x86/kernel/head_64.S20
-rw-r--r--arch/x86/kernel/i387.c15
-rw-r--r--arch/x86/kernel/irq.c6
-rw-r--r--arch/x86/kernel/irqinit.c4
-rw-r--r--arch/x86/kernel/kvm.c43
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c24
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c22
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c22
-rw-r--r--arch/x86/kernel/traps.c19
-rw-r--r--arch/x86/kvm/cpuid.c4
-rw-r--r--arch/x86/kvm/cpuid.h8
-rw-r--r--arch/x86/kvm/mmu.c16
-rw-r--r--arch/x86/kvm/mmu.h4
-rw-r--r--arch/x86/kvm/paging_tmpl.h7
-rw-r--r--arch/x86/kvm/svm.c1
-rw-r--r--arch/x86/kvm/vmx.c1
-rw-r--r--arch/x86/kvm/x86.c26
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S7
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S61
-rw-r--r--arch/x86/lib/checksum_32.S52
-rw-r--r--arch/x86/lib/clear_page_64.S7
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S12
-rw-r--r--arch/x86/lib/cmpxchg8b_emu.S11
-rw-r--r--arch/x86/lib/copy_page_64.S11
-rw-r--r--arch/x86/lib/copy_user_64.S15
-rw-r--r--arch/x86/lib/csum-copy_64.S17
-rw-r--r--arch/x86/lib/getuser.S13
-rw-r--r--arch/x86/lib/iomap_copy_64.S3
-rw-r--r--arch/x86/lib/memcpy_64.S3
-rw-r--r--arch/x86/lib/memmove_64.S3
-rw-r--r--arch/x86/lib/memset_64.S5
-rw-r--r--arch/x86/lib/msr-reg.S44
-rw-r--r--arch/x86/lib/putuser.S8
-rw-r--r--arch/x86/lib/rwsem.S49
-rw-r--r--arch/x86/mm/init.c6
-rw-r--r--arch/x86/mm/iomap_32.c12
-rw-r--r--arch/x86/mm/ioremap.c71
-rw-r--r--arch/x86/mm/pageattr-test.c1
-rw-r--r--arch/x86/mm/pageattr.c84
-rw-r--r--arch/x86/mm/pat.c337
-rw-r--r--arch/x86/mm/pat_internal.h2
-rw-r--r--arch/x86/mm/pat_rbtree.c6
-rw-r--r--arch/x86/mm/pgtable.c60
-rw-r--r--arch/x86/net/bpf_jit.S1
-rw-r--r--arch/x86/net/bpf_jit_comp.c7
-rw-r--r--arch/x86/pci/acpi.c13
-rw-r--r--arch/x86/pci/i386.c6
-rw-r--r--arch/x86/platform/Makefile1
-rw-r--r--arch/x86/platform/atom/Makefile1
-rw-r--r--arch/x86/platform/atom/punit_atom_debug.c183
-rw-r--r--arch/x86/um/Makefile2
-rw-r--r--arch/x86/um/asm/barrier.h3
-rw-r--r--arch/x86/xen/enlighten.c5
-rw-r--r--arch/x86/xen/p2m.c1
-rw-r--r--arch/x86/xen/spinlock.c64
-rw-r--r--arch/x86/xen/xen-asm_64.S6
154 files changed, 4126 insertions, 3699 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 3942f74c92d7..1538562cc720 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -1,3 +1,6 @@
+
+obj-y += entry/
+
obj-$(CONFIG_KVM) += kvm/
# Xen paravirtualization support
@@ -11,7 +14,7 @@ obj-y += kernel/
obj-y += mm/
obj-y += crypto/
-obj-y += vdso/
+
obj-$(CONFIG_IA32_EMULATION) += ia32/
obj-y += platform/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6bbb991d0f3c..7e39f9b22705 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -9,140 +9,141 @@ config 64BIT
config X86_32
def_bool y
depends on !64BIT
- select CLKSRC_I8253
- select HAVE_UID16
config X86_64
def_bool y
depends on 64BIT
- select X86_DEV_DMA_OPS
- select ARCH_USE_CMPXCHG_LOCKREF
- select HAVE_LIVEPATCH
### Arch settings
config X86
def_bool y
- select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
- select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
+ select ACPI_LEGACY_TABLES_LOOKUP if ACPI
+ select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
+ select ANON_INODES
+ select ARCH_CLOCKSOURCE_DATA
+ select ARCH_DISCARD_MEMBLOCK
+ select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+ select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_FAST_MULTIPLIER
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_SG_CHAIN
+ select ARCH_HAVE_NMI_SAFE_CMPXCHG
+ select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
- select HAVE_AOUT if X86_32
- select HAVE_UNSTABLE_SCHED_CLOCK
- select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
- select ARCH_SUPPORTS_INT128 if X86_64
- select HAVE_IDE
- select HAVE_OPROFILE
- select HAVE_PCSPKR_PLATFORM
- select HAVE_PERF_EVENTS
- select HAVE_IOREMAP_PROT
- select HAVE_KPROBES
- select HAVE_MEMBLOCK
- select HAVE_MEMBLOCK_NODE_MAP
- select ARCH_DISCARD_MEMBLOCK
- select ARCH_WANT_OPTIONAL_GPIOLIB
+ select ARCH_SUPPORTS_ATOMIC_RMW
+ select ARCH_SUPPORTS_INT128 if X86_64
+ select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
+ select ARCH_USE_BUILTIN_BSWAP
+ select ARCH_USE_CMPXCHG_LOCKREF if X86_64
+ select ARCH_USE_QUEUED_RWLOCKS
+ select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_FRAME_POINTERS
- select HAVE_DMA_ATTRS
- select HAVE_DMA_CONTIGUOUS
- select HAVE_KRETPROBES
+ select ARCH_WANT_IPC_PARSE_VERSION if X86_32
+ select ARCH_WANT_OPTIONAL_GPIOLIB
+ select BUILDTIME_EXTABLE_SORT
+ select CLKEVT_I8253
+ select CLKSRC_I8253 if X86_32
+ select CLOCKSOURCE_VALIDATE_LAST_CYCLE
+ select CLOCKSOURCE_WATCHDOG
+ select CLONE_BACKWARDS if X86_32
+ select COMPAT_OLD_SIGACTION if IA32_EMULATION
+ select DCACHE_WORD_ACCESS
+ select GENERIC_CLOCKEVENTS
+ select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
+ select GENERIC_CLOCKEVENTS_MIN_ADJUST
+ select GENERIC_CMOS_UPDATE
+ select GENERIC_CPU_AUTOPROBE
select GENERIC_EARLY_IOREMAP
- select HAVE_OPTPROBES
- select HAVE_KPROBES_ON_FTRACE
- select HAVE_FTRACE_MCOUNT_RECORD
- select HAVE_FENTRY if X86_64
+ select GENERIC_FIND_FIRST_BIT
+ select GENERIC_IOMAP
+ select GENERIC_IRQ_PROBE
+ select GENERIC_IRQ_SHOW
+ select GENERIC_PENDING_IRQ if SMP
+ select GENERIC_SMP_IDLE_THREAD
+ select GENERIC_STRNCPY_FROM_USER
+ select GENERIC_STRNLEN_USER
+ select GENERIC_TIME_VSYSCALL
+ select HAVE_ACPI_APEI if ACPI
+ select HAVE_ACPI_APEI_NMI if ACPI
+ select HAVE_ALIGNED_STRUCT_PAGE if SLUB
+ select HAVE_AOUT if X86_32
+ select HAVE_ARCH_AUDITSYSCALL
+ select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
+ select HAVE_ARCH_JUMP_LABEL
+ select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+ select HAVE_ARCH_KGDB
+ select HAVE_ARCH_KMEMCHECK
+ select HAVE_ARCH_SECCOMP_FILTER
+ select HAVE_ARCH_SOFT_DIRTY if X86_64
+ select HAVE_ARCH_TRACEHOOK
+ select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ select HAVE_BPF_JIT if X86_64
+ select HAVE_CC_STACKPROTECTOR
+ select HAVE_CMPXCHG_DOUBLE
+ select HAVE_CMPXCHG_LOCAL
+ select HAVE_CONTEXT_TRACKING if X86_64
select HAVE_C_RECORDMCOUNT
+ select HAVE_DEBUG_KMEMLEAK
+ select HAVE_DEBUG_STACKOVERFLOW
+ select HAVE_DMA_API_DEBUG
+ select HAVE_DMA_ATTRS
+ select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS
- select HAVE_FUNCTION_TRACER
- select HAVE_FUNCTION_GRAPH_TRACER
- select HAVE_FUNCTION_GRAPH_FP_TEST
- select HAVE_SYSCALL_TRACEPOINTS
- select SYSCTL_EXCEPTION_TRACE
- select HAVE_KVM
- select HAVE_ARCH_KGDB
- select HAVE_ARCH_TRACEHOOK
- select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_EFFICIENT_UNALIGNED_ACCESS
- select USER_STACKTRACE_SUPPORT
- select HAVE_REGS_AND_STACK_ACCESS_API
- select HAVE_DMA_API_DEBUG
- select HAVE_KERNEL_GZIP
+ select HAVE_FENTRY if X86_64
+ select HAVE_FTRACE_MCOUNT_RECORD
+ select HAVE_FUNCTION_GRAPH_FP_TEST
+ select HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_FUNCTION_TRACER
+ select HAVE_GENERIC_DMA_COHERENT if X86_32
+ select HAVE_HW_BREAKPOINT
+ select HAVE_IDE
+ select HAVE_IOREMAP_PROT
+ select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
+ select HAVE_IRQ_TIME_ACCOUNTING
select HAVE_KERNEL_BZIP2
+ select HAVE_KERNEL_GZIP
+ select HAVE_KERNEL_LZ4
select HAVE_KERNEL_LZMA
- select HAVE_KERNEL_XZ
select HAVE_KERNEL_LZO
- select HAVE_KERNEL_LZ4
- select HAVE_HW_BREAKPOINT
+ select HAVE_KERNEL_XZ
+ select HAVE_KPROBES
+ select HAVE_KPROBES_ON_FTRACE
+ select HAVE_KRETPROBES
+ select HAVE_KVM
+ select HAVE_LIVEPATCH if X86_64
+ select HAVE_MEMBLOCK
+ select HAVE_MEMBLOCK_NODE_MAP
select HAVE_MIXED_BREAKPOINTS_REGS
- select PERF_EVENTS
+ select HAVE_OPROFILE
+ select HAVE_OPTPROBES
+ select HAVE_PCSPKR_PLATFORM
+ select HAVE_PERF_EVENTS
select HAVE_PERF_EVENTS_NMI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select HAVE_DEBUG_KMEMLEAK
- select ANON_INODES
- select HAVE_ALIGNED_STRUCT_PAGE if SLUB
- select HAVE_CMPXCHG_LOCAL
- select HAVE_CMPXCHG_DOUBLE
- select HAVE_ARCH_KMEMCHECK
- select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+ select HAVE_REGS_AND_STACK_ACCESS_API
+ select HAVE_SYSCALL_TRACEPOINTS
+ select HAVE_UID16 if X86_32
+ select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_USER_RETURN_NOTIFIER
- select ARCH_HAS_ELF_RANDOMIZE
- select HAVE_ARCH_JUMP_LABEL
- select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
- select SPARSE_IRQ
- select GENERIC_FIND_FIRST_BIT
- select GENERIC_IRQ_PROBE
- select GENERIC_PENDING_IRQ if SMP
- select GENERIC_IRQ_SHOW
- select GENERIC_CLOCKEVENTS_MIN_ADJUST
select IRQ_FORCED_THREADING
- select HAVE_BPF_JIT if X86_64
- select HAVE_ARCH_TRANSPARENT_HUGEPAGE
- select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
- select ARCH_HAS_SG_CHAIN
- select CLKEVT_I8253
- select ARCH_HAVE_NMI_SAFE_CMPXCHG
- select GENERIC_IOMAP
- select DCACHE_WORD_ACCESS
- select GENERIC_SMP_IDLE_THREAD
- select ARCH_WANT_IPC_PARSE_VERSION if X86_32
- select HAVE_ARCH_SECCOMP_FILTER
- select BUILDTIME_EXTABLE_SORT
- select GENERIC_CMOS_UPDATE
- select HAVE_ARCH_SOFT_DIRTY if X86_64
- select CLOCKSOURCE_WATCHDOG
- select GENERIC_CLOCKEVENTS
- select ARCH_CLOCKSOURCE_DATA
- select CLOCKSOURCE_VALIDATE_LAST_CYCLE
- select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC)
- select GENERIC_TIME_VSYSCALL
- select GENERIC_STRNCPY_FROM_USER
- select GENERIC_STRNLEN_USER
- select HAVE_CONTEXT_TRACKING if X86_64
- select HAVE_IRQ_TIME_ACCOUNTING
- select VIRT_TO_BUS
- select MODULES_USE_ELF_REL if X86_32
- select MODULES_USE_ELF_RELA if X86_64
- select CLONE_BACKWARDS if X86_32
- select ARCH_USE_BUILTIN_BSWAP
- select ARCH_USE_QUEUE_RWLOCK
- select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
- select OLD_SIGACTION if X86_32
- select COMPAT_OLD_SIGACTION if IA32_EMULATION
+ select MODULES_USE_ELF_RELA if X86_64
+ select MODULES_USE_ELF_REL if X86_32
+ select OLD_SIGACTION if X86_32
+ select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION
+ select PERF_EVENTS
select RTC_LIB
- select HAVE_DEBUG_STACKOVERFLOW
- select HAVE_IRQ_EXIT_ON_IRQ_STACK if X86_64
- select HAVE_CC_STACKPROTECTOR
- select GENERIC_CPU_AUTOPROBE
- select HAVE_ARCH_AUDITSYSCALL
- select ARCH_SUPPORTS_ATOMIC_RMW
- select HAVE_ACPI_APEI if ACPI
- select HAVE_ACPI_APEI_NMI if ACPI
- select ACPI_LEGACY_TABLES_LOOKUP if ACPI
- select X86_FEATURE_NAMES if PROC_FS
+ select SPARSE_IRQ
select SRCU
+ select SYSCTL_EXCEPTION_TRACE
+ select USER_STACKTRACE_SUPPORT
+ select VIRT_TO_BUS
+ select X86_DEV_DMA_OPS if X86_64
+ select X86_FEATURE_NAMES if PROC_FS
config INSTRUCTION_DECODER
def_bool y
@@ -260,10 +261,6 @@ config X86_64_SMP
def_bool y
depends on X86_64 && SMP
-config X86_HT
- def_bool y
- depends on SMP
-
config X86_32_LAZY_GS
def_bool y
depends on X86_32 && !CC_STACKPROTECTOR
@@ -441,6 +438,7 @@ config X86_UV
depends on X86_EXTENDED_PLATFORM
depends on NUMA
depends on X86_X2APIC
+ depends on PCI
---help---
This option is needed in order to support SGI Ultraviolet systems.
If you don't have one of these, you should say N here.
@@ -665,7 +663,7 @@ config PARAVIRT_DEBUG
config PARAVIRT_SPINLOCKS
bool "Paravirtualization layer for spinlocks"
depends on PARAVIRT && SMP
- select UNINLINE_SPIN_UNLOCK
+ select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS
---help---
Paravirtualized spinlocks allow a pvops backend to replace the
spinlock implementation with something virtualization-friendly
@@ -850,11 +848,12 @@ config NR_CPUS
default "1" if !SMP
default "8192" if MAXSMP
default "32" if SMP && X86_BIGSMP
- default "8" if SMP
+ default "8" if SMP && X86_32
+ default "64" if SMP
---help---
This allows you to specify the maximum number of CPUs which this
kernel will support. If CPUMASK_OFFSTACK is enabled, the maximum
- supported value is 4096, otherwise the maximum value is 512. The
+ supported value is 8192, otherwise the maximum value is 512. The
minimum value which makes sense is 2.
This is purely to save memory - each supported CPU adds
@@ -862,7 +861,7 @@ config NR_CPUS
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
- depends on X86_HT
+ depends on SMP
---help---
SMT scheduler support improves the CPU scheduler's decision making
when dealing with Intel Pentium 4 chips with HyperThreading at a
@@ -872,7 +871,7 @@ config SCHED_SMT
config SCHED_MC
def_bool y
prompt "Multi-core scheduler support"
- depends on X86_HT
+ depends on SMP
---help---
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 72484a645f05..a5973f851750 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -332,4 +332,15 @@ config X86_DEBUG_STATIC_CPU_HAS
If unsure, say N.
+config PUNIT_ATOM_DEBUG
+ tristate "ATOM Punit debug driver"
+ select DEBUG_FS
+ select IOSF_MBI
+ ---help---
+ This is a debug driver, which gets the power states
+ of all Punit North Complex devices. The power states of
+ each device is exposed as part of the debugfs interface.
+ The current power state can be read from
+ /sys/kernel/debug/punit_atom/dev_power_state
+
endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 57996ee840dd..118e6debc483 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -149,12 +149,6 @@ endif
sp-$(CONFIG_X86_32) := esp
sp-$(CONFIG_X86_64) := rsp
-# do binutils support CFI?
-cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1)
-# is .cfi_signal_frame supported too?
-cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
-cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
-
# does binutils support specific instructions?
asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1)
@@ -162,8 +156,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1)
avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1)
avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1)
-KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
-KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
+KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr)
LDFLAGS := -m elf_$(UTS_MACHINE)
@@ -187,7 +181,7 @@ archscripts: scripts_basic
# Syscall table generation
archheaders:
- $(Q)$(MAKE) $(build)=arch/x86/syscalls all
+ $(Q)$(MAKE) $(build)=arch/x86/entry/syscalls all
archprepare:
ifeq ($(CONFIG_KEXEC_FILE),y)
@@ -250,7 +244,7 @@ install:
PHONY += vdso_install
vdso_install:
- $(Q)$(MAKE) $(build)=arch/x86/vdso $@
+ $(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@
archclean:
$(Q)rm -rf $(objtree)/arch/i386
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
new file mode 100644
index 000000000000..7a144971db79
--- /dev/null
+++ b/arch/x86/entry/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the x86 low level entry code
+#
+obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+
+obj-y += vdso/
+obj-y += vsyscall/
+
+obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o
+
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/entry/calling.h
index 1c8b50edb2db..f4e6308c4200 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/entry/calling.h
@@ -46,8 +46,6 @@ For 32-bit we have the following conventions - kernel is built with
*/
-#include <asm/dwarf2.h>
-
#ifdef CONFIG_X86_64
/*
@@ -91,28 +89,27 @@ For 32-bit we have the following conventions - kernel is built with
#define SIZEOF_PTREGS 21*8
.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
- subq $15*8+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+ addq $-(15*8+\addskip), %rsp
.endm
.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
.if \r11
- movq_cfi r11, 6*8+\offset
+ movq %r11, 6*8+\offset(%rsp)
.endif
.if \r8910
- movq_cfi r10, 7*8+\offset
- movq_cfi r9, 8*8+\offset
- movq_cfi r8, 9*8+\offset
+ movq %r10, 7*8+\offset(%rsp)
+ movq %r9, 8*8+\offset(%rsp)
+ movq %r8, 9*8+\offset(%rsp)
.endif
.if \rax
- movq_cfi rax, 10*8+\offset
+ movq %rax, 10*8+\offset(%rsp)
.endif
.if \rcx
- movq_cfi rcx, 11*8+\offset
+ movq %rcx, 11*8+\offset(%rsp)
.endif
- movq_cfi rdx, 12*8+\offset
- movq_cfi rsi, 13*8+\offset
- movq_cfi rdi, 14*8+\offset
+ movq %rdx, 12*8+\offset(%rsp)
+ movq %rsi, 13*8+\offset(%rsp)
+ movq %rdi, 14*8+\offset(%rsp)
.endm
.macro SAVE_C_REGS offset=0
SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
@@ -131,24 +128,24 @@ For 32-bit we have the following conventions - kernel is built with
.endm
.macro SAVE_EXTRA_REGS offset=0
- movq_cfi r15, 0*8+\offset
- movq_cfi r14, 1*8+\offset
- movq_cfi r13, 2*8+\offset
- movq_cfi r12, 3*8+\offset
- movq_cfi rbp, 4*8+\offset
- movq_cfi rbx, 5*8+\offset
+ movq %r15, 0*8+\offset(%rsp)
+ movq %r14, 1*8+\offset(%rsp)
+ movq %r13, 2*8+\offset(%rsp)
+ movq %r12, 3*8+\offset(%rsp)
+ movq %rbp, 4*8+\offset(%rsp)
+ movq %rbx, 5*8+\offset(%rsp)
.endm
.macro SAVE_EXTRA_REGS_RBP offset=0
- movq_cfi rbp, 4*8+\offset
+ movq %rbp, 4*8+\offset(%rsp)
.endm
.macro RESTORE_EXTRA_REGS offset=0
- movq_cfi_restore 0*8+\offset, r15
- movq_cfi_restore 1*8+\offset, r14
- movq_cfi_restore 2*8+\offset, r13
- movq_cfi_restore 3*8+\offset, r12
- movq_cfi_restore 4*8+\offset, rbp
- movq_cfi_restore 5*8+\offset, rbx
+ movq 0*8+\offset(%rsp), %r15
+ movq 1*8+\offset(%rsp), %r14
+ movq 2*8+\offset(%rsp), %r13
+ movq 3*8+\offset(%rsp), %r12
+ movq 4*8+\offset(%rsp), %rbp
+ movq 5*8+\offset(%rsp), %rbx
.endm
.macro ZERO_EXTRA_REGS
@@ -162,24 +159,24 @@ For 32-bit we have the following conventions - kernel is built with
.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
- movq_cfi_restore 6*8, r11
+ movq 6*8(%rsp), %r11
.endif
.if \rstor_r8910
- movq_cfi_restore 7*8, r10
- movq_cfi_restore 8*8, r9
- movq_cfi_restore 9*8, r8
+ movq 7*8(%rsp), %r10
+ movq 8*8(%rsp), %r9
+ movq 9*8(%rsp), %r8
.endif
.if \rstor_rax
- movq_cfi_restore 10*8, rax
+ movq 10*8(%rsp), %rax
.endif
.if \rstor_rcx
- movq_cfi_restore 11*8, rcx
+ movq 11*8(%rsp), %rcx
.endif
.if \rstor_rdx
- movq_cfi_restore 12*8, rdx
+ movq 12*8(%rsp), %rdx
.endif
- movq_cfi_restore 13*8, rsi
- movq_cfi_restore 14*8, rdi
+ movq 13*8(%rsp), %rsi
+ movq 14*8(%rsp), %rdi
.endm
.macro RESTORE_C_REGS
RESTORE_C_REGS_HELPER 1,1,1,1,1
@@ -204,8 +201,7 @@ For 32-bit we have the following conventions - kernel is built with
.endm
.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
- addq $15*8+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
+ subq $-(15*8+\addskip), %rsp
.endm
.macro icebp
@@ -224,23 +220,23 @@ For 32-bit we have the following conventions - kernel is built with
*/
.macro SAVE_ALL
- pushl_cfi_reg eax
- pushl_cfi_reg ebp
- pushl_cfi_reg edi
- pushl_cfi_reg esi
- pushl_cfi_reg edx
- pushl_cfi_reg ecx
- pushl_cfi_reg ebx
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
.endm
.macro RESTORE_ALL
- popl_cfi_reg ebx
- popl_cfi_reg ecx
- popl_cfi_reg edx
- popl_cfi_reg esi
- popl_cfi_reg edi
- popl_cfi_reg ebp
- popl_cfi_reg eax
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
.endm
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
new file mode 100644
index 000000000000..21dc60a60b5f
--- /dev/null
+++ b/arch/x86/entry/entry_32.S
@@ -0,0 +1,1248 @@
+/*
+ * Copyright (C) 1991,1992 Linus Torvalds
+ *
+ * entry_32.S contains the system-call and low-level fault and trap handling routines.
+ *
+ * Stack layout in 'syscall_exit':
+ * ptrace needs to have all registers on the stack.
+ * If the order here is changed, it needs to be
+ * updated in fork.c:copy_process(), signal.c:do_signal(),
+ * ptrace.c and ptrace.h
+ *
+ * 0(%esp) - %ebx
+ * 4(%esp) - %ecx
+ * 8(%esp) - %edx
+ * C(%esp) - %esi
+ * 10(%esp) - %edi
+ * 14(%esp) - %ebp
+ * 18(%esp) - %eax
+ * 1C(%esp) - %ds
+ * 20(%esp) - %es
+ * 24(%esp) - %fs
+ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
+ */
+
+#include <linux/linkage.h>
+#include <linux/err.h>
+#include <asm/thread_info.h>
+#include <asm/irqflags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/ftrace.h>
+#include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE 0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+# define sysenter_audit syscall_trace_entry
+# define sysexit_audit syscall_exit_work
+#endif
+
+ .section .entry.text, "ax"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization. The following will never clobber any registers:
+ * INTERRUPT_RETURN (aka. "iret")
+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
+#ifdef CONFIG_PREEMPT
+# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
+#else
+# define preempt_stop(clobbers)
+# define resume_kernel restore_all
+#endif
+
+.macro TRACE_IRQS_IRET
+#ifdef CONFIG_TRACE_IRQFLAGS
+ testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off?
+ jz 1f
+ TRACE_IRQS_ON
+1:
+#endif
+.endm
+
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc. Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+ pushl $0
+.endm
+.macro POP_GS pop=0
+ addl $(4 + \pop), %esp
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+ pushl %gs
+.endm
+
+.macro POP_GS pop=0
+98: popl %gs
+ .if \pop <> 0
+ add $\pop, %esp
+ .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, (%esp)
+ jmp 98b
+.popsection
+ _ASM_EXTABLE(98b, 99b)
+.endm
+
+.macro PTGS_TO_GS
+98: mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, PT_GS(%esp)
+ jmp 98b
+.popsection
+ _ASM_EXTABLE(98b, 99b)
+.endm
+
+.macro GS_TO_REG reg
+ movl %gs, \reg
+.endm
+.macro REG_TO_PTGS reg
+ movl \reg, PT_GS(%esp)
+.endm
+.macro SET_KERNEL_GS reg
+ movl $(__KERNEL_STACK_CANARY), \reg
+ movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
+.macro SAVE_ALL
+ cld
+ PUSH_GS
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ movl $(__USER_DS), %edx
+ movl %edx, %ds
+ movl %edx, %es
+ movl $(__KERNEL_PERCPU), %edx
+ movl %edx, %fs
+ SET_KERNEL_GS %edx
+.endm
+
+.macro RESTORE_INT_REGS
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+.endm
+
+.macro RESTORE_REGS pop=0
+ RESTORE_INT_REGS
+1: popl %ds
+2: popl %es
+3: popl %fs
+ POP_GS \pop
+.pushsection .fixup, "ax"
+4: movl $0, (%esp)
+ jmp 1b
+5: movl $0, (%esp)
+ jmp 2b
+6: movl $0, (%esp)
+ jmp 3b
+.popsection
+ _ASM_EXTABLE(1b, 4b)
+ _ASM_EXTABLE(2b, 5b)
+ _ASM_EXTABLE(3b, 6b)
+ POP_GS_EX
+.endm
+
+ENTRY(ret_from_fork)
+ pushl %eax
+ call schedule_tail
+ GET_THREAD_INFO(%ebp)
+ popl %eax
+ pushl $0x0202 # Reset kernel eflags
+ popfl
+ jmp syscall_exit
+END(ret_from_fork)
+
+ENTRY(ret_from_kernel_thread)
+ pushl %eax
+ call schedule_tail
+ GET_THREAD_INFO(%ebp)
+ popl %eax
+ pushl $0x0202 # Reset kernel eflags
+ popfl
+ movl PT_EBP(%esp), %eax
+ call *PT_EBX(%esp)
+ movl $0, PT_EAX(%esp)
+ jmp syscall_exit
+ENDPROC(ret_from_kernel_thread)
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+ # userspace resumption stub bypassing syscall exit tracing
+ ALIGN
+ret_from_exception:
+ preempt_stop(CLBR_ANY)
+ret_from_intr:
+ GET_THREAD_INFO(%ebp)
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
+ /*
+ * We can be coming here from child spawned by kernel_thread().
+ */
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
+ cmpl $USER_RPL, %eax
+ jb resume_kernel # not returning to v8086 or userspace
+
+ENTRY(resume_userspace)
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
+ # int/exception return?
+ jne work_pending
+ jmp restore_all
+END(ret_from_exception)
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+ DISABLE_INTERRUPTS(CLBR_ANY)
+need_resched:
+ cmpl $0, PER_CPU_VAR(__preempt_count)
+ jnz restore_all
+ testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ jz restore_all
+ call preempt_schedule_irq
+ jmp need_resched
+END(resume_kernel)
+#endif
+
+/*
+ * SYSENTER_RETURN points to after the SYSENTER instruction
+ * in the vsyscall page. See vsyscall-sysentry.S, which defines
+ * the symbol.
+ */
+
+ # SYSENTER call handler stub
+ENTRY(entry_SYSENTER_32)
+ movl TSS_sysenter_sp0(%esp), %esp
+sysenter_past_esp:
+ /*
+ * Interrupts are disabled here, but we can't trace it until
+ * enough kernel state to call TRACE_IRQS_OFF can be called - but
+ * we immediately enable interrupts at that point anyway.
+ */
+ pushl $__USER_DS
+ pushl %ebp
+ pushfl
+ orl $X86_EFLAGS_IF, (%esp)
+ pushl $__USER_CS
+ /*
+ * Push current_thread_info()->sysenter_return to the stack.
+ * A tiny bit of offset fixup is necessary: TI_sysenter_return
+ * is relative to thread_info, which is at the bottom of the
+ * kernel stack page. 4*4 means the 4 words pushed above;
+ * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
+ * and THREAD_SIZE takes us to the bottom.
+ */
+ pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
+
+ pushl %eax
+ SAVE_ALL
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+ cmpl $__PAGE_OFFSET-3, %ebp
+ jae syscall_fault
+ ASM_STAC
+1: movl (%ebp), %ebp
+ ASM_CLAC
+ movl %ebp, PT_EBP(%esp)
+ _ASM_EXTABLE(1b, syscall_fault)
+
+ GET_THREAD_INFO(%ebp)
+
+ testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+ jnz sysenter_audit
+sysenter_do_call:
+ cmpl $(NR_syscalls), %eax
+ jae sysenter_badsys
+ call *sys_call_table(, %eax, 4)
+sysenter_after_call:
+ movl %eax, PT_EAX(%esp)
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ testl $_TIF_ALLWORK_MASK, %ecx
+ jnz sysexit_audit
+sysenter_exit:
+/* if something modifies registers it must also disable sysexit */
+ movl PT_EIP(%esp), %edx
+ movl PT_OLDESP(%esp), %ecx
+ xorl %ebp, %ebp
+ TRACE_IRQS_ON
+1: mov PT_FS(%esp), %fs
+ PTGS_TO_GS
+ ENABLE_INTERRUPTS_SYSEXIT
+
+#ifdef CONFIG_AUDITSYSCALL
+sysenter_audit:
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp)
+ jnz syscall_trace_entry
+ /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
+ movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
+ /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
+ pushl PT_ESI(%esp) /* a3: 5th arg */
+ pushl PT_EDX+4(%esp) /* a2: 4th arg */
+ call __audit_syscall_entry
+ popl %ecx /* get that remapped edx off the stack */
+ popl %ecx /* get that remapped esi off the stack */
+ movl PT_EAX(%esp), %eax /* reload syscall number */
+ jmp sysenter_do_call
+
+sysexit_audit:
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+ jnz syscall_exit_work
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY)
+ movl %eax, %edx /* second arg, syscall return value */
+ cmpl $-MAX_ERRNO, %eax /* is it an error ? */
+ setbe %al /* 1 if so, 0 if not */
+ movzbl %al, %eax /* zero-extend that */
+ call __audit_syscall_exit
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
+ jnz syscall_exit_work
+ movl PT_EAX(%esp), %eax /* reload syscall return value */
+ jmp sysenter_exit
+#endif
+
+.pushsection .fixup, "ax"
+2: movl $0, PT_FS(%esp)
+ jmp 1b
+.popsection
+ _ASM_EXTABLE(1b, 2b)
+ PTGS_TO_GS_EX
+ENDPROC(entry_SYSENTER_32)
+
+ # system call handler stub
+ENTRY(entry_INT80_32)
+ ASM_CLAC
+ pushl %eax # save orig_eax
+ SAVE_ALL
+ GET_THREAD_INFO(%ebp)
+ # system call tracing in operation / emulation
+ testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
+ jnz syscall_trace_entry
+ cmpl $(NR_syscalls), %eax
+ jae syscall_badsys
+syscall_call:
+ call *sys_call_table(, %eax, 4)
+syscall_after_call:
+ movl %eax, PT_EAX(%esp) # store the return value
+syscall_exit:
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ testl $_TIF_ALLWORK_MASK, %ecx # current->work
+ jnz syscall_exit_work
+
+restore_all:
+ TRACE_IRQS_IRET
+restore_all_notrace:
+#ifdef CONFIG_X86_ESPFIX32
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
+ /*
+ * Warning: PT_OLDSS(%esp) contains the wrong/random values if we
+ * are returning to the kernel.
+ * See comments in process.c:copy_thread() for details.
+ */
+ movb PT_OLDSS(%esp), %ah
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
+ je ldt_ss # returning to user-space with LDT SS
+#endif
+restore_nocheck:
+ RESTORE_REGS 4 # skip orig_eax/error_code
+irq_return:
+ INTERRUPT_RETURN
+.section .fixup, "ax"
+ENTRY(iret_exc )
+ pushl $0 # no error code
+ pushl $do_iret_error
+ jmp error_code
+.previous
+ _ASM_EXTABLE(irq_return, iret_exc)
+
+#ifdef CONFIG_X86_ESPFIX32
+ldt_ss:
+#ifdef CONFIG_PARAVIRT
+ /*
+ * The kernel can't run on a non-flat stack if paravirt mode
+ * is active. Rather than try to fixup the high bits of
+ * ESP, bypass this code entirely. This may break DOSemu
+ * and/or Wine support in a paravirt VM, although the option
+ * is still available to implement the setting of the high
+ * 16-bits in the INTERRUPT_RETURN paravirt-op.
+ */
+ cmpl $0, pv_info+PARAVIRT_enabled
+ jne restore_nocheck
+#endif
+
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ shr $16, %edx
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
+ pushl $__ESPFIX_SS
+ pushl %eax /* new kernel esp */
+ /*
+ * Disable interrupts, but do not irqtrace this section: we
+ * will soon execute iret and the tracer was already set to
+ * the irqstate after the IRET:
+ */
+ DISABLE_INTERRUPTS(CLBR_EAX)
+ lss (%esp), %esp /* switch to espfix segment */
+ jmp restore_nocheck
+#endif
+ENDPROC(entry_INT80_32)
+
+ # perform work that needs to be done immediately before resumption
+ ALIGN
+work_pending:
+ testb $_TIF_NEED_RESCHED, %cl
+ jz work_notifysig
+work_resched:
+ call schedule
+ LOCKDEP_SYS_EXIT
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
+ # setting need_resched or sigpending
+ # between sampling and the iret
+ TRACE_IRQS_OFF
+ movl TI_flags(%ebp), %ecx
+ andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
+ # than syscall tracing?
+ jz restore_all
+ testb $_TIF_NEED_RESCHED, %cl
+ jnz work_resched
+
+work_notifysig: # deal with pending signals and
+ # notify-resume requests
+#ifdef CONFIG_VM86
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
+ movl %esp, %eax
+ jnz work_notifysig_v86 # returning to kernel-space or
+ # vm86-space
+1:
+#else
+ movl %esp, %eax
+#endif
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ movb PT_CS(%esp), %bl
+ andb $SEGMENT_RPL_MASK, %bl
+ cmpb $USER_RPL, %bl
+ jb resume_kernel
+ xorl %edx, %edx
+ call do_notify_resume
+ jmp resume_userspace
+
+#ifdef CONFIG_VM86
+ ALIGN
+work_notifysig_v86:
+ pushl %ecx # save ti_flags for do_notify_resume
+ call save_v86_state # %eax contains pt_regs pointer
+ popl %ecx
+ movl %eax, %esp
+ jmp 1b
+#endif
+END(work_pending)
+
+ # perform syscall exit tracing
+ ALIGN
+syscall_trace_entry:
+ movl $-ENOSYS, PT_EAX(%esp)
+ movl %esp, %eax
+ call syscall_trace_enter
+ /* What it returned is what we'll actually use. */
+ cmpl $(NR_syscalls), %eax
+ jnae syscall_call
+ jmp syscall_exit
+END(syscall_trace_entry)
+
+ # perform syscall exit tracing
+ ALIGN
+syscall_exit_work:
+ testl $_TIF_WORK_SYSCALL_EXIT, %ecx
+ jz work_pending
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
+ # schedule() instead
+ movl %esp, %eax
+ call syscall_trace_leave
+ jmp resume_userspace
+END(syscall_exit_work)
+
+syscall_fault:
+ ASM_CLAC
+ GET_THREAD_INFO(%ebp)
+ movl $-EFAULT, PT_EAX(%esp)
+ jmp resume_userspace
+END(syscall_fault)
+
+syscall_badsys:
+ movl $-ENOSYS, %eax
+ jmp syscall_after_call
+END(syscall_badsys)
+
+sysenter_badsys:
+ movl $-ENOSYS, %eax
+ jmp sysenter_after_call
+END(sysenter_badsys)
+
+.macro FIXUP_ESPFIX_STACK
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+#ifdef CONFIG_X86_ESPFIX32
+ /* fixup the stack */
+ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
+ shl $16, %eax
+ addl %esp, %eax /* the adjusted stack pointer */
+ pushl $__KERNEL_DS
+ pushl %eax
+ lss (%esp), %esp /* switch to the normal stack segment */
+#endif
+.endm
+.macro UNWIND_ESPFIX_STACK
+#ifdef CONFIG_X86_ESPFIX32
+ movl %ss, %eax
+ /* see if on espfix stack */
+ cmpw $__ESPFIX_SS, %ax
+ jne 27f
+ movl $__KERNEL_DS, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ /* switch to normal stack */
+ FIXUP_ESPFIX_STACK
+27:
+#endif
+.endm
+
+/*
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
+ */
+ .align 8
+ENTRY(irq_entries_start)
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ pushl $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_interrupt
+ .align 8
+ .endr
+END(irq_entries_start)
+
+/*
+ * the CPU automatically disables interrupts when executing an IRQ vector,
+ * so IRQ-flags tracing has to follow that:
+ */
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+common_interrupt:
+ ASM_CLAC
+ addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ movl %esp, %eax
+ call do_IRQ
+ jmp ret_from_intr
+ENDPROC(common_interrupt)
+
+#define BUILD_INTERRUPT3(name, nr, fn) \
+ENTRY(name) \
+ ASM_CLAC; \
+ pushl $~(nr); \
+ SAVE_ALL; \
+ TRACE_IRQS_OFF \
+ movl %esp, %eax; \
+ call fn; \
+ jmp ret_from_intr; \
+ENDPROC(name)
+
+
+#ifdef CONFIG_TRACING
+# define TRACE_BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
+#else
+# define TRACE_BUILD_INTERRUPT(name, nr)
+#endif
+
+#define BUILD_INTERRUPT(name, nr) \
+ BUILD_INTERRUPT3(name, nr, smp_##name); \
+ TRACE_BUILD_INTERRUPT(name, nr)
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include <asm/entry_arch.h>
+
+ENTRY(coprocessor_error)
+ ASM_CLAC
+ pushl $0
+ pushl $do_coprocessor_error
+ jmp error_code
+END(coprocessor_error)
+
+ENTRY(simd_coprocessor_error)
+ ASM_CLAC
+ pushl $0
+#ifdef CONFIG_X86_INVD_BUG
+ /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+ ALTERNATIVE "pushl $do_general_protection", \
+ "pushl $do_simd_coprocessor_error", \
+ X86_FEATURE_XMM
+#else
+ pushl $do_simd_coprocessor_error
+#endif
+ jmp error_code
+END(simd_coprocessor_error)
+
+ENTRY(device_not_available)
+ ASM_CLAC
+ pushl $-1 # mark this as an int
+ pushl $do_device_not_available
+ jmp error_code
+END(device_not_available)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+ iret
+ _ASM_EXTABLE(native_iret, iret_exc)
+END(native_iret)
+
+ENTRY(native_irq_enable_sysexit)
+ sti
+ sysexit
+END(native_irq_enable_sysexit)
+#endif
+
+ENTRY(overflow)
+ ASM_CLAC
+ pushl $0
+ pushl $do_overflow
+ jmp error_code
+END(overflow)
+
+ENTRY(bounds)
+ ASM_CLAC
+ pushl $0
+ pushl $do_bounds
+ jmp error_code
+END(bounds)
+
+ENTRY(invalid_op)
+ ASM_CLAC
+ pushl $0
+ pushl $do_invalid_op
+ jmp error_code
+END(invalid_op)
+
+ENTRY(coprocessor_segment_overrun)
+ ASM_CLAC
+ pushl $0
+ pushl $do_coprocessor_segment_overrun
+ jmp error_code
+END(coprocessor_segment_overrun)
+
+ENTRY(invalid_TSS)
+ ASM_CLAC
+ pushl $do_invalid_TSS
+ jmp error_code
+END(invalid_TSS)
+
+ENTRY(segment_not_present)
+ ASM_CLAC
+ pushl $do_segment_not_present
+ jmp error_code
+END(segment_not_present)
+
+ENTRY(stack_segment)
+ ASM_CLAC
+ pushl $do_stack_segment
+ jmp error_code
+END(stack_segment)
+
+ENTRY(alignment_check)
+ ASM_CLAC
+ pushl $do_alignment_check
+ jmp error_code
+END(alignment_check)
+
+ENTRY(divide_error)
+ ASM_CLAC
+ pushl $0 # no error code
+ pushl $do_divide_error
+ jmp error_code
+END(divide_error)
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+ ASM_CLAC
+ pushl $0
+ pushl machine_check_vector
+ jmp error_code
+END(machine_check)
+#endif
+
+ENTRY(spurious_interrupt_bug)
+ ASM_CLAC
+ pushl $0
+ pushl $do_spurious_interrupt_bug
+ jmp error_code
+END(spurious_interrupt_bug)
+
+#ifdef CONFIG_XEN
+/*
+ * Xen doesn't set %esp to be precisely what the normal SYSENTER
+ * entry point expects, so fix it up before using the normal path.
+ */
+ENTRY(xen_sysenter_target)
+ addl $5*4, %esp /* remove xen-provided frame */
+ jmp sysenter_past_esp
+
+ENTRY(xen_hypervisor_callback)
+ pushl $-1 /* orig_ax = -1 => not a system call */
+ SAVE_ALL
+ TRACE_IRQS_OFF
+
+ /*
+ * Check to see if we got the event in the critical
+ * region in xen_iret_direct, after we've reenabled
+ * events and checked for pending events. This simulates
+ * iret instruction's behaviour where it delivers a
+ * pending interrupt when enabling interrupts:
+ */
+ movl PT_EIP(%esp), %eax
+ cmpl $xen_iret_start_crit, %eax
+ jb 1f
+ cmpl $xen_iret_end_crit, %eax
+ jae 1f
+
+ jmp xen_iret_crit_fixup
+
+ENTRY(xen_do_upcall)
+1: mov %esp, %eax
+ call xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+ call xen_maybe_preempt_hcall
+#endif
+ jmp ret_from_intr
+ENDPROC(xen_hypervisor_callback)
+
+/*
+ * Hypervisor uses this for application faults while it executes.
+ * We get here for two reasons:
+ * 1. Fault while reloading DS, ES, FS or GS
+ * 2. Fault while executing IRET
+ * Category 1 we fix up by reattempting the load, and zeroing the segment
+ * register if the load fails.
+ * Category 2 we fix up by jumping to do_iret_error. We cannot use the
+ * normal Linux return path in this case because if we use the IRET hypercall
+ * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+ * We distinguish between categories by maintaining a status value in EAX.
+ */
+ENTRY(xen_failsafe_callback)
+ pushl %eax
+ movl $1, %eax
+1: mov 4(%esp), %ds
+2: mov 8(%esp), %es
+3: mov 12(%esp), %fs
+4: mov 16(%esp), %gs
+ /* EAX == 0 => Category 1 (Bad segment)
+ EAX != 0 => Category 2 (Bad IRET) */
+ testl %eax, %eax
+ popl %eax
+ lea 16(%esp), %esp
+ jz 5f
+ jmp iret_exc
+5: pushl $-1 /* orig_ax = -1 => not a system call */
+ SAVE_ALL
+ jmp ret_from_exception
+
+.section .fixup, "ax"
+6: xorl %eax, %eax
+ movl %eax, 4(%esp)
+ jmp 1b
+7: xorl %eax, %eax
+ movl %eax, 8(%esp)
+ jmp 2b
+8: xorl %eax, %eax
+ movl %eax, 12(%esp)
+ jmp 3b
+9: xorl %eax, %eax
+ movl %eax, 16(%esp)
+ jmp 4b
+.previous
+ _ASM_EXTABLE(1b, 6b)
+ _ASM_EXTABLE(2b, 7b)
+ _ASM_EXTABLE(3b, 8b)
+ _ASM_EXTABLE(4b, 9b)
+ENDPROC(xen_failsafe_callback)
+
+BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+ xen_evtchn_do_upcall)
+
+#endif /* CONFIG_XEN */
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
+ hyperv_vector_handler)
+
+#endif /* CONFIG_HYPERV */
+
+#ifdef CONFIG_FUNCTION_TRACER
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+ ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ pushl $0 /* Pass NULL as regs pointer */
+ movl 4*4(%esp), %eax
+ movl 0x4(%ebp), %edx
+ movl function_trace_op, %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+.globl ftrace_call
+ftrace_call:
+ call ftrace_stub
+
+ addl $4, %esp /* skip NULL pointer */
+ popl %edx
+ popl %ecx
+ popl %eax
+ftrace_ret:
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ jmp ftrace_stub
+#endif
+
+.globl ftrace_stub
+ftrace_stub:
+ ret
+END(ftrace_caller)
+
+ENTRY(ftrace_regs_caller)
+ pushf /* push flags before compare (in cs location) */
+
+ /*
+ * i386 does not save SS and ESP when coming from kernel.
+ * Instead, to get sp, &regs->sp is used (see ptrace.h).
+ * Unfortunately, that means eflags must be at the same location
+ * as the current return ip is. We move the return ip into the
+ * ip location, and move flags into the return ip location.
+ */
+ pushl 4(%esp) /* save return ip into ip slot */
+
+ pushl $0 /* Load 0 into orig_ax */
+ pushl %gs
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+
+ movl 13*4(%esp), %eax /* Get the saved flags */
+ movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
+ /* clobbering return ip */
+ movl $__KERNEL_CS, 13*4(%esp)
+
+ movl 12*4(%esp), %eax /* Load ip (1st parameter) */
+ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
+ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
+ movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
+ pushl %esp /* Save pt_regs as 4th parameter */
+
+GLOBAL(ftrace_regs_call)
+ call ftrace_stub
+
+ addl $4, %esp /* Skip pt_regs */
+ movl 14*4(%esp), %eax /* Move flags back into cs */
+ movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
+ movl 12*4(%esp), %eax /* Get return ip from regs->ip */
+ movl %eax, 14*4(%esp) /* Put return ip back for ret */
+
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %eax
+ popl %ds
+ popl %es
+ popl %fs
+ popl %gs
+ addl $8, %esp /* Skip orig_ax and ip */
+ popf /* Pop flags at end (no addl to corrupt flags) */
+ jmp ftrace_ret
+
+ popf
+ jmp ftrace_stub
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
+ENTRY(mcount)
+ cmpl $__PAGE_OFFSET, %esp
+ jb ftrace_stub /* Paging not enabled yet? */
+
+ cmpl $ftrace_stub, ftrace_trace_function
+ jnz trace
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ cmpl $ftrace_stub, ftrace_graph_return
+ jnz ftrace_graph_caller
+
+ cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
+ jnz ftrace_graph_caller
+#endif
+.globl ftrace_stub
+ftrace_stub:
+ ret
+
+ /* taken from glibc */
+trace:
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ movl 0x4(%ebp), %edx
+ subl $MCOUNT_INSN_SIZE, %eax
+
+ call *ftrace_trace_function
+
+ popl %edx
+ popl %ecx
+ popl %eax
+ jmp ftrace_stub
+END(mcount)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FUNCTION_TRACER */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ENTRY(ftrace_graph_caller)
+ pushl %eax
+ pushl %ecx
+ pushl %edx
+ movl 0xc(%esp), %eax
+ lea 0x4(%ebp), %edx
+ movl (%ebp), %ecx
+ subl $MCOUNT_INSN_SIZE, %eax
+ call prepare_ftrace_return
+ popl %edx
+ popl %ecx
+ popl %eax
+ ret
+END(ftrace_graph_caller)
+
+.globl return_to_handler
+return_to_handler:
+ pushl %eax
+ pushl %edx
+ movl %ebp, %eax
+ call ftrace_return_to_handler
+ movl %eax, %ecx
+ popl %edx
+ popl %eax
+ jmp *%ecx
+#endif
+
+#ifdef CONFIG_TRACING
+ENTRY(trace_page_fault)
+ ASM_CLAC
+ pushl $trace_do_page_fault
+ jmp error_code
+END(trace_page_fault)
+#endif
+
+ENTRY(page_fault)
+ ASM_CLAC
+ pushl $do_page_fault
+ ALIGN
+error_code:
+ /* the function address is in %gs's slot on the stack */
+ pushl %fs
+ pushl %es
+ pushl %ds
+ pushl %eax
+ pushl %ebp
+ pushl %edi
+ pushl %esi
+ pushl %edx
+ pushl %ecx
+ pushl %ebx
+ cld
+ movl $(__KERNEL_PERCPU), %ecx
+ movl %ecx, %fs
+ UNWIND_ESPFIX_STACK
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi # get the function address
+ movl PT_ORIG_EAX(%esp), %edx # get the error code
+ movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
+ movl $(__USER_DS), %ecx
+ movl %ecx, %ds
+ movl %ecx, %es
+ TRACE_IRQS_OFF
+ movl %esp, %eax # pt_regs pointer
+ call *%edi
+ jmp ret_from_exception
+END(page_fault)
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+.macro FIX_STACK offset ok label
+ cmpw $__KERNEL_CS, 4(%esp)
+ jne \ok
+\label:
+ movl TSS_sysenter_sp0 + \offset(%esp), %esp
+ pushfl
+ pushl $__KERNEL_CS
+ pushl $sysenter_past_esp
+.endm
+
+ENTRY(debug)
+ ASM_CLAC
+ cmpl $entry_SYSENTER_32, (%esp)
+ jne debug_stack_correct
+ FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
+debug_stack_correct:
+ pushl $-1 # mark this as an int
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx, %edx # error code 0
+ movl %esp, %eax # pt_regs pointer
+ call do_debug
+ jmp ret_from_exception
+END(debug)
+
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+ ASM_CLAC
+#ifdef CONFIG_X86_ESPFIX32
+ pushl %eax
+ movl %ss, %eax
+ cmpw $__ESPFIX_SS, %ax
+ popl %eax
+ je nmi_espfix_stack
+#endif
+ cmpl $entry_SYSENTER_32, (%esp)
+ je nmi_stack_fixup
+ pushl %eax
+ movl %esp, %eax
+ /*
+ * Do not access memory above the end of our stack page,
+ * it might not exist.
+ */
+ andl $(THREAD_SIZE-1), %eax
+ cmpl $(THREAD_SIZE-20), %eax
+ popl %eax
+ jae nmi_stack_correct
+ cmpl $entry_SYSENTER_32, 12(%esp)
+ je nmi_debug_stack_check
+nmi_stack_correct:
+ pushl %eax
+ SAVE_ALL
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+ call do_nmi
+ jmp restore_all_notrace
+
+nmi_stack_fixup:
+ FIX_STACK 12, nmi_stack_correct, 1
+ jmp nmi_stack_correct
+
+nmi_debug_stack_check:
+ cmpw $__KERNEL_CS, 16(%esp)
+ jne nmi_stack_correct
+ cmpl $debug, (%esp)
+ jb nmi_stack_correct
+ cmpl $debug_esp_fix_insn, (%esp)
+ ja nmi_stack_correct
+ FIX_STACK 24, nmi_stack_correct, 1
+ jmp nmi_stack_correct
+
+#ifdef CONFIG_X86_ESPFIX32
+nmi_espfix_stack:
+ /*
+ * create the pointer to lss back
+ */
+ pushl %ss
+ pushl %esp
+ addl $4, (%esp)
+ /* copy the iret frame of 12 bytes */
+ .rept 3
+ pushl 16(%esp)
+ .endr
+ pushl %eax
+ SAVE_ALL
+ FIXUP_ESPFIX_STACK # %eax == %esp
+ xorl %edx, %edx # zero error code
+ call do_nmi
+ RESTORE_REGS
+ lss 12+4(%esp), %esp # back to espfix stack
+ jmp irq_return
+#endif
+END(nmi)
+
+ENTRY(int3)
+ ASM_CLAC
+ pushl $-1 # mark this as an int
+ SAVE_ALL
+ TRACE_IRQS_OFF
+ xorl %edx, %edx # zero error code
+ movl %esp, %eax # pt_regs pointer
+ call do_int3
+ jmp ret_from_exception
+END(int3)
+
+ENTRY(general_protection)
+ pushl $do_general_protection
+ jmp error_code
+END(general_protection)
+
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+ ASM_CLAC
+ pushl $do_async_page_fault
+ jmp error_code
+END(async_page_fault)
+#endif
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/entry/entry_64.S
index 22aadc917868..3bb2c4302df1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -4,34 +4,25 @@
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 2000, 2001, 2002 Andi Kleen SuSE Labs
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
- */
-
-/*
+ *
* entry.S contains the system-call and fault low-level handling routines.
*
* Some of this is documented in Documentation/x86/entry_64.txt
*
- * NOTE: This code handles signal-recognition, which happens every time
- * after an interrupt and after each system call.
- *
* A note on terminology:
- * - iret frame: Architecture defined interrupt frame from SS to RIP
- * at the top of the kernel process stack.
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
+ * at the top of the kernel process stack.
*
* Some macro usage:
- * - CFI macros are used to generate dwarf2 unwind information for better
- * backtraces. They don't change any code.
- * - ENTRY/END Define functions in the symbol table.
- * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
- * - idtentry - Define exception entry points.
+ * - ENTRY/END: Define functions in the symbol table.
+ * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
+ * - idtentry: Define exception entry points.
*/
-
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/cache.h>
#include <asm/errno.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+#include "calling.h"
#include <asm/asm-offsets.h>
#include <asm/msr.h>
#include <asm/unistd.h>
@@ -49,13 +40,12 @@
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
-#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_64BIT 0x80000000
-#define __AUDIT_ARCH_LE 0x40000000
-
- .code64
- .section .entry.text, "ax"
+#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT 0x80000000
+#define __AUDIT_ARCH_LE 0x40000000
+.code64
+.section .entry.text, "ax"
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
@@ -64,11 +54,10 @@ ENTRY(native_usergs_sysret64)
ENDPROC(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-
.macro TRACE_IRQS_IRETQ
#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9,EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
+ bt $9, EFLAGS(%rsp) /* interrupts off? */
+ jnc 1f
TRACE_IRQS_ON
1:
#endif
@@ -88,89 +77,34 @@ ENDPROC(native_usergs_sysret64)
#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
.macro TRACE_IRQS_OFF_DEBUG
- call debug_stack_set_zero
+ call debug_stack_set_zero
TRACE_IRQS_OFF
- call debug_stack_reset
+ call debug_stack_reset
.endm
.macro TRACE_IRQS_ON_DEBUG
- call debug_stack_set_zero
+ call debug_stack_set_zero
TRACE_IRQS_ON
- call debug_stack_reset
+ call debug_stack_reset
.endm
.macro TRACE_IRQS_IRETQ_DEBUG
- bt $9,EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
+ bt $9, EFLAGS(%rsp) /* interrupts off? */
+ jnc 1f
TRACE_IRQS_ON_DEBUG
1:
.endm
#else
-# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
+# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
+# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
+# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
#endif
/*
- * empty frame
- */
- .macro EMPTY_FRAME start=1 offset=0
- .if \start
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,8+\offset
- .else
- CFI_DEF_CFA_OFFSET 8+\offset
- .endif
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
- */
- .macro INTR_FRAME start=1 offset=0
- EMPTY_FRAME \start, 5*8+\offset
- /*CFI_REL_OFFSET ss, 4*8+\offset*/
- CFI_REL_OFFSET rsp, 3*8+\offset
- /*CFI_REL_OFFSET rflags, 2*8+\offset*/
- /*CFI_REL_OFFSET cs, 1*8+\offset*/
- CFI_REL_OFFSET rip, 0*8+\offset
- .endm
-
-/*
- * initial frame state for exceptions with error code (and interrupts
- * with vector already pushed)
- */
- .macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, 1*8+\offset
- .endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
- .macro DEFAULT_FRAME start=1 offset=0
- XCPT_FRAME \start, ORIG_RAX+\offset
- CFI_REL_OFFSET rdi, RDI+\offset
- CFI_REL_OFFSET rsi, RSI+\offset
- CFI_REL_OFFSET rdx, RDX+\offset
- CFI_REL_OFFSET rcx, RCX+\offset
- CFI_REL_OFFSET rax, RAX+\offset
- CFI_REL_OFFSET r8, R8+\offset
- CFI_REL_OFFSET r9, R9+\offset
- CFI_REL_OFFSET r10, R10+\offset
- CFI_REL_OFFSET r11, R11+\offset
- CFI_REL_OFFSET rbx, RBX+\offset
- CFI_REL_OFFSET rbp, RBP+\offset
- CFI_REL_OFFSET r12, R12+\offset
- CFI_REL_OFFSET r13, R13+\offset
- CFI_REL_OFFSET r14, R14+\offset
- CFI_REL_OFFSET r15, R15+\offset
- .endm
-
-/*
- * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
+ * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
- * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
@@ -186,7 +120,7 @@ ENDPROC(native_usergs_sysret64)
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
- * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
+ * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
@@ -195,13 +129,7 @@ ENDPROC(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
-ENTRY(system_call)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,0
- CFI_REGISTER rip,rcx
- /*CFI_REGISTER rflags,r11*/
-
+ENTRY(entry_SYSCALL_64)
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
@@ -213,14 +141,14 @@ ENTRY(system_call)
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
-GLOBAL(system_call_after_swapgs)
+GLOBAL(entry_SYSCALL_64_after_swapgs)
- movq %rsp,PER_CPU_VAR(rsp_scratch)
- movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp
+ movq %rsp, PER_CPU_VAR(rsp_scratch)
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
- pushq_cfi $__USER_DS /* pt_regs->ss */
- pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
+ pushq $__USER_DS /* pt_regs->ss */
+ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
/*
* Re-enable interrupts.
* We use 'rsp_scratch' as a scratch space, hence irq-off block above
@@ -229,36 +157,34 @@ GLOBAL(system_call_after_swapgs)
* with using rsp_scratch:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %r11 /* pt_regs->flags */
- pushq_cfi $__USER_CS /* pt_regs->cs */
- pushq_cfi %rcx /* pt_regs->ip */
- CFI_REL_OFFSET rip,0
- pushq_cfi_reg rax /* pt_regs->orig_ax */
- pushq_cfi_reg rdi /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rcx /* pt_regs->cx */
- pushq_cfi $-ENOSYS /* pt_regs->ax */
- pushq_cfi_reg r8 /* pt_regs->r8 */
- pushq_cfi_reg r9 /* pt_regs->r9 */
- pushq_cfi_reg r10 /* pt_regs->r10 */
- pushq_cfi_reg r11 /* pt_regs->r11 */
- sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 6*8
-
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz tracesys
-system_call_fastpath:
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+ pushq %rax /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 */
+ pushq %r9 /* pt_regs->r9 */
+ pushq %r10 /* pt_regs->r10 */
+ pushq %r11 /* pt_regs->r11 */
+ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz tracesys
+entry_SYSCALL_64_fastpath:
#if __SYSCALL_MASK == ~0
- cmpq $__NR_syscall_max,%rax
+ cmpq $__NR_syscall_max, %rax
#else
- andl $__SYSCALL_MASK,%eax
- cmpl $__NR_syscall_max,%eax
+ andl $__SYSCALL_MASK, %eax
+ cmpl $__NR_syscall_max, %eax
#endif
- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
- movq %r10,%rcx
- call *sys_call_table(,%rax,8)
- movq %rax,RAX(%rsp)
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
+ movq %r10, %rcx
+ call *sys_call_table(, %rax, 8)
+ movq %rax, RAX(%rsp)
1:
/*
* Syscall return path ending with SYSRET (fast path).
@@ -279,19 +205,15 @@ system_call_fastpath:
* flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
* very bad.
*/
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
-
- CFI_REMEMBER_STATE
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RIP(%rsp),%rcx
- CFI_REGISTER rip,rcx
- movq EFLAGS(%rsp),%r11
- /*CFI_REGISTER rflags,r11*/
- movq RSP(%rsp),%rsp
+ movq RIP(%rsp), %rcx
+ movq EFLAGS(%rsp), %r11
+ movq RSP(%rsp), %rsp
/*
- * 64bit SYSRET restores rip from rcx,
+ * 64-bit SYSRET restores rip from rcx,
* rflags from r11 (but RF and VM bits are forced to 0),
* cs and ss are loaded from MSRs.
* Restoration of rflags re-enables interrupts.
@@ -307,25 +229,23 @@ system_call_fastpath:
*/
USERGS_SYSRET64
- CFI_RESTORE_STATE
-
/* Do syscall entry tracing */
tracesys:
- movq %rsp, %rdi
- movl $AUDIT_ARCH_X86_64, %esi
- call syscall_trace_enter_phase1
- test %rax, %rax
- jnz tracesys_phase2 /* if needed, run the slow path */
- RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
- movq ORIG_RAX(%rsp), %rax
- jmp system_call_fastpath /* and return to the fast path */
+ movq %rsp, %rdi
+ movl $AUDIT_ARCH_X86_64, %esi
+ call syscall_trace_enter_phase1
+ test %rax, %rax
+ jnz tracesys_phase2 /* if needed, run the slow path */
+ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
+ movq ORIG_RAX(%rsp), %rax
+ jmp entry_SYSCALL_64_fastpath /* and return to the fast path */
tracesys_phase2:
SAVE_EXTRA_REGS
- movq %rsp, %rdi
- movl $AUDIT_ARCH_X86_64, %esi
- movq %rax,%rdx
- call syscall_trace_enter_phase2
+ movq %rsp, %rdi
+ movl $AUDIT_ARCH_X86_64, %esi
+ movq %rax, %rdx
+ call syscall_trace_enter_phase2
/*
* Reload registers from stack in case ptrace changed them.
@@ -335,15 +255,15 @@ tracesys_phase2:
RESTORE_C_REGS_EXCEPT_RAX
RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
- cmpq $__NR_syscall_max,%rax
+ cmpq $__NR_syscall_max, %rax
#else
- andl $__SYSCALL_MASK,%eax
- cmpl $__NR_syscall_max,%eax
+ andl $__SYSCALL_MASK, %eax
+ cmpl $__NR_syscall_max, %eax
#endif
- ja 1f /* return -ENOSYS (already in pt_regs->ax) */
- movq %r10,%rcx /* fixup for C */
- call *sys_call_table(,%rax,8)
- movq %rax,RAX(%rsp)
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
+ movq %r10, %rcx /* fixup for C */
+ call *sys_call_table(, %rax, 8)
+ movq %rax, RAX(%rsp)
1:
/* Use IRET because user could have changed pt_regs->foo */
@@ -355,31 +275,33 @@ GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
TRACE_IRQS_OFF
- movl $_TIF_ALLWORK_MASK,%edi
+ movl $_TIF_ALLWORK_MASK, %edi
/* edi: mask to check */
GLOBAL(int_with_check)
LOCKDEP_SYS_EXIT_IRQ
GET_THREAD_INFO(%rcx)
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
+ movl TI_flags(%rcx), %edx
+ andl %edi, %edx
+ jnz int_careful
+ andl $~TS_COMPAT, TI_status(%rcx)
jmp syscall_return
- /* Either reschedule or signal or syscall exit tracking needed. */
- /* First do a reschedule test. */
- /* edx: work, edi: workmask */
+ /*
+ * Either reschedule or signal or syscall exit tracking needed.
+ * First do a reschedule test.
+ * edx: work, edi: workmask
+ */
int_careful:
- bt $TIF_NEED_RESCHED,%edx
- jnc int_very_careful
+ bt $TIF_NEED_RESCHED, %edx
+ jnc int_very_careful
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
+ pushq %rdi
SCHEDULE_USER
- popq_cfi %rdi
+ popq %rdi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- jmp int_with_check
+ jmp int_with_check
/* handle signals and tracing -- both require a full pt_regs */
int_very_careful:
@@ -387,27 +309,27 @@ int_very_careful:
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_EXTRA_REGS
/* Check for syscall exit trace */
- testl $_TIF_WORK_SYSCALL_EXIT,%edx
- jz int_signal
- pushq_cfi %rdi
- leaq 8(%rsp),%rdi # &ptregs -> arg1
- call syscall_trace_leave
- popq_cfi %rdi
- andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
- jmp int_restore_rest
+ testl $_TIF_WORK_SYSCALL_EXIT, %edx
+ jz int_signal
+ pushq %rdi
+ leaq 8(%rsp), %rdi /* &ptregs -> arg1 */
+ call syscall_trace_leave
+ popq %rdi
+ andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
+ jmp int_restore_rest
int_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz 1f
- movq %rsp,%rdi # &ptregs -> arg1
- xorl %esi,%esi # oldset -> arg2
- call do_notify_resume
-1: movl $_TIF_WORK_MASK,%edi
+ testl $_TIF_DO_NOTIFY_MASK, %edx
+ jz 1f
+ movq %rsp, %rdi /* &ptregs -> arg1 */
+ xorl %esi, %esi /* oldset -> arg2 */
+ call do_notify_resume
+1: movl $_TIF_WORK_MASK, %edi
int_restore_rest:
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- jmp int_with_check
+ jmp int_with_check
syscall_return:
/* The IRETQ could re-enable interrupts: */
@@ -418,10 +340,10 @@ syscall_return:
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context.
*/
- movq RCX(%rsp),%rcx
- movq RIP(%rsp),%r11
- cmpq %rcx,%r11 /* RCX == RIP */
- jne opportunistic_sysret_failed
+ movq RCX(%rsp), %rcx
+ movq RIP(%rsp), %r11
+ cmpq %rcx, %r11 /* RCX == RIP */
+ jne opportunistic_sysret_failed
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
@@ -434,19 +356,21 @@ syscall_return:
.ifne __VIRTUAL_MASK_SHIFT - 47
.error "virtual address width changed -- SYSRET checks need update"
.endif
+
/* Change top 16 bits to be the sign-extension of 47th bit */
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
+
/* If this changed %rcx, it was not canonical */
cmpq %rcx, %r11
jne opportunistic_sysret_failed
- cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
+ cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
- movq R11(%rsp),%r11
- cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
+ movq R11(%rsp), %r11
+ cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
/*
* SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
@@ -455,47 +379,41 @@ syscall_return:
* with register state that satisfies the opportunistic SYSRET
* conditions. For example, single-stepping this user code:
*
- * movq $stuck_here,%rcx
+ * movq $stuck_here, %rcx
* pushfq
* popq %r11
* stuck_here:
*
* would never get past 'stuck_here'.
*/
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+ jnz opportunistic_sysret_failed
/* nothing to check for RSP */
- cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
+ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
/*
- * We win! This label is here just for ease of understanding
- * perf profiles. Nothing jumps here.
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
- CFI_REMEMBER_STATE
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
- movq RSP(%rsp),%rsp
+ movq RSP(%rsp), %rsp
USERGS_SYSRET64
- CFI_RESTORE_STATE
opportunistic_sysret_failed:
SWAPGS
jmp restore_c_regs_and_iret
- CFI_ENDPROC
-END(system_call)
+END(entry_SYSCALL_64)
.macro FORK_LIKE func
ENTRY(stub_\func)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8 /* offset 8: return address */
SAVE_EXTRA_REGS 8
- jmp sys_\func
- CFI_ENDPROC
+ jmp sys_\func
END(stub_\func)
.endm
@@ -504,8 +422,6 @@ END(stub_\func)
FORK_LIKE vfork
ENTRY(stub_execve)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
call sys_execve
return_from_execve:
testl %eax, %eax
@@ -515,11 +431,9 @@ return_from_execve:
1:
/* must use IRET code path (pt_regs->cs may have changed) */
addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
ZERO_EXTRA_REGS
- movq %rax,RAX(%rsp)
+ movq %rax, RAX(%rsp)
jmp int_ret_from_sys_call
- CFI_ENDPROC
END(stub_execve)
/*
* Remaining execve stubs are only 7 bytes long.
@@ -527,32 +441,23 @@ END(stub_execve)
*/
.align 8
GLOBAL(stub_execveat)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
call sys_execveat
jmp return_from_execve
- CFI_ENDPROC
END(stub_execveat)
#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
.align 8
GLOBAL(stub_x32_execve)
GLOBAL(stub32_execve)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
call compat_sys_execve
jmp return_from_execve
- CFI_ENDPROC
END(stub32_execve)
END(stub_x32_execve)
.align 8
GLOBAL(stub_x32_execveat)
GLOBAL(stub32_execveat)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
call compat_sys_execveat
jmp return_from_execve
- CFI_ENDPROC
END(stub32_execveat)
END(stub_x32_execveat)
#endif
@@ -562,8 +467,6 @@ END(stub_x32_execveat)
* This cannot be done with SYSRET, so use the IRET return path instead.
*/
ENTRY(stub_rt_sigreturn)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
/*
* SAVE_EXTRA_REGS result is not normally needed:
* sigreturn overwrites all pt_regs->GPREGS.
@@ -572,24 +475,19 @@ ENTRY(stub_rt_sigreturn)
* we SAVE_EXTRA_REGS here.
*/
SAVE_EXTRA_REGS 8
- call sys_rt_sigreturn
+ call sys_rt_sigreturn
return_from_stub:
addq $8, %rsp
- CFI_ADJUST_CFA_OFFSET -8
RESTORE_EXTRA_REGS
- movq %rax,RAX(%rsp)
- jmp int_ret_from_sys_call
- CFI_ENDPROC
+ movq %rax, RAX(%rsp)
+ jmp int_ret_from_sys_call
END(stub_rt_sigreturn)
#ifdef CONFIG_X86_X32_ABI
ENTRY(stub_x32_rt_sigreturn)
- CFI_STARTPROC
- DEFAULT_FRAME 0, 8
SAVE_EXTRA_REGS 8
- call sys32_x32_rt_sigreturn
- jmp return_from_stub
- CFI_ENDPROC
+ call sys32_x32_rt_sigreturn
+ jmp return_from_stub
END(stub_x32_rt_sigreturn)
#endif
@@ -599,36 +497,36 @@ END(stub_x32_rt_sigreturn)
* rdi: prev task we switched from
*/
ENTRY(ret_from_fork)
- DEFAULT_FRAME
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
+ LOCK ; btr $TIF_FORK, TI_flags(%r8)
- pushq_cfi $0x0002
- popfq_cfi # reset kernel eflags
+ pushq $0x0002
+ popfq /* reset kernel eflags */
- call schedule_tail # rdi: 'prev' task parameter
+ call schedule_tail /* rdi: 'prev' task parameter */
RESTORE_EXTRA_REGS
- testb $3, CS(%rsp) # from kernel_thread?
+ testb $3, CS(%rsp) /* from kernel_thread? */
/*
* By the time we get here, we have no idea whether our pt_regs,
* ti flags, and ti status came from the 64-bit SYSCALL fast path,
- * the slow path, or one of the ia32entry paths.
+ * the slow path, or one of the 32-bit compat paths.
* Use IRET code path to return, since it can safely handle
* all of the above.
*/
jnz int_ret_from_sys_call
- /* We came from kernel_thread */
- /* nb: we depend on RESTORE_EXTRA_REGS above */
- movq %rbp, %rdi
- call *%rbx
- movl $0, RAX(%rsp)
+ /*
+ * We came from kernel_thread
+ * nb: we depend on RESTORE_EXTRA_REGS above
+ */
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
RESTORE_EXTRA_REGS
- jmp int_ret_from_sys_call
- CFI_ENDPROC
+ jmp int_ret_from_sys_call
END(ret_from_fork)
/*
@@ -637,16 +535,13 @@ END(ret_from_fork)
*/
.align 8
ENTRY(irq_entries_start)
- INTR_FRAME
vector=FIRST_EXTERNAL_VECTOR
.rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
+ pushq $(~vector+0x80) /* Note: always in signed byte range */
vector=vector+1
jmp common_interrupt
- CFI_ADJUST_CFA_OFFSET -8
.align 8
.endr
- CFI_ENDPROC
END(irq_entries_start)
/*
@@ -672,7 +567,7 @@ END(irq_entries_start)
/* this goes to 0(%rsp) for unwinder, not for saving the value: */
SAVE_EXTRA_REGS_RBP -RBP
- leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
+ leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */
testb $3, CS-RBP(%rsp)
jz 1f
@@ -685,24 +580,14 @@ END(irq_entries_start)
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
- movq %rsp, %rsi
- incl PER_CPU_VAR(irq_count)
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- CFI_DEF_CFA_REGISTER rsi
- pushq %rsi
- /*
- * For debugger:
- * "CFA (Current Frame Address) is the value on stack + offset"
- */
- CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 (rsp) */, 0, \
- 0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
- 0x22 /* DW_OP_plus */
+ movq %rsp, %rsi
+ incl PER_CPU_VAR(irq_count)
+ cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
+ pushq %rsi
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
- call \func
+ call \func
.endm
/*
@@ -711,42 +596,36 @@ END(irq_entries_start)
*/
.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
- XCPT_FRAME
ASM_CLAC
- addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
+ addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
interrupt do_IRQ
/* 0(%rsp): old RSP */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- decl PER_CPU_VAR(irq_count)
+ decl PER_CPU_VAR(irq_count)
/* Restore saved previous stack */
- popq %rsi
- CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
+ popq %rsi
/* return code expects complete pt_regs - adjust rsp accordingly: */
- leaq -RBP(%rsi),%rsp
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET RBP
+ leaq -RBP(%rsi), %rsp
testb $3, CS(%rsp)
jz retint_kernel
/* Interrupt came from user space */
-
+retint_user:
GET_THREAD_INFO(%rcx)
- /*
- * %rcx: thread info. Interrupts off.
- */
+
+ /* %rcx: thread info. Interrupts are off. */
retint_with_reschedule:
- movl $_TIF_WORK_MASK,%edi
+ movl $_TIF_WORK_MASK, %edi
retint_check:
LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- andl %edi,%edx
- CFI_REMEMBER_STATE
- jnz retint_careful
+ movl TI_flags(%rcx), %edx
+ andl %edi, %edx
+ jnz retint_careful
-retint_swapgs: /* return to user-space */
+retint_swapgs: /* return to user-space */
/*
* The iretq could re-enable interrupts:
*/
@@ -761,9 +640,9 @@ retint_kernel:
#ifdef CONFIG_PREEMPT
/* Interrupts are off */
/* Check if we need preemption */
- bt $9,EFLAGS(%rsp) /* interrupts were off? */
+ bt $9, EFLAGS(%rsp) /* were interrupts off? */
jnc 1f
-0: cmpl $0,PER_CPU_VAR(__preempt_count)
+0: cmpl $0, PER_CPU_VAR(__preempt_count)
jnz 1f
call preempt_schedule_irq
jmp 0b
@@ -781,8 +660,6 @@ retint_kernel:
restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
-
-irq_return:
INTERRUPT_RETURN
ENTRY(native_iret)
@@ -791,8 +668,8 @@ ENTRY(native_iret)
* 64-bit mode SS:RSP on the exception stack is always valid.
*/
#ifdef CONFIG_X86_ESPFIX64
- testb $4,(SS-RIP)(%rsp)
- jnz native_irq_return_ldt
+ testb $4, (SS-RIP)(%rsp)
+ jnz native_irq_return_ldt
#endif
.global native_irq_return_iret
@@ -807,62 +684,60 @@ native_irq_return_iret:
#ifdef CONFIG_X86_ESPFIX64
native_irq_return_ldt:
- pushq_cfi %rax
- pushq_cfi %rdi
+ pushq %rax
+ pushq %rdi
SWAPGS
- movq PER_CPU_VAR(espfix_waddr),%rdi
- movq %rax,(0*8)(%rdi) /* RAX */
- movq (2*8)(%rsp),%rax /* RIP */
- movq %rax,(1*8)(%rdi)
- movq (3*8)(%rsp),%rax /* CS */
- movq %rax,(2*8)(%rdi)
- movq (4*8)(%rsp),%rax /* RFLAGS */
- movq %rax,(3*8)(%rdi)
- movq (6*8)(%rsp),%rax /* SS */
- movq %rax,(5*8)(%rdi)
- movq (5*8)(%rsp),%rax /* RSP */
- movq %rax,(4*8)(%rdi)
- andl $0xffff0000,%eax
- popq_cfi %rdi
- orq PER_CPU_VAR(espfix_stack),%rax
+ movq PER_CPU_VAR(espfix_waddr), %rdi
+ movq %rax, (0*8)(%rdi) /* RAX */
+ movq (2*8)(%rsp), %rax /* RIP */
+ movq %rax, (1*8)(%rdi)
+ movq (3*8)(%rsp), %rax /* CS */
+ movq %rax, (2*8)(%rdi)
+ movq (4*8)(%rsp), %rax /* RFLAGS */
+ movq %rax, (3*8)(%rdi)
+ movq (6*8)(%rsp), %rax /* SS */
+ movq %rax, (5*8)(%rdi)
+ movq (5*8)(%rsp), %rax /* RSP */
+ movq %rax, (4*8)(%rdi)
+ andl $0xffff0000, %eax
+ popq %rdi
+ orq PER_CPU_VAR(espfix_stack), %rax
SWAPGS
- movq %rax,%rsp
- popq_cfi %rax
- jmp native_irq_return_iret
+ movq %rax, %rsp
+ popq %rax
+ jmp native_irq_return_iret
#endif
/* edi: workmask, edx: work */
retint_careful:
- CFI_RESTORE_STATE
- bt $TIF_NEED_RESCHED,%edx
- jnc retint_signal
+ bt $TIF_NEED_RESCHED, %edx
+ jnc retint_signal
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- pushq_cfi %rdi
+ pushq %rdi
SCHEDULE_USER
- popq_cfi %rdi
+ popq %rdi
GET_THREAD_INFO(%rcx)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- jmp retint_check
+ jmp retint_check
retint_signal:
- testl $_TIF_DO_NOTIFY_MASK,%edx
- jz retint_swapgs
+ testl $_TIF_DO_NOTIFY_MASK, %edx
+ jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_EXTRA_REGS
- movq $-1,ORIG_RAX(%rsp)
- xorl %esi,%esi # oldset
- movq %rsp,%rdi # &pt_regs
- call do_notify_resume
+ movq $-1, ORIG_RAX(%rsp)
+ xorl %esi, %esi /* oldset */
+ movq %rsp, %rdi /* &pt_regs */
+ call do_notify_resume
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
- jmp retint_with_reschedule
+ jmp retint_with_reschedule
- CFI_ENDPROC
END(common_interrupt)
/*
@@ -870,13 +745,11 @@ END(common_interrupt)
*/
.macro apicinterrupt3 num sym do_sym
ENTRY(\sym)
- INTR_FRAME
ASM_CLAC
- pushq_cfi $~(\num)
+ pushq $~(\num)
.Lcommon_\sym:
interrupt \do_sym
- jmp ret_from_intr
- CFI_ENDPROC
+ jmp ret_from_intr
END(\sym)
.endm
@@ -898,55 +771,45 @@ trace_apicinterrupt \num \sym
.endm
#ifdef CONFIG_SMP
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \
- irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt3 REBOOT_VECTOR \
- reboot_interrupt smp_reboot_interrupt
+apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
+apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
#endif
#ifdef CONFIG_X86_UV
-apicinterrupt3 UV_BAU_MESSAGE \
- uv_bau_message_intr1 uv_bau_message_interrupt
+apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
#endif
-apicinterrupt LOCAL_TIMER_VECTOR \
- apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR \
- x86_platform_ipi smp_x86_platform_ipi
+
+apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
+apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
#ifdef CONFIG_HAVE_KVM
-apicinterrupt3 POSTED_INTR_VECTOR \
- kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
-apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR \
- kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
+apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
+apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
-apicinterrupt THRESHOLD_APIC_VECTOR \
- threshold_interrupt smp_threshold_interrupt
+apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
+#endif
+
+#ifdef CONFIG_X86_MCE_AMD
+apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
-apicinterrupt THERMAL_APIC_VECTOR \
- thermal_interrupt smp_thermal_interrupt
+apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
#endif
#ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
- call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR \
- call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR \
- reschedule_interrupt smp_reschedule_interrupt
+apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
+apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
+apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
#endif
-apicinterrupt ERROR_APIC_VECTOR \
- error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR \
- spurious_interrupt smp_spurious_interrupt
+apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
+apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
#ifdef CONFIG_IRQ_WORK
-apicinterrupt IRQ_WORK_VECTOR \
- irq_work_interrupt smp_irq_work_interrupt
+apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
#endif
/*
@@ -961,100 +824,87 @@ ENTRY(\sym)
.error "using shift_ist requires paranoid=1"
.endif
- .if \has_error_code
- XCPT_FRAME
- .else
- INTR_FRAME
- .endif
-
ASM_CLAC
PARAVIRT_ADJUST_EXCEPTION_FRAME
.ifeq \has_error_code
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
ALLOC_PT_GPREGS_ON_STACK
.if \paranoid
.if \paranoid == 1
- CFI_REMEMBER_STATE
- testb $3, CS(%rsp) /* If coming from userspace, switch */
- jnz 1f /* stacks. */
+ testb $3, CS(%rsp) /* If coming from userspace, switch stacks */
+ jnz 1f
.endif
- call paranoid_entry
+ call paranoid_entry
.else
- call error_entry
+ call error_entry
.endif
/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
- DEFAULT_FRAME 0
-
.if \paranoid
.if \shift_ist != -1
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
+ TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
.else
TRACE_IRQS_OFF
.endif
.endif
- movq %rsp,%rdi /* pt_regs pointer */
+ movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ movq ORIG_RAX(%rsp), %rsi /* get error code */
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.else
- xorl %esi,%esi /* no error code */
+ xorl %esi, %esi /* no error code */
.endif
.if \shift_ist != -1
- subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+ subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
- call \do_sym
+ call \do_sym
.if \shift_ist != -1
- addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
+ addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
/* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
- jmp paranoid_exit
+ jmp paranoid_exit
.else
- jmp error_exit
+ jmp error_exit
.endif
.if \paranoid == 1
- CFI_RESTORE_STATE
/*
* Paranoid entry from userspace. Switch stacks and treat it
* as a normal entry. This means that paranoid handlers
* run in real process context if user_mode(regs).
*/
1:
- call error_entry
+ call error_entry
- DEFAULT_FRAME 0
- movq %rsp,%rdi /* pt_regs pointer */
- call sync_regs
- movq %rax,%rsp /* switch stack */
+ movq %rsp, %rdi /* pt_regs pointer */
+ call sync_regs
+ movq %rax, %rsp /* switch stack */
- movq %rsp,%rdi /* pt_regs pointer */
+ movq %rsp, %rdi /* pt_regs pointer */
.if \has_error_code
- movq ORIG_RAX(%rsp),%rsi /* get error code */
- movq $-1,ORIG_RAX(%rsp) /* no syscall to restart */
+ movq ORIG_RAX(%rsp), %rsi /* get error code */
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
.else
- xorl %esi,%esi /* no error code */
+ xorl %esi, %esi /* no error code */
.endif
- call \do_sym
+ call \do_sym
- jmp error_exit /* %ebx: no swapgs flag */
+ jmp error_exit /* %ebx: no swapgs flag */
.endif
-
- CFI_ENDPROC
END(\sym)
.endm
@@ -1069,65 +919,58 @@ idtentry \sym \do_sym has_error_code=\has_error_code
.endm
#endif
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
-
- /* Reload gs selector with exception handling */
- /* edi: new selector */
+idtentry divide_error do_divide_error has_error_code=0
+idtentry overflow do_overflow has_error_code=0
+idtentry bounds do_bounds has_error_code=0
+idtentry invalid_op do_invalid_op has_error_code=0
+idtentry device_not_available do_device_not_available has_error_code=0
+idtentry double_fault do_double_fault has_error_code=1 paranoid=2
+idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
+idtentry invalid_TSS do_invalid_TSS has_error_code=1
+idtentry segment_not_present do_segment_not_present has_error_code=1
+idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
+idtentry coprocessor_error do_coprocessor_error has_error_code=0
+idtentry alignment_check do_alignment_check has_error_code=1
+idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
+
+
+ /*
+ * Reload gs selector with exception handling
+ * edi: new selector
+ */
ENTRY(native_load_gs_index)
- CFI_STARTPROC
- pushfq_cfi
+ pushfq
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
SWAPGS
gs_change:
- movl %edi,%gs
-2: mfence /* workaround */
+ movl %edi, %gs
+2: mfence /* workaround */
SWAPGS
- popfq_cfi
+ popfq
ret
- CFI_ENDPROC
END(native_load_gs_index)
- _ASM_EXTABLE(gs_change,bad_gs)
- .section .fixup,"ax"
+ _ASM_EXTABLE(gs_change, bad_gs)
+ .section .fixup, "ax"
/* running with kernelgs */
bad_gs:
- SWAPGS /* switch back to user gs */
- xorl %eax,%eax
- movl %eax,%gs
- jmp 2b
+ SWAPGS /* switch back to user gs */
+ xorl %eax, %eax
+ movl %eax, %gs
+ jmp 2b
.previous
/* Call softirq on interrupt stack. Interrupts are off. */
ENTRY(do_softirq_own_stack)
- CFI_STARTPROC
- pushq_cfi %rbp
- CFI_REL_OFFSET rbp,0
- mov %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- incl PER_CPU_VAR(irq_count)
- cmove PER_CPU_VAR(irq_stack_ptr),%rsp
- push %rbp # backlink for old unwinder
- call __do_softirq
+ pushq %rbp
+ mov %rsp, %rbp
+ incl PER_CPU_VAR(irq_count)
+ cmove PER_CPU_VAR(irq_stack_ptr), %rsp
+ push %rbp /* frame pointer backlink */
+ call __do_softirq
leaveq
- CFI_RESTORE rbp
- CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET -8
- decl PER_CPU_VAR(irq_count)
+ decl PER_CPU_VAR(irq_count)
ret
- CFI_ENDPROC
END(do_softirq_own_stack)
#ifdef CONFIG_XEN
@@ -1146,29 +989,24 @@ idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
* existing activation in its critical region -- if so, we pop the current
* activation and restart the handler using the previous one.
*/
-ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
- CFI_STARTPROC
+ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
+
/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
* see the correct pointer to the pt_regs
*/
- movq %rdi, %rsp # we don't return, adjust the stack frame
- CFI_ENDPROC
- DEFAULT_FRAME
-11: incl PER_CPU_VAR(irq_count)
- movq %rsp,%rbp
- CFI_DEF_CFA_REGISTER rbp
- cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
- pushq %rbp # backlink for old unwinder
- call xen_evtchn_do_upcall
- popq %rsp
- CFI_DEF_CFA_REGISTER rsp
- decl PER_CPU_VAR(irq_count)
+ movq %rdi, %rsp /* we don't return, adjust the stack frame */
+11: incl PER_CPU_VAR(irq_count)
+ movq %rsp, %rbp
+ cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp
+ pushq %rbp /* frame pointer backlink */
+ call xen_evtchn_do_upcall
+ popq %rsp
+ decl PER_CPU_VAR(irq_count)
#ifndef CONFIG_PREEMPT
- call xen_maybe_preempt_hcall
+ call xen_maybe_preempt_hcall
#endif
- jmp error_exit
- CFI_ENDPROC
+ jmp error_exit
END(xen_do_hypervisor_callback)
/*
@@ -1185,51 +1023,35 @@ END(xen_do_hypervisor_callback)
* with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
- INTR_FRAME 1 (6*8)
- /*CFI_REL_OFFSET gs,GS*/
- /*CFI_REL_OFFSET fs,FS*/
- /*CFI_REL_OFFSET es,ES*/
- /*CFI_REL_OFFSET ds,DS*/
- CFI_REL_OFFSET r11,8
- CFI_REL_OFFSET rcx,0
- movl %ds,%ecx
- cmpw %cx,0x10(%rsp)
- CFI_REMEMBER_STATE
- jne 1f
- movl %es,%ecx
- cmpw %cx,0x18(%rsp)
- jne 1f
- movl %fs,%ecx
- cmpw %cx,0x20(%rsp)
- jne 1f
- movl %gs,%ecx
- cmpw %cx,0x28(%rsp)
- jne 1f
+ movl %ds, %ecx
+ cmpw %cx, 0x10(%rsp)
+ jne 1f
+ movl %es, %ecx
+ cmpw %cx, 0x18(%rsp)
+ jne 1f
+ movl %fs, %ecx
+ cmpw %cx, 0x20(%rsp)
+ jne 1f
+ movl %gs, %ecx
+ cmpw %cx, 0x28(%rsp)
+ jne 1f
/* All segments match their saved values => Category 2 (Bad IRET). */
- movq (%rsp),%rcx
- CFI_RESTORE rcx
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $0 /* RIP */
- pushq_cfi %r11
- pushq_cfi %rcx
- jmp general_protection
- CFI_RESTORE_STATE
+ movq (%rsp), %rcx
+ movq 8(%rsp), %r11
+ addq $0x30, %rsp
+ pushq $0 /* RIP */
+ pushq %r11
+ pushq %rcx
+ jmp general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
- movq (%rsp),%rcx
- CFI_RESTORE rcx
- movq 8(%rsp),%r11
- CFI_RESTORE r11
- addq $0x30,%rsp
- CFI_ADJUST_CFA_OFFSET -0x30
- pushq_cfi $-1 /* orig_ax = -1 => not a system call */
+ movq (%rsp), %rcx
+ movq 8(%rsp), %r11
+ addq $0x30, %rsp
+ pushq $-1 /* orig_ax = -1 => not a system call */
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
- jmp error_exit
- CFI_ENDPROC
+ jmp error_exit
END(xen_failsafe_callback)
apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
@@ -1242,21 +1064,25 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
hyperv_callback_vector hyperv_vector_handler
#endif /* CONFIG_HYPERV */
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
-idtentry stack_segment do_stack_segment has_error_code=1
+idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
+idtentry stack_segment do_stack_segment has_error_code=1
+
#ifdef CONFIG_XEN
-idtentry xen_debug do_debug has_error_code=0
-idtentry xen_int3 do_int3 has_error_code=0
-idtentry xen_stack_segment do_stack_segment has_error_code=1
+idtentry xen_debug do_debug has_error_code=0
+idtentry xen_int3 do_int3 has_error_code=0
+idtentry xen_stack_segment do_stack_segment has_error_code=1
#endif
-idtentry general_protection do_general_protection has_error_code=1
-trace_idtentry page_fault do_page_fault has_error_code=1
+
+idtentry general_protection do_general_protection has_error_code=1
+trace_idtentry page_fault do_page_fault has_error_code=1
+
#ifdef CONFIG_KVM_GUEST
-idtentry async_page_fault do_async_page_fault has_error_code=1
+idtentry async_page_fault do_async_page_fault has_error_code=1
#endif
+
#ifdef CONFIG_X86_MCE
-idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
+idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
/*
@@ -1265,19 +1091,17 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(
* Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
- XCPT_FRAME 1 15*8
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
+ movl $1, %ebx
+ movl $MSR_GS_BASE, %ecx
rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
+ testl %edx, %edx
+ js 1f /* negative -> in kernel */
SWAPGS
- xorl %ebx,%ebx
+ xorl %ebx, %ebx
1: ret
- CFI_ENDPROC
END(paranoid_entry)
/*
@@ -1289,17 +1113,17 @@ END(paranoid_entry)
* in syscall entry), so checking for preemption here would
* be complicated. Fortunately, we there's no good reason
* to try to handle preemption here.
+ *
+ * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
*/
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(paranoid_exit)
- DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
- testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_exit_no_swapgs
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz paranoid_exit_no_swapgs
TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
+ jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
paranoid_exit_restore:
@@ -1307,24 +1131,24 @@ paranoid_exit_restore:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
- CFI_ENDPROC
END(paranoid_exit)
/*
* Save all registers in pt_regs, and switch gs if needed.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ * Return: EBX=0: came from user mode; EBX=1: otherwise
*/
ENTRY(error_entry)
- XCPT_FRAME 1 15*8
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
- xorl %ebx,%ebx
+ xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz error_kernelspace
-error_swapgs:
+
+ /* We entered from user mode */
SWAPGS
-error_sti:
+
+error_entry_done:
TRACE_IRQS_OFF
ret
@@ -1335,56 +1159,66 @@ error_sti:
* for these here too.
*/
error_kernelspace:
- CFI_REL_OFFSET rcx, RCX+8
- incl %ebx
- leaq native_irq_return_iret(%rip),%rcx
- cmpq %rcx,RIP+8(%rsp)
- je error_bad_iret
- movl %ecx,%eax /* zero extend */
- cmpq %rax,RIP+8(%rsp)
- je bstep_iret
- cmpq $gs_change,RIP+8(%rsp)
- je error_swapgs
- jmp error_sti
+ incl %ebx
+ leaq native_irq_return_iret(%rip), %rcx
+ cmpq %rcx, RIP+8(%rsp)
+ je error_bad_iret
+ movl %ecx, %eax /* zero extend */
+ cmpq %rax, RIP+8(%rsp)
+ je bstep_iret
+ cmpq $gs_change, RIP+8(%rsp)
+ jne error_entry_done
+
+ /*
+ * hack: gs_change can fail with user gsbase. If this happens, fix up
+ * gsbase and proceed. We'll fix up the exception and land in
+ * gs_change's error handler with kernel gsbase.
+ */
+ SWAPGS
+ jmp error_entry_done
bstep_iret:
/* Fix truncated RIP */
- movq %rcx,RIP+8(%rsp)
+ movq %rcx, RIP+8(%rsp)
/* fall through */
error_bad_iret:
+ /*
+ * We came from an IRET to user mode, so we have user gsbase.
+ * Switch to kernel gsbase:
+ */
SWAPGS
- mov %rsp,%rdi
- call fixup_bad_iret
- mov %rax,%rsp
- decl %ebx /* Return to usergs */
- jmp error_sti
- CFI_ENDPROC
+
+ /*
+ * Pretend that the exception came from user mode: set up pt_regs
+ * as if we faulted immediately after IRET and clear EBX so that
+ * error_exit knows that we will be returning to user mode.
+ */
+ mov %rsp, %rdi
+ call fixup_bad_iret
+ mov %rax, %rsp
+ decl %ebx
+ jmp error_entry_done
END(error_entry)
-/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
+/*
+ * On entry, EBS is a "return to kernel mode" flag:
+ * 1: already in kernel mode, don't need SWAPGS
+ * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode
+ */
ENTRY(error_exit)
- DEFAULT_FRAME
- movl %ebx,%eax
+ movl %ebx, %eax
RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- GET_THREAD_INFO(%rcx)
- testl %eax,%eax
- jnz retint_kernel
- LOCKDEP_SYS_EXIT_IRQ
- movl TI_flags(%rcx),%edx
- movl $_TIF_WORK_MASK,%edi
- andl %edi,%edx
- jnz retint_careful
- jmp retint_swapgs
- CFI_ENDPROC
+ testl %eax, %eax
+ jnz retint_kernel
+ jmp retint_user
END(error_exit)
/* Runs on exception stack */
ENTRY(nmi)
- INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
/*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
@@ -1419,22 +1253,21 @@ ENTRY(nmi)
*/
/* Use %rdx as our temp variable throughout */
- pushq_cfi %rdx
- CFI_REL_OFFSET rdx, 0
+ pushq %rdx
/*
* If %cs was not the kernel segment, then the NMI triggered in user
* space, which means it is definitely not nested.
*/
- cmpl $__KERNEL_CS, 16(%rsp)
- jne first_nmi
+ cmpl $__KERNEL_CS, 16(%rsp)
+ jne first_nmi
/*
* Check the special variable on the stack to see if NMIs are
* executing.
*/
- cmpl $1, -8(%rsp)
- je nested_nmi
+ cmpl $1, -8(%rsp)
+ je nested_nmi
/*
* Now test if the previous stack was an NMI stack.
@@ -1448,51 +1281,46 @@ ENTRY(nmi)
cmpq %rdx, 4*8(%rsp)
/* If the stack pointer is above the NMI stack, this is a normal NMI */
ja first_nmi
+
subq $EXCEPTION_STKSZ, %rdx
cmpq %rdx, 4*8(%rsp)
/* If it is below the NMI stack, it is a normal NMI */
jb first_nmi
/* Ah, it is within the NMI stack, treat it as nested */
- CFI_REMEMBER_STATE
-
nested_nmi:
/*
* Do nothing if we interrupted the fixup in repeat_nmi.
* It's about to repeat the NMI handler, so we are fine
* with ignoring this one.
*/
- movq $repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja 1f
- movq $end_repeat_nmi, %rdx
- cmpq 8(%rsp), %rdx
- ja nested_nmi_out
+ movq $repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja 1f
+ movq $end_repeat_nmi, %rdx
+ cmpq 8(%rsp), %rdx
+ ja nested_nmi_out
1:
/* Set up the interrupted NMIs stack to jump to repeat_nmi */
- leaq -1*8(%rsp), %rdx
- movq %rdx, %rsp
- CFI_ADJUST_CFA_OFFSET 1*8
- leaq -10*8(%rsp), %rdx
- pushq_cfi $__KERNEL_DS
- pushq_cfi %rdx
- pushfq_cfi
- pushq_cfi $__KERNEL_CS
- pushq_cfi $repeat_nmi
+ leaq -1*8(%rsp), %rdx
+ movq %rdx, %rsp
+ leaq -10*8(%rsp), %rdx
+ pushq $__KERNEL_DS
+ pushq %rdx
+ pushfq
+ pushq $__KERNEL_CS
+ pushq $repeat_nmi
/* Put stack back */
- addq $(6*8), %rsp
- CFI_ADJUST_CFA_OFFSET -6*8
+ addq $(6*8), %rsp
nested_nmi_out:
- popq_cfi %rdx
- CFI_RESTORE rdx
+ popq %rdx
/* No need to check faults here */
INTERRUPT_RETURN
- CFI_RESTORE_STATE
first_nmi:
/*
* Because nested NMIs will use the pushed location that we
@@ -1530,23 +1358,18 @@ first_nmi:
* is also used by nested NMIs and can not be trusted on exit.
*/
/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
- movq (%rsp), %rdx
- CFI_RESTORE rdx
+ movq (%rsp), %rdx
/* Set the NMI executing variable on the stack. */
- pushq_cfi $1
+ pushq $1
- /*
- * Leave room for the "copied" frame
- */
- subq $(5*8), %rsp
- CFI_ADJUST_CFA_OFFSET 5*8
+ /* Leave room for the "copied" frame */
+ subq $(5*8), %rsp
/* Copy the stack frame to the Saved frame */
.rept 5
- pushq_cfi 11*8(%rsp)
+ pushq 11*8(%rsp)
.endr
- CFI_DEF_CFA_OFFSET 5*8
/* Everything up to here is safe from nested NMIs */
@@ -1565,16 +1388,14 @@ repeat_nmi:
* is benign for the non-repeat case, where 1 was pushed just above
* to this very stack slot).
*/
- movq $1, 10*8(%rsp)
+ movq $1, 10*8(%rsp)
/* Make another copy, this one may be modified by nested NMIs */
- addq $(10*8), %rsp
- CFI_ADJUST_CFA_OFFSET -10*8
+ addq $(10*8), %rsp
.rept 5
- pushq_cfi -6*8(%rsp)
+ pushq -6*8(%rsp)
.endr
- subq $(5*8), %rsp
- CFI_DEF_CFA_OFFSET 5*8
+ subq $(5*8), %rsp
end_repeat_nmi:
/*
@@ -1582,7 +1403,7 @@ end_repeat_nmi:
* NMI if the first NMI took an exception and reset our iret stack
* so that we repeat another NMI.
*/
- pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
ALLOC_PT_GPREGS_ON_STACK
/*
@@ -1592,8 +1413,7 @@ end_repeat_nmi:
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
*/
- call paranoid_entry
- DEFAULT_FRAME 0
+ call paranoid_entry
/*
* Save off the CR2 register. If we take a page fault in the NMI then
@@ -1604,21 +1424,21 @@ end_repeat_nmi:
* origin fault. Save it off and restore it if it changes.
* Use the r12 callee-saved register.
*/
- movq %cr2, %r12
+ movq %cr2, %r12
/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
- movq %rsp,%rdi
- movq $-1,%rsi
- call do_nmi
+ movq %rsp, %rdi
+ movq $-1, %rsi
+ call do_nmi
/* Did the NMI take a page fault? Restore cr2 if it did */
- movq %cr2, %rcx
- cmpq %rcx, %r12
- je 1f
- movq %r12, %cr2
+ movq %cr2, %rcx
+ cmpq %rcx, %r12
+ je 1f
+ movq %r12, %cr2
1:
- testl %ebx,%ebx /* swapgs needed? */
- jnz nmi_restore
+ testl %ebx, %ebx /* swapgs needed? */
+ jnz nmi_restore
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
@@ -1628,15 +1448,11 @@ nmi_restore:
REMOVE_PT_GPREGS_FROM_STACK 6*8
/* Clear the NMI executing stack variable */
- movq $0, 5*8(%rsp)
- jmp irq_return
- CFI_ENDPROC
+ movq $0, 5*8(%rsp)
+ INTERRUPT_RETURN
END(nmi)
ENTRY(ignore_sysret)
- CFI_STARTPROC
- mov $-ENOSYS,%eax
+ mov $-ENOSYS, %eax
sysret
- CFI_ENDPROC
END(ignore_sysret)
-
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
new file mode 100644
index 000000000000..bb187a6a877c
--- /dev/null
+++ b/arch/x86/entry/entry_64_compat.S
@@ -0,0 +1,556 @@
+/*
+ * Compatibility mode system call entry point for x86-64.
+ *
+ * Copyright 2000-2002 Andi Kleen, SuSE Labs.
+ */
+#include "calling.h"
+#include <asm/asm-offsets.h>
+#include <asm/current.h>
+#include <asm/errno.h>
+#include <asm/ia32_unistd.h>
+#include <asm/thread_info.h>
+#include <asm/segment.h>
+#include <asm/irqflags.h>
+#include <asm/asm.h>
+#include <asm/smap.h>
+#include <linux/linkage.h>
+#include <linux/err.h>
+
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE 0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+# define sysexit_audit ia32_ret_from_sys_call
+# define sysretl_audit ia32_ret_from_sys_call
+#endif
+
+ .section .entry.text, "ax"
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_usergs_sysret32)
+ swapgs
+ sysretl
+ENDPROC(native_usergs_sysret32)
+#endif
+
+/*
+ * 32-bit SYSENTER instruction entry.
+ *
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(entry_SYSENTER_compat)
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
+ SWAPGS_UNSAFE_STACK
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
+ /* Zero-extending 32-bit regs, do not remove */
+ movl %ebp, %ebp
+ movl %eax, %eax
+
+ movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER32_DS /* pt_regs->ss */
+ pushq %rbp /* pt_regs->sp */
+ pushfq /* pt_regs->flags */
+ pushq $__USER32_CS /* pt_regs->cs */
+ pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */
+ pushq %rax /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ cld
+ sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
+
+ /*
+ * no need to do an access_ok check here because rbp has been
+ * 32-bit zero extended
+ */
+ ASM_STAC
+1: movl (%rbp), %ebp
+ _ASM_EXTABLE(1b, ia32_badarg)
+ ASM_CLAC
+
+ /*
+ * Sysenter doesn't filter flags, so we need to clear NT
+ * ourselves. To save a few cycles, we can check whether
+ * NT was set instead of doing an unconditional popfq.
+ */
+ testl $X86_EFLAGS_NT, EFLAGS(%rsp)
+ jnz sysenter_fix_flags
+sysenter_flags_fixed:
+
+ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz sysenter_tracesys
+
+sysenter_do_call:
+ /* 32-bit syscall -> 64-bit C ABI argument conversion */
+ movl %edi, %r8d /* arg5 */
+ movl %ebp, %r9d /* arg6 */
+ xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx, %edi /* arg1 */
+ movl %edx, %edx /* arg3 (zero extension) */
+sysenter_dispatch:
+ cmpq $(IA32_NR_syscalls-1), %rax
+ ja 1f
+ call *ia32_sys_call_table(, %rax, 8)
+ movq %rax, RAX(%rsp)
+1:
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz sysexit_audit
+sysexit_from_sys_call:
+ /*
+ * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
+ * NMI between STI and SYSEXIT has poorly specified behavior,
+ * and and NMI followed by an IRQ with usergs is fatal. So
+ * we just pretend we're using SYSEXIT but we really use
+ * SYSRETL instead.
+ *
+ * This code path is still called 'sysexit' because it pairs
+ * with 'sysenter' and it uses the SYSENTER calling convention.
+ */
+ andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ movl RIP(%rsp), %ecx /* User %eip */
+ RESTORE_RSI_RDI
+ xorl %edx, %edx /* Do not leak kernel information */
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r10
+ movl EFLAGS(%rsp), %r11d /* User eflags */
+ TRACE_IRQS_ON
+
+ /*
+ * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
+ * since it avoids a dicey window with interrupts enabled.
+ */
+ movl RSP(%rsp), %esp
+
+ /*
+ * USERGS_SYSRET32 does:
+ * gsbase = user's gs base
+ * eip = ecx
+ * rflags = r11
+ * cs = __USER32_CS
+ * ss = __USER_DS
+ *
+ * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
+ *
+ * pop %ebp
+ * pop %edx
+ * pop %ecx
+ *
+ * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
+ * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
+ * address (already known to user code), and R12-R15 are
+ * callee-saved and therefore don't contain any interesting
+ * kernel data.
+ */
+ USERGS_SYSRET32
+
+#ifdef CONFIG_AUDITSYSCALL
+ .macro auditsys_entry_common
+ /*
+ * At this point, registers hold syscall args in the 32-bit syscall ABI:
+ * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP.
+ *
+ * We want to pass them to __audit_syscall_entry(), which is a 64-bit
+ * C function with 5 parameters, so shuffle them to match what
+ * the function expects: RDI,RSI,RDX,RCX,R8.
+ */
+ movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */
+ xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */
+ /* arg3 (RDX) <= 2nd syscall arg (ECX) */
+ movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */
+ movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */
+ call __audit_syscall_entry
+
+ /*
+ * We are going to jump back to the syscall dispatch code.
+ * Prepare syscall args as required by the 64-bit C ABI.
+ * Registers clobbered by __audit_syscall_entry() are
+ * loaded from pt_regs on stack:
+ */
+ movl ORIG_RAX(%rsp), %eax /* syscall number */
+ movl %ebx, %edi /* arg1 */
+ movl RCX(%rsp), %esi /* arg2 */
+ movl RDX(%rsp), %edx /* arg3 */
+ movl RSI(%rsp), %ecx /* arg4 */
+ movl RDI(%rsp), %r8d /* arg5 */
+ movl %ebp, %r9d /* arg6 */
+ .endm
+
+ .macro auditsys_exit exit
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz ia32_ret_from_sys_call
+ TRACE_IRQS_ON
+ ENABLE_INTERRUPTS(CLBR_NONE)
+ movl %eax, %esi /* second arg, syscall return value */
+ cmpl $-MAX_ERRNO, %eax /* is it an error ? */
+ jbe 1f
+ movslq %eax, %rsi /* if error sign extend to 64 bits */
+1: setbe %al /* 1 if error, 0 if not */
+ movzbl %al, %edi /* zero-extend that into %edi */
+ call __audit_syscall_exit
+ movq RAX(%rsp), %rax /* reload syscall return value */
+ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jz \exit
+ xorl %eax, %eax /* Do not leak kernel information */
+ movq %rax, R11(%rsp)
+ movq %rax, R10(%rsp)
+ movq %rax, R9(%rsp)
+ movq %rax, R8(%rsp)
+ jmp int_with_check
+ .endm
+
+sysenter_auditsys:
+ auditsys_entry_common
+ jmp sysenter_dispatch
+
+sysexit_audit:
+ auditsys_exit sysexit_from_sys_call
+#endif
+
+sysenter_fix_flags:
+ pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
+ popfq
+ jmp sysenter_flags_fixed
+
+sysenter_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jz sysenter_auditsys
+#endif
+ SAVE_EXTRA_REGS
+ xorl %eax, %eax /* Do not leak kernel information */
+ movq %rax, R11(%rsp)
+ movq %rax, R10(%rsp)
+ movq %rax, R9(%rsp)
+ movq %rax, R8(%rsp)
+ movq %rsp, %rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+
+ /* Reload arg registers from stack. (see sysenter_tracesys) */
+ movl RCX(%rsp), %ecx
+ movl RDX(%rsp), %edx
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
+ movl %eax, %eax /* zero extension */
+
+ RESTORE_EXTRA_REGS
+ jmp sysenter_do_call
+ENDPROC(entry_SYSENTER_compat)
+
+/*
+ * 32-bit SYSCALL instruction entry.
+ *
+ * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
+ * Arguments:
+ * eax system call number
+ * ecx return address
+ * ebx arg1
+ * ebp arg2 (note: not saved in the stack frame, should not be touched)
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * esp user stack
+ * 0(%esp) arg6
+ *
+ * This is purely a fast path. For anything complicated we use the int 0x80
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
+ENTRY(entry_SYSCALL_compat)
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
+ SWAPGS_UNSAFE_STACK
+ movl %esp, %r8d
+ movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
+ /* Zero-extending 32-bit regs, do not remove */
+ movl %eax, %eax
+
+ /* Construct struct pt_regs on stack */
+ pushq $__USER32_DS /* pt_regs->ss */
+ pushq %r8 /* pt_regs->sp */
+ pushq %r11 /* pt_regs->flags */
+ pushq $__USER32_CS /* pt_regs->cs */
+ pushq %rcx /* pt_regs->ip */
+ pushq %rax /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rbp /* pt_regs->cx */
+ movl %ebp, %ecx
+ pushq $-ENOSYS /* pt_regs->ax */
+ sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
+
+ /*
+ * No need to do an access_ok check here because r8 has been
+ * 32-bit zero extended:
+ */
+ ASM_STAC
+1: movl (%r8), %ebp
+ _ASM_EXTABLE(1b, ia32_badarg)
+ ASM_CLAC
+ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz cstar_tracesys
+
+cstar_do_call:
+ /* 32-bit syscall -> 64-bit C ABI argument conversion */
+ movl %edi, %r8d /* arg5 */
+ movl %ebp, %r9d /* arg6 */
+ xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx, %edi /* arg1 */
+ movl %edx, %edx /* arg3 (zero extension) */
+
+cstar_dispatch:
+ cmpq $(IA32_NR_syscalls-1), %rax
+ ja 1f
+
+ call *ia32_sys_call_table(, %rax, 8)
+ movq %rax, RAX(%rsp)
+1:
+ movl RCX(%rsp), %ebp
+ DISABLE_INTERRUPTS(CLBR_NONE)
+ TRACE_IRQS_OFF
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz sysretl_audit
+
+sysretl_from_sys_call:
+ andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ RESTORE_RSI_RDI_RDX
+ movl RIP(%rsp), %ecx
+ movl EFLAGS(%rsp), %r11d
+ xorq %r10, %r10
+ xorq %r9, %r9
+ xorq %r8, %r8
+ TRACE_IRQS_ON
+ movl RSP(%rsp), %esp
+ /*
+ * 64-bit->32-bit SYSRET restores eip from ecx,
+ * eflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ * (Note: 32-bit->32-bit SYSRET is different: since r11
+ * does not exist, it merely sets eflags.IF=1).
+ *
+ * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
+ * descriptor is not reinitialized. This means that we must
+ * avoid SYSRET with SS == NULL, which could happen if we schedule,
+ * exit the kernel, and re-enter using an interrupt vector. (All
+ * interrupt entries on x86_64 set SS to NULL.) We prevent that
+ * from happening by reloading SS in __switch_to.
+ */
+ USERGS_SYSRET32
+
+#ifdef CONFIG_AUDITSYSCALL
+cstar_auditsys:
+ auditsys_entry_common
+ jmp cstar_dispatch
+
+sysretl_audit:
+ auditsys_exit sysretl_from_sys_call
+#endif
+
+cstar_tracesys:
+#ifdef CONFIG_AUDITSYSCALL
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jz cstar_auditsys
+#endif
+ SAVE_EXTRA_REGS
+ xorl %eax, %eax /* Do not leak kernel information */
+ movq %rax, R11(%rsp)
+ movq %rax, R10(%rsp)
+ movq %rax, R9(%rsp)
+ movq %rax, R8(%rsp)
+ movq %rsp, %rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+
+ /* Reload arg registers from stack. (see sysenter_tracesys) */
+ movl RCX(%rsp), %ecx
+ movl RDX(%rsp), %edx
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
+ movl %eax, %eax /* zero extension */
+
+ RESTORE_EXTRA_REGS
+ jmp cstar_do_call
+END(entry_SYSCALL_compat)
+
+ia32_badarg:
+ ASM_CLAC
+ movq $-EFAULT, RAX(%rsp)
+ia32_ret_from_sys_call:
+ xorl %eax, %eax /* Do not leak kernel information */
+ movq %rax, R11(%rsp)
+ movq %rax, R10(%rsp)
+ movq %rax, R9(%rsp)
+ movq %rax, R8(%rsp)
+ jmp int_ret_from_sys_call
+
+/*
+ * Emulated IA32 system calls via int 0x80.
+ *
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp arg6 (note: not saved in the stack frame, should not be touched)
+ *
+ * Notes:
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
+ * Arguments are zero extended. For system calls that want sign extension and
+ * take long arguments a wrapper is needed. Most calls can just be called
+ * directly.
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
+
+ENTRY(entry_INT80_compat)
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
+ SWAPGS
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
+ /* Zero-extending 32-bit regs, do not remove */
+ movl %eax, %eax
+
+ /* Construct struct pt_regs on stack (iret frame is already on stack) */
+ pushq %rax /* pt_regs->orig_ax */
+ pushq %rdi /* pt_regs->di */
+ pushq %rsi /* pt_regs->si */
+ pushq %rdx /* pt_regs->dx */
+ pushq %rcx /* pt_regs->cx */
+ pushq $-ENOSYS /* pt_regs->ax */
+ pushq $0 /* pt_regs->r8 */
+ pushq $0 /* pt_regs->r9 */
+ pushq $0 /* pt_regs->r10 */
+ pushq $0 /* pt_regs->r11 */
+ cld
+ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
+
+ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz ia32_tracesys
+
+ia32_do_call:
+ /* 32-bit syscall -> 64-bit C ABI argument conversion */
+ movl %edi, %r8d /* arg5 */
+ movl %ebp, %r9d /* arg6 */
+ xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx, %edi /* arg1 */
+ movl %edx, %edx /* arg3 (zero extension) */
+ cmpq $(IA32_NR_syscalls-1), %rax
+ ja 1f
+
+ call *ia32_sys_call_table(, %rax, 8)
+ movq %rax, RAX(%rsp)
+1:
+ jmp int_ret_from_sys_call
+
+ia32_tracesys:
+ SAVE_EXTRA_REGS
+ movq %rsp, %rdi /* &pt_regs -> arg1 */
+ call syscall_trace_enter
+ /*
+ * Reload arg registers from stack in case ptrace changed them.
+ * Don't reload %eax because syscall_trace_enter() returned
+ * the %rax value we should see. But do truncate it to 32 bits.
+ * If it's -1 to make us punt the syscall, then (u32)-1 is still
+ * an appropriately invalid value.
+ */
+ movl RCX(%rsp), %ecx
+ movl RDX(%rsp), %edx
+ movl RSI(%rsp), %esi
+ movl RDI(%rsp), %edi
+ movl %eax, %eax /* zero extension */
+ RESTORE_EXTRA_REGS
+ jmp ia32_do_call
+END(entry_INT80_compat)
+
+ .macro PTREGSCALL label, func
+ ALIGN
+GLOBAL(\label)
+ leaq \func(%rip), %rax
+ jmp ia32_ptregs_common
+ .endm
+
+ PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
+ PTREGSCALL stub32_sigreturn, sys32_sigreturn
+ PTREGSCALL stub32_fork, sys_fork
+ PTREGSCALL stub32_vfork, sys_vfork
+
+ ALIGN
+GLOBAL(stub32_clone)
+ leaq sys_clone(%rip), %rax
+ /*
+ * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
+ * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
+ *
+ * The native 64-bit kernel's sys_clone() implements the latter,
+ * so we need to swap arguments here before calling it:
+ */
+ xchg %r8, %rcx
+ jmp ia32_ptregs_common
+
+ ALIGN
+ia32_ptregs_common:
+ SAVE_EXTRA_REGS 8
+ call *%rax
+ RESTORE_EXTRA_REGS 8
+ ret
+END(ia32_ptregs_common)
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/entry/syscall_32.c
index 3777189c4a19..8ea34f94e973 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -10,7 +10,7 @@
#else
#define SYM(sym, compat) sym
#define ia32_sys_call_table sys_call_table
-#define __NR_ia32_syscall_max __NR_syscall_max
+#define __NR_syscall_compat_max __NR_syscall_max
#endif
#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
@@ -23,11 +23,11 @@ typedef asmlinkage void (*sys_call_ptr_t)(void);
extern asmlinkage void sys_ni_syscall(void);
-__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
- [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
+ [0 ... __NR_syscall_compat_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/entry/syscall_64.c
index 4ac730b37f0b..4ac730b37f0b 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
diff --git a/arch/x86/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
index a55abb9f6c5e..57aa59fd140c 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/entry/syscalls/Makefile
@@ -1,5 +1,5 @@
-out := $(obj)/../include/generated/asm
-uapi := $(obj)/../include/generated/uapi/asm
+out := $(obj)/../../include/generated/asm
+uapi := $(obj)/../../include/generated/uapi/asm
# Create output directory if not already present
_dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ef8187f9d28d..ef8187f9d28d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 9ef32d5f1b19..9ef32d5f1b19 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
diff --git a/arch/x86/syscalls/syscallhdr.sh b/arch/x86/entry/syscalls/syscallhdr.sh
index 31fd5f1f38f7..31fd5f1f38f7 100644
--- a/arch/x86/syscalls/syscallhdr.sh
+++ b/arch/x86/entry/syscalls/syscallhdr.sh
diff --git a/arch/x86/syscalls/syscalltbl.sh b/arch/x86/entry/syscalls/syscalltbl.sh
index 0e7f8ec071e7..0e7f8ec071e7 100644
--- a/arch/x86/syscalls/syscalltbl.sh
+++ b/arch/x86/entry/syscalls/syscalltbl.sh
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/entry/thunk_32.S
index 5eb715087b80..e9acf5f4fc92 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -6,16 +6,14 @@
*/
#include <linux/linkage.h>
#include <asm/asm.h>
- #include <asm/dwarf2.h>
/* put return address in eax (arg1) */
.macro THUNK name, func, put_ret_addr_in_eax=0
.globl \name
\name:
- CFI_STARTPROC
- pushl_cfi_reg eax
- pushl_cfi_reg ecx
- pushl_cfi_reg edx
+ pushl %eax
+ pushl %ecx
+ pushl %edx
.if \put_ret_addr_in_eax
/* Place EIP in the arg1 */
@@ -23,11 +21,10 @@
.endif
call \func
- popl_cfi_reg edx
- popl_cfi_reg ecx
- popl_cfi_reg eax
+ popl %edx
+ popl %ecx
+ popl %eax
ret
- CFI_ENDPROC
_ASM_NOKPROBE(\name)
.endm
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/entry/thunk_64.S
index f89ba4e93025..3e95681b4e2d 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -6,35 +6,32 @@
* Subject to the GNU public license, v.2. No warranty of any kind.
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
+#include "calling.h"
#include <asm/asm.h>
/* rdi: arg1 ... normal C conventions. rax is saved/restored. */
.macro THUNK name, func, put_ret_addr_in_rdi=0
.globl \name
\name:
- CFI_STARTPROC
/* this one pushes 9 elems, the next one would be %rIP */
- pushq_cfi_reg rdi
- pushq_cfi_reg rsi
- pushq_cfi_reg rdx
- pushq_cfi_reg rcx
- pushq_cfi_reg rax
- pushq_cfi_reg r8
- pushq_cfi_reg r9
- pushq_cfi_reg r10
- pushq_cfi_reg r11
+ pushq %rdi
+ pushq %rsi
+ pushq %rdx
+ pushq %rcx
+ pushq %rax
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
.if \put_ret_addr_in_rdi
/* 9*8(%rsp) is return addr on stack */
- movq_cfi_restore 9*8, rdi
+ movq 9*8(%rsp), %rdi
.endif
call \func
jmp restore
- CFI_ENDPROC
_ASM_NOKPROBE(\name)
.endm
@@ -57,19 +54,16 @@
#if defined(CONFIG_TRACE_IRQFLAGS) \
|| defined(CONFIG_DEBUG_LOCK_ALLOC) \
|| defined(CONFIG_PREEMPT)
- CFI_STARTPROC
- CFI_ADJUST_CFA_OFFSET 9*8
restore:
- popq_cfi_reg r11
- popq_cfi_reg r10
- popq_cfi_reg r9
- popq_cfi_reg r8
- popq_cfi_reg rax
- popq_cfi_reg rcx
- popq_cfi_reg rdx
- popq_cfi_reg rsi
- popq_cfi_reg rdi
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+ popq %rax
+ popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
ret
- CFI_ENDPROC
_ASM_NOKPROBE(restore)
#endif
diff --git a/arch/x86/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore
index aae8ffdd5880..aae8ffdd5880 100644
--- a/arch/x86/vdso/.gitignore
+++ b/arch/x86/entry/vdso/.gitignore
diff --git a/arch/x86/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index e97032069f88..e97032069f88 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh
index 7ee90a9b549d..7ee90a9b549d 100755
--- a/arch/x86/vdso/checkundef.sh
+++ b/arch/x86/entry/vdso/checkundef.sh
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 9793322751e0..9793322751e0 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921025f5..de2c921025f5 100644
--- a/arch/x86/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S
index 79a071e4357e..79a071e4357e 100644
--- a/arch/x86/vdso/vdso-note.S
+++ b/arch/x86/entry/vdso/vdso-note.S
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S
index 6807932643c2..6807932643c2 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/entry/vdso/vdso.lds.S
diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 8627db24a7f6..8627db24a7f6 100644
--- a/arch/x86/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h
index 0224987556ce..0224987556ce 100644
--- a/arch/x86/vdso/vdso2c.h
+++ b/arch/x86/entry/vdso/vdso2c.h
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index e904c270573b..e904c270573b 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore
index e45fba9d0ced..e45fba9d0ced 100644
--- a/arch/x86/vdso/vdso32/.gitignore
+++ b/arch/x86/entry/vdso/vdso32/.gitignore
diff --git a/arch/x86/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S
index b15b7c01aedb..b15b7c01aedb 100644
--- a/arch/x86/vdso/vdso32/int80.S
+++ b/arch/x86/entry/vdso/vdso32/int80.S
diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S
index c83f25734696..c83f25734696 100644
--- a/arch/x86/vdso/vdso32/note.S
+++ b/arch/x86/entry/vdso/vdso32/note.S
diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S
index d7ec4e251c0a..d7ec4e251c0a 100644
--- a/arch/x86/vdso/vdso32/sigreturn.S
+++ b/arch/x86/entry/vdso/vdso32/sigreturn.S
diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S
index 6b286bb5251c..6b286bb5251c 100644
--- a/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/entry/vdso/vdso32/syscall.S
diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S
index e354bceee0e0..e354bceee0e0 100644
--- a/arch/x86/vdso/vdso32/sysenter.S
+++ b/arch/x86/entry/vdso/vdso32/sysenter.S
diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
index 175cc72c0f68..175cc72c0f68 100644
--- a/arch/x86/vdso/vdso32/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c
diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
index 541468e25265..541468e25265 100644
--- a/arch/x86/vdso/vdso32/vdso-fakesections.c
+++ b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c
diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
index 31056cf294bf..31056cf294bf 100644
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S
diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S
index 697c11ece90c..697c11ece90c 100644
--- a/arch/x86/vdso/vdsox32.lds.S
+++ b/arch/x86/entry/vdso/vdsox32.lds.S
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index 8ec3d1f4ce9a..8ec3d1f4ce9a 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
diff --git a/arch/x86/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 1c9f750c3859..1c9f750c3859 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile
new file mode 100644
index 000000000000..a9f4856f622a
--- /dev/null
+++ b/arch/x86/entry/vsyscall/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for the x86 low level vsyscall code
+#
+obj-y := vsyscall_gtod.o
+
+obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
+
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 2dcc6ff6fdcc..2dcc6ff6fdcc 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
index c9596a9af159..c9596a9af159 100644
--- a/arch/x86/kernel/vsyscall_emu_64.S
+++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c
index 51e330416995..51e330416995 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c
diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h
index a8b2edec54fe..9dd7359a38a8 100644
--- a/arch/x86/kernel/vsyscall_trace.h
+++ b/arch/x86/entry/vsyscall/vsyscall_trace.h
@@ -24,6 +24,6 @@ TRACE_EVENT(emulate_vsyscall,
#endif
#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH ../../arch/x86/kernel
+#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/
#define TRACE_INCLUDE_FILE vsyscall_trace
#include <trace/define_trace.h>
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index bb635c641869..cd4339bae066 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -2,7 +2,7 @@
# Makefile for the ia32 kernel emulation subsystem.
#
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
+obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o
obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
deleted file mode 100644
index 63450a596800..000000000000
--- a/arch/x86/ia32/ia32entry.S
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Compatibility mode system call entry point for x86-64.
- *
- * Copyright 2000-2002 Andi Kleen, SuSE Labs.
- */
-
-#include <asm/dwarf2.h>
-#include <asm/calling.h>
-#include <asm/asm-offsets.h>
-#include <asm/current.h>
-#include <asm/errno.h>
-#include <asm/ia32_unistd.h>
-#include <asm/thread_info.h>
-#include <asm/segment.h>
-#include <asm/irqflags.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-#include <linux/linkage.h>
-#include <linux/err.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysexit_audit ia32_ret_from_sys_call
-#define sysretl_audit ia32_ret_from_sys_call
-#endif
-
- .section .entry.text, "ax"
-
- /* clobbers %rax */
- .macro CLEAR_RREGS _r9=rax
- xorl %eax,%eax
- movq %rax,R11(%rsp)
- movq %rax,R10(%rsp)
- movq %\_r9,R9(%rsp)
- movq %rax,R8(%rsp)
- .endm
-
- /*
- * Reload arg registers from stack in case ptrace changed them.
- * We don't reload %eax because syscall_trace_enter() returned
- * the %rax value we should see. Instead, we just truncate that
- * value to 32 bits again as we did on entry from user mode.
- * If it's a new value set by user_regset during entry tracing,
- * this matches the normal truncation of the user-mode value.
- * If it's -1 to make us punt the syscall, then (u32)-1 is still
- * an appropriately invalid value.
- */
- .macro LOAD_ARGS32 _r9=0
- .if \_r9
- movl R9(%rsp),%r9d
- .endif
- movl RCX(%rsp),%ecx
- movl RDX(%rsp),%edx
- movl RSI(%rsp),%esi
- movl RDI(%rsp),%edi
- movl %eax,%eax /* zero extension */
- .endm
-
- .macro CFI_STARTPROC32 simple
- CFI_STARTPROC \simple
- CFI_UNDEFINED r8
- CFI_UNDEFINED r9
- CFI_UNDEFINED r10
- CFI_UNDEFINED r11
- CFI_UNDEFINED r12
- CFI_UNDEFINED r13
- CFI_UNDEFINED r14
- CFI_UNDEFINED r15
- .endm
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_usergs_sysret32)
- swapgs
- sysretl
-ENDPROC(native_usergs_sysret32)
-#endif
-
-/*
- * 32bit SYSENTER instruction entry.
- *
- * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
- * IF and VM in rflags are cleared (IOW: interrupts are off).
- * SYSENTER does not save anything on the stack,
- * and does not save old rip (!!!) and rflags.
- *
- * Arguments:
- * eax system call number
- * ebx arg1
- * ecx arg2
- * edx arg3
- * esi arg4
- * edi arg5
- * ebp user stack
- * 0(%ebp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
-ENTRY(ia32_sysenter_target)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,0
- CFI_REGISTER rsp,rbp
-
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
- SWAPGS_UNSAFE_STACK
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- ENABLE_INTERRUPTS(CLBR_NONE)
-
- /* Zero-extending 32-bit regs, do not remove */
- movl %ebp, %ebp
- movl %eax, %eax
-
- movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
- CFI_REGISTER rip,r10
-
- /* Construct struct pt_regs on stack */
- pushq_cfi $__USER32_DS /* pt_regs->ss */
- pushq_cfi %rbp /* pt_regs->sp */
- CFI_REL_OFFSET rsp,0
- pushfq_cfi /* pt_regs->flags */
- pushq_cfi $__USER32_CS /* pt_regs->cs */
- pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */
- CFI_REL_OFFSET rip,0
- pushq_cfi_reg rax /* pt_regs->orig_ax */
- pushq_cfi_reg rdi /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rcx /* pt_regs->cx */
- pushq_cfi $-ENOSYS /* pt_regs->ax */
- cld
- sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 10*8
-
- /*
- * no need to do an access_ok check here because rbp has been
- * 32bit zero extended
- */
- ASM_STAC
-1: movl (%rbp),%ebp
- _ASM_EXTABLE(1b,ia32_badarg)
- ASM_CLAC
-
- /*
- * Sysenter doesn't filter flags, so we need to clear NT
- * ourselves. To save a few cycles, we can check whether
- * NT was set instead of doing an unconditional popfq.
- */
- testl $X86_EFLAGS_NT,EFLAGS(%rsp)
- jnz sysenter_fix_flags
-sysenter_flags_fixed:
-
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- CFI_REMEMBER_STATE
- jnz sysenter_tracesys
-sysenter_do_call:
- /* 32bit syscall -> 64bit C ABI argument conversion */
- movl %edi,%r8d /* arg5 */
- movl %ebp,%r9d /* arg6 */
- xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
- movl %ebx,%edi /* arg1 */
- movl %edx,%edx /* arg3 (zero extension) */
-sysenter_dispatch:
- cmpq $(IA32_NR_syscalls-1),%rax
- ja 1f
- call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX(%rsp)
-1:
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysexit_audit
-sysexit_from_sys_call:
- /*
- * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
- * NMI between STI and SYSEXIT has poorly specified behavior,
- * and and NMI followed by an IRQ with usergs is fatal. So
- * we just pretend we're using SYSEXIT but we really use
- * SYSRETL instead.
- *
- * This code path is still called 'sysexit' because it pairs
- * with 'sysenter' and it uses the SYSENTER calling convention.
- */
- andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- movl RIP(%rsp),%ecx /* User %eip */
- CFI_REGISTER rip,rcx
- RESTORE_RSI_RDI
- xorl %edx,%edx /* avoid info leaks */
- xorq %r8,%r8
- xorq %r9,%r9
- xorq %r10,%r10
- movl EFLAGS(%rsp),%r11d /* User eflags */
- /*CFI_RESTORE rflags*/
- TRACE_IRQS_ON
-
- /*
- * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
- * since it avoids a dicey window with interrupts enabled.
- */
- movl RSP(%rsp),%esp
-
- /*
- * USERGS_SYSRET32 does:
- * gsbase = user's gs base
- * eip = ecx
- * rflags = r11
- * cs = __USER32_CS
- * ss = __USER_DS
- *
- * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
- *
- * pop %ebp
- * pop %edx
- * pop %ecx
- *
- * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
- * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
- * address (already known to user code), and R12-R15 are
- * callee-saved and therefore don't contain any interesting
- * kernel data.
- */
- USERGS_SYSRET32
-
- CFI_RESTORE_STATE
-
-#ifdef CONFIG_AUDITSYSCALL
- .macro auditsys_entry_common
- movl %esi,%r8d /* 5th arg: 4th syscall arg */
- movl %ecx,%r9d /*swap with edx*/
- movl %edx,%ecx /* 4th arg: 3rd syscall arg */
- movl %r9d,%edx /* 3rd arg: 2nd syscall arg */
- movl %ebx,%esi /* 2nd arg: 1st syscall arg */
- movl %eax,%edi /* 1st arg: syscall number */
- call __audit_syscall_entry
- movl ORIG_RAX(%rsp),%eax /* reload syscall number */
- movl %ebx,%edi /* reload 1st syscall arg */
- movl RCX(%rsp),%esi /* reload 2nd syscall arg */
- movl RDX(%rsp),%edx /* reload 3rd syscall arg */
- movl RSI(%rsp),%ecx /* reload 4th syscall arg */
- movl RDI(%rsp),%r8d /* reload 5th syscall arg */
- .endm
-
- .macro auditsys_exit exit
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_ret_from_sys_call
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- movl %eax,%esi /* second arg, syscall return value */
- cmpl $-MAX_ERRNO,%eax /* is it an error ? */
- jbe 1f
- movslq %eax, %rsi /* if error sign extend to 64 bits */
-1: setbe %al /* 1 if error, 0 if not */
- movzbl %al,%edi /* zero-extend that into %edi */
- call __audit_syscall_exit
- movq RAX(%rsp),%rax /* reload syscall return value */
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz \exit
- CLEAR_RREGS
- jmp int_with_check
- .endm
-
-sysenter_auditsys:
- auditsys_entry_common
- movl %ebp,%r9d /* reload 6th syscall arg */
- jmp sysenter_dispatch
-
-sysexit_audit:
- auditsys_exit sysexit_from_sys_call
-#endif
-
-sysenter_fix_flags:
- pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
- popfq_cfi
- jmp sysenter_flags_fixed
-
-sysenter_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz sysenter_auditsys
-#endif
- SAVE_EXTRA_REGS
- CLEAR_RREGS
- movq %rsp,%rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
- RESTORE_EXTRA_REGS
- jmp sysenter_do_call
- CFI_ENDPROC
-ENDPROC(ia32_sysenter_target)
-
-/*
- * 32bit SYSCALL instruction entry.
- *
- * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
- * then loads new ss, cs, and rip from previously programmed MSRs.
- * rflags gets masked by a value from another MSR (so CLD and CLAC
- * are not needed). SYSCALL does not save anything on the stack
- * and does not change rsp.
- *
- * Note: rflags saving+masking-with-MSR happens only in Long mode
- * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
- * Don't get confused: rflags saving+masking depends on Long Mode Active bit
- * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
- * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
- *
- * Arguments:
- * eax system call number
- * ecx return address
- * ebx arg1
- * ebp arg2 (note: not saved in the stack frame, should not be touched)
- * edx arg3
- * esi arg4
- * edi arg5
- * esp user stack
- * 0(%esp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
-ENTRY(ia32_cstar_target)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,0
- CFI_REGISTER rip,rcx
- /*CFI_REGISTER rflags,r11*/
-
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
- SWAPGS_UNSAFE_STACK
- movl %esp,%r8d
- CFI_REGISTER rsp,r8
- movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp
- ENABLE_INTERRUPTS(CLBR_NONE)
-
- /* Zero-extending 32-bit regs, do not remove */
- movl %eax,%eax
-
- /* Construct struct pt_regs on stack */
- pushq_cfi $__USER32_DS /* pt_regs->ss */
- pushq_cfi %r8 /* pt_regs->sp */
- CFI_REL_OFFSET rsp,0
- pushq_cfi %r11 /* pt_regs->flags */
- pushq_cfi $__USER32_CS /* pt_regs->cs */
- pushq_cfi %rcx /* pt_regs->ip */
- CFI_REL_OFFSET rip,0
- pushq_cfi_reg rax /* pt_regs->orig_ax */
- pushq_cfi_reg rdi /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rbp /* pt_regs->cx */
- movl %ebp,%ecx
- pushq_cfi $-ENOSYS /* pt_regs->ax */
- sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 10*8
-
- /*
- * no need to do an access_ok check here because r8 has been
- * 32bit zero extended
- */
- ASM_STAC
-1: movl (%r8),%r9d
- _ASM_EXTABLE(1b,ia32_badarg)
- ASM_CLAC
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- CFI_REMEMBER_STATE
- jnz cstar_tracesys
-cstar_do_call:
- /* 32bit syscall -> 64bit C ABI argument conversion */
- movl %edi,%r8d /* arg5 */
- /* r9 already loaded */ /* arg6 */
- xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
- movl %ebx,%edi /* arg1 */
- movl %edx,%edx /* arg3 (zero extension) */
-cstar_dispatch:
- cmpq $(IA32_NR_syscalls-1),%rax
- ja 1f
- call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX(%rsp)
-1:
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysretl_audit
-sysretl_from_sys_call:
- andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- RESTORE_RSI_RDI_RDX
- movl RIP(%rsp),%ecx
- CFI_REGISTER rip,rcx
- movl EFLAGS(%rsp),%r11d
- /*CFI_REGISTER rflags,r11*/
- xorq %r10,%r10
- xorq %r9,%r9
- xorq %r8,%r8
- TRACE_IRQS_ON
- movl RSP(%rsp),%esp
- CFI_RESTORE rsp
- /*
- * 64bit->32bit SYSRET restores eip from ecx,
- * eflags from r11 (but RF and VM bits are forced to 0),
- * cs and ss are loaded from MSRs.
- * (Note: 32bit->32bit SYSRET is different: since r11
- * does not exist, it merely sets eflags.IF=1).
- *
- * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
- * descriptor is not reinitialized. This means that we must
- * avoid SYSRET with SS == NULL, which could happen if we schedule,
- * exit the kernel, and re-enter using an interrupt vector. (All
- * interrupt entries on x86_64 set SS to NULL.) We prevent that
- * from happening by reloading SS in __switch_to.
- */
- USERGS_SYSRET32
-
-#ifdef CONFIG_AUDITSYSCALL
-cstar_auditsys:
- CFI_RESTORE_STATE
- movl %r9d,R9(%rsp) /* register to be clobbered by call */
- auditsys_entry_common
- movl R9(%rsp),%r9d /* reload 6th syscall arg */
- jmp cstar_dispatch
-
-sysretl_audit:
- auditsys_exit sysretl_from_sys_call
-#endif
-
-cstar_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz cstar_auditsys
-#endif
- xchgl %r9d,%ebp
- SAVE_EXTRA_REGS
- CLEAR_RREGS r9
- movq %rsp,%rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */
- RESTORE_EXTRA_REGS
- xchgl %ebp,%r9d
- jmp cstar_do_call
-END(ia32_cstar_target)
-
-ia32_badarg:
- ASM_CLAC
- movq $-EFAULT,%rax
- jmp ia32_sysret
- CFI_ENDPROC
-
-/*
- * Emulated IA32 system calls via int 0x80.
- *
- * Arguments:
- * eax system call number
- * ebx arg1
- * ecx arg2
- * edx arg3
- * esi arg4
- * edi arg5
- * ebp arg6 (note: not saved in the stack frame, should not be touched)
- *
- * Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except eax must be saved (but ptrace may violate that).
- * Arguments are zero extended. For system calls that want sign extension and
- * take long arguments a wrapper is needed. Most calls can just be called
- * directly.
- * Assumes it is only called from user space and entered with interrupts off.
- */
-
-ENTRY(ia32_syscall)
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,5*8
- /*CFI_REL_OFFSET ss,4*8 */
- CFI_REL_OFFSET rsp,3*8
- /*CFI_REL_OFFSET rflags,2*8 */
- /*CFI_REL_OFFSET cs,1*8 */
- CFI_REL_OFFSET rip,0*8
-
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
- PARAVIRT_ADJUST_EXCEPTION_FRAME
- SWAPGS
- ENABLE_INTERRUPTS(CLBR_NONE)
-
- /* Zero-extending 32-bit regs, do not remove */
- movl %eax,%eax
-
- /* Construct struct pt_regs on stack (iret frame is already on stack) */
- pushq_cfi_reg rax /* pt_regs->orig_ax */
- pushq_cfi_reg rdi /* pt_regs->di */
- pushq_cfi_reg rsi /* pt_regs->si */
- pushq_cfi_reg rdx /* pt_regs->dx */
- pushq_cfi_reg rcx /* pt_regs->cx */
- pushq_cfi $-ENOSYS /* pt_regs->ax */
- cld
- sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
- CFI_ADJUST_CFA_OFFSET 10*8
-
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_tracesys
-ia32_do_call:
- /* 32bit syscall -> 64bit C ABI argument conversion */
- movl %edi,%r8d /* arg5 */
- movl %ebp,%r9d /* arg6 */
- xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
- movl %ebx,%edi /* arg1 */
- movl %edx,%edx /* arg3 (zero extension) */
- cmpq $(IA32_NR_syscalls-1),%rax
- ja 1f
- call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
-ia32_sysret:
- movq %rax,RAX(%rsp)
-1:
-ia32_ret_from_sys_call:
- CLEAR_RREGS
- jmp int_ret_from_sys_call
-
-ia32_tracesys:
- SAVE_EXTRA_REGS
- CLEAR_RREGS
- movq %rsp,%rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
- RESTORE_EXTRA_REGS
- jmp ia32_do_call
- CFI_ENDPROC
-END(ia32_syscall)
-
- .macro PTREGSCALL label, func
- ALIGN
-GLOBAL(\label)
- leaq \func(%rip),%rax
- jmp ia32_ptregs_common
- .endm
-
- CFI_STARTPROC32
-
- PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
- PTREGSCALL stub32_sigreturn, sys32_sigreturn
- PTREGSCALL stub32_fork, sys_fork
- PTREGSCALL stub32_vfork, sys_vfork
-
- ALIGN
-GLOBAL(stub32_clone)
- leaq sys_clone(%rip),%rax
- mov %r8, %rcx
- jmp ia32_ptregs_common
-
- ALIGN
-ia32_ptregs_common:
- CFI_ENDPROC
- CFI_STARTPROC32 simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SIZEOF_PTREGS
- CFI_REL_OFFSET rax,RAX
- CFI_REL_OFFSET rcx,RCX
- CFI_REL_OFFSET rdx,RDX
- CFI_REL_OFFSET rsi,RSI
- CFI_REL_OFFSET rdi,RDI
- CFI_REL_OFFSET rip,RIP
-/* CFI_REL_OFFSET cs,CS*/
-/* CFI_REL_OFFSET rflags,EFLAGS*/
- CFI_REL_OFFSET rsp,RSP
-/* CFI_REL_OFFSET ss,SS*/
- SAVE_EXTRA_REGS 8
- call *%rax
- RESTORE_EXTRA_REGS 8
- ret
- CFI_ENDPROC
-END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 959e45b81fe2..e51a8f803f55 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -35,12 +35,12 @@
#define smp_mb() mb()
#define smp_rmb() dma_rmb()
#define smp_wmb() barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
+#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0)
#else /* !SMP */
#define smp_mb() barrier()
#define smp_rmb() barrier()
#define smp_wmb() barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
#endif /* SMP */
#define read_barrier_depends() do { } while (0)
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 47c8e32f621a..b6f7457d12e4 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -8,7 +8,7 @@
/*
* The set_memory_* API can be used to change various attributes of a virtual
* address range. The attributes include:
- * Cachability : UnCached, WriteCombining, WriteBack
+ * Cachability : UnCached, WriteCombining, WriteThrough, WriteBack
* Executability : eXeutable, NoteXecutable
* Read/Write : ReadOnly, ReadWrite
* Presence : NotPresent
@@ -35,9 +35,11 @@
int _set_memory_uc(unsigned long addr, int numpages);
int _set_memory_wc(unsigned long addr, int numpages);
+int _set_memory_wt(unsigned long addr, int numpages);
int _set_memory_wb(unsigned long addr, int numpages);
int set_memory_uc(unsigned long addr, int numpages);
int set_memory_wc(unsigned long addr, int numpages);
+int set_memory_wt(unsigned long addr, int numpages);
int set_memory_wb(unsigned long addr, int numpages);
int set_memory_x(unsigned long addr, int numpages);
int set_memory_nx(unsigned long addr, int numpages);
@@ -48,10 +50,12 @@ int set_memory_4k(unsigned long addr, int numpages);
int set_memory_array_uc(unsigned long *addr, int addrinarray);
int set_memory_array_wc(unsigned long *addr, int addrinarray);
+int set_memory_array_wt(unsigned long *addr, int addrinarray);
int set_memory_array_wb(unsigned long *addr, int addrinarray);
int set_pages_array_uc(struct page **pages, int addrinarray);
int set_pages_array_wc(struct page **pages, int addrinarray);
+int set_pages_array_wt(struct page **pages, int addrinarray);
int set_pages_array_wb(struct page **pages, int addrinarray);
/*
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
index 99c105d78b7e..ad19841eddfe 100644
--- a/arch/x86/include/asm/cmpxchg.h
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -4,8 +4,6 @@
#include <linux/compiler.h>
#include <asm/alternative.h> /* Provides LOCK_PREFIX */
-#define __HAVE_ARCH_CMPXCHG 1
-
/*
* Non-existant functions to indicate usage errors at link time
* (or compile-time if the compiler implements __compiletime_error().
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
deleted file mode 100644
index de1cdaf4d743..000000000000
--- a/arch/x86/include/asm/dwarf2.h
+++ /dev/null
@@ -1,170 +0,0 @@
-#ifndef _ASM_X86_DWARF2_H
-#define _ASM_X86_DWARF2_H
-
-#ifndef __ASSEMBLY__
-#warning "asm/dwarf2.h should be only included in pure assembly files"
-#endif
-
-/*
- * Macros for dwarf2 CFI unwind table entries.
- * See "as.info" for details on these pseudo ops. Unfortunately
- * they are only supported in very new binutils, so define them
- * away for older version.
- */
-
-#ifdef CONFIG_AS_CFI
-
-#define CFI_STARTPROC .cfi_startproc
-#define CFI_ENDPROC .cfi_endproc
-#define CFI_DEF_CFA .cfi_def_cfa
-#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
-#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
-#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
-#define CFI_OFFSET .cfi_offset
-#define CFI_REL_OFFSET .cfi_rel_offset
-#define CFI_REGISTER .cfi_register
-#define CFI_RESTORE .cfi_restore
-#define CFI_REMEMBER_STATE .cfi_remember_state
-#define CFI_RESTORE_STATE .cfi_restore_state
-#define CFI_UNDEFINED .cfi_undefined
-#define CFI_ESCAPE .cfi_escape
-
-#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
-#define CFI_SIGNAL_FRAME .cfi_signal_frame
-#else
-#define CFI_SIGNAL_FRAME
-#endif
-
-#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__)
- /*
- * Emit CFI data in .debug_frame sections, not .eh_frame sections.
- * The latter we currently just discard since we don't do DWARF
- * unwinding at runtime. So only the offline DWARF information is
- * useful to anyone. Note we should not use this directive if this
- * file is used in the vDSO assembly, or if vmlinux.lds.S gets
- * changed so it doesn't discard .eh_frame.
- */
- .cfi_sections .debug_frame
-#endif
-
-#else
-
-/*
- * Due to the structure of pre-exisiting code, don't use assembler line
- * comment character # to ignore the arguments. Instead, use a dummy macro.
- */
-.macro cfi_ignore a=0, b=0, c=0, d=0
-.endm
-
-#define CFI_STARTPROC cfi_ignore
-#define CFI_ENDPROC cfi_ignore
-#define CFI_DEF_CFA cfi_ignore
-#define CFI_DEF_CFA_REGISTER cfi_ignore
-#define CFI_DEF_CFA_OFFSET cfi_ignore
-#define CFI_ADJUST_CFA_OFFSET cfi_ignore
-#define CFI_OFFSET cfi_ignore
-#define CFI_REL_OFFSET cfi_ignore
-#define CFI_REGISTER cfi_ignore
-#define CFI_RESTORE cfi_ignore
-#define CFI_REMEMBER_STATE cfi_ignore
-#define CFI_RESTORE_STATE cfi_ignore
-#define CFI_UNDEFINED cfi_ignore
-#define CFI_ESCAPE cfi_ignore
-#define CFI_SIGNAL_FRAME cfi_ignore
-
-#endif
-
-/*
- * An attempt to make CFI annotations more or less
- * correct and shorter. It is implied that you know
- * what you're doing if you use them.
- */
-#ifdef __ASSEMBLY__
-#ifdef CONFIG_X86_64
- .macro pushq_cfi reg
- pushq \reg
- CFI_ADJUST_CFA_OFFSET 8
- .endm
-
- .macro pushq_cfi_reg reg
- pushq %\reg
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET \reg, 0
- .endm
-
- .macro popq_cfi reg
- popq \reg
- CFI_ADJUST_CFA_OFFSET -8
- .endm
-
- .macro popq_cfi_reg reg
- popq %\reg
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE \reg
- .endm
-
- .macro pushfq_cfi
- pushfq
- CFI_ADJUST_CFA_OFFSET 8
- .endm
-
- .macro popfq_cfi
- popfq
- CFI_ADJUST_CFA_OFFSET -8
- .endm
-
- .macro movq_cfi reg offset=0
- movq %\reg, \offset(%rsp)
- CFI_REL_OFFSET \reg, \offset
- .endm
-
- .macro movq_cfi_restore offset reg
- movq \offset(%rsp), %\reg
- CFI_RESTORE \reg
- .endm
-#else /*!CONFIG_X86_64*/
- .macro pushl_cfi reg
- pushl \reg
- CFI_ADJUST_CFA_OFFSET 4
- .endm
-
- .macro pushl_cfi_reg reg
- pushl %\reg
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET \reg, 0
- .endm
-
- .macro popl_cfi reg
- popl \reg
- CFI_ADJUST_CFA_OFFSET -4
- .endm
-
- .macro popl_cfi_reg reg
- popl %\reg
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE \reg
- .endm
-
- .macro pushfl_cfi
- pushfl
- CFI_ADJUST_CFA_OFFSET 4
- .endm
-
- .macro popfl_cfi
- popfl
- CFI_ADJUST_CFA_OFFSET -4
- .endm
-
- .macro movl_cfi reg offset=0
- movl %\reg, \offset(%esp)
- CFI_REL_OFFSET \reg, \offset
- .endm
-
- .macro movl_cfi_restore offset reg
- movl \offset(%esp), %\reg
- CFI_RESTORE \reg
- .endm
-#endif /*!CONFIG_X86_64*/
-#endif /*__ASSEMBLY__*/
-
-#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 27ca0afcccd7..df002992d8fd 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -52,4 +52,7 @@ BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
BUILD_INTERRUPT(threshold_interrupt,THRESHOLD_APIC_VECTOR)
#endif
+#ifdef CONFIG_X86_MCE_AMD
+BUILD_INTERRUPT(deferred_error_interrupt, DEFERRED_ERROR_VECTOR)
+#endif
#endif
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 3b629f47eb65..793179cf8e21 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -1,20 +1,17 @@
#ifdef __ASSEMBLY__
#include <asm/asm.h>
-#include <asm/dwarf2.h>
/* The annotation hides the frame from the unwinder and makes it look
like a ordinary ebp save/restore. This avoids some special cases for
frame pointer later */
#ifdef CONFIG_FRAME_POINTER
.macro FRAME
- __ASM_SIZE(push,_cfi) %__ASM_REG(bp)
- CFI_REL_OFFSET __ASM_REG(bp), 0
+ __ASM_SIZE(push,) %__ASM_REG(bp)
__ASM_SIZE(mov) %__ASM_REG(sp), %__ASM_REG(bp)
.endm
.macro ENDFRAME
- __ASM_SIZE(pop,_cfi) %__ASM_REG(bp)
- CFI_RESTORE __ASM_REG(bp)
+ __ASM_SIZE(pop,) %__ASM_REG(bp)
.endm
#else
.macro FRAME
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 986606539395..7178043b0e1d 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -34,6 +34,9 @@ typedef struct {
#ifdef CONFIG_X86_MCE_THRESHOLD
unsigned int irq_threshold_count;
#endif
+#ifdef CONFIG_X86_MCE_AMD
+ unsigned int irq_deferred_error_count;
+#endif
#if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
unsigned int irq_hv_callback_count;
#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 10c80d4f8386..6615032e19c8 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -40,6 +40,7 @@ extern asmlinkage void reschedule_interrupt(void);
extern asmlinkage void irq_move_cleanup_interrupt(void);
extern asmlinkage void reboot_interrupt(void);
extern asmlinkage void threshold_interrupt(void);
+extern asmlinkage void deferred_error_interrupt(void);
extern asmlinkage void call_function_interrupt(void);
extern asmlinkage void call_function_single_interrupt(void);
@@ -54,6 +55,7 @@ extern void trace_spurious_interrupt(void);
extern void trace_thermal_interrupt(void);
extern void trace_reschedule_interrupt(void);
extern void trace_threshold_interrupt(void);
+extern void trace_deferred_error_interrupt(void);
extern void trace_call_function_interrupt(void);
extern void trace_call_function_single_interrupt(void);
#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 34a5b93704d3..83ec9b1d77cc 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -35,11 +35,13 @@
*/
#define ARCH_HAS_IOREMAP_WC
+#define ARCH_HAS_IOREMAP_WT
#include <linux/string.h>
#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/early_ioremap.h>
+#include <asm/pgtable_types.h>
#define build_mmio_read(name, size, type, reg, barrier) \
static inline type name(const volatile void __iomem *addr) \
@@ -177,6 +179,7 @@ static inline unsigned int isa_virt_to_bus(volatile void *address)
* look at pci_iomap().
*/
extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
unsigned long prot_val);
@@ -197,8 +200,6 @@ extern void set_iounmap_nonlazy(void);
#include <asm-generic/iomap.h>
-#include <linux/vmalloc.h>
-
/*
* Convert a virtual cached pointer to an uncached pointer
*/
@@ -320,6 +321,7 @@ extern void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr);
extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
enum page_cache_mode pcm);
extern void __iomem *ioremap_wc(resource_size_t offset, unsigned long size);
+extern void __iomem *ioremap_wt(resource_size_t offset, unsigned long size);
extern bool is_early_ioremap_ptep(pte_t *ptep);
@@ -338,6 +340,9 @@ extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
#define IO_SPACE_LIMIT 0xffff
#ifdef CONFIG_MTRR
+extern int __must_check arch_phys_wc_index(int handle);
+#define arch_phys_wc_index arch_phys_wc_index
+
extern int __must_check arch_phys_wc_add(unsigned long base,
unsigned long size);
extern void arch_phys_wc_del(int handle);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0ed29ac13a9d..4c2d2eb2060a 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -83,22 +83,23 @@
*/
#define X86_PLATFORM_IPI_VECTOR 0xf7
-/* Vector for KVM to deliver posted interrupt IPI */
-#ifdef CONFIG_HAVE_KVM
-#define POSTED_INTR_VECTOR 0xf2
#define POSTED_INTR_WAKEUP_VECTOR 0xf1
-#endif
-
/*
* IRQ work vector:
*/
#define IRQ_WORK_VECTOR 0xf6
#define UV_BAU_MESSAGE 0xf5
+#define DEFERRED_ERROR_VECTOR 0xf4
/* Vector on which hypervisor callbacks will be delivered */
#define HYPERVISOR_CALLBACK_VECTOR 0xf3
+/* Vector for KVM to deliver posted interrupt IPI */
+#ifdef CONFIG_HAVE_KVM
+#define POSTED_INTR_VECTOR 0xf2
+#endif
+
/*
* Local APIC timer IRQ vector is on a different priority level,
* to work around the 'lost local interrupt if more than 2 IRQ
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dea2e7e962e3..f4a555beef19 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -207,6 +207,7 @@ union kvm_mmu_page_role {
unsigned nxe:1;
unsigned cr0_wp:1;
unsigned smep_andnot_wp:1;
+ unsigned smap_andnot_wp:1;
};
};
@@ -400,6 +401,7 @@ struct kvm_vcpu_arch {
struct kvm_mmu_memory_cache mmu_page_header_cache;
struct fpu guest_fpu;
+ bool eager_fpu;
u64 xcr0;
u64 guest_supported_xcr0;
u32 guest_xstate_size;
@@ -743,6 +745,7 @@ struct kvm_x86_ops {
void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+ void (*fpu_activate)(struct kvm_vcpu *vcpu);
void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 1f5a86d518db..982dfc3679ad 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -17,11 +17,16 @@
#define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT)
#define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */
#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */
+#define MCG_LMCE_P (1ULL<<27) /* Local machine check supported */
/* MCG_STATUS register defines */
#define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */
#define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */
#define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */
+#define MCG_STATUS_LMCES (1ULL<<3) /* LMCE signaled */
+
+/* MCG_EXT_CTL register defines */
+#define MCG_EXT_CTL_LMCE_EN (1ULL<<0) /* Enable LMCE */
/* MCi_STATUS register defines */
#define MCI_STATUS_VAL (1ULL<<63) /* valid error */
@@ -104,6 +109,7 @@ struct mce_log {
struct mca_config {
bool dont_log_ce;
bool cmci_disabled;
+ bool lmce_disabled;
bool ignore_ce;
bool disabled;
bool ser;
@@ -117,8 +123,19 @@ struct mca_config {
};
struct mce_vendor_flags {
- __u64 overflow_recov : 1, /* cpuid_ebx(80000007) */
- __reserved_0 : 63;
+ /*
+ * overflow recovery cpuid bit indicates that overflow
+ * conditions are not fatal
+ */
+ __u64 overflow_recov : 1,
+
+ /*
+ * SUCCOR stands for S/W UnCorrectable error COntainment
+ * and Recovery. It indicates support for data poisoning
+ * in HW and deferred error interrupts.
+ */
+ succor : 1,
+ __reserved_0 : 62;
};
extern struct mce_vendor_flags mce_flags;
@@ -168,12 +185,16 @@ void cmci_clear(void);
void cmci_reenable(void);
void cmci_rediscover(void);
void cmci_recheck(void);
+void lmce_clear(void);
+void lmce_enable(void);
#else
static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
static inline void cmci_clear(void) {}
static inline void cmci_reenable(void) {}
static inline void cmci_rediscover(void) {}
static inline void cmci_recheck(void) {}
+static inline void lmce_clear(void) {}
+static inline void lmce_enable(void) {}
#endif
#ifdef CONFIG_X86_MCE_AMD
@@ -223,6 +244,9 @@ void do_machine_check(struct pt_regs *, long);
extern void (*mce_threshold_vector)(void);
extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
+/* Deferred error interrupt handler */
+extern void (*deferred_error_int_vector)(void);
+
/*
* Thermal handler
*/
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index c469490db4a8..9ebc3d009373 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -56,6 +56,7 @@
#define MSR_IA32_MCG_CAP 0x00000179
#define MSR_IA32_MCG_STATUS 0x0000017a
#define MSR_IA32_MCG_CTL 0x0000017b
+#define MSR_IA32_MCG_EXT_CTL 0x000004d0
#define MSR_OFFCORE_RSP_0 0x000001a6
#define MSR_OFFCORE_RSP_1 0x000001a7
@@ -140,6 +141,7 @@
#define MSR_CORE_C3_RESIDENCY 0x000003fc
#define MSR_CORE_C6_RESIDENCY 0x000003fd
#define MSR_CORE_C7_RESIDENCY 0x000003fe
+#define MSR_KNL_CORE_C6_RESIDENCY 0x000003ff
#define MSR_PKG_C2_RESIDENCY 0x0000060d
#define MSR_PKG_C8_RESIDENCY 0x00000630
#define MSR_PKG_C9_RESIDENCY 0x00000631
@@ -379,6 +381,7 @@
#define FEATURE_CONTROL_LOCKED (1<<0)
#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)
#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2)
+#define FEATURE_CONTROL_LMCE (1<<20)
#define MSR_IA32_APICBASE 0x0000001b
#define MSR_IA32_APICBASE_BSP (1<<8)
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index de36f22eb0b9..e6a707eb5081 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -1,13 +1,14 @@
#ifndef _ASM_X86_MSR_H
#define _ASM_X86_MSR_H
-#include <uapi/asm/msr.h>
+#include "msr-index.h"
#ifndef __ASSEMBLY__
#include <asm/asm.h>
#include <asm/errno.h>
#include <asm/cpumask.h>
+#include <uapi/asm/msr.h>
struct msr {
union {
@@ -205,8 +206,13 @@ do { \
#endif /* !CONFIG_PARAVIRT */
-#define wrmsrl_safe(msr, val) wrmsr_safe((msr), (u32)(val), \
- (u32)((val) >> 32))
+/*
+ * 64-bit version of wrmsr_safe():
+ */
+static inline int wrmsrl_safe(u32 msr, u64 val)
+{
+ return wrmsr_safe(msr, (u32)val, (u32)(val >> 32));
+}
#define write_tsc(low, high) wrmsr(MSR_IA32_TSC, (low), (high))
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index f768f6298419..b94f6f64e23d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -31,7 +31,7 @@
* arch_phys_wc_add and arch_phys_wc_del.
*/
# ifdef CONFIG_MTRR
-extern u8 mtrr_type_lookup(u64 addr, u64 end);
+extern u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform);
extern void mtrr_save_fixed_ranges(void *);
extern void mtrr_save_state(void);
extern int mtrr_add(unsigned long base, unsigned long size,
@@ -48,14 +48,13 @@ extern void mtrr_aps_init(void);
extern void mtrr_bp_restore(void);
extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
extern int amd_special_default_mtrr(void);
-extern int phys_wc_to_mtrr_index(int handle);
# else
-static inline u8 mtrr_type_lookup(u64 addr, u64 end)
+static inline u8 mtrr_type_lookup(u64 addr, u64 end, u8 *uniform)
{
/*
* Return no-MTRRs:
*/
- return 0xff;
+ return MTRR_TYPE_INVALID;
}
#define mtrr_save_fixed_ranges(arg) do {} while (0)
#define mtrr_save_state() do {} while (0)
@@ -84,10 +83,6 @@ static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
{
}
-static inline int phys_wc_to_mtrr_index(int handle)
-{
- return -1;
-}
#define mtrr_ap_init() do {} while (0)
#define mtrr_bp_init() do {} while (0)
@@ -127,4 +122,8 @@ struct mtrr_gentry32 {
_IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
#endif /* CONFIG_COMPAT */
+/* Bit fields for enabled in struct mtrr_state_type */
+#define MTRR_STATE_MTRR_FIXED_ENABLED 0x01
+#define MTRR_STATE_MTRR_ENABLED 0x02
+
#endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8957810ad7d1..d143bfad45d7 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock,
+ u32 val)
+{
+ PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val);
+}
+
+static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock)
+{
+ PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock);
+}
+
+static __always_inline void pv_wait(u8 *ptr, u8 val)
+{
+ PVOP_VCALL2(pv_lock_ops.wait, ptr, val);
+}
+
+static __always_inline void pv_kick(int cpu)
+{
+ PVOP_VCALL1(pv_lock_ops.kick, cpu);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock,
__ticket_t ticket)
{
@@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket);
}
-#endif
+#endif /* CONFIG_QUEUED_SPINLOCKS */
+
+#endif /* SMP && PARAVIRT_SPINLOCKS */
#ifdef CONFIG_X86_32
#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 344c646e7f06..a6b8f9fadb06 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -334,9 +334,19 @@ struct arch_spinlock;
typedef u16 __ticket_t;
#endif
+struct qspinlock;
+
struct pv_lock_ops {
+#ifdef CONFIG_QUEUED_SPINLOCKS
+ void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val);
+ struct paravirt_callee_save queued_spin_unlock;
+
+ void (*wait)(u8 *ptr, u8 val);
+ void (*kick)(int cpu);
+#else /* !CONFIG_QUEUED_SPINLOCKS */
struct paravirt_callee_save lock_spinning;
void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket);
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
};
/* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
index 91bc4ba95f91..ca6c228d5e62 100644
--- a/arch/x86/include/asm/pat.h
+++ b/arch/x86/include/asm/pat.h
@@ -4,14 +4,9 @@
#include <linux/types.h>
#include <asm/pgtable_types.h>
-#ifdef CONFIG_X86_PAT
-extern int pat_enabled;
-#else
-static const int pat_enabled;
-#endif
-
+bool pat_enabled(void);
extern void pat_init(void);
-void pat_init_cache_modes(void);
+void pat_init_cache_modes(u64);
extern int reserve_memtype(u64 start, u64 end,
enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index fe57e7a98839..2562e303405b 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -398,11 +398,17 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
* requested memtype:
* - request is uncached, return cannot be write-back
* - request is write-combine, return cannot be write-back
+ * - request is write-through, return cannot be write-back
+ * - request is write-through, return cannot be write-combine
*/
if ((pcm == _PAGE_CACHE_MODE_UC_MINUS &&
new_pcm == _PAGE_CACHE_MODE_WB) ||
(pcm == _PAGE_CACHE_MODE_WC &&
- new_pcm == _PAGE_CACHE_MODE_WB)) {
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WT &&
+ new_pcm == _PAGE_CACHE_MODE_WB) ||
+ (pcm == _PAGE_CACHE_MODE_WT &&
+ new_pcm == _PAGE_CACHE_MODE_WC)) {
return 0;
}
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 78f0c8cbe316..13f310bfc09a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -367,6 +367,9 @@ extern int nx_enabled;
#define pgprot_writecombine pgprot_writecombine
extern pgprot_t pgprot_writecombine(pgprot_t prot);
+#define pgprot_writethrough pgprot_writethrough
+extern pgprot_t pgprot_writethrough(pgprot_t prot);
+
/* Indicate that x86 has its own track and untrack pfn vma functions */
#define __HAVE_PFNMAP_TRACKING
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index a90f8972dad5..a4a77286cb1d 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,12 +5,14 @@
/* misc architecture specific prototypes */
-void system_call(void);
void syscall_init(void);
-void ia32_syscall(void);
-void ia32_cstar_target(void);
-void ia32_sysenter_target(void);
+void entry_SYSCALL_64(void);
+void entry_SYSCALL_compat(void);
+void entry_INT80_32(void);
+void entry_INT80_compat(void);
+void entry_SYSENTER_32(void);
+void entry_SYSENTER_compat(void);
void x86_configure_nx(void);
void x86_report_nx(void);
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
new file mode 100644
index 000000000000..9d51fae1cba3
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,57 @@
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#include <asm/cpufeature.h>
+#include <asm-generic/qspinlock_types.h>
+#include <asm/paravirt.h>
+
+#define queued_spin_unlock queued_spin_unlock
+/**
+ * queued_spin_unlock - release a queued spinlock
+ * @lock : Pointer to queued spinlock structure
+ *
+ * A smp_store_release() on the least-significant byte.
+ */
+static inline void native_queued_spin_unlock(struct qspinlock *lock)
+{
+ smp_store_release((u8 *)lock, 0);
+}
+
+#ifdef CONFIG_PARAVIRT_SPINLOCKS
+extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __pv_init_lock_hash(void);
+extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
+extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock);
+
+static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
+{
+ pv_queued_spin_lock_slowpath(lock, val);
+}
+
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+ pv_queued_spin_unlock(lock);
+}
+#else
+static inline void queued_spin_unlock(struct qspinlock *lock)
+{
+ native_queued_spin_unlock(lock);
+}
+#endif
+
+#define virt_queued_spin_lock virt_queued_spin_lock
+
+static inline bool virt_queued_spin_lock(struct qspinlock *lock)
+{
+ if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0)
+ cpu_relax();
+
+ return true;
+}
+
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_X86_QSPINLOCK_H */
diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h
new file mode 100644
index 000000000000..b002e711ba88
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock_paravirt.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_QSPINLOCK_PARAVIRT_H
+#define __ASM_QSPINLOCK_PARAVIRT_H
+
+PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock);
+
+#endif
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 5a9856eb12ba..7d5a1929d76b 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -231,11 +231,21 @@
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8)
#ifdef __KERNEL__
+
+/*
+ * early_idt_handler_array is an array of entry points referenced in the
+ * early IDT. For simplicity, it's a real array with one entry point
+ * every nine bytes. That leaves room for an optional 'push $0' if the
+ * vector has no error code (two bytes), a 'push $vector_number' (two
+ * bytes), and a jump to the common entry code (up to five bytes).
+ */
+#define EARLY_IDT_HANDLER_SIZE 9
+
#ifndef __ASSEMBLY__
-extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
+extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE];
#ifdef CONFIG_TRACING
-# define trace_early_idt_handlers early_idt_handlers
+# define trace_early_idt_handler_array early_idt_handler_array
#endif
/*
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index aeb4666e0c0a..2270e41b32fd 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -215,6 +215,44 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
}
+/**
+ * pcommit_sfence() - persistent commit and fence
+ *
+ * The PCOMMIT instruction ensures that data that has been flushed from the
+ * processor's cache hierarchy with CLWB, CLFLUSHOPT or CLFLUSH is accepted to
+ * memory and is durable on the DIMM. The primary use case for this is
+ * persistent memory.
+ *
+ * This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT
+ * with appropriate fencing.
+ *
+ * Example:
+ * void flush_and_commit_buffer(void *vaddr, unsigned int size)
+ * {
+ * unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ * void *vend = vaddr + size;
+ * void *p;
+ *
+ * for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ * p < vend; p += boot_cpu_data.x86_clflush_size)
+ * clwb(p);
+ *
+ * // SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes
+ * // MFENCE via mb() also works
+ * wmb();
+ *
+ * // PCOMMIT and the required SFENCE for ordering
+ * pcommit_sfence();
+ * }
+ *
+ * After this function completes the data pointed to by 'vaddr' has been
+ * accepted to memory and will be durable if the 'vaddr' points to persistent
+ * memory.
+ *
+ * PCOMMIT must always be ordered by an MFENCE or SFENCE, so to help simplify
+ * things we include both the PCOMMIT and the required SFENCE in the
+ * alternatives generated by pcommit_sfence().
+ */
static inline void pcommit_sfence(void)
{
alternative(ASM_NOP7,
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 64b611782ef0..be0a05913b91 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -42,6 +42,10 @@
extern struct static_key paravirt_ticketlocks_enabled;
static __always_inline bool static_key_false(struct static_key *key);
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm/qspinlock.h>
+#else
+
#ifdef CONFIG_PARAVIRT_SPINLOCKS
static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
@@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
cpu_relax();
}
}
+#endif /* CONFIG_QUEUED_SPINLOCKS */
/*
* Read-write spinlocks, allowing multiple readers
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 5f9d7572d82b..65c3e37f879a 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -23,6 +23,9 @@ typedef u32 __ticketpair_t;
#define TICKET_SHIFT (sizeof(__ticket_t) * 8)
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
typedef struct arch_spinlock {
union {
__ticketpair_t head_tail;
@@ -33,6 +36,7 @@ typedef struct arch_spinlock {
} arch_spinlock_t;
#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
#include <asm-generic/qrwlock_types.h>
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 0e8f04f2c26f..8d717faeed22 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -26,7 +26,7 @@
#define _ASM_X86_TOPOLOGY_H
#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
# define ENABLE_TOPO_DEFINES
# endif
#else
diff --git a/arch/x86/include/asm/trace/irq_vectors.h b/arch/x86/include/asm/trace/irq_vectors.h
index 4cab890007a7..38a09a13a9bc 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -101,6 +101,12 @@ DEFINE_IRQ_VECTOR_EVENT(call_function_single);
DEFINE_IRQ_VECTOR_EVENT(threshold_apic);
/*
+ * deferred_error_apic - called when entering/exiting a deferred apic interrupt
+ * vector handler
+ */
+DEFINE_IRQ_VECTOR_EVENT(deferred_error_apic);
+
+/*
* thermal_apic - called when entering/exiting a thermal apic interrupt
* vector handler
*/
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 4e49d7dff78e..c5380bea2a36 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -108,7 +108,8 @@ extern int panic_on_unrecovered_nmi;
void math_emulate(struct math_emu_info *);
#ifndef CONFIG_X86_32
asmlinkage void smp_thermal_interrupt(void);
-asmlinkage void mce_threshold_interrupt(void);
+asmlinkage void smp_threshold_interrupt(void);
+asmlinkage void smp_deferred_error_interrupt(void);
#endif
extern enum ctx_state ist_enter(struct pt_regs *regs);
diff --git a/arch/x86/include/uapi/asm/msr.h b/arch/x86/include/uapi/asm/msr.h
index 155e51048fa4..c41f4fe25483 100644
--- a/arch/x86/include/uapi/asm/msr.h
+++ b/arch/x86/include/uapi/asm/msr.h
@@ -1,8 +1,6 @@
#ifndef _UAPI_ASM_X86_MSR_H
#define _UAPI_ASM_X86_MSR_H
-#include <asm/msr-index.h>
-
#ifndef __ASSEMBLY__
#include <linux/types.h>
diff --git a/arch/x86/include/uapi/asm/mtrr.h b/arch/x86/include/uapi/asm/mtrr.h
index d0acb658c8f4..7528dcf59691 100644
--- a/arch/x86/include/uapi/asm/mtrr.h
+++ b/arch/x86/include/uapi/asm/mtrr.h
@@ -103,7 +103,7 @@ struct mtrr_state_type {
#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry)
-/* These are the region types */
+/* MTRR memory types, which are defined in SDM */
#define MTRR_TYPE_UNCACHABLE 0
#define MTRR_TYPE_WRCOMB 1
/*#define MTRR_TYPE_ 2*/
@@ -113,5 +113,11 @@ struct mtrr_state_type {
#define MTRR_TYPE_WRBACK 6
#define MTRR_NUM_TYPES 7
+/*
+ * Invalid MTRR memory type. mtrr_type_lookup() returns this value when
+ * MTRRs are disabled. Note, this value is allocated from the reserved
+ * values (0x7-0xff) of the MTRR memory types.
+ */
+#define MTRR_TYPE_INVALID 0xff
#endif /* _UAPI_ASM_X86_MTRR_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 9bcd0b56ca17..01663ee5f1b7 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,7 +22,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
CFLAGS_irq.o := -I$(src)/../include/asm/trace
-obj-y := process_$(BITS).o signal.o entry_$(BITS).o
+obj-y := process_$(BITS).o signal.o
obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o
obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
@@ -31,9 +31,6 @@ obj-y += probe_roms.o
obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += mcount_64.o
-obj-y += syscall_$(BITS).o vsyscall_gtod.o
-obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
-obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-y += bootflag.o e820.o
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index dcaab87da629..d8f42f902a0f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -66,7 +66,7 @@ int main(void)
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
DEFINE(NR_syscalls, sizeof(syscalls_64));
- DEFINE(__NR_ia32_syscall_max, sizeof(syscalls_ia32) - 1);
+ DEFINE(__NR_syscall_compat_max, sizeof(syscalls_ia32) - 1);
DEFINE(IA32_NR_syscalls, sizeof(syscalls_ia32));
return 0;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e4cf63301ff4..eb4f01269b5d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -288,7 +288,7 @@ static int nearby_node(int apicid)
* Assumption: Number of cores in each internal node is the same.
* (2) AMD processors supporting compute units
*/
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
static void amd_get_topology(struct cpuinfo_x86 *c)
{
u32 nodes, cores_per_cu = 1;
@@ -341,7 +341,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c)
*/
static void amd_detect_cmp(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned bits;
int cpu = smp_processor_id();
@@ -420,7 +420,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
static void early_init_amd_mc(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned bits, ecx;
/* Multi core CPU? */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 6bec0b55863e..cc7f753e571d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -508,7 +508,7 @@ static void cpu_detect_tlb(struct cpuinfo_x86 *c)
void detect_ht(struct cpuinfo_x86 *c)
{
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
int index_msb, core_bits;
static bool printed;
@@ -844,7 +844,7 @@ static void generic_identify(struct cpuinfo_x86 *c)
if (c->cpuid_level >= 0x00000001) {
c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
#ifdef CONFIG_X86_32
-# ifdef CONFIG_X86_HT
+# ifdef CONFIG_SMP
c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
# else
c->apicid = c->initial_apicid;
@@ -1026,7 +1026,7 @@ void enable_sep_cpu(void)
(unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);
out:
put_cpu();
@@ -1204,10 +1204,10 @@ void syscall_init(void)
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
+ wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1216,7 +1216,7 @@ void syscall_init(void)
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index edcb0e28c336..be4febc58b94 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -654,7 +654,7 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
unsigned int cpu = c->cpu_index;
#endif
@@ -773,19 +773,19 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (new_l2) {
l2 = new_l2;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l2_id;
#endif
}
if (new_l3) {
l3 = new_l3;
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
per_cpu(cpu_llc_id, cpu) = l3_id;
#endif
}
-#ifdef CONFIG_X86_HT
+#ifdef CONFIG_SMP
/*
* If cpu_llc_id is not yet set, this means cpuid_level < 4 which in
* turns means that the only possibility is SMT (as indicated in
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index e535533d5ab8..5b974c97e31e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -708,6 +708,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
struct pt_regs *regs)
{
int i, ret = 0;
+ char *tmp;
for (i = 0; i < mca_cfg.banks; i++) {
m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i));
@@ -716,9 +717,11 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp,
if (quirk_no_way_out)
quirk_no_way_out(i, m, regs);
}
- if (mce_severity(m, mca_cfg.tolerant, msg, true) >=
- MCE_PANIC_SEVERITY)
+
+ if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) {
+ *msg = tmp;
ret = 1;
+ }
}
return ret;
}
@@ -1047,6 +1050,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
char *msg = "Unknown";
u64 recover_paddr = ~0ull;
int flags = MF_ACTION_REQUIRED;
+ int lmce = 0;
prev_state = ist_enter(regs);
@@ -1074,11 +1078,20 @@ void do_machine_check(struct pt_regs *regs, long error_code)
kill_it = 1;
/*
- * Go through all the banks in exclusion of the other CPUs.
- * This way we don't report duplicated events on shared banks
- * because the first one to see it will clear it.
+ * Check if this MCE is signaled to only this logical processor
*/
- order = mce_start(&no_way_out);
+ if (m.mcgstatus & MCG_STATUS_LMCES)
+ lmce = 1;
+ else {
+ /*
+ * Go through all the banks in exclusion of the other CPUs.
+ * This way we don't report duplicated events on shared banks
+ * because the first one to see it will clear it.
+ * If this is a Local MCE, then no need to perform rendezvous.
+ */
+ order = mce_start(&no_way_out);
+ }
+
for (i = 0; i < cfg->banks; i++) {
__clear_bit(i, toclear);
if (!test_bit(i, valid_banks))
@@ -1155,8 +1168,18 @@ void do_machine_check(struct pt_regs *regs, long error_code)
* Do most of the synchronization with other CPUs.
* When there's any problem use only local no_way_out state.
*/
- if (mce_end(order) < 0)
- no_way_out = worst >= MCE_PANIC_SEVERITY;
+ if (!lmce) {
+ if (mce_end(order) < 0)
+ no_way_out = worst >= MCE_PANIC_SEVERITY;
+ } else {
+ /*
+ * Local MCE skipped calling mce_reign()
+ * If we found a fatal error, we need to panic here.
+ */
+ if (worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
+ mce_panic("Machine check from unknown source",
+ NULL, NULL);
+ }
/*
* At insane "tolerant" levels we take no action. Otherwise
@@ -1637,10 +1660,16 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
mce_intel_feature_init(c);
mce_adjust_timer = cmci_intel_adjust_timer;
break;
- case X86_VENDOR_AMD:
+
+ case X86_VENDOR_AMD: {
+ u32 ebx = cpuid_ebx(0x80000007);
+
mce_amd_feature_init(c);
- mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
+ mce_flags.overflow_recov = !!(ebx & BIT(0));
+ mce_flags.succor = !!(ebx & BIT(1));
break;
+ }
+
default:
break;
}
@@ -1976,6 +2005,7 @@ void mce_disable_bank(int bank)
/*
* mce=off Disables machine check
* mce=no_cmci Disables CMCI
+ * mce=no_lmce Disables LMCE
* mce=dont_log_ce Clears corrected events silently, no log created for CEs.
* mce=ignore_ce Disables polling and CMCI, corrected events are not cleared.
* mce=TOLERANCELEVEL[,monarchtimeout] (number, see above)
@@ -1999,6 +2029,8 @@ static int __init mcheck_enable(char *str)
cfg->disabled = true;
else if (!strcmp(str, "no_cmci"))
cfg->cmci_disabled = true;
+ else if (!strcmp(str, "no_lmce"))
+ cfg->lmce_disabled = true;
else if (!strcmp(str, "dont_log_ce"))
cfg->dont_log_ce = true;
else if (!strcmp(str, "ignore_ce"))
@@ -2008,11 +2040,8 @@ static int __init mcheck_enable(char *str)
else if (!strcmp(str, "bios_cmci_threshold"))
cfg->bios_cmci_threshold = true;
else if (isdigit(str[0])) {
- get_option(&str, &(cfg->tolerant));
- if (*str == ',') {
- ++str;
+ if (get_option(&str, &cfg->tolerant) == 2)
get_option(&str, &(cfg->monarch_timeout));
- }
} else {
pr_info("mce argument %s ignored. Please use /sys\n", str);
return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 55ad9b37cae8..e99b15077e94 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,19 +1,13 @@
/*
- * (c) 2005-2012 Advanced Micro Devices, Inc.
+ * (c) 2005-2015 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
*
* Written by Jacob Shin - AMD, Inc.
- *
* Maintained by: Borislav Petkov <bp@alien8.de>
*
- * April 2006
- * - added support for AMD Family 0x10 processors
- * May 2012
- * - major scrubbing
- *
- * All MC4_MISCi registers are shared between multi-cores
+ * All MC4_MISCi registers are shared between cores on a node.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
@@ -32,6 +26,7 @@
#include <asm/idle.h>
#include <asm/mce.h>
#include <asm/msr.h>
+#include <asm/trace/irq_vectors.h>
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
@@ -47,6 +42,13 @@
#define MASK_BLKPTR_LO 0xFF000000
#define MCG_XBLK_ADDR 0xC0000400
+/* Deferred error settings */
+#define MSR_CU_DEF_ERR 0xC0000410
+#define MASK_DEF_LVTOFF 0x000000F0
+#define MASK_DEF_INT_TYPE 0x00000006
+#define DEF_LVT_OFF 0x2
+#define DEF_INT_TYPE_APIC 0x2
+
static const char * const th_names[] = {
"load_store",
"insn_fetch",
@@ -60,6 +62,13 @@ static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */
static void amd_threshold_interrupt(void);
+static void amd_deferred_error_interrupt(void);
+
+static void default_deferred_error_interrupt(void)
+{
+ pr_err("Unexpected deferred interrupt at vector %x\n", DEFERRED_ERROR_VECTOR);
+}
+void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
/*
* CPU Initialization
@@ -196,7 +205,7 @@ static void mce_threshold_block_init(struct threshold_block *b, int offset)
threshold_restart_bank(&tr);
};
-static int setup_APIC_mce(int reserved, int new)
+static int setup_APIC_mce_threshold(int reserved, int new)
{
if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
APIC_EILVT_MSG_FIX, 0))
@@ -205,6 +214,39 @@ static int setup_APIC_mce(int reserved, int new)
return reserved;
}
+static int setup_APIC_deferred_error(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, DEFERRED_ERROR_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
+}
+
+static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
+{
+ u32 low = 0, high = 0;
+ int def_offset = -1, def_new;
+
+ if (rdmsr_safe(MSR_CU_DEF_ERR, &low, &high))
+ return;
+
+ def_new = (low & MASK_DEF_LVTOFF) >> 4;
+ if (!(low & MASK_DEF_LVTOFF)) {
+ pr_err(FW_BUG "Your BIOS is not setting up LVT offset 0x2 for deferred error IRQs correctly.\n");
+ def_new = DEF_LVT_OFF;
+ low = (low & ~MASK_DEF_LVTOFF) | (DEF_LVT_OFF << 4);
+ }
+
+ def_offset = setup_APIC_deferred_error(def_offset, def_new);
+ if ((def_offset == def_new) &&
+ (deferred_error_int_vector != amd_deferred_error_interrupt))
+ deferred_error_int_vector = amd_deferred_error_interrupt;
+
+ low = (low & ~MASK_DEF_INT_TYPE) | DEF_INT_TYPE_APIC;
+ wrmsr(MSR_CU_DEF_ERR, low, high);
+}
+
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
@@ -252,7 +294,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
b.interrupt_enable = 1;
new = (high & MASK_LVTOFF_HI) >> 20;
- offset = setup_APIC_mce(offset, new);
+ offset = setup_APIC_mce_threshold(offset, new);
if ((offset == new) &&
(mce_threshold_vector != amd_threshold_interrupt))
@@ -262,6 +304,73 @@ init:
mce_threshold_block_init(&b, offset);
}
}
+
+ if (mce_flags.succor)
+ deferred_error_interrupt_enable(c);
+}
+
+static void __log_error(unsigned int bank, bool threshold_err, u64 misc)
+{
+ struct mce m;
+ u64 status;
+
+ rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+ if (!(status & MCI_STATUS_VAL))
+ return;
+
+ mce_setup(&m);
+
+ m.status = status;
+ m.bank = bank;
+
+ if (threshold_err)
+ m.misc = misc;
+
+ if (m.status & MCI_STATUS_ADDRV)
+ rdmsrl(MSR_IA32_MCx_ADDR(bank), m.addr);
+
+ mce_log(&m);
+ wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+}
+
+static inline void __smp_deferred_error_interrupt(void)
+{
+ inc_irq_stat(irq_deferred_error_count);
+ deferred_error_int_vector();
+}
+
+asmlinkage __visible void smp_deferred_error_interrupt(void)
+{
+ entering_irq();
+ __smp_deferred_error_interrupt();
+ exiting_ack_irq();
+}
+
+asmlinkage __visible void smp_trace_deferred_error_interrupt(void)
+{
+ entering_irq();
+ trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
+ __smp_deferred_error_interrupt();
+ trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
+ exiting_ack_irq();
+}
+
+/* APIC interrupt handler for deferred errors */
+static void amd_deferred_error_interrupt(void)
+{
+ u64 status;
+ unsigned int bank;
+
+ for (bank = 0; bank < mca_cfg.banks; ++bank) {
+ rdmsrl(MSR_IA32_MCx_STATUS(bank), status);
+
+ if (!(status & MCI_STATUS_VAL) ||
+ !(status & MCI_STATUS_DEFERRED))
+ continue;
+
+ __log_error(bank, false, 0);
+ break;
+ }
}
/*
@@ -273,12 +382,12 @@ init:
* the interrupt goes off when error_count reaches threshold_limit.
* the handler will simply log mcelog w/ software defined bank number.
*/
+
static void amd_threshold_interrupt(void)
{
u32 low = 0, high = 0, address = 0;
int cpu = smp_processor_id();
unsigned int bank, block;
- struct mce m;
/* assume first bank caused it */
for (bank = 0; bank < mca_cfg.banks; ++bank) {
@@ -321,15 +430,7 @@ static void amd_threshold_interrupt(void)
return;
log:
- mce_setup(&m);
- rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
- if (!(m.status & MCI_STATUS_VAL))
- return;
- m.misc = ((u64)high << 32) | low;
- m.bank = bank;
- mce_log(&m);
-
- wrmsrl(MSR_IA32_MCx_STATUS(bank), 0);
+ __log_error(bank, true, ((u64)high << 32) | low);
}
/*
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b4a41cf030ed..844f56c5616d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -91,6 +91,36 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
+static bool lmce_supported(void)
+{
+ u64 tmp;
+
+ if (mca_cfg.lmce_disabled)
+ return false;
+
+ rdmsrl(MSR_IA32_MCG_CAP, tmp);
+
+ /*
+ * LMCE depends on recovery support in the processor. Hence both
+ * MCG_SER_P and MCG_LMCE_P should be present in MCG_CAP.
+ */
+ if ((tmp & (MCG_SER_P | MCG_LMCE_P)) !=
+ (MCG_SER_P | MCG_LMCE_P))
+ return false;
+
+ /*
+ * BIOS should indicate support for LMCE by setting bit 20 in
+ * IA32_FEATURE_CONTROL without which touching MCG_EXT_CTL will
+ * generate a #GP fault.
+ */
+ rdmsrl(MSR_IA32_FEATURE_CONTROL, tmp);
+ if ((tmp & (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE)) ==
+ (FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_LMCE))
+ return true;
+
+ return false;
+}
+
bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
@@ -405,8 +435,22 @@ static void intel_init_cmci(void)
cmci_recheck();
}
+void intel_init_lmce(void)
+{
+ u64 val;
+
+ if (!lmce_supported())
+ return;
+
+ rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+
+ if (!(val & MCG_EXT_CTL_LMCE_EN))
+ wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
+}
+
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
+ intel_init_lmce();
}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 5f90b85ff22e..70d7c93f4550 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -98,7 +98,8 @@ x86_get_mtrr_mem_range(struct range *range, int nr_range,
continue;
base = range_state[i].base_pfn;
if (base < (1<<(20-PAGE_SHIFT)) && mtrr_state.have_fixed &&
- (mtrr_state.enabled & 1)) {
+ (mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
/* Var MTRR contains UC entry below 1M? Skip it: */
printk(BIOS_BUG_MSG, i);
if (base + size <= (1<<(20-PAGE_SHIFT)))
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d74f7b3c6ba..3b533cf37c74 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -102,59 +102,76 @@ static int check_type_overlap(u8 *prev, u8 *curr)
return 0;
}
-/*
- * Error/Semi-error returns:
- * 0xFF - when MTRR is not enabled
- * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
- * corresponds only to [start:*partial_end].
- * Caller has to lookup again for [*partial_end:end].
+/**
+ * mtrr_type_lookup_fixed - look up memory type in MTRR fixed entries
+ *
+ * Return the MTRR fixed memory type of 'start'.
+ *
+ * MTRR fixed entries are divided into the following ways:
+ * 0x00000 - 0x7FFFF : This range is divided into eight 64KB sub-ranges
+ * 0x80000 - 0xBFFFF : This range is divided into sixteen 16KB sub-ranges
+ * 0xC0000 - 0xFFFFF : This range is divided into sixty-four 4KB sub-ranges
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - Matched memory type
+ * MTRR_TYPE_INVALID - Unmatched
+ */
+static u8 mtrr_type_lookup_fixed(u64 start, u64 end)
+{
+ int idx;
+
+ if (start >= 0x100000)
+ return MTRR_TYPE_INVALID;
+
+ /* 0x0 - 0x7FFFF */
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state.fixed_ranges[idx];
+ /* 0x80000 - 0xBFFFF */
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state.fixed_ranges[idx];
+ }
+
+ /* 0xC0000 - 0xFFFFF */
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state.fixed_ranges[idx];
+}
+
+/**
+ * mtrr_type_lookup_variable - look up memory type in MTRR variable entries
+ *
+ * Return Value:
+ * MTRR_TYPE_(type) - Matched memory type or default memory type (unmatched)
+ *
+ * Output Arguments:
+ * repeat - Set to 1 when [start:end] spanned across MTRR range and type
+ * returned corresponds only to [start:*partial_end]. Caller has
+ * to lookup again for [*partial_end:end].
+ *
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
*/
-static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
+static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
+ int *repeat, u8 *uniform)
{
int i;
u64 base, mask;
u8 prev_match, curr_match;
*repeat = 0;
- if (!mtrr_state_set)
- return 0xFF;
-
- if (!mtrr_state.enabled)
- return 0xFF;
+ *uniform = 1;
- /* Make end inclusive end, instead of exclusive */
+ /* Make end inclusive instead of exclusive */
end--;
- /* Look in fixed ranges. Just return the type as per start */
- if (mtrr_state.have_fixed && (start < 0x100000)) {
- int idx;
-
- if (start < 0x80000) {
- idx = 0;
- idx += (start >> 16);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0xC0000) {
- idx = 1 * 8;
- idx += ((start - 0x80000) >> 14);
- return mtrr_state.fixed_ranges[idx];
- } else if (start < 0x1000000) {
- idx = 3 * 8;
- idx += ((start - 0xC0000) >> 12);
- return mtrr_state.fixed_ranges[idx];
- }
- }
-
- /*
- * Look in variable ranges
- * Look of multiple ranges matching this address and pick type
- * as per MTRR precedence
- */
- if (!(mtrr_state.enabled & 2))
- return mtrr_state.def_type;
-
- prev_match = 0xFF;
+ prev_match = MTRR_TYPE_INVALID;
for (i = 0; i < num_var_ranges; ++i) {
- unsigned short start_state, end_state;
+ unsigned short start_state, end_state, inclusive;
if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
continue;
@@ -166,20 +183,29 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
start_state = ((start & mask) == (base & mask));
end_state = ((end & mask) == (base & mask));
+ inclusive = ((start < base) && (end > base));
- if (start_state != end_state) {
+ if ((start_state != end_state) || inclusive) {
/*
* We have start:end spanning across an MTRR.
- * We split the region into
- * either
- * (start:mtrr_end) (mtrr_end:end)
- * or
- * (start:mtrr_start) (mtrr_start:end)
+ * We split the region into either
+ *
+ * - start_state:1
+ * (start:mtrr_end)(mtrr_end:end)
+ * - end_state:1
+ * (start:mtrr_start)(mtrr_start:end)
+ * - inclusive:1
+ * (start:mtrr_start)(mtrr_start:mtrr_end)(mtrr_end:end)
+ *
* depending on kind of overlap.
- * Return the type for first region and a pointer to
- * the start of second region so that caller will
- * lookup again on the second region.
- * Note: This way we handle multiple overlaps as well.
+ *
+ * Return the type of the first region and a pointer
+ * to the start of next region so that caller will be
+ * advised to lookup again after having adjusted start
+ * and end.
+ *
+ * Note: This way we handle overlaps with multiple
+ * entries and the default type properly.
*/
if (start_state)
*partial_end = base + get_mtrr_size(mask);
@@ -193,59 +219,94 @@ static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
end = *partial_end - 1; /* end is inclusive */
*repeat = 1;
+ *uniform = 0;
}
if ((start & mask) != (base & mask))
continue;
curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
- if (prev_match == 0xFF) {
+ if (prev_match == MTRR_TYPE_INVALID) {
prev_match = curr_match;
continue;
}
+ *uniform = 0;
if (check_type_overlap(&prev_match, &curr_match))
return curr_match;
}
- if (mtrr_tom2) {
- if (start >= (1ULL<<32) && (end < mtrr_tom2))
- return MTRR_TYPE_WRBACK;
- }
-
- if (prev_match != 0xFF)
+ if (prev_match != MTRR_TYPE_INVALID)
return prev_match;
return mtrr_state.def_type;
}
-/*
- * Returns the effective MTRR type for the region
- * Error return:
- * 0xFF - when MTRR is not enabled
+/**
+ * mtrr_type_lookup - look up memory type in MTRR
+ *
+ * Return Values:
+ * MTRR_TYPE_(type) - The effective MTRR type for the region
+ * MTRR_TYPE_INVALID - MTRR is disabled
+ *
+ * Output Argument:
+ * uniform - Set to 1 when an MTRR covers the region uniformly, i.e. the
+ * region is fully covered by a single MTRR entry or the default
+ * type.
*/
-u8 mtrr_type_lookup(u64 start, u64 end)
+u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
{
- u8 type, prev_type;
+ u8 type, prev_type, is_uniform = 1, dummy;
int repeat;
u64 partial_end;
- type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+ if (!mtrr_state_set)
+ return MTRR_TYPE_INVALID;
+
+ if (!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED))
+ return MTRR_TYPE_INVALID;
+
+ /*
+ * Look up the fixed ranges first, which take priority over
+ * the variable ranges.
+ */
+ if ((start < 0x100000) &&
+ (mtrr_state.have_fixed) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) {
+ is_uniform = 0;
+ type = mtrr_type_lookup_fixed(start, end);
+ goto out;
+ }
+
+ /*
+ * Look up the variable ranges. Look of multiple ranges matching
+ * this address and pick type as per MTRR precedence.
+ */
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &is_uniform);
/*
* Common path is with repeat = 0.
* However, we can have cases where [start:end] spans across some
- * MTRR range. Do repeated lookups for that case here.
+ * MTRR ranges and/or the default type. Do repeated lookups for
+ * that case here.
*/
while (repeat) {
prev_type = type;
start = partial_end;
- type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
+ is_uniform = 0;
+ type = mtrr_type_lookup_variable(start, end, &partial_end,
+ &repeat, &dummy);
if (check_type_overlap(&prev_type, &type))
- return type;
+ goto out;
}
+ if (mtrr_tom2 && (start >= (1ULL<<32)) && (end < mtrr_tom2))
+ type = MTRR_TYPE_WRBACK;
+
+out:
+ *uniform = is_uniform;
return type;
}
@@ -347,7 +408,9 @@ static void __init print_mtrr_state(void)
mtrr_attrib_to_str(mtrr_state.def_type));
if (mtrr_state.have_fixed) {
pr_debug("MTRR fixed ranges %sabled:\n",
- mtrr_state.enabled & 1 ? "en" : "dis");
+ ((mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED) &&
+ (mtrr_state.enabled & MTRR_STATE_MTRR_FIXED_ENABLED)) ?
+ "en" : "dis");
print_fixed(0x00000, 0x10000, mtrr_state.fixed_ranges + 0);
for (i = 0; i < 2; ++i)
print_fixed(0x80000 + i * 0x20000, 0x04000,
@@ -360,7 +423,7 @@ static void __init print_mtrr_state(void)
print_fixed_last();
}
pr_debug("MTRR variable ranges %sabled:\n",
- mtrr_state.enabled & 2 ? "en" : "dis");
+ mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED ? "en" : "dis");
high_width = (__ffs64(size_or_mask) - (32 - PAGE_SHIFT) + 3) / 4;
for (i = 0; i < num_var_ranges; ++i) {
@@ -382,7 +445,7 @@ static void __init print_mtrr_state(void)
}
/* Grab all of the MTRR state for this CPU into *state */
-void __init get_mtrr_state(void)
+bool __init get_mtrr_state(void)
{
struct mtrr_var_range *vrs;
unsigned long flags;
@@ -426,6 +489,8 @@ void __init get_mtrr_state(void)
post_set();
local_irq_restore(flags);
+
+ return !!(mtrr_state.enabled & MTRR_STATE_MTRR_ENABLED);
}
/* Some BIOS's are messed up and don't set all MTRRs the same! */
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index ea5f363a1948..e7ed0d8ebacb 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -59,6 +59,12 @@
#define MTRR_TO_PHYS_WC_OFFSET 1000
u32 num_var_ranges;
+static bool __mtrr_enabled;
+
+static bool mtrr_enabled(void)
+{
+ return __mtrr_enabled;
+}
unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
static DEFINE_MUTEX(mtrr_mutex);
@@ -286,7 +292,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
int i, replace, error;
mtrr_type ltype;
- if (!mtrr_if)
+ if (!mtrr_enabled())
return -ENXIO;
error = mtrr_if->validate_add_page(base, size, type);
@@ -435,6 +441,8 @@ static int mtrr_check(unsigned long base, unsigned long size)
int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
bool increment)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
@@ -463,8 +471,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
unsigned long lbase, lsize;
int error = -EINVAL;
- if (!mtrr_if)
- return -ENXIO;
+ if (!mtrr_enabled())
+ return -ENODEV;
max = num_var_ranges;
/* No CPU hotplug when we change MTRR entries */
@@ -523,6 +531,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
*/
int mtrr_del(int reg, unsigned long base, unsigned long size)
{
+ if (!mtrr_enabled())
+ return -ENODEV;
if (mtrr_check(base, size))
return -EINVAL;
return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
@@ -538,6 +548,9 @@ EXPORT_SYMBOL(mtrr_del);
* attempts to add a WC MTRR covering size bytes starting at base and
* logs an error if this fails.
*
+ * The called should provide a power of two size on an equivalent
+ * power of two boundary.
+ *
* Drivers must store the return value to pass to mtrr_del_wc_if_needed,
* but drivers should not try to interpret that return value.
*/
@@ -545,7 +558,7 @@ int arch_phys_wc_add(unsigned long base, unsigned long size)
{
int ret;
- if (pat_enabled)
+ if (pat_enabled() || !mtrr_enabled())
return 0; /* Success! (We don't need to do anything.) */
ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true);
@@ -577,7 +590,7 @@ void arch_phys_wc_del(int handle)
EXPORT_SYMBOL(arch_phys_wc_del);
/*
- * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value
+ * arch_phys_wc_index - translates arch_phys_wc_add's return value
* @handle: Return value from arch_phys_wc_add
*
* This will turn the return value from arch_phys_wc_add into an mtrr
@@ -587,14 +600,14 @@ EXPORT_SYMBOL(arch_phys_wc_del);
* in printk line. Alas there is an illegitimate use in some ancient
* drm ioctls.
*/
-int phys_wc_to_mtrr_index(int handle)
+int arch_phys_wc_index(int handle)
{
if (handle < MTRR_TO_PHYS_WC_OFFSET)
return -1;
else
return handle - MTRR_TO_PHYS_WC_OFFSET;
}
-EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index);
+EXPORT_SYMBOL_GPL(arch_phys_wc_index);
/*
* HACK ALERT!
@@ -734,10 +747,12 @@ void __init mtrr_bp_init(void)
}
if (mtrr_if) {
+ __mtrr_enabled = true;
set_num_var_ranges();
init_table();
if (use_intel()) {
- get_mtrr_state();
+ /* BIOS may override */
+ __mtrr_enabled = get_mtrr_state();
if (mtrr_cleanup(phys_addr)) {
changed_by_mtrr_cleanup = 1;
@@ -745,10 +760,16 @@ void __init mtrr_bp_init(void)
}
}
}
+
+ if (!mtrr_enabled())
+ pr_info("MTRR: Disabled\n");
}
void mtrr_ap_init(void)
{
+ if (!mtrr_enabled())
+ return;
+
if (!use_intel() || mtrr_aps_delayed_init)
return;
/*
@@ -774,6 +795,9 @@ void mtrr_save_state(void)
{
int first_cpu;
+ if (!mtrr_enabled())
+ return;
+
get_online_cpus();
first_cpu = cpumask_first(cpu_online_mask);
smp_call_function_single(first_cpu, mtrr_save_fixed_ranges, NULL, 1);
@@ -782,6 +806,8 @@ void mtrr_save_state(void)
void set_mtrr_aps_delayed_init(void)
{
+ if (!mtrr_enabled())
+ return;
if (!use_intel())
return;
@@ -793,7 +819,7 @@ void set_mtrr_aps_delayed_init(void)
*/
void mtrr_aps_init(void)
{
- if (!use_intel())
+ if (!use_intel() || !mtrr_enabled())
return;
/*
@@ -810,7 +836,7 @@ void mtrr_aps_init(void)
void mtrr_bp_restore(void)
{
- if (!use_intel())
+ if (!use_intel() || !mtrr_enabled())
return;
mtrr_if->set_all();
@@ -818,7 +844,7 @@ void mtrr_bp_restore(void)
static int __init mtrr_init_finialize(void)
{
- if (!mtrr_if)
+ if (!mtrr_enabled())
return 0;
if (use_intel()) {
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index df5e41f31a27..951884dcc433 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -51,7 +51,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
void fill_mtrr_var_range(unsigned int index,
u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
-void get_mtrr_state(void);
+bool get_mtrr_state(void);
extern void set_mtrr_ops(const struct mtrr_ops *ops);
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index c76d3e37c6e1..e068d6683dba 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -22,6 +22,7 @@
#include <linux/elfcore.h>
#include <linux/module.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index fe9f0b79a18b..5cb9a4d6f623 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -627,8 +627,12 @@ static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
QFLAG_APPLY_ONCE, intel_graphics_stolen },
/*
- * HPET on current version of Baytrail platform has accuracy
- * problems, disable it for now:
+ * HPET on the current version of the Baytrail platform has accuracy
+ * problems: it will halt in deep idle state - so we disable it.
+ *
+ * More details can be found in section 18.10.1.3 of the datasheet:
+ *
+ * http://www.intel.com/content/dam/www/public/us/en/documents/datasheets/atom-z8000-datasheet-vol-1.pdf
*/
{ PCI_VENDOR_ID_INTEL, 0x0f00,
PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, force_disable_hpet},
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
deleted file mode 100644
index 1c309763e321..000000000000
--- a/arch/x86/kernel/entry_32.S
+++ /dev/null
@@ -1,1401 +0,0 @@
-/*
- *
- * Copyright (C) 1991, 1992 Linus Torvalds
- */
-
-/*
- * entry.S contains the system-call and fault low-level handling routines.
- * This also contains the timer-interrupt handler, as well as all interrupts
- * and faults that can result in a task-switch.
- *
- * NOTE: This code handles signal-recognition, which happens every time
- * after a timer-interrupt and after each system call.
- *
- * I changed all the .align's to 4 (16 byte alignment), as that's faster
- * on a 486.
- *
- * Stack layout in 'syscall_exit':
- * ptrace needs to have all regs on the stack.
- * if the order here is changed, it needs to be
- * updated in fork.c:copy_process, signal.c:do_signal,
- * ptrace.c and ptrace.h
- *
- * 0(%esp) - %ebx
- * 4(%esp) - %ecx
- * 8(%esp) - %edx
- * C(%esp) - %esi
- * 10(%esp) - %edi
- * 14(%esp) - %ebp
- * 18(%esp) - %eax
- * 1C(%esp) - %ds
- * 20(%esp) - %es
- * 24(%esp) - %fs
- * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
- * 2C(%esp) - orig_eax
- * 30(%esp) - %eip
- * 34(%esp) - %cs
- * 38(%esp) - %eflags
- * 3C(%esp) - %oldesp
- * 40(%esp) - %oldss
- *
- * "current" is in register %ebx during any slow entries.
- */
-
-#include <linux/linkage.h>
-#include <linux/err.h>
-#include <asm/thread_info.h>
-#include <asm/irqflags.h>
-#include <asm/errno.h>
-#include <asm/segment.h>
-#include <asm/smp.h>
-#include <asm/page_types.h>
-#include <asm/percpu.h>
-#include <asm/dwarf2.h>
-#include <asm/processor-flags.h>
-#include <asm/ftrace.h>
-#include <asm/irq_vectors.h>
-#include <asm/cpufeature.h>
-#include <asm/alternative-asm.h>
-#include <asm/asm.h>
-#include <asm/smap.h>
-
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-#define sysenter_audit syscall_trace_entry
-#define sysexit_audit syscall_exit_work
-#endif
-
- .section .entry.text, "ax"
-
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization. The following will never clobber any registers:
- * INTERRUPT_RETURN (aka. "iret")
- * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#ifdef CONFIG_PREEMPT
-#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-#define preempt_stop(clobbers)
-#define resume_kernel restore_all
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
- jz 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-/*
- * User gs save/restore
- *
- * %gs is used for userland TLS and kernel only uses it for stack
- * canary which is required to be at %gs:20 by gcc. Read the comment
- * at the top of stackprotector.h for more info.
- *
- * Local labels 98 and 99 are used.
- */
-#ifdef CONFIG_X86_32_LAZY_GS
-
- /* unfortunately push/pop can't be no-op */
-.macro PUSH_GS
- pushl_cfi $0
-.endm
-.macro POP_GS pop=0
- addl $(4 + \pop), %esp
- CFI_ADJUST_CFA_OFFSET -(4 + \pop)
-.endm
-.macro POP_GS_EX
-.endm
-
- /* all the rest are no-op */
-.macro PTGS_TO_GS
-.endm
-.macro PTGS_TO_GS_EX
-.endm
-.macro GS_TO_REG reg
-.endm
-.macro REG_TO_PTGS reg
-.endm
-.macro SET_KERNEL_GS reg
-.endm
-
-#else /* CONFIG_X86_32_LAZY_GS */
-
-.macro PUSH_GS
- pushl_cfi %gs
- /*CFI_REL_OFFSET gs, 0*/
-.endm
-
-.macro POP_GS pop=0
-98: popl_cfi %gs
- /*CFI_RESTORE gs*/
- .if \pop <> 0
- add $\pop, %esp
- CFI_ADJUST_CFA_OFFSET -\pop
- .endif
-.endm
-.macro POP_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, (%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b,99b)
-.endm
-
-.macro PTGS_TO_GS
-98: mov PT_GS(%esp), %gs
-.endm
-.macro PTGS_TO_GS_EX
-.pushsection .fixup, "ax"
-99: movl $0, PT_GS(%esp)
- jmp 98b
-.popsection
- _ASM_EXTABLE(98b,99b)
-.endm
-
-.macro GS_TO_REG reg
- movl %gs, \reg
- /*CFI_REGISTER gs, \reg*/
-.endm
-.macro REG_TO_PTGS reg
- movl \reg, PT_GS(%esp)
- /*CFI_REL_OFFSET gs, PT_GS*/
-.endm
-.macro SET_KERNEL_GS reg
- movl $(__KERNEL_STACK_CANARY), \reg
- movl \reg, %gs
-.endm
-
-#endif /* CONFIG_X86_32_LAZY_GS */
-
-.macro SAVE_ALL
- cld
- PUSH_GS
- pushl_cfi %fs
- /*CFI_REL_OFFSET fs, 0;*/
- pushl_cfi %es
- /*CFI_REL_OFFSET es, 0;*/
- pushl_cfi %ds
- /*CFI_REL_OFFSET ds, 0;*/
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
- movl $(__USER_DS), %edx
- movl %edx, %ds
- movl %edx, %es
- movl $(__KERNEL_PERCPU), %edx
- movl %edx, %fs
- SET_KERNEL_GS %edx
-.endm
-
-.macro RESTORE_INT_REGS
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %ecx
- CFI_RESTORE ecx
- popl_cfi %edx
- CFI_RESTORE edx
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
- popl_cfi %ebp
- CFI_RESTORE ebp
- popl_cfi %eax
- CFI_RESTORE eax
-.endm
-
-.macro RESTORE_REGS pop=0
- RESTORE_INT_REGS
-1: popl_cfi %ds
- /*CFI_RESTORE ds;*/
-2: popl_cfi %es
- /*CFI_RESTORE es;*/
-3: popl_cfi %fs
- /*CFI_RESTORE fs;*/
- POP_GS \pop
-.pushsection .fixup, "ax"
-4: movl $0, (%esp)
- jmp 1b
-5: movl $0, (%esp)
- jmp 2b
-6: movl $0, (%esp)
- jmp 3b
-.popsection
- _ASM_EXTABLE(1b,4b)
- _ASM_EXTABLE(2b,5b)
- _ASM_EXTABLE(3b,6b)
- POP_GS_EX
-.endm
-
-.macro RING0_INT_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 3*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_EC_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 4*4
- /*CFI_OFFSET cs, -2*4;*/
- CFI_OFFSET eip, -3*4
-.endm
-
-.macro RING0_PTREGS_FRAME
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, PT_OLDESP-PT_EBX
- /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/
- CFI_OFFSET eip, PT_EIP-PT_OLDESP
- /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/
- /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/
- CFI_OFFSET eax, PT_EAX-PT_OLDESP
- CFI_OFFSET ebp, PT_EBP-PT_OLDESP
- CFI_OFFSET edi, PT_EDI-PT_OLDESP
- CFI_OFFSET esi, PT_ESI-PT_OLDESP
- CFI_OFFSET edx, PT_EDX-PT_OLDESP
- CFI_OFFSET ecx, PT_ECX-PT_OLDESP
- CFI_OFFSET ebx, PT_EBX-PT_OLDESP
-.endm
-
-ENTRY(ret_from_fork)
- CFI_STARTPROC
- pushl_cfi %eax
- call schedule_tail
- GET_THREAD_INFO(%ebp)
- popl_cfi %eax
- pushl_cfi $0x0202 # Reset kernel eflags
- popfl_cfi
- jmp syscall_exit
- CFI_ENDPROC
-END(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
- CFI_STARTPROC
- pushl_cfi %eax
- call schedule_tail
- GET_THREAD_INFO(%ebp)
- popl_cfi %eax
- pushl_cfi $0x0202 # Reset kernel eflags
- popfl_cfi
- movl PT_EBP(%esp),%eax
- call *PT_EBX(%esp)
- movl $0,PT_EAX(%esp)
- jmp syscall_exit
- CFI_ENDPROC
-ENDPROC(ret_from_kernel_thread)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
- # userspace resumption stub bypassing syscall exit tracing
- ALIGN
- RING0_PTREGS_FRAME
-ret_from_exception:
- preempt_stop(CLBR_ANY)
-ret_from_intr:
- GET_THREAD_INFO(%ebp)
-#ifdef CONFIG_VM86
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
-#else
- /*
- * We can be coming here from child spawned by kernel_thread().
- */
- movl PT_CS(%esp), %eax
- andl $SEGMENT_RPL_MASK, %eax
-#endif
- cmpl $USER_RPL, %eax
- jb resume_kernel # not returning to v8086 or userspace
-
-ENTRY(resume_userspace)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done on
- # int/exception return?
- jne work_pending
- jmp restore_all
-END(ret_from_exception)
-
-#ifdef CONFIG_PREEMPT
-ENTRY(resume_kernel)
- DISABLE_INTERRUPTS(CLBR_ANY)
-need_resched:
- cmpl $0,PER_CPU_VAR(__preempt_count)
- jnz restore_all
- testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz restore_all
- call preempt_schedule_irq
- jmp need_resched
-END(resume_kernel)
-#endif
- CFI_ENDPROC
-
-/* SYSENTER_RETURN points to after the "sysenter" instruction in
- the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
-
- # sysenter call handler stub
-ENTRY(ia32_sysenter_target)
- CFI_STARTPROC simple
- CFI_SIGNAL_FRAME
- CFI_DEF_CFA esp, 0
- CFI_REGISTER esp, ebp
- movl TSS_sysenter_sp0(%esp),%esp
-sysenter_past_esp:
- /*
- * Interrupts are disabled here, but we can't trace it until
- * enough kernel state to call TRACE_IRQS_OFF can be called - but
- * we immediately enable interrupts at that point anyway.
- */
- pushl_cfi $__USER_DS
- /*CFI_REL_OFFSET ss, 0*/
- pushl_cfi %ebp
- CFI_REL_OFFSET esp, 0
- pushfl_cfi
- orl $X86_EFLAGS_IF, (%esp)
- pushl_cfi $__USER_CS
- /*CFI_REL_OFFSET cs, 0*/
- /*
- * Push current_thread_info()->sysenter_return to the stack.
- * A tiny bit of offset fixup is necessary: TI_sysenter_return
- * is relative to thread_info, which is at the bottom of the
- * kernel stack page. 4*4 means the 4 words pushed above;
- * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
- * and THREAD_SIZE takes us to the bottom.
- */
- pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
- CFI_REL_OFFSET eip, 0
-
- pushl_cfi %eax
- SAVE_ALL
- ENABLE_INTERRUPTS(CLBR_NONE)
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
- cmpl $__PAGE_OFFSET-3,%ebp
- jae syscall_fault
- ASM_STAC
-1: movl (%ebp),%ebp
- ASM_CLAC
- movl %ebp,PT_EBP(%esp)
- _ASM_EXTABLE(1b,syscall_fault)
-
- GET_THREAD_INFO(%ebp)
-
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz sysenter_audit
-sysenter_do_call:
- cmpl $(NR_syscalls), %eax
- jae sysenter_badsys
- call *sys_call_table(,%eax,4)
-sysenter_after_call:
- movl %eax,PT_EAX(%esp)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx
- jnz sysexit_audit
-sysenter_exit:
-/* if something modifies registers it must also disable sysexit */
- movl PT_EIP(%esp), %edx
- movl PT_OLDESP(%esp), %ecx
- xorl %ebp,%ebp
- TRACE_IRQS_ON
-1: mov PT_FS(%esp), %fs
- PTGS_TO_GS
- ENABLE_INTERRUPTS_SYSEXIT
-
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
- jnz syscall_trace_entry
- /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
- movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */
- /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
- pushl_cfi PT_ESI(%esp) /* a3: 5th arg */
- pushl_cfi PT_EDX+4(%esp) /* a2: 4th arg */
- call __audit_syscall_entry
- popl_cfi %ecx /* get that remapped edx off the stack */
- popl_cfi %ecx /* get that remapped esi off the stack */
- movl PT_EAX(%esp),%eax /* reload syscall number */
- jmp sysenter_do_call
-
-sysexit_audit:
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jnz syscall_exit_work
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
- movl %eax,%edx /* second arg, syscall return value */
- cmpl $-MAX_ERRNO,%eax /* is it an error ? */
- setbe %al /* 1 if so, 0 if not */
- movzbl %al,%eax /* zero-extend that */
- call __audit_syscall_exit
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jnz syscall_exit_work
- movl PT_EAX(%esp),%eax /* reload syscall return value */
- jmp sysenter_exit
-#endif
-
- CFI_ENDPROC
-.pushsection .fixup,"ax"
-2: movl $0,PT_FS(%esp)
- jmp 1b
-.popsection
- _ASM_EXTABLE(1b,2b)
- PTGS_TO_GS_EX
-ENDPROC(ia32_sysenter_target)
-
- # system call handler stub
-ENTRY(system_call)
- RING0_INT_FRAME # can't unwind into user space anyway
- ASM_CLAC
- pushl_cfi %eax # save orig_eax
- SAVE_ALL
- GET_THREAD_INFO(%ebp)
- # system call tracing in operation / emulation
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
- jnz syscall_trace_entry
- cmpl $(NR_syscalls), %eax
- jae syscall_badsys
-syscall_call:
- call *sys_call_table(,%eax,4)
-syscall_after_call:
- movl %eax,PT_EAX(%esp) # store the return value
-syscall_exit:
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jnz syscall_exit_work
-
-restore_all:
- TRACE_IRQS_IRET
-restore_all_notrace:
-#ifdef CONFIG_X86_ESPFIX32
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
- # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
- # are returning to the kernel.
- # See comments in process.c:copy_thread() for details.
- movb PT_OLDSS(%esp), %ah
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
- cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
- CFI_REMEMBER_STATE
- je ldt_ss # returning to user-space with LDT SS
-#endif
-restore_nocheck:
- RESTORE_REGS 4 # skip orig_eax/error_code
-irq_return:
- INTERRUPT_RETURN
-.section .fixup,"ax"
-ENTRY(iret_exc)
- pushl $0 # no error code
- pushl $do_iret_error
- jmp error_code
-.previous
- _ASM_EXTABLE(irq_return,iret_exc)
-
-#ifdef CONFIG_X86_ESPFIX32
- CFI_RESTORE_STATE
-ldt_ss:
-#ifdef CONFIG_PARAVIRT
- /*
- * The kernel can't run on a non-flat stack if paravirt mode
- * is active. Rather than try to fixup the high bits of
- * ESP, bypass this code entirely. This may break DOSemu
- * and/or Wine support in a paravirt VM, although the option
- * is still available to implement the setting of the high
- * 16-bits in the INTERRUPT_RETURN paravirt-op.
- */
- cmpl $0, pv_info+PARAVIRT_enabled
- jne restore_nocheck
-#endif
-
-/*
- * Setup and switch to ESPFIX stack
- *
- * We're returning to userspace with a 16 bit stack. The CPU will not
- * restore the high word of ESP for us on executing iret... This is an
- * "official" bug of all the x86-compatible CPUs, which we can work
- * around to make dosemu and wine happy. We do this by preloading the
- * high word of ESP with the high word of the userspace ESP while
- * compensating for the offset by changing to the ESPFIX segment with
- * a base address that matches for the difference.
- */
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
- mov %esp, %edx /* load kernel esp */
- mov PT_OLDESP(%esp), %eax /* load userspace esp */
- mov %dx, %ax /* eax: new kernel esp */
- sub %eax, %edx /* offset (low word is 0) */
- shr $16, %edx
- mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
- mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
- pushl_cfi $__ESPFIX_SS
- pushl_cfi %eax /* new kernel esp */
- /* Disable interrupts, but do not irqtrace this section: we
- * will soon execute iret and the tracer was already set to
- * the irqstate after the iret */
- DISABLE_INTERRUPTS(CLBR_EAX)
- lss (%esp), %esp /* switch to espfix segment */
- CFI_ADJUST_CFA_OFFSET -8
- jmp restore_nocheck
-#endif
- CFI_ENDPROC
-ENDPROC(system_call)
-
- # perform work that needs to be done immediately before resumption
- ALIGN
- RING0_PTREGS_FRAME # can't unwind into user space anyway
-work_pending:
- testb $_TIF_NEED_RESCHED, %cl
- jz work_notifysig
-work_resched:
- call schedule
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
- # setting need_resched or sigpending
- # between sampling and the iret
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- andl $_TIF_WORK_MASK, %ecx # is there any work to be done other
- # than syscall tracing?
- jz restore_all
- testb $_TIF_NEED_RESCHED, %cl
- jnz work_resched
-
-work_notifysig: # deal with pending signals and
- # notify-resume requests
-#ifdef CONFIG_VM86
- testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
- movl %esp, %eax
- jnz work_notifysig_v86 # returning to kernel-space or
- # vm86-space
-1:
-#else
- movl %esp, %eax
-#endif
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- movb PT_CS(%esp), %bl
- andb $SEGMENT_RPL_MASK, %bl
- cmpb $USER_RPL, %bl
- jb resume_kernel
- xorl %edx, %edx
- call do_notify_resume
- jmp resume_userspace
-
-#ifdef CONFIG_VM86
- ALIGN
-work_notifysig_v86:
- pushl_cfi %ecx # save ti_flags for do_notify_resume
- call save_v86_state # %eax contains pt_regs pointer
- popl_cfi %ecx
- movl %eax, %esp
- jmp 1b
-#endif
-END(work_pending)
-
- # perform syscall exit tracing
- ALIGN
-syscall_trace_entry:
- movl $-ENOSYS,PT_EAX(%esp)
- movl %esp, %eax
- call syscall_trace_enter
- /* What it returned is what we'll actually use. */
- cmpl $(NR_syscalls), %eax
- jnae syscall_call
- jmp syscall_exit
-END(syscall_trace_entry)
-
- # perform syscall exit tracing
- ALIGN
-syscall_exit_work:
- testl $_TIF_WORK_SYSCALL_EXIT, %ecx
- jz work_pending
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
- # schedule() instead
- movl %esp, %eax
- call syscall_trace_leave
- jmp resume_userspace
-END(syscall_exit_work)
- CFI_ENDPROC
-
- RING0_INT_FRAME # can't unwind into user space anyway
-syscall_fault:
- ASM_CLAC
- GET_THREAD_INFO(%ebp)
- movl $-EFAULT,PT_EAX(%esp)
- jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
- movl $-ENOSYS,%eax
- jmp syscall_after_call
-END(syscall_badsys)
-
-sysenter_badsys:
- movl $-ENOSYS,%eax
- jmp sysenter_after_call
-END(sysenter_badsys)
- CFI_ENDPROC
-
-.macro FIXUP_ESPFIX_STACK
-/*
- * Switch back for ESPFIX stack to the normal zerobased stack
- *
- * We can't call C functions using the ESPFIX stack. This code reads
- * the high word of the segment base from the GDT and swiches to the
- * normal stack and adjusts ESP with the matching offset.
- */
-#ifdef CONFIG_X86_ESPFIX32
- /* fixup the stack */
- mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
- mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
- shl $16, %eax
- addl %esp, %eax /* the adjusted stack pointer */
- pushl_cfi $__KERNEL_DS
- pushl_cfi %eax
- lss (%esp), %esp /* switch to the normal stack segment */
- CFI_ADJUST_CFA_OFFSET -8
-#endif
-.endm
-.macro UNWIND_ESPFIX_STACK
-#ifdef CONFIG_X86_ESPFIX32
- movl %ss, %eax
- /* see if on espfix stack */
- cmpw $__ESPFIX_SS, %ax
- jne 27f
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
- movl %eax, %es
- /* switch to normal stack */
- FIXUP_ESPFIX_STACK
-27:
-#endif
-.endm
-
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-ENTRY(irq_entries_start)
- RING0_INT_FRAME
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- CFI_ADJUST_CFA_OFFSET -4
- .align 8
- .endr
-END(irq_entries_start)
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-common_interrupt:
- ASM_CLAC
- addl $-0x80,(%esp) /* Adjust vector into the [-256,-1] range */
- SAVE_ALL
- TRACE_IRQS_OFF
- movl %esp,%eax
- call do_IRQ
- jmp ret_from_intr
-ENDPROC(common_interrupt)
- CFI_ENDPROC
-
-#define BUILD_INTERRUPT3(name, nr, fn) \
-ENTRY(name) \
- RING0_INT_FRAME; \
- ASM_CLAC; \
- pushl_cfi $~(nr); \
- SAVE_ALL; \
- TRACE_IRQS_OFF \
- movl %esp,%eax; \
- call fn; \
- jmp ret_from_intr; \
- CFI_ENDPROC; \
-ENDPROC(name)
-
-
-#ifdef CONFIG_TRACING
-#define TRACE_BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name)
-#else
-#define TRACE_BUILD_INTERRUPT(name, nr)
-#endif
-
-#define BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(name, nr, smp_##name); \
- TRACE_BUILD_INTERRUPT(name, nr)
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-ENTRY(coprocessor_error)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_coprocessor_error
- jmp error_code
- CFI_ENDPROC
-END(coprocessor_error)
-
-ENTRY(simd_coprocessor_error)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
-#ifdef CONFIG_X86_INVD_BUG
- /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
- ALTERNATIVE "pushl_cfi $do_general_protection", \
- "pushl $do_simd_coprocessor_error", \
- X86_FEATURE_XMM
-#else
- pushl_cfi $do_simd_coprocessor_error
-#endif
- jmp error_code
- CFI_ENDPROC
-END(simd_coprocessor_error)
-
-ENTRY(device_not_available)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $-1 # mark this as an int
- pushl_cfi $do_device_not_available
- jmp error_code
- CFI_ENDPROC
-END(device_not_available)
-
-#ifdef CONFIG_PARAVIRT
-ENTRY(native_iret)
- iret
- _ASM_EXTABLE(native_iret, iret_exc)
-END(native_iret)
-
-ENTRY(native_irq_enable_sysexit)
- sti
- sysexit
-END(native_irq_enable_sysexit)
-#endif
-
-ENTRY(overflow)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_overflow
- jmp error_code
- CFI_ENDPROC
-END(overflow)
-
-ENTRY(bounds)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_bounds
- jmp error_code
- CFI_ENDPROC
-END(bounds)
-
-ENTRY(invalid_op)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_invalid_op
- jmp error_code
- CFI_ENDPROC
-END(invalid_op)
-
-ENTRY(coprocessor_segment_overrun)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_coprocessor_segment_overrun
- jmp error_code
- CFI_ENDPROC
-END(coprocessor_segment_overrun)
-
-ENTRY(invalid_TSS)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_invalid_TSS
- jmp error_code
- CFI_ENDPROC
-END(invalid_TSS)
-
-ENTRY(segment_not_present)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_segment_not_present
- jmp error_code
- CFI_ENDPROC
-END(segment_not_present)
-
-ENTRY(stack_segment)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_stack_segment
- jmp error_code
- CFI_ENDPROC
-END(stack_segment)
-
-ENTRY(alignment_check)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_alignment_check
- jmp error_code
- CFI_ENDPROC
-END(alignment_check)
-
-ENTRY(divide_error)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0 # no error code
- pushl_cfi $do_divide_error
- jmp error_code
- CFI_ENDPROC
-END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-ENTRY(machine_check)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi machine_check_vector
- jmp error_code
- CFI_ENDPROC
-END(machine_check)
-#endif
-
-ENTRY(spurious_interrupt_bug)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $0
- pushl_cfi $do_spurious_interrupt_bug
- jmp error_code
- CFI_ENDPROC
-END(spurious_interrupt_bug)
-
-#ifdef CONFIG_XEN
-/* Xen doesn't set %esp to be precisely what the normal sysenter
- entrypoint expects, so fix it up before using the normal path. */
-ENTRY(xen_sysenter_target)
- RING0_INT_FRAME
- addl $5*4, %esp /* remove xen-provided frame */
- CFI_ADJUST_CFA_OFFSET -5*4
- jmp sysenter_past_esp
- CFI_ENDPROC
-
-ENTRY(xen_hypervisor_callback)
- CFI_STARTPROC
- pushl_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- TRACE_IRQS_OFF
-
- /* Check to see if we got the event in the critical
- region in xen_iret_direct, after we've reenabled
- events and checked for pending events. This simulates
- iret instruction's behaviour where it delivers a
- pending interrupt when enabling interrupts. */
- movl PT_EIP(%esp),%eax
- cmpl $xen_iret_start_crit,%eax
- jb 1f
- cmpl $xen_iret_end_crit,%eax
- jae 1f
-
- jmp xen_iret_crit_fixup
-
-ENTRY(xen_do_upcall)
-1: mov %esp, %eax
- call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPT
- call xen_maybe_preempt_hcall
-#endif
- jmp ret_from_intr
- CFI_ENDPROC
-ENDPROC(xen_hypervisor_callback)
-
-# Hypervisor uses this for application faults while it executes.
-# We get here for two reasons:
-# 1. Fault while reloading DS, ES, FS or GS
-# 2. Fault while executing IRET
-# Category 1 we fix up by reattempting the load, and zeroing the segment
-# register if the load fails.
-# Category 2 we fix up by jumping to do_iret_error. We cannot use the
-# normal Linux return path in this case because if we use the IRET hypercall
-# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
-# We distinguish between categories by maintaining a status value in EAX.
-ENTRY(xen_failsafe_callback)
- CFI_STARTPROC
- pushl_cfi %eax
- movl $1,%eax
-1: mov 4(%esp),%ds
-2: mov 8(%esp),%es
-3: mov 12(%esp),%fs
-4: mov 16(%esp),%gs
- /* EAX == 0 => Category 1 (Bad segment)
- EAX != 0 => Category 2 (Bad IRET) */
- testl %eax,%eax
- popl_cfi %eax
- lea 16(%esp),%esp
- CFI_ADJUST_CFA_OFFSET -16
- jz 5f
- jmp iret_exc
-5: pushl_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
- jmp ret_from_exception
- CFI_ENDPROC
-
-.section .fixup,"ax"
-6: xorl %eax,%eax
- movl %eax,4(%esp)
- jmp 1b
-7: xorl %eax,%eax
- movl %eax,8(%esp)
- jmp 2b
-8: xorl %eax,%eax
- movl %eax,12(%esp)
- jmp 3b
-9: xorl %eax,%eax
- movl %eax,16(%esp)
- jmp 4b
-.previous
- _ASM_EXTABLE(1b,6b)
- _ASM_EXTABLE(2b,7b)
- _ASM_EXTABLE(3b,8b)
- _ASM_EXTABLE(4b,9b)
-ENDPROC(xen_failsafe_callback)
-
-BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- xen_evtchn_do_upcall)
-
-#endif /* CONFIG_XEN */
-
-#if IS_ENABLED(CONFIG_HYPERV)
-
-BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- hyperv_vector_handler)
-
-#endif /* CONFIG_HYPERV */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-ENTRY(mcount)
- ret
-END(mcount)
-
-ENTRY(ftrace_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- pushl $0 /* Pass NULL as regs pointer */
- movl 4*4(%esp), %eax
- movl 0x4(%ebp), %edx
- movl function_trace_op, %ecx
- subl $MCOUNT_INSN_SIZE, %eax
-
-.globl ftrace_call
-ftrace_call:
- call ftrace_stub
-
- addl $4,%esp /* skip NULL pointer */
- popl %edx
- popl %ecx
- popl %eax
-ftrace_ret:
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
- jmp ftrace_stub
-#endif
-
-.globl ftrace_stub
-ftrace_stub:
- ret
-END(ftrace_caller)
-
-ENTRY(ftrace_regs_caller)
- pushf /* push flags before compare (in cs location) */
-
- /*
- * i386 does not save SS and ESP when coming from kernel.
- * Instead, to get sp, &regs->sp is used (see ptrace.h).
- * Unfortunately, that means eflags must be at the same location
- * as the current return ip is. We move the return ip into the
- * ip location, and move flags into the return ip location.
- */
- pushl 4(%esp) /* save return ip into ip slot */
-
- pushl $0 /* Load 0 into orig_ax */
- pushl %gs
- pushl %fs
- pushl %es
- pushl %ds
- pushl %eax
- pushl %ebp
- pushl %edi
- pushl %esi
- pushl %edx
- pushl %ecx
- pushl %ebx
-
- movl 13*4(%esp), %eax /* Get the saved flags */
- movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */
- /* clobbering return ip */
- movl $__KERNEL_CS,13*4(%esp)
-
- movl 12*4(%esp), %eax /* Load ip (1st parameter) */
- subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */
- movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */
- movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */
- pushl %esp /* Save pt_regs as 4th parameter */
-
-GLOBAL(ftrace_regs_call)
- call ftrace_stub
-
- addl $4, %esp /* Skip pt_regs */
- movl 14*4(%esp), %eax /* Move flags back into cs */
- movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */
- movl 12*4(%esp), %eax /* Get return ip from regs->ip */
- movl %eax, 14*4(%esp) /* Put return ip back for ret */
-
- popl %ebx
- popl %ecx
- popl %edx
- popl %esi
- popl %edi
- popl %ebp
- popl %eax
- popl %ds
- popl %es
- popl %fs
- popl %gs
- addl $8, %esp /* Skip orig_ax and ip */
- popf /* Pop flags at end (no addl to corrupt flags) */
- jmp ftrace_ret
-
- popf
- jmp ftrace_stub
-#else /* ! CONFIG_DYNAMIC_FTRACE */
-
-ENTRY(mcount)
- cmpl $__PAGE_OFFSET, %esp
- jb ftrace_stub /* Paging not enabled yet? */
-
- cmpl $ftrace_stub, ftrace_trace_function
- jnz trace
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- cmpl $ftrace_stub, ftrace_graph_return
- jnz ftrace_graph_caller
-
- cmpl $ftrace_graph_entry_stub, ftrace_graph_entry
- jnz ftrace_graph_caller
-#endif
-.globl ftrace_stub
-ftrace_stub:
- ret
-
- /* taken from glibc */
-trace:
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- movl 0x4(%ebp), %edx
- subl $MCOUNT_INSN_SIZE, %eax
-
- call *ftrace_trace_function
-
- popl %edx
- popl %ecx
- popl %eax
- jmp ftrace_stub
-END(mcount)
-#endif /* CONFIG_DYNAMIC_FTRACE */
-#endif /* CONFIG_FUNCTION_TRACER */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-ENTRY(ftrace_graph_caller)
- pushl %eax
- pushl %ecx
- pushl %edx
- movl 0xc(%esp), %eax
- lea 0x4(%ebp), %edx
- movl (%ebp), %ecx
- subl $MCOUNT_INSN_SIZE, %eax
- call prepare_ftrace_return
- popl %edx
- popl %ecx
- popl %eax
- ret
-END(ftrace_graph_caller)
-
-.globl return_to_handler
-return_to_handler:
- pushl %eax
- pushl %edx
- movl %ebp, %eax
- call ftrace_return_to_handler
- movl %eax, %ecx
- popl %edx
- popl %eax
- jmp *%ecx
-#endif
-
-#ifdef CONFIG_TRACING
-ENTRY(trace_page_fault)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $trace_do_page_fault
- jmp error_code
- CFI_ENDPROC
-END(trace_page_fault)
-#endif
-
-ENTRY(page_fault)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_page_fault
- ALIGN
-error_code:
- /* the function address is in %gs's slot on the stack */
- pushl_cfi %fs
- /*CFI_REL_OFFSET fs, 0*/
- pushl_cfi %es
- /*CFI_REL_OFFSET es, 0*/
- pushl_cfi %ds
- /*CFI_REL_OFFSET ds, 0*/
- pushl_cfi_reg eax
- pushl_cfi_reg ebp
- pushl_cfi_reg edi
- pushl_cfi_reg esi
- pushl_cfi_reg edx
- pushl_cfi_reg ecx
- pushl_cfi_reg ebx
- cld
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
- UNWIND_ESPFIX_STACK
- GS_TO_REG %ecx
- movl PT_GS(%esp), %edi # get the function address
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
- movl $(__USER_DS), %ecx
- movl %ecx, %ds
- movl %ecx, %es
- TRACE_IRQS_OFF
- movl %esp,%eax # pt_regs pointer
- call *%edi
- jmp ret_from_exception
- CFI_ENDPROC
-END(page_fault)
-
-/*
- * Debug traps and NMI can happen at the one SYSENTER instruction
- * that sets up the real kernel stack. Check here, since we can't
- * allow the wrong stack to be used.
- *
- * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have
- * already pushed 3 words if it hits on the sysenter instruction:
- * eflags, cs and eip.
- *
- * We just load the right stack, and push the three (known) values
- * by hand onto the new stack - while updating the return eip past
- * the instruction that would have done it for sysenter.
- */
-.macro FIX_STACK offset ok label
- cmpw $__KERNEL_CS, 4(%esp)
- jne \ok
-\label:
- movl TSS_sysenter_sp0 + \offset(%esp), %esp
- CFI_DEF_CFA esp, 0
- CFI_UNDEFINED eip
- pushfl_cfi
- pushl_cfi $__KERNEL_CS
- pushl_cfi $sysenter_past_esp
- CFI_REL_OFFSET eip, 0
-.endm
-
-ENTRY(debug)
- RING0_INT_FRAME
- ASM_CLAC
- cmpl $ia32_sysenter_target,(%esp)
- jne debug_stack_correct
- FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
-debug_stack_correct:
- pushl_cfi $-1 # mark this as an int
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # error code 0
- movl %esp,%eax # pt_regs pointer
- call do_debug
- jmp ret_from_exception
- CFI_ENDPROC
-END(debug)
-
-/*
- * NMI is doubly nasty. It can happen _while_ we're handling
- * a debug fault, and the debug fault hasn't yet been able to
- * clear up the stack. So we first check whether we got an
- * NMI on the sysenter entry path, but after that we need to
- * check whether we got an NMI on the debug path where the debug
- * fault happened on the sysenter path.
- */
-ENTRY(nmi)
- RING0_INT_FRAME
- ASM_CLAC
-#ifdef CONFIG_X86_ESPFIX32
- pushl_cfi %eax
- movl %ss, %eax
- cmpw $__ESPFIX_SS, %ax
- popl_cfi %eax
- je nmi_espfix_stack
-#endif
- cmpl $ia32_sysenter_target,(%esp)
- je nmi_stack_fixup
- pushl_cfi %eax
- movl %esp,%eax
- /* Do not access memory above the end of our stack page,
- * it might not exist.
- */
- andl $(THREAD_SIZE-1),%eax
- cmpl $(THREAD_SIZE-20),%eax
- popl_cfi %eax
- jae nmi_stack_correct
- cmpl $ia32_sysenter_target,12(%esp)
- je nmi_debug_stack_check
-nmi_stack_correct:
- /* We have a RING0_INT_FRAME here */
- pushl_cfi %eax
- SAVE_ALL
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_nmi
- jmp restore_all_notrace
- CFI_ENDPROC
-
-nmi_stack_fixup:
- RING0_INT_FRAME
- FIX_STACK 12, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-nmi_debug_stack_check:
- /* We have a RING0_INT_FRAME here */
- cmpw $__KERNEL_CS,16(%esp)
- jne nmi_stack_correct
- cmpl $debug,(%esp)
- jb nmi_stack_correct
- cmpl $debug_esp_fix_insn,(%esp)
- ja nmi_stack_correct
- FIX_STACK 24, nmi_stack_correct, 1
- jmp nmi_stack_correct
-
-#ifdef CONFIG_X86_ESPFIX32
-nmi_espfix_stack:
- /* We have a RING0_INT_FRAME here.
- *
- * create the pointer to lss back
- */
- pushl_cfi %ss
- pushl_cfi %esp
- addl $4, (%esp)
- /* copy the iret frame of 12 bytes */
- .rept 3
- pushl_cfi 16(%esp)
- .endr
- pushl_cfi %eax
- SAVE_ALL
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx,%edx # zero error code
- call do_nmi
- RESTORE_REGS
- lss 12+4(%esp), %esp # back to espfix stack
- CFI_ADJUST_CFA_OFFSET -24
- jmp irq_return
-#endif
- CFI_ENDPROC
-END(nmi)
-
-ENTRY(int3)
- RING0_INT_FRAME
- ASM_CLAC
- pushl_cfi $-1 # mark this as an int
- SAVE_ALL
- TRACE_IRQS_OFF
- xorl %edx,%edx # zero error code
- movl %esp,%eax # pt_regs pointer
- call do_int3
- jmp ret_from_exception
- CFI_ENDPROC
-END(int3)
-
-ENTRY(general_protection)
- RING0_EC_FRAME
- pushl_cfi $do_general_protection
- jmp error_code
- CFI_ENDPROC
-END(general_protection)
-
-#ifdef CONFIG_KVM_GUEST
-ENTRY(async_page_fault)
- RING0_EC_FRAME
- ASM_CLAC
- pushl_cfi $do_async_page_fault
- jmp error_code
- CFI_ENDPROC
-END(async_page_fault)
-#endif
-
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 2b55ee6db053..5a4668136e98 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -167,7 +167,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
clear_bss();
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
- set_intr_gate(i, early_idt_handlers[i]);
+ set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
copy_bootdata(__va(real_mode_data));
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 02d257256200..544dec4cc605 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -478,21 +478,22 @@ is486:
__INIT
setup_once:
/*
- * Set up a idt with 256 entries pointing to ignore_int,
- * interrupt gates. It doesn't actually load idt - that needs
- * to be done on each CPU. Interrupts are enabled elsewhere,
- * when we can be relatively sure everything is ok.
+ * Set up a idt with 256 interrupt gates that push zero if there
+ * is no error code and then jump to early_idt_handler_common.
+ * It doesn't actually load the idt - that needs to be done on
+ * each CPU. Interrupts are enabled elsewhere, when we can be
+ * relatively sure everything is ok.
*/
movl $idt_table,%edi
- movl $early_idt_handlers,%eax
+ movl $early_idt_handler_array,%eax
movl $NUM_EXCEPTION_VECTORS,%ecx
1:
movl %eax,(%edi)
movl %eax,4(%edi)
/* interrupt gate, dpl=0, present */
movl $(0x8E000000 + __KERNEL_CS),2(%edi)
- addl $9,%eax
+ addl $EARLY_IDT_HANDLER_SIZE,%eax
addl $8,%edi
loop 1b
@@ -524,26 +525,28 @@ setup_once:
andl $0,setup_once_ref /* Once is enough, thanks */
ret
-ENTRY(early_idt_handlers)
+ENTRY(early_idt_handler_array)
# 36(%esp) %eflags
# 32(%esp) %cs
# 28(%esp) %eip
# 24(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .if (EXCEPTION_ERRCODE_MASK >> i) & 1
- ASM_NOP2
- .else
+ .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
pushl $0 # Dummy error code, to make stack frame uniform
.endif
pushl $i # 20(%esp) Vector number
- jmp early_idt_handler
+ jmp early_idt_handler_common
i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handlers)
+ENDPROC(early_idt_handler_array)
- /* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
cld
cmpl $2,(%esp) # X86_TRAP_NMI
@@ -603,7 +606,7 @@ ex_entry:
.Lis_nmi:
addl $8,%esp /* drop vector number and error code */
iret
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
/* This is the default interrupt "handler" :-) */
ALIGN
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 43eafc8afb69..e5c27f729a38 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -321,26 +321,28 @@ bad_address:
jmp bad_address
__INIT
- .globl early_idt_handlers
-early_idt_handlers:
+ENTRY(early_idt_handler_array)
# 104(%rsp) %rflags
# 96(%rsp) %cs
# 88(%rsp) %rip
# 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .if (EXCEPTION_ERRCODE_MASK >> i) & 1
- ASM_NOP2
- .else
+ .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
pushq $0 # Dummy error code, to make stack frame uniform
.endif
pushq $i # 72(%rsp) Vector number
- jmp early_idt_handler
+ jmp early_idt_handler_common
i = i + 1
+ .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
+ENDPROC(early_idt_handler_array)
-/* This is global to keep gas from relaxing the jumps */
-ENTRY(early_idt_handler)
+early_idt_handler_common:
+ /*
+ * The stack is the hardware frame, an error code or zero, and the
+ * vector number.
+ */
cld
cmpl $2,(%rsp) # X86_TRAP_NMI
@@ -412,7 +414,7 @@ ENTRY(early_idt_handler)
.Lis_nmi:
addq $16,%rsp # drop vector number and error code
INTERRUPT_RETURN
-ENDPROC(early_idt_handler)
+ENDPROC(early_idt_handler_common)
__INITDATA
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 009183276bb7..6185d3141219 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -173,6 +173,21 @@ static void init_thread_xstate(void)
xstate_size = sizeof(struct i387_fxsave_struct);
else
xstate_size = sizeof(struct i387_fsave_struct);
+
+ /*
+ * Quirk: we don't yet handle the XSAVES* instructions
+ * correctly, as we don't correctly convert between
+ * standard and compacted format when interfacing
+ * with user-space - so disable it for now.
+ *
+ * The difference is small: with recent CPUs the
+ * compacted format is only marginally smaller than
+ * the standard FPU state format.
+ *
+ * ( This is easy to backport while we are fixing
+ * XSAVES* support. )
+ */
+ setup_clear_cpu_cap(X86_FEATURE_XSAVES);
}
/*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 7e10c8b4b318..88b366487b0e 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -122,6 +122,12 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
seq_puts(p, " Threshold APIC interrupts\n");
#endif
+#ifdef CONFIG_X86_MCE_AMD
+ seq_printf(p, "%*s: ", prec, "DFR");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", irq_stats(j)->irq_deferred_error_count);
+ seq_puts(p, " Deferred Error APIC interrupts\n");
+#endif
#ifdef CONFIG_X86_MCE
seq_printf(p, "%*s: ", prec, "MCE");
for_each_online_cpu(j)
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 680723a8e4b6..a3a5e158ed69 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -135,6 +135,10 @@ static void __init apic_intr_init(void)
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
+#ifdef CONFIG_X86_MCE_AMD
+ alloc_intr_gate(DEFERRED_ERROR_VECTOR, deferred_error_interrupt);
+#endif
+
#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 9435620062df..1681504e44a4 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu)
kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void kvm_wait(u8 *ptr, u8 val)
+{
+ unsigned long flags;
+
+ if (in_nmi())
+ return;
+
+ local_irq_save(flags);
+
+ if (READ_ONCE(*ptr) != val)
+ goto out;
+
+ /*
+ * halt until it's our turn and kicked. Note that we do safe halt
+ * for irq enabled case to avoid hang when lock info is overwritten
+ * in irq spinlock slowpath and no spurious interrupt occur to save us.
+ */
+ if (arch_irqs_disabled_flags(flags))
+ halt();
+ else
+ safe_halt();
+
+out:
+ local_irq_restore(flags);
+}
+
+#else /* !CONFIG_QUEUED_SPINLOCKS */
+
enum kvm_contention_stat {
TAKEN_SLOW,
TAKEN_SLOW_PICKUP,
@@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
}
}
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void)
if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
return;
+#ifdef CONFIG_QUEUED_SPINLOCKS
+ __pv_init_lock_hash();
+ pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_lock_ops.wait = kvm_wait;
+ pv_lock_ops.kick = kvm_kick_cpu;
+#else /* !CONFIG_QUEUED_SPINLOCKS */
pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
pv_lock_ops.unlock_kick = kvm_unlock_kick;
+#endif
}
static __init int kvm_spinlock_init_jump(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 415480d3ea84..11546b462fa6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -17,6 +17,7 @@
#include <linux/ftrace.h>
#include <linux/io.h>
#include <linux/suspend.h>
+#include <linux/vmalloc.h>
#include <asm/init.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index bbb6c7316341..33ee3e0efd65 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,11 +8,33 @@
#include <asm/paravirt.h>
+#ifdef CONFIG_QUEUED_SPINLOCKS
+__visible void __native_queued_spin_unlock(struct qspinlock *lock)
+{
+ native_queued_spin_unlock(lock);
+}
+
+PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock);
+
+bool pv_is_native_spin_unlock(void)
+{
+ return pv_lock_ops.queued_spin_unlock.func ==
+ __raw_callee_save___native_queued_spin_unlock;
+}
+#endif
+
struct pv_lock_ops pv_lock_ops = {
#ifdef CONFIG_SMP
+#ifdef CONFIG_QUEUED_SPINLOCKS
+ .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath,
+ .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock),
+ .wait = paravirt_nop,
+ .kick = paravirt_nop,
+#else /* !CONFIG_QUEUED_SPINLOCKS */
.lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop),
.unlock_kick = paravirt_nop,
-#endif
+#endif /* !CONFIG_QUEUED_SPINLOCKS */
+#endif /* SMP */
};
EXPORT_SYMBOL(pv_lock_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6d6ab6..e1b013696dde 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
+#endif
+
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
{
/* arg in %eax, return in %eax */
@@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
return 0;
}
+extern bool pv_is_native_spin_unlock(void);
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
@@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_cpu_ops, read_tsc);
-
- patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
- break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+ if (pv_is_native_spin_unlock()) {
+ start = start_pv_lock_ops_queued_spin_unlock;
+ end = end_pv_lock_ops_queued_spin_unlock;
+ goto patch_site;
+ }
+#endif
default:
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
+
+patch_site:
+ ret = paravirt_patch_insns(ibuf, len, start, end);
+ break;
}
#undef PATCH_SITE
return ret;
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 0de21c62c348..8aa05583bc42 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
DEF_NATIVE(, mov64, "mov %rdi, %rax");
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)");
+#endif
+
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
{
return paravirt_patch_insns(insnbuf, len,
@@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
start__mov64, end__mov64);
}
+extern bool pv_is_native_spin_unlock(void);
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
@@ -58,14 +64,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_cpu_ops, clts);
PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
-
- patch_site:
- ret = paravirt_patch_insns(ibuf, len, start, end);
- break;
+#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
+ case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
+ if (pv_is_native_spin_unlock()) {
+ start = start_pv_lock_ops_queued_spin_unlock;
+ end = end_pv_lock_ops_queued_spin_unlock;
+ goto patch_site;
+ }
+#endif
default:
ret = paravirt_patch_default(type, clobbers, ibuf, addr, len);
break;
+
+patch_site:
+ ret = paravirt_patch_insns(ibuf, len, start, end);
+ break;
}
#undef PATCH_SITE
return ret;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5e0791f9d3dc..de379366f6d1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -72,8 +72,7 @@ gate_desc debug_idt_table[NR_VECTORS] __page_aligned_bss;
#else
#include <asm/processor-flags.h>
#include <asm/setup.h>
-
-asmlinkage int system_call(void);
+#include <asm/proto.h>
#endif
/* Must be page-aligned because the real IDT is used in a fixmap. */
@@ -813,18 +812,6 @@ dotraplinkage void
do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
{
conditional_sti(regs);
-#if 0
- /* No need to warn about this any longer. */
- pr_info("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage __visible void __attribute__((weak)) smp_threshold_interrupt(void)
-{
}
/*
@@ -992,12 +979,12 @@ void __init trap_init(void)
set_bit(i, used_vectors);
#ifdef CONFIG_IA32_EMULATION
- set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+ set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
#ifdef CONFIG_X86_32
- set_system_trap_gate(IA32_SYSCALL_VECTOR, &system_call);
+ set_system_trap_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59b69f6a2844..1d08ad3582d0 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -16,6 +16,8 @@
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <asm/i387.h> /* For use_eager_fpu. Ugh! */
+#include <asm/fpu-internal.h> /* For use_eager_fpu. Ugh! */
#include <asm/user.h>
#include <asm/xsave.h>
#include "cpuid.h"
@@ -95,6 +97,8 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
+ vcpu->arch.eager_fpu = guest_cpuid_has_mpx(vcpu);
+
/*
* The existing code assumes virtual address is 48-bit in the canonical
* address checks; exit if it is ever changed.
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index c3b1ad9fca81..496b3695d3d3 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -117,4 +117,12 @@ static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu)
best = kvm_find_cpuid_entry(vcpu, 7, 0);
return best && (best->ebx & bit(X86_FEATURE_RTM));
}
+
+static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *best;
+
+ best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ return best && (best->ebx & bit(X86_FEATURE_MPX));
+}
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d43867c33bc4..44a7d2515497 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3736,8 +3736,8 @@ static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
}
}
-void update_permission_bitmask(struct kvm_vcpu *vcpu,
- struct kvm_mmu *mmu, bool ept)
+static void update_permission_bitmask(struct kvm_vcpu *vcpu,
+ struct kvm_mmu *mmu, bool ept)
{
unsigned bit, byte, pfec;
u8 map;
@@ -3918,6 +3918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
{
bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
+ bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
struct kvm_mmu *context = &vcpu->arch.mmu;
MMU_WARN_ON(VALID_PAGE(context->root_hpa));
@@ -3936,6 +3937,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
context->base_role.cr0_wp = is_write_protection(vcpu);
context->base_role.smep_andnot_wp
= smep && !is_write_protection(vcpu);
+ context->base_role.smap_andnot_wp
+ = smap && !is_write_protection(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
@@ -4207,12 +4210,18 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes)
{
gfn_t gfn = gpa >> PAGE_SHIFT;
- union kvm_mmu_page_role mask = { .word = 0 };
struct kvm_mmu_page *sp;
LIST_HEAD(invalid_list);
u64 entry, gentry, *spte;
int npte;
bool remote_flush, local_flush, zap_page;
+ union kvm_mmu_page_role mask = (union kvm_mmu_page_role) {
+ .cr0_wp = 1,
+ .cr4_pae = 1,
+ .nxe = 1,
+ .smep_andnot_wp = 1,
+ .smap_andnot_wp = 1,
+ };
/*
* If we don't have indirect shadow pages, it means no page is
@@ -4238,7 +4247,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
++vcpu->kvm->stat.mmu_pte_write;
kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
- mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
detect_write_flooding(sp)) {
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c7d65637c851..0ada65ecddcf 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -71,8 +71,6 @@ enum {
int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
-void update_permission_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- bool ept);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
@@ -166,6 +164,8 @@ static inline bool permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
int index = (pfec >> 1) +
(smap >> (X86_EFLAGS_AC_BIT - PFERR_RSVD_BIT + 1));
+ WARN_ON(pfec & PFERR_RSVD_MASK);
+
return (mmu->permissions[index] >> pte_access) & 1;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index fd49c867b25a..6e6d115fe9b5 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -718,6 +718,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
mmu_is_nested(vcpu));
if (likely(r != RET_MMIO_PF_INVALID))
return r;
+
+ /*
+ * page fault with PFEC.RSVD = 1 is caused by shadow
+ * page fault, should not be used to walk guest page
+ * table.
+ */
+ error_code &= ~PFERR_RSVD_MASK;
};
r = mmu_topup_memory_caches(vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ce741b8650f6..9afa233b5482 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4381,6 +4381,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.cache_reg = svm_cache_reg,
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
+ .fpu_activate = svm_fpu_activate,
.fpu_deactivate = svm_fpu_deactivate,
.tlb_flush = svm_flush_tlb,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b61687bd79..2d73807f0d31 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -10185,6 +10185,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.cache_reg = vmx_cache_reg,
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
+ .fpu_activate = vmx_fpu_activate,
.fpu_deactivate = vmx_fpu_deactivate,
.tlb_flush = vmx_flush_tlb,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c73efcd03e29..ea306adbbc13 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -702,8 +702,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
- X86_CR4_PAE | X86_CR4_SMEP;
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP | X86_CR4_SMAP;
+
if (cr4 & CR4_RESERVED_BITS)
return 1;
@@ -744,9 +745,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
- if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
- update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
-
if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
kvm_update_cpuid(vcpu);
@@ -6197,6 +6195,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
return;
page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return;
kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
/*
@@ -7060,7 +7060,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
fpu_save_init(&vcpu->arch.guest_fpu);
__kernel_fpu_end();
++vcpu->stat.fpu_reload;
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ if (!vcpu->arch.eager_fpu)
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+
trace_kvm_fpu(0);
}
@@ -7076,11 +7078,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
+ struct kvm_vcpu *vcpu;
+
if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
printk_once(KERN_WARNING
"kvm: SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
- return kvm_x86_ops->vcpu_create(kvm, id);
+
+ vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+ /*
+ * Activate fpu unconditionally in case the guest needs eager FPU. It will be
+ * deactivated soon if it doesn't.
+ */
+ kvm_x86_ops->fpu_activate(vcpu);
+ return vcpu;
}
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 982989d282ff..f2587888d987 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,7 +17,6 @@ clean-files := inat-tables.c
obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
lib-y := delay.o misc.o cmdline.o
-lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
lib-y += memcpy_$(BITS).o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 00933d5e992f..9b0ca8fe80fc 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -11,26 +11,23 @@
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
/* if you want SMP support, implement these with real spinlocks */
.macro LOCK reg
- pushfl_cfi
+ pushfl
cli
.endm
.macro UNLOCK reg
- popfl_cfi
+ popfl
.endm
#define BEGIN(op) \
.macro endp; \
- CFI_ENDPROC; \
ENDPROC(atomic64_##op##_386); \
.purgem endp; \
.endm; \
ENTRY(atomic64_##op##_386); \
- CFI_STARTPROC; \
LOCK v;
#define ENDP endp
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 082a85167a5b..db3ae85440ff 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -11,7 +11,6 @@
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
.macro read64 reg
movl %ebx, %eax
@@ -22,16 +21,11 @@
.endm
ENTRY(atomic64_read_cx8)
- CFI_STARTPROC
-
read64 %ecx
ret
- CFI_ENDPROC
ENDPROC(atomic64_read_cx8)
ENTRY(atomic64_set_cx8)
- CFI_STARTPROC
-
1:
/* we don't need LOCK_PREFIX since aligned 64-bit writes
* are atomic on 586 and newer */
@@ -39,28 +33,23 @@ ENTRY(atomic64_set_cx8)
jne 1b
ret
- CFI_ENDPROC
ENDPROC(atomic64_set_cx8)
ENTRY(atomic64_xchg_cx8)
- CFI_STARTPROC
-
1:
LOCK_PREFIX
cmpxchg8b (%esi)
jne 1b
ret
- CFI_ENDPROC
ENDPROC(atomic64_xchg_cx8)
.macro addsub_return func ins insc
ENTRY(atomic64_\func\()_return_cx8)
- CFI_STARTPROC
- pushl_cfi_reg ebp
- pushl_cfi_reg ebx
- pushl_cfi_reg esi
- pushl_cfi_reg edi
+ pushl %ebp
+ pushl %ebx
+ pushl %esi
+ pushl %edi
movl %eax, %esi
movl %edx, %edi
@@ -79,12 +68,11 @@ ENTRY(atomic64_\func\()_return_cx8)
10:
movl %ebx, %eax
movl %ecx, %edx
- popl_cfi_reg edi
- popl_cfi_reg esi
- popl_cfi_reg ebx
- popl_cfi_reg ebp
+ popl %edi
+ popl %esi
+ popl %ebx
+ popl %ebp
ret
- CFI_ENDPROC
ENDPROC(atomic64_\func\()_return_cx8)
.endm
@@ -93,8 +81,7 @@ addsub_return sub sub sbb
.macro incdec_return func ins insc
ENTRY(atomic64_\func\()_return_cx8)
- CFI_STARTPROC
- pushl_cfi_reg ebx
+ pushl %ebx
read64 %esi
1:
@@ -109,9 +96,8 @@ ENTRY(atomic64_\func\()_return_cx8)
10:
movl %ebx, %eax
movl %ecx, %edx
- popl_cfi_reg ebx
+ popl %ebx
ret
- CFI_ENDPROC
ENDPROC(atomic64_\func\()_return_cx8)
.endm
@@ -119,8 +105,7 @@ incdec_return inc add adc
incdec_return dec sub sbb
ENTRY(atomic64_dec_if_positive_cx8)
- CFI_STARTPROC
- pushl_cfi_reg ebx
+ pushl %ebx
read64 %esi
1:
@@ -136,18 +121,16 @@ ENTRY(atomic64_dec_if_positive_cx8)
2:
movl %ebx, %eax
movl %ecx, %edx
- popl_cfi_reg ebx
+ popl %ebx
ret
- CFI_ENDPROC
ENDPROC(atomic64_dec_if_positive_cx8)
ENTRY(atomic64_add_unless_cx8)
- CFI_STARTPROC
- pushl_cfi_reg ebp
- pushl_cfi_reg ebx
+ pushl %ebp
+ pushl %ebx
/* these just push these two parameters on the stack */
- pushl_cfi_reg edi
- pushl_cfi_reg ecx
+ pushl %edi
+ pushl %ecx
movl %eax, %ebp
movl %edx, %edi
@@ -168,21 +151,18 @@ ENTRY(atomic64_add_unless_cx8)
movl $1, %eax
3:
addl $8, %esp
- CFI_ADJUST_CFA_OFFSET -8
- popl_cfi_reg ebx
- popl_cfi_reg ebp
+ popl %ebx
+ popl %ebp
ret
4:
cmpl %edx, 4(%esp)
jne 2b
xorl %eax, %eax
jmp 3b
- CFI_ENDPROC
ENDPROC(atomic64_add_unless_cx8)
ENTRY(atomic64_inc_not_zero_cx8)
- CFI_STARTPROC
- pushl_cfi_reg ebx
+ pushl %ebx
read64 %esi
1:
@@ -199,7 +179,6 @@ ENTRY(atomic64_inc_not_zero_cx8)
movl $1, %eax
3:
- popl_cfi_reg ebx
+ popl %ebx
ret
- CFI_ENDPROC
ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index 9bc944a91274..c1e623209853 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -26,7 +26,6 @@
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/errno.h>
#include <asm/asm.h>
@@ -50,9 +49,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
* alignment for the unrolled loop.
*/
ENTRY(csum_partial)
- CFI_STARTPROC
- pushl_cfi_reg esi
- pushl_cfi_reg ebx
+ pushl %esi
+ pushl %ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
@@ -129,10 +127,9 @@ ENTRY(csum_partial)
jz 8f
roll $8, %eax
8:
- popl_cfi_reg ebx
- popl_cfi_reg esi
+ popl %ebx
+ popl %esi
ret
- CFI_ENDPROC
ENDPROC(csum_partial)
#else
@@ -140,9 +137,8 @@ ENDPROC(csum_partial)
/* Version for PentiumII/PPro */
ENTRY(csum_partial)
- CFI_STARTPROC
- pushl_cfi_reg esi
- pushl_cfi_reg ebx
+ pushl %esi
+ pushl %ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
@@ -249,10 +245,9 @@ ENTRY(csum_partial)
jz 90f
roll $8, %eax
90:
- popl_cfi_reg ebx
- popl_cfi_reg esi
+ popl %ebx
+ popl %esi
ret
- CFI_ENDPROC
ENDPROC(csum_partial)
#endif
@@ -287,12 +282,10 @@ unsigned int csum_partial_copy_generic (const char *src, char *dst,
#define FP 12
ENTRY(csum_partial_copy_generic)
- CFI_STARTPROC
subl $4,%esp
- CFI_ADJUST_CFA_OFFSET 4
- pushl_cfi_reg edi
- pushl_cfi_reg esi
- pushl_cfi_reg ebx
+ pushl %edi
+ pushl %esi
+ pushl %ebx
movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
movl ARGBASE+4(%esp),%esi # src
@@ -401,12 +394,11 @@ DST( movb %cl, (%edi) )
.previous
- popl_cfi_reg ebx
- popl_cfi_reg esi
- popl_cfi_reg edi
- popl_cfi %ecx # equivalent to addl $4,%esp
+ popl %ebx
+ popl %esi
+ popl %edi
+ popl %ecx # equivalent to addl $4,%esp
ret
- CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
#else
@@ -426,10 +418,9 @@ ENDPROC(csum_partial_copy_generic)
#define ARGBASE 12
ENTRY(csum_partial_copy_generic)
- CFI_STARTPROC
- pushl_cfi_reg ebx
- pushl_cfi_reg edi
- pushl_cfi_reg esi
+ pushl %ebx
+ pushl %edi
+ pushl %esi
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
movl ARGBASE+12(%esp),%ecx #len
@@ -489,11 +480,10 @@ DST( movb %dl, (%edi) )
jmp 7b
.previous
- popl_cfi_reg esi
- popl_cfi_reg edi
- popl_cfi_reg ebx
+ popl %esi
+ popl %edi
+ popl %ebx
ret
- CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
#undef ROUND
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index e67e579c93bd..a2fe51b00cce 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,4 @@
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
@@ -15,7 +14,6 @@
* %rdi - page
*/
ENTRY(clear_page)
- CFI_STARTPROC
ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
"jmp clear_page_c_e", X86_FEATURE_ERMS
@@ -24,11 +22,9 @@ ENTRY(clear_page)
xorl %eax,%eax
rep stosq
ret
- CFI_ENDPROC
ENDPROC(clear_page)
ENTRY(clear_page_orig)
- CFI_STARTPROC
xorl %eax,%eax
movl $4096/64,%ecx
@@ -48,14 +44,11 @@ ENTRY(clear_page_orig)
jnz .Lloop
nop
ret
- CFI_ENDPROC
ENDPROC(clear_page_orig)
ENTRY(clear_page_c_e)
- CFI_STARTPROC
movl $4096,%ecx
xorl %eax,%eax
rep stosb
ret
- CFI_ENDPROC
ENDPROC(clear_page_c_e)
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 40a172541ee2..9b330242e740 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -6,7 +6,6 @@
*
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/percpu.h>
.text
@@ -21,7 +20,6 @@
* %al : Operation successful
*/
ENTRY(this_cpu_cmpxchg16b_emu)
-CFI_STARTPROC
#
# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
@@ -32,7 +30,7 @@ CFI_STARTPROC
# *atomic* on a single cpu (as provided by the this_cpu_xx class of
# macros).
#
- pushfq_cfi
+ pushfq
cli
cmpq PER_CPU_VAR((%rsi)), %rax
@@ -43,17 +41,13 @@ CFI_STARTPROC
movq %rbx, PER_CPU_VAR((%rsi))
movq %rcx, PER_CPU_VAR(8(%rsi))
- CFI_REMEMBER_STATE
- popfq_cfi
+ popfq
mov $1, %al
ret
- CFI_RESTORE_STATE
.Lnot_same:
- popfq_cfi
+ popfq
xor %al,%al
ret
-CFI_ENDPROC
-
ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S
index b4807fce5177..ad5349778490 100644
--- a/arch/x86/lib/cmpxchg8b_emu.S
+++ b/arch/x86/lib/cmpxchg8b_emu.S
@@ -7,7 +7,6 @@
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
.text
@@ -20,14 +19,13 @@
* %ecx : high 32 bits of new value
*/
ENTRY(cmpxchg8b_emu)
-CFI_STARTPROC
#
# Emulate 'cmpxchg8b (%esi)' on UP except we don't
# set the whole ZF thing (caller will just compare
# eax:edx with the expected value)
#
- pushfl_cfi
+ pushfl
cli
cmpl (%esi), %eax
@@ -38,18 +36,15 @@ CFI_STARTPROC
movl %ebx, (%esi)
movl %ecx, 4(%esi)
- CFI_REMEMBER_STATE
- popfl_cfi
+ popfl
ret
- CFI_RESTORE_STATE
.Lnot_same:
movl (%esi), %eax
.Lhalf_same:
movl 4(%esi), %edx
- popfl_cfi
+ popfl
ret
-CFI_ENDPROC
ENDPROC(cmpxchg8b_emu)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 8239dbcbf984..009f98216b7e 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -1,7 +1,6 @@
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
@@ -13,22 +12,16 @@
*/
ALIGN
ENTRY(copy_page)
- CFI_STARTPROC
ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
ret
- CFI_ENDPROC
ENDPROC(copy_page)
ENTRY(copy_page_regs)
- CFI_STARTPROC
subq $2*8, %rsp
- CFI_ADJUST_CFA_OFFSET 2*8
movq %rbx, (%rsp)
- CFI_REL_OFFSET rbx, 0
movq %r12, 1*8(%rsp)
- CFI_REL_OFFSET r12, 1*8
movl $(4096/64)-5, %ecx
.p2align 4
@@ -87,11 +80,7 @@ ENTRY(copy_page_regs)
jnz .Loop2
movq (%rsp), %rbx
- CFI_RESTORE rbx
movq 1*8(%rsp), %r12
- CFI_RESTORE r12
addq $2*8, %rsp
- CFI_ADJUST_CFA_OFFSET -2*8
ret
- CFI_ENDPROC
ENDPROC(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index e4b3beee83bd..982ce34f4a9b 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -7,7 +7,6 @@
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
@@ -18,7 +17,6 @@
/* Standard copy_to_user with segment limit checking */
ENTRY(_copy_to_user)
- CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rdi,%rcx
addq %rdx,%rcx
@@ -30,12 +28,10 @@ ENTRY(_copy_to_user)
X86_FEATURE_REP_GOOD, \
"jmp copy_user_enhanced_fast_string", \
X86_FEATURE_ERMS
- CFI_ENDPROC
ENDPROC(_copy_to_user)
/* Standard copy_from_user with segment limit checking */
ENTRY(_copy_from_user)
- CFI_STARTPROC
GET_THREAD_INFO(%rax)
movq %rsi,%rcx
addq %rdx,%rcx
@@ -47,14 +43,12 @@ ENTRY(_copy_from_user)
X86_FEATURE_REP_GOOD, \
"jmp copy_user_enhanced_fast_string", \
X86_FEATURE_ERMS
- CFI_ENDPROC
ENDPROC(_copy_from_user)
.section .fixup,"ax"
/* must zero dest */
ENTRY(bad_from_user)
bad_from_user:
- CFI_STARTPROC
movl %edx,%ecx
xorl %eax,%eax
rep
@@ -62,7 +56,6 @@ bad_from_user:
bad_to_user:
movl %edx,%eax
ret
- CFI_ENDPROC
ENDPROC(bad_from_user)
.previous
@@ -80,7 +73,6 @@ ENDPROC(bad_from_user)
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_generic_unrolled)
- CFI_STARTPROC
ASM_STAC
cmpl $8,%edx
jb 20f /* less then 8 bytes, go to byte copy loop */
@@ -162,7 +154,6 @@ ENTRY(copy_user_generic_unrolled)
_ASM_EXTABLE(19b,40b)
_ASM_EXTABLE(21b,50b)
_ASM_EXTABLE(22b,50b)
- CFI_ENDPROC
ENDPROC(copy_user_generic_unrolled)
/* Some CPUs run faster using the string copy instructions.
@@ -184,7 +175,6 @@ ENDPROC(copy_user_generic_unrolled)
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_generic_string)
- CFI_STARTPROC
ASM_STAC
cmpl $8,%edx
jb 2f /* less than 8 bytes, go to byte copy loop */
@@ -209,7 +199,6 @@ ENTRY(copy_user_generic_string)
_ASM_EXTABLE(1b,11b)
_ASM_EXTABLE(3b,12b)
- CFI_ENDPROC
ENDPROC(copy_user_generic_string)
/*
@@ -225,7 +214,6 @@ ENDPROC(copy_user_generic_string)
* eax uncopied bytes or 0 if successful.
*/
ENTRY(copy_user_enhanced_fast_string)
- CFI_STARTPROC
ASM_STAC
movl %edx,%ecx
1: rep
@@ -240,7 +228,6 @@ ENTRY(copy_user_enhanced_fast_string)
.previous
_ASM_EXTABLE(1b,12b)
- CFI_ENDPROC
ENDPROC(copy_user_enhanced_fast_string)
/*
@@ -248,7 +235,6 @@ ENDPROC(copy_user_enhanced_fast_string)
* This will force destination/source out of cache for more performance.
*/
ENTRY(__copy_user_nocache)
- CFI_STARTPROC
ASM_STAC
cmpl $8,%edx
jb 20f /* less then 8 bytes, go to byte copy loop */
@@ -332,5 +318,4 @@ ENTRY(__copy_user_nocache)
_ASM_EXTABLE(19b,40b)
_ASM_EXTABLE(21b,50b)
_ASM_EXTABLE(22b,50b)
- CFI_ENDPROC
ENDPROC(__copy_user_nocache)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 9734182966f3..7e48807b2fa1 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -6,7 +6,6 @@
* for more details. No warranty for anything given at all.
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/errno.h>
#include <asm/asm.h>
@@ -47,23 +46,16 @@
ENTRY(csum_partial_copy_generic)
- CFI_STARTPROC
cmpl $3*64, %edx
jle .Lignore
.Lignore:
subq $7*8, %rsp
- CFI_ADJUST_CFA_OFFSET 7*8
movq %rbx, 2*8(%rsp)
- CFI_REL_OFFSET rbx, 2*8
movq %r12, 3*8(%rsp)
- CFI_REL_OFFSET r12, 3*8
movq %r14, 4*8(%rsp)
- CFI_REL_OFFSET r14, 4*8
movq %r13, 5*8(%rsp)
- CFI_REL_OFFSET r13, 5*8
movq %rbp, 6*8(%rsp)
- CFI_REL_OFFSET rbp, 6*8
movq %r8, (%rsp)
movq %r9, 1*8(%rsp)
@@ -206,22 +198,14 @@ ENTRY(csum_partial_copy_generic)
addl %ebx, %eax
adcl %r9d, %eax /* carry */
- CFI_REMEMBER_STATE
.Lende:
movq 2*8(%rsp), %rbx
- CFI_RESTORE rbx
movq 3*8(%rsp), %r12
- CFI_RESTORE r12
movq 4*8(%rsp), %r14
- CFI_RESTORE r14
movq 5*8(%rsp), %r13
- CFI_RESTORE r13
movq 6*8(%rsp), %rbp
- CFI_RESTORE rbp
addq $7*8, %rsp
- CFI_ADJUST_CFA_OFFSET -7*8
ret
- CFI_RESTORE_STATE
/* Exception handlers. Very simple, zeroing is done in the wrappers */
.Lbad_source:
@@ -237,5 +221,4 @@ ENTRY(csum_partial_copy_generic)
jz .Lende
movl $-EFAULT, (%rax)
jmp .Lende
- CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index a4512359656a..46668cda4ffd 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -26,7 +26,6 @@
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/page_types.h>
#include <asm/errno.h>
#include <asm/asm-offsets.h>
@@ -36,7 +35,6 @@
.text
ENTRY(__get_user_1)
- CFI_STARTPROC
GET_THREAD_INFO(%_ASM_DX)
cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
jae bad_get_user
@@ -45,11 +43,9 @@ ENTRY(__get_user_1)
xor %eax,%eax
ASM_CLAC
ret
- CFI_ENDPROC
ENDPROC(__get_user_1)
ENTRY(__get_user_2)
- CFI_STARTPROC
add $1,%_ASM_AX
jc bad_get_user
GET_THREAD_INFO(%_ASM_DX)
@@ -60,11 +56,9 @@ ENTRY(__get_user_2)
xor %eax,%eax
ASM_CLAC
ret
- CFI_ENDPROC
ENDPROC(__get_user_2)
ENTRY(__get_user_4)
- CFI_STARTPROC
add $3,%_ASM_AX
jc bad_get_user
GET_THREAD_INFO(%_ASM_DX)
@@ -75,11 +69,9 @@ ENTRY(__get_user_4)
xor %eax,%eax
ASM_CLAC
ret
- CFI_ENDPROC
ENDPROC(__get_user_4)
ENTRY(__get_user_8)
- CFI_STARTPROC
#ifdef CONFIG_X86_64
add $7,%_ASM_AX
jc bad_get_user
@@ -104,28 +96,23 @@ ENTRY(__get_user_8)
ASM_CLAC
ret
#endif
- CFI_ENDPROC
ENDPROC(__get_user_8)
bad_get_user:
- CFI_STARTPROC
xor %edx,%edx
mov $(-EFAULT),%_ASM_AX
ASM_CLAC
ret
- CFI_ENDPROC
END(bad_get_user)
#ifdef CONFIG_X86_32
bad_get_user_8:
- CFI_STARTPROC
xor %edx,%edx
xor %ecx,%ecx
mov $(-EFAULT),%_ASM_AX
ASM_CLAC
ret
- CFI_ENDPROC
END(bad_get_user_8)
#endif
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
index 05a95e713da8..33147fef3452 100644
--- a/arch/x86/lib/iomap_copy_64.S
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -16,15 +16,12 @@
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
/*
* override generic version in lib/iomap_copy.c
*/
ENTRY(__iowrite32_copy)
- CFI_STARTPROC
movl %edx,%ecx
rep movsd
ret
- CFI_ENDPROC
ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index b046664f5a1c..16698bba87de 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -2,7 +2,6 @@
#include <linux/linkage.h>
#include <asm/cpufeature.h>
-#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
/*
@@ -53,7 +52,6 @@ ENTRY(memcpy_erms)
ENDPROC(memcpy_erms)
ENTRY(memcpy_orig)
- CFI_STARTPROC
movq %rdi, %rax
cmpq $0x20, %rdx
@@ -178,5 +176,4 @@ ENTRY(memcpy_orig)
.Lend:
retq
- CFI_ENDPROC
ENDPROC(memcpy_orig)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0f8a0d0331b9..ca2afdd6d98e 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -6,7 +6,6 @@
* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
@@ -27,7 +26,6 @@
ENTRY(memmove)
ENTRY(__memmove)
- CFI_STARTPROC
/* Handle more 32 bytes in loop */
mov %rdi, %rax
@@ -207,6 +205,5 @@ ENTRY(__memmove)
movb %r11b, (%rdi)
13:
retq
- CFI_ENDPROC
ENDPROC(__memmove)
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 93118fb23976..2661fad05827 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -1,7 +1,6 @@
/* Copyright 2002 Andi Kleen, SuSE Labs */
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
@@ -66,7 +65,6 @@ ENTRY(memset_erms)
ENDPROC(memset_erms)
ENTRY(memset_orig)
- CFI_STARTPROC
movq %rdi,%r10
/* expand byte value */
@@ -78,7 +76,6 @@ ENTRY(memset_orig)
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
- CFI_REMEMBER_STATE
.Lafter_bad_alignment:
movq %rdx,%rcx
@@ -128,7 +125,6 @@ ENTRY(memset_orig)
movq %r10,%rax
ret
- CFI_RESTORE_STATE
.Lbad_alignment:
cmpq $7,%rdx
jbe .Lhandle_7
@@ -139,5 +135,4 @@ ENTRY(memset_orig)
subq %r8,%rdx
jmp .Lafter_bad_alignment
.Lfinal:
- CFI_ENDPROC
ENDPROC(memset_orig)
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index 3ca5218fbece..c81556409bbb 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -1,6 +1,5 @@
#include <linux/linkage.h>
#include <linux/errno.h>
-#include <asm/dwarf2.h>
#include <asm/asm.h>
#include <asm/msr.h>
@@ -13,9 +12,8 @@
*/
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
- CFI_STARTPROC
- pushq_cfi_reg rbx
- pushq_cfi_reg rbp
+ pushq %rbx
+ pushq %rbp
movq %rdi, %r10 /* Save pointer */
xorl %r11d, %r11d /* Return value */
movl (%rdi), %eax
@@ -25,7 +23,6 @@ ENTRY(\op\()_safe_regs)
movl 20(%rdi), %ebp
movl 24(%rdi), %esi
movl 28(%rdi), %edi
- CFI_REMEMBER_STATE
1: \op
2: movl %eax, (%r10)
movl %r11d, %eax /* Return value */
@@ -35,16 +32,14 @@ ENTRY(\op\()_safe_regs)
movl %ebp, 20(%r10)
movl %esi, 24(%r10)
movl %edi, 28(%r10)
- popq_cfi_reg rbp
- popq_cfi_reg rbx
+ popq %rbp
+ popq %rbx
ret
3:
- CFI_RESTORE_STATE
movl $-EIO, %r11d
jmp 2b
_ASM_EXTABLE(1b, 3b)
- CFI_ENDPROC
ENDPROC(\op\()_safe_regs)
.endm
@@ -52,13 +47,12 @@ ENDPROC(\op\()_safe_regs)
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
- CFI_STARTPROC
- pushl_cfi_reg ebx
- pushl_cfi_reg ebp
- pushl_cfi_reg esi
- pushl_cfi_reg edi
- pushl_cfi $0 /* Return value */
- pushl_cfi %eax
+ pushl %ebx
+ pushl %ebp
+ pushl %esi
+ pushl %edi
+ pushl $0 /* Return value */
+ pushl %eax
movl 4(%eax), %ecx
movl 8(%eax), %edx
movl 12(%eax), %ebx
@@ -66,32 +60,28 @@ ENTRY(\op\()_safe_regs)
movl 24(%eax), %esi
movl 28(%eax), %edi
movl (%eax), %eax
- CFI_REMEMBER_STATE
1: \op
-2: pushl_cfi %eax
+2: pushl %eax
movl 4(%esp), %eax
- popl_cfi (%eax)
+ popl (%eax)
addl $4, %esp
- CFI_ADJUST_CFA_OFFSET -4
movl %ecx, 4(%eax)
movl %edx, 8(%eax)
movl %ebx, 12(%eax)
movl %ebp, 20(%eax)
movl %esi, 24(%eax)
movl %edi, 28(%eax)
- popl_cfi %eax
- popl_cfi_reg edi
- popl_cfi_reg esi
- popl_cfi_reg ebp
- popl_cfi_reg ebx
+ popl %eax
+ popl %edi
+ popl %esi
+ popl %ebp
+ popl %ebx
ret
3:
- CFI_RESTORE_STATE
movl $-EIO, 4(%esp)
jmp 2b
_ASM_EXTABLE(1b, 3b)
- CFI_ENDPROC
ENDPROC(\op\()_safe_regs)
.endm
diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S
index fc6ba17a7eec..e0817a12d323 100644
--- a/arch/x86/lib/putuser.S
+++ b/arch/x86/lib/putuser.S
@@ -11,7 +11,6 @@
* return value.
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
#include <asm/thread_info.h>
#include <asm/errno.h>
#include <asm/asm.h>
@@ -30,11 +29,9 @@
* as they get called from within inline assembly.
*/
-#define ENTER CFI_STARTPROC ; \
- GET_THREAD_INFO(%_ASM_BX)
+#define ENTER GET_THREAD_INFO(%_ASM_BX)
#define EXIT ASM_CLAC ; \
- ret ; \
- CFI_ENDPROC
+ ret
.text
ENTRY(__put_user_1)
@@ -87,7 +84,6 @@ ENTRY(__put_user_8)
ENDPROC(__put_user_8)
bad_put_user:
- CFI_STARTPROC
movl $-EFAULT,%eax
EXIT
END(bad_put_user)
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 2322abe4da3b..40027db99140 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -15,7 +15,6 @@
#include <linux/linkage.h>
#include <asm/alternative-asm.h>
-#include <asm/dwarf2.h>
#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg)
#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l)
@@ -34,10 +33,10 @@
*/
#define save_common_regs \
- pushl_cfi_reg ecx
+ pushl %ecx
#define restore_common_regs \
- popl_cfi_reg ecx
+ popl %ecx
/* Avoid uglifying the argument copying x86-64 needs to do. */
.macro movq src, dst
@@ -64,50 +63,45 @@
*/
#define save_common_regs \
- pushq_cfi_reg rdi; \
- pushq_cfi_reg rsi; \
- pushq_cfi_reg rcx; \
- pushq_cfi_reg r8; \
- pushq_cfi_reg r9; \
- pushq_cfi_reg r10; \
- pushq_cfi_reg r11
+ pushq %rdi; \
+ pushq %rsi; \
+ pushq %rcx; \
+ pushq %r8; \
+ pushq %r9; \
+ pushq %r10; \
+ pushq %r11
#define restore_common_regs \
- popq_cfi_reg r11; \
- popq_cfi_reg r10; \
- popq_cfi_reg r9; \
- popq_cfi_reg r8; \
- popq_cfi_reg rcx; \
- popq_cfi_reg rsi; \
- popq_cfi_reg rdi
+ popq %r11; \
+ popq %r10; \
+ popq %r9; \
+ popq %r8; \
+ popq %rcx; \
+ popq %rsi; \
+ popq %rdi
#endif
/* Fix up special calling conventions */
ENTRY(call_rwsem_down_read_failed)
- CFI_STARTPROC
save_common_regs
- __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
+ __ASM_SIZE(push,) %__ASM_REG(dx)
movq %rax,%rdi
call rwsem_down_read_failed
- __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
+ __ASM_SIZE(pop,) %__ASM_REG(dx)
restore_common_regs
ret
- CFI_ENDPROC
ENDPROC(call_rwsem_down_read_failed)
ENTRY(call_rwsem_down_write_failed)
- CFI_STARTPROC
save_common_regs
movq %rax,%rdi
call rwsem_down_write_failed
restore_common_regs
ret
- CFI_ENDPROC
ENDPROC(call_rwsem_down_write_failed)
ENTRY(call_rwsem_wake)
- CFI_STARTPROC
/* do nothing if still outstanding active readers */
__ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx)
jnz 1f
@@ -116,17 +110,14 @@ ENTRY(call_rwsem_wake)
call rwsem_wake
restore_common_regs
1: ret
- CFI_ENDPROC
ENDPROC(call_rwsem_wake)
ENTRY(call_rwsem_downgrade_wake)
- CFI_STARTPROC
save_common_regs
- __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
+ __ASM_SIZE(push,) %__ASM_REG(dx)
movq %rax,%rdi
call rwsem_downgrade_wake
- __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
+ __ASM_SIZE(pop,) %__ASM_REG(dx)
restore_common_regs
ret
- CFI_ENDPROC
ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 1d553186c434..8533b46e6bee 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -40,7 +40,7 @@
*/
uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
- [_PAGE_CACHE_MODE_WC ] = _PAGE_PWT | 0 ,
+ [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC ] = _PAGE_PWT | _PAGE_PCD,
[_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
@@ -50,11 +50,11 @@ EXPORT_SYMBOL(__cachemode2pte_tbl);
uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
- [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_WC,
+ [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC,
[__pte2cm_idx( 0 | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
- [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
+ [__pte2cm_idx(_PAGE_PWT | 0 | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 9ca35fc60cfe..a9dc7a37e6a2 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -77,13 +77,13 @@ void __iomem *
iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
{
/*
- * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
- * PAGE_KERNEL_WC maps to PWT, which translates to uncached if the
- * MTRR is UC or WC. UC_MINUS gets the real intention, of the
- * user, which is "WC if the MTRR is WC, UC if you can't do that."
+ * For non-PAT systems, translate non-WB request to UC- just in
+ * case the caller set the PWT bit to prot directly without using
+ * pgprot_writecombine(). UC- translates to uncached if the MTRR
+ * is UC or WC. UC- gets the real intention, of the user, which is
+ * "WC if the MTRR is WC, UC if you can't do that."
*/
- if (!pat_enabled && pgprot_val(prot) ==
- (__PAGE_KERNEL | cachemode2protval(_PAGE_CACHE_MODE_WC)))
+ if (!pat_enabled() && pgprot2cachemode(prot) != _PAGE_CACHE_MODE_WB)
prot = __pgprot(__PAGE_KERNEL |
cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS));
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 70e7444c6835..8405c0c6a535 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -42,6 +42,9 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
case _PAGE_CACHE_MODE_WC:
err = _set_memory_wc(vaddr, nrpages);
break;
+ case _PAGE_CACHE_MODE_WT:
+ err = _set_memory_wt(vaddr, nrpages);
+ break;
case _PAGE_CACHE_MODE_WB:
err = _set_memory_wb(vaddr, nrpages);
break;
@@ -172,6 +175,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
prot = __pgprot(pgprot_val(prot) |
cachemode2protval(_PAGE_CACHE_MODE_WC));
break;
+ case _PAGE_CACHE_MODE_WT:
+ prot = __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WT));
+ break;
case _PAGE_CACHE_MODE_WB:
break;
}
@@ -234,10 +241,11 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
{
/*
* Ideally, this should be:
- * pat_enabled ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
+ * pat_enabled() ? _PAGE_CACHE_MODE_UC : _PAGE_CACHE_MODE_UC_MINUS;
*
* Till we fix all X drivers to use ioremap_wc(), we will use
- * UC MINUS.
+ * UC MINUS. Drivers that are certain they need or can already
+ * be converted over to strong UC can use ioremap_uc().
*/
enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC_MINUS;
@@ -247,6 +255,39 @@ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
EXPORT_SYMBOL(ioremap_nocache);
/**
+ * ioremap_uc - map bus memory into CPU space as strongly uncachable
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_uc performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked with a strong
+ * preference as completely uncachable on the CPU when possible. For non-PAT
+ * systems this ends up setting page-attribute flags PCD=1, PWT=1. For PAT
+ * systems this will set the PAT entry for the pages as strong UC. This call
+ * will honor existing caching rules from things like the PCI bus. Note that
+ * there are other caches and buffers on many busses. In particular driver
+ * authors should read up on PCI writes.
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_uc(resource_size_t phys_addr, unsigned long size)
+{
+ enum page_cache_mode pcm = _PAGE_CACHE_MODE_UC;
+
+ return __ioremap_caller(phys_addr, size, pcm,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL_GPL(ioremap_uc);
+
+/**
* ioremap_wc - map memory into CPU space write combined
* @phys_addr: bus address of the memory
* @size: size of the resource to map
@@ -258,14 +299,28 @@ EXPORT_SYMBOL(ioremap_nocache);
*/
void __iomem *ioremap_wc(resource_size_t phys_addr, unsigned long size)
{
- if (pat_enabled)
- return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WC,
__builtin_return_address(0));
- else
- return ioremap_nocache(phys_addr, size);
}
EXPORT_SYMBOL(ioremap_wc);
+/**
+ * ioremap_wt - map memory into CPU space write through
+ * @phys_addr: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write through.
+ * Write through stores data into memory while keeping the cache up-to-date.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wt(resource_size_t phys_addr, unsigned long size)
+{
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WT,
+ __builtin_return_address(0));
+}
+EXPORT_SYMBOL(ioremap_wt);
+
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
return __ioremap_caller(phys_addr, size, _PAGE_CACHE_MODE_WB,
@@ -331,7 +386,7 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
-int arch_ioremap_pud_supported(void)
+int __init arch_ioremap_pud_supported(void)
{
#ifdef CONFIG_X86_64
return cpu_has_gbpages;
@@ -340,7 +395,7 @@ int arch_ioremap_pud_supported(void)
#endif
}
-int arch_ioremap_pmd_supported(void)
+int __init arch_ioremap_pmd_supported(void)
{
return cpu_has_pse;
}
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 6629f397b467..8ff686aa7e8c 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -9,6 +9,7 @@
#include <linux/random.h>
#include <linux/kernel.h>
#include <linux/mm.h>
+#include <linux/vmalloc.h>
#include <asm/cacheflush.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 89af288ec674..727158cb3b3c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -14,6 +14,7 @@
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/pci.h>
+#include <linux/vmalloc.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -129,16 +130,15 @@ within(unsigned long addr, unsigned long start, unsigned long end)
*/
void clflush_cache_range(void *vaddr, unsigned int size)
{
- void *vend = vaddr + size - 1;
+ unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1;
+ void *vend = vaddr + size;
+ void *p;
mb();
- for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
- clflushopt(vaddr);
- /*
- * Flush any possible final partial cacheline:
- */
- clflushopt(vend);
+ for (p = (void *)((unsigned long)vaddr & ~clflush_mask);
+ p < vend; p += boot_cpu_data.x86_clflush_size)
+ clflushopt(p);
mb();
}
@@ -418,13 +418,11 @@ phys_addr_t slow_virt_to_phys(void *__virt_addr)
phys_addr_t phys_addr;
unsigned long offset;
enum pg_level level;
- unsigned long psize;
unsigned long pmask;
pte_t *pte;
pte = lookup_address(virt_addr, &level);
BUG_ON(!pte);
- psize = page_level_size(level);
pmask = page_level_mask(level);
offset = virt_addr & ~pmask;
phys_addr = (phys_addr_t)pte_pfn(*pte) << PAGE_SHIFT;
@@ -1468,6 +1466,9 @@ int _set_memory_uc(unsigned long addr, int numpages)
{
/*
* for now UC MINUS. see comments in ioremap_nocache()
+ * If you really need strong UC use ioremap_uc(), but note
+ * that you cannot override IO areas with set_memory_*() as
+ * these helpers cannot work with IO memory.
*/
return change_page_attr_set(&addr, numpages,
cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
@@ -1502,12 +1503,10 @@ EXPORT_SYMBOL(set_memory_uc);
static int _set_memory_array(unsigned long *addr, int addrinarray,
enum page_cache_mode new_type)
{
+ enum page_cache_mode set_type;
int i, j;
int ret;
- /*
- * for now UC MINUS. see comments in ioremap_nocache()
- */
for (i = 0; i < addrinarray; i++) {
ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
new_type, NULL);
@@ -1515,9 +1514,12 @@ static int _set_memory_array(unsigned long *addr, int addrinarray,
goto out_free;
}
+ /* If WC, set to UC- first and then WC */
+ set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+ _PAGE_CACHE_MODE_UC_MINUS : new_type;
+
ret = change_page_attr_set(addr, addrinarray,
- cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
- 1);
+ cachemode2pgprot(set_type), 1);
if (!ret && new_type == _PAGE_CACHE_MODE_WC)
ret = change_page_attr_set_clr(addr, addrinarray,
@@ -1549,6 +1551,12 @@ int set_memory_array_wc(unsigned long *addr, int addrinarray)
}
EXPORT_SYMBOL(set_memory_array_wc);
+int set_memory_array_wt(unsigned long *addr, int addrinarray)
+{
+ return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_memory_array_wt);
+
int _set_memory_wc(unsigned long addr, int numpages)
{
int ret;
@@ -1571,27 +1579,42 @@ int set_memory_wc(unsigned long addr, int numpages)
{
int ret;
- if (!pat_enabled)
- return set_memory_uc(addr, numpages);
-
ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
_PAGE_CACHE_MODE_WC, NULL);
if (ret)
- goto out_err;
+ return ret;
ret = _set_memory_wc(addr, numpages);
if (ret)
- goto out_free;
-
- return 0;
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-out_free:
- free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
-out_err:
return ret;
}
EXPORT_SYMBOL(set_memory_wc);
+int _set_memory_wt(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(&addr, numpages,
+ cachemode2pgprot(_PAGE_CACHE_MODE_WT), 0);
+}
+
+int set_memory_wt(unsigned long addr, int numpages)
+{
+ int ret;
+
+ ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+ _PAGE_CACHE_MODE_WT, NULL);
+ if (ret)
+ return ret;
+
+ ret = _set_memory_wt(addr, numpages);
+ if (ret)
+ free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_memory_wt);
+
int _set_memory_wb(unsigned long addr, int numpages)
{
/* WB cache mode is hard wired to all cache attribute bits being 0 */
@@ -1682,6 +1705,7 @@ static int _set_pages_array(struct page **pages, int addrinarray,
{
unsigned long start;
unsigned long end;
+ enum page_cache_mode set_type;
int i;
int free_idx;
int ret;
@@ -1695,8 +1719,12 @@ static int _set_pages_array(struct page **pages, int addrinarray,
goto err_out;
}
+ /* If WC, set to UC- first and then WC */
+ set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
+ _PAGE_CACHE_MODE_UC_MINUS : new_type;
+
ret = cpa_set_pages_array(pages, addrinarray,
- cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS));
+ cachemode2pgprot(set_type));
if (!ret && new_type == _PAGE_CACHE_MODE_WC)
ret = change_page_attr_set_clr(NULL, addrinarray,
cachemode2pgprot(
@@ -1730,6 +1758,12 @@ int set_pages_array_wc(struct page **pages, int addrinarray)
}
EXPORT_SYMBOL(set_pages_array_wc);
+int set_pages_array_wt(struct page **pages, int addrinarray)
+{
+ return _set_pages_array(pages, addrinarray, _PAGE_CACHE_MODE_WT);
+}
+EXPORT_SYMBOL_GPL(set_pages_array_wt);
+
int set_pages_wb(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 35af6771a95a..188e3e07eeeb 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -33,13 +33,17 @@
#include "pat_internal.h"
#include "mm_internal.h"
-#ifdef CONFIG_X86_PAT
-int __read_mostly pat_enabled = 1;
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
+
+static bool boot_cpu_done;
+
+static int __read_mostly __pat_enabled = IS_ENABLED(CONFIG_X86_PAT);
static inline void pat_disable(const char *reason)
{
- pat_enabled = 0;
- printk(KERN_INFO "%s\n", reason);
+ __pat_enabled = 0;
+ pr_info("x86/PAT: %s\n", reason);
}
static int __init nopat(char *str)
@@ -48,13 +52,12 @@ static int __init nopat(char *str)
return 0;
}
early_param("nopat", nopat);
-#else
-static inline void pat_disable(const char *reason)
+
+bool pat_enabled(void)
{
- (void)reason;
+ return !!__pat_enabled;
}
-#endif
-
+EXPORT_SYMBOL_GPL(pat_enabled);
int pat_debug_enable;
@@ -65,22 +68,24 @@ static int __init pat_debug_setup(char *str)
}
__setup("debugpat", pat_debug_setup);
-static u64 __read_mostly boot_pat_state;
-
#ifdef CONFIG_X86_PAT
/*
- * X86 PAT uses page flags WC and Uncached together to keep track of
- * memory type of pages that have backing page struct. X86 PAT supports 3
- * different memory types, _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC and
- * _PAGE_CACHE_MODE_UC_MINUS and fourth state where page's memory type has not
- * been changed from its default (value of -1 used to denote this).
- * Note we do not support _PAGE_CACHE_MODE_UC here.
+ * X86 PAT uses page flags arch_1 and uncached together to keep track of
+ * memory type of pages that have backing page struct.
+ *
+ * X86 PAT supports 4 different memory types:
+ * - _PAGE_CACHE_MODE_WB
+ * - _PAGE_CACHE_MODE_WC
+ * - _PAGE_CACHE_MODE_UC_MINUS
+ * - _PAGE_CACHE_MODE_WT
+ *
+ * _PAGE_CACHE_MODE_WB is the default type.
*/
-#define _PGMT_DEFAULT 0
+#define _PGMT_WB 0
#define _PGMT_WC (1UL << PG_arch_1)
#define _PGMT_UC_MINUS (1UL << PG_uncached)
-#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1)
+#define _PGMT_WT (1UL << PG_uncached | 1UL << PG_arch_1)
#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1)
#define _PGMT_CLEAR_MASK (~_PGMT_MASK)
@@ -88,14 +93,14 @@ static inline enum page_cache_mode get_page_memtype(struct page *pg)
{
unsigned long pg_flags = pg->flags & _PGMT_MASK;
- if (pg_flags == _PGMT_DEFAULT)
- return -1;
+ if (pg_flags == _PGMT_WB)
+ return _PAGE_CACHE_MODE_WB;
else if (pg_flags == _PGMT_WC)
return _PAGE_CACHE_MODE_WC;
else if (pg_flags == _PGMT_UC_MINUS)
return _PAGE_CACHE_MODE_UC_MINUS;
else
- return _PAGE_CACHE_MODE_WB;
+ return _PAGE_CACHE_MODE_WT;
}
static inline void set_page_memtype(struct page *pg,
@@ -112,11 +117,12 @@ static inline void set_page_memtype(struct page *pg,
case _PAGE_CACHE_MODE_UC_MINUS:
memtype_flags = _PGMT_UC_MINUS;
break;
- case _PAGE_CACHE_MODE_WB:
- memtype_flags = _PGMT_WB;
+ case _PAGE_CACHE_MODE_WT:
+ memtype_flags = _PGMT_WT;
break;
+ case _PAGE_CACHE_MODE_WB:
default:
- memtype_flags = _PGMT_DEFAULT;
+ memtype_flags = _PGMT_WB;
break;
}
@@ -174,78 +180,154 @@ static enum page_cache_mode pat_get_cache_mode(unsigned pat_val, char *msg)
* configuration.
* Using lower indices is preferred, so we start with highest index.
*/
-void pat_init_cache_modes(void)
+void pat_init_cache_modes(u64 pat)
{
- int i;
enum page_cache_mode cache;
char pat_msg[33];
- u64 pat;
+ int i;
- rdmsrl(MSR_IA32_CR_PAT, pat);
pat_msg[32] = 0;
for (i = 7; i >= 0; i--) {
cache = pat_get_cache_mode((pat >> (i * 8)) & 7,
pat_msg + 4 * i);
update_cache_mode_entry(i, cache);
}
- pr_info("PAT configuration [0-7]: %s\n", pat_msg);
+ pr_info("x86/PAT: Configuration [0-7]: %s\n", pat_msg);
}
#define PAT(x, y) ((u64)PAT_ ## y << ((x)*8))
-void pat_init(void)
+static void pat_bsp_init(u64 pat)
{
- u64 pat;
- bool boot_cpu = !boot_pat_state;
+ u64 tmp_pat;
- if (!pat_enabled)
+ if (!cpu_has_pat) {
+ pat_disable("PAT not supported by CPU.");
return;
+ }
- if (!cpu_has_pat) {
- if (!boot_pat_state) {
- pat_disable("PAT not supported by CPU.");
- return;
- } else {
- /*
- * If this happens we are on a secondary CPU, but
- * switched to PAT on the boot CPU. We have no way to
- * undo PAT.
- */
- printk(KERN_ERR "PAT enabled, "
- "but not supported by secondary CPU\n");
- BUG();
- }
+ if (!pat_enabled())
+ goto done;
+
+ rdmsrl(MSR_IA32_CR_PAT, tmp_pat);
+ if (!tmp_pat) {
+ pat_disable("PAT MSR is 0, disabled.");
+ return;
}
- /* Set PWT to Write-Combining. All other bits stay the same */
- /*
- * PTE encoding used in Linux:
- * PAT
- * |PCD
- * ||PWT
- * |||
- * 000 WB _PAGE_CACHE_WB
- * 001 WC _PAGE_CACHE_WC
- * 010 UC- _PAGE_CACHE_UC_MINUS
- * 011 UC _PAGE_CACHE_UC
- * PAT bit unused
- */
- pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
- PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
-
- /* Boot CPU check */
- if (!boot_pat_state) {
- rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
- if (!boot_pat_state) {
- pat_disable("PAT read returns always zero, disabled.");
- return;
- }
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+
+done:
+ pat_init_cache_modes(pat);
+}
+
+static void pat_ap_init(u64 pat)
+{
+ if (!pat_enabled())
+ return;
+
+ if (!cpu_has_pat) {
+ /*
+ * If this happens we are on a secondary CPU, but switched to
+ * PAT on the boot CPU. We have no way to undo PAT.
+ */
+ panic("x86/PAT: PAT enabled, but not supported by secondary CPU\n");
}
wrmsrl(MSR_IA32_CR_PAT, pat);
+}
+
+void pat_init(void)
+{
+ u64 pat;
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+
+ if (!pat_enabled()) {
+ /*
+ * No PAT. Emulate the PAT table that corresponds to the two
+ * cache bits, PWT (Write Through) and PCD (Cache Disable). This
+ * setup is the same as the BIOS default setup when the system
+ * has PAT but the "nopat" boot option has been specified. This
+ * emulated PAT table is used when MSR_IA32_CR_PAT returns 0.
+ *
+ * PTE encoding:
+ *
+ * PCD
+ * |PWT PAT
+ * || slot
+ * 00 0 WB : _PAGE_CACHE_MODE_WB
+ * 01 1 WT : _PAGE_CACHE_MODE_WT
+ * 10 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 11 3 UC : _PAGE_CACHE_MODE_UC
+ *
+ * NOTE: When WC or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WT) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WT) | PAT(6, UC_MINUS) | PAT(7, UC);
- if (boot_cpu)
- pat_init_cache_modes();
+ } else if ((c->x86_vendor == X86_VENDOR_INTEL) &&
+ (((c->x86 == 0x6) && (c->x86_model <= 0xd)) ||
+ ((c->x86 == 0xf) && (c->x86_model <= 0x6)))) {
+ /*
+ * PAT support with the lower four entries. Intel Pentium 2,
+ * 3, M, and 4 are affected by PAT errata, which makes the
+ * upper four entries unusable. To be on the safe side, we don't
+ * use those.
+ *
+ * PTE encoding:
+ * PAT
+ * |PCD
+ * ||PWT PAT
+ * ||| slot
+ * 000 0 WB : _PAGE_CACHE_MODE_WB
+ * 001 1 WC : _PAGE_CACHE_MODE_WC
+ * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 011 3 UC : _PAGE_CACHE_MODE_UC
+ * PAT bit unused
+ *
+ * NOTE: When WT or WP is used, it is redirected to UC- per
+ * the default setup in __cachemode2pte_tbl[].
+ */
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, UC);
+ } else {
+ /*
+ * Full PAT support. We put WT in slot 7 to improve
+ * robustness in the presence of errata that might cause
+ * the high PAT bit to be ignored. This way, a buggy slot 7
+ * access will hit slot 3, and slot 3 is UC, so at worst
+ * we lose performance without causing a correctness issue.
+ * Pentium 4 erratum N46 is an example for such an erratum,
+ * although we try not to use PAT at all on affected CPUs.
+ *
+ * PTE encoding:
+ * PAT
+ * |PCD
+ * ||PWT PAT
+ * ||| slot
+ * 000 0 WB : _PAGE_CACHE_MODE_WB
+ * 001 1 WC : _PAGE_CACHE_MODE_WC
+ * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS
+ * 011 3 UC : _PAGE_CACHE_MODE_UC
+ * 100 4 WB : Reserved
+ * 101 5 WC : Reserved
+ * 110 6 UC-: Reserved
+ * 111 7 WT : _PAGE_CACHE_MODE_WT
+ *
+ * The reserved slots are unused, but mapped to their
+ * corresponding types in the presence of PAT errata.
+ */
+ pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) |
+ PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT);
+ }
+
+ if (!boot_cpu_done) {
+ pat_bsp_init(pat);
+ boot_cpu_done = true;
+ } else {
+ pat_ap_init(pat);
+ }
}
#undef PAT
@@ -267,9 +349,9 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end,
* request is for WB.
*/
if (req_type == _PAGE_CACHE_MODE_WB) {
- u8 mtrr_type;
+ u8 mtrr_type, uniform;
- mtrr_type = mtrr_type_lookup(start, end);
+ mtrr_type = mtrr_type_lookup(start, end, &uniform);
if (mtrr_type != MTRR_TYPE_WRBACK)
return _PAGE_CACHE_MODE_UC_MINUS;
@@ -324,9 +406,14 @@ static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end)
/*
* For RAM pages, we use page flags to mark the pages with appropriate type.
- * Here we do two pass:
- * - Find the memtype of all the pages in the range, look for any conflicts
- * - In case of no conflicts, set the new memtype for pages in the range
+ * The page flags are limited to four types, WB (default), WC, WT and UC-.
+ * WP request fails with -EINVAL, and UC gets redirected to UC-. Setting
+ * a new memory type is only allowed for a page mapped with the default WB
+ * type.
+ *
+ * Here we do two passes:
+ * - Find the memtype of all the pages in the range, look for any conflicts.
+ * - In case of no conflicts, set the new memtype for pages in the range.
*/
static int reserve_ram_pages_type(u64 start, u64 end,
enum page_cache_mode req_type,
@@ -335,6 +422,12 @@ static int reserve_ram_pages_type(u64 start, u64 end,
struct page *page;
u64 pfn;
+ if (req_type == _PAGE_CACHE_MODE_WP) {
+ if (new_type)
+ *new_type = _PAGE_CACHE_MODE_UC_MINUS;
+ return -EINVAL;
+ }
+
if (req_type == _PAGE_CACHE_MODE_UC) {
/* We do not support strong UC */
WARN_ON_ONCE(1);
@@ -346,8 +439,8 @@ static int reserve_ram_pages_type(u64 start, u64 end,
page = pfn_to_page(pfn);
type = get_page_memtype(page);
- if (type != -1) {
- pr_info("reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
+ if (type != _PAGE_CACHE_MODE_WB) {
+ pr_info("x86/PAT: reserve_ram_pages_type failed [mem %#010Lx-%#010Lx], track 0x%x, req 0x%x\n",
start, end - 1, type, req_type);
if (new_type)
*new_type = type;
@@ -373,7 +466,7 @@ static int free_ram_pages_type(u64 start, u64 end)
for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
page = pfn_to_page(pfn);
- set_page_memtype(page, -1);
+ set_page_memtype(page, _PAGE_CACHE_MODE_WB);
}
return 0;
}
@@ -384,6 +477,7 @@ static int free_ram_pages_type(u64 start, u64 end)
* - _PAGE_CACHE_MODE_WC
* - _PAGE_CACHE_MODE_UC_MINUS
* - _PAGE_CACHE_MODE_UC
+ * - _PAGE_CACHE_MODE_WT
*
* If new_type is NULL, function will return an error if it cannot reserve the
* region with req_type. If new_type is non-NULL, function will return
@@ -400,14 +494,10 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
BUG_ON(start >= end); /* end is exclusive */
- if (!pat_enabled) {
+ if (!pat_enabled()) {
/* This is identical to page table setting without PAT */
- if (new_type) {
- if (req_type == _PAGE_CACHE_MODE_WC)
- *new_type = _PAGE_CACHE_MODE_UC_MINUS;
- else
- *new_type = req_type;
- }
+ if (new_type)
+ *new_type = req_type;
return 0;
}
@@ -451,9 +541,9 @@ int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_type,
err = rbt_memtype_check_insert(new, new_type);
if (err) {
- printk(KERN_INFO "reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
- start, end - 1,
- cattr_name(new->type), cattr_name(req_type));
+ pr_info("x86/PAT: reserve_memtype failed [mem %#010Lx-%#010Lx], track %s, req %s\n",
+ start, end - 1,
+ cattr_name(new->type), cattr_name(req_type));
kfree(new);
spin_unlock(&memtype_lock);
@@ -475,7 +565,7 @@ int free_memtype(u64 start, u64 end)
int is_range_ram;
struct memtype *entry;
- if (!pat_enabled)
+ if (!pat_enabled())
return 0;
/* Low ISA region is always mapped WB. No need to track */
@@ -497,8 +587,8 @@ int free_memtype(u64 start, u64 end)
spin_unlock(&memtype_lock);
if (!entry) {
- printk(KERN_INFO "%s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
- current->comm, current->pid, start, end - 1);
+ pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n",
+ current->comm, current->pid, start, end - 1);
return -EINVAL;
}
@@ -517,7 +607,7 @@ int free_memtype(u64 start, u64 end)
* Only to be called when PAT is enabled
*
* Returns _PAGE_CACHE_MODE_WB, _PAGE_CACHE_MODE_WC, _PAGE_CACHE_MODE_UC_MINUS
- * or _PAGE_CACHE_MODE_UC
+ * or _PAGE_CACHE_MODE_WT.
*/
static enum page_cache_mode lookup_memtype(u64 paddr)
{
@@ -529,16 +619,9 @@ static enum page_cache_mode lookup_memtype(u64 paddr)
if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
struct page *page;
- page = pfn_to_page(paddr >> PAGE_SHIFT);
- rettype = get_page_memtype(page);
- /*
- * -1 from get_page_memtype() implies RAM page is in its
- * default state and not reserved, and hence of type WB
- */
- if (rettype == -1)
- rettype = _PAGE_CACHE_MODE_WB;
- return rettype;
+ page = pfn_to_page(paddr >> PAGE_SHIFT);
+ return get_page_memtype(page);
}
spin_lock(&memtype_lock);
@@ -623,13 +706,13 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
u64 to = from + size;
u64 cursor = from;
- if (!pat_enabled)
+ if (!pat_enabled())
return 1;
while (cursor < to) {
if (!devmem_is_allowed(pfn)) {
- printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
- current->comm, from, to - 1);
+ pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+ current->comm, from, to - 1);
return 0;
}
cursor += PAGE_SIZE;
@@ -659,7 +742,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
* caching for the high addresses through the KEN pin, but
* we maintain the tradition of paranoia in this code.
*/
- if (!pat_enabled &&
+ if (!pat_enabled() &&
!(boot_cpu_has(X86_FEATURE_MTRR) ||
boot_cpu_has(X86_FEATURE_K6_MTRR) ||
boot_cpu_has(X86_FEATURE_CYRIX_ARR) ||
@@ -698,8 +781,7 @@ int kernel_map_sync_memtype(u64 base, unsigned long size,
size;
if (ioremap_change_attr((unsigned long)__va(base), id_sz, pcm) < 0) {
- printk(KERN_INFO "%s:%d ioremap_change_attr failed %s "
- "for [mem %#010Lx-%#010Lx]\n",
+ pr_info("x86/PAT: %s:%d ioremap_change_attr failed %s for [mem %#010Lx-%#010Lx]\n",
current->comm, current->pid,
cattr_name(pcm),
base, (unsigned long long)(base + size-1));
@@ -729,12 +811,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
* the type requested matches the type of first page in the range.
*/
if (is_ram) {
- if (!pat_enabled)
+ if (!pat_enabled())
return 0;
pcm = lookup_memtype(paddr);
if (want_pcm != pcm) {
- printk(KERN_WARNING "%s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
+ pr_warn("x86/PAT: %s:%d map pfn RAM range req %s for [mem %#010Lx-%#010Lx], got %s\n",
current->comm, current->pid,
cattr_name(want_pcm),
(unsigned long long)paddr,
@@ -755,13 +837,12 @@ static int reserve_pfn_range(u64 paddr, unsigned long size, pgprot_t *vma_prot,
if (strict_prot ||
!is_new_memtype_allowed(paddr, size, want_pcm, pcm)) {
free_memtype(paddr, paddr + size);
- printk(KERN_ERR "%s:%d map pfn expected mapping type %s"
- " for [mem %#010Lx-%#010Lx], got %s\n",
- current->comm, current->pid,
- cattr_name(want_pcm),
- (unsigned long long)paddr,
- (unsigned long long)(paddr + size - 1),
- cattr_name(pcm));
+ pr_err("x86/PAT: %s:%d map pfn expected mapping type %s for [mem %#010Lx-%#010Lx], got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_pcm),
+ (unsigned long long)paddr,
+ (unsigned long long)(paddr + size - 1),
+ cattr_name(pcm));
return -EINVAL;
}
/*
@@ -844,7 +925,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot,
return ret;
}
- if (!pat_enabled)
+ if (!pat_enabled())
return 0;
/*
@@ -872,7 +953,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot,
{
enum page_cache_mode pcm;
- if (!pat_enabled)
+ if (!pat_enabled())
return 0;
/* Set prot based on lookup */
@@ -913,14 +994,18 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
pgprot_t pgprot_writecombine(pgprot_t prot)
{
- if (pat_enabled)
- return __pgprot(pgprot_val(prot) |
+ return __pgprot(pgprot_val(prot) |
cachemode2protval(_PAGE_CACHE_MODE_WC));
- else
- return pgprot_noncached(prot);
}
EXPORT_SYMBOL_GPL(pgprot_writecombine);
+pgprot_t pgprot_writethrough(pgprot_t prot)
+{
+ return __pgprot(pgprot_val(prot) |
+ cachemode2protval(_PAGE_CACHE_MODE_WT));
+}
+EXPORT_SYMBOL_GPL(pgprot_writethrough);
+
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
static struct memtype *memtype_get_idx(loff_t pos)
@@ -996,7 +1081,7 @@ static const struct file_operations memtype_fops = {
static int __init pat_memtype_list_init(void)
{
- if (pat_enabled) {
+ if (pat_enabled()) {
debugfs_create_file("pat_memtype_list", S_IRUSR,
arch_debugfs_dir, NULL, &memtype_fops);
}
diff --git a/arch/x86/mm/pat_internal.h b/arch/x86/mm/pat_internal.h
index f6411620305d..a739bfc40690 100644
--- a/arch/x86/mm/pat_internal.h
+++ b/arch/x86/mm/pat_internal.h
@@ -4,7 +4,7 @@
extern int pat_debug_enable;
#define dprintk(fmt, arg...) \
- do { if (pat_debug_enable) printk(KERN_INFO fmt, ##arg); } while (0)
+ do { if (pat_debug_enable) pr_info("x86/PAT: " fmt, ##arg); } while (0)
struct memtype {
u64 start;
diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c
index 6582adcc8bd9..63931080366a 100644
--- a/arch/x86/mm/pat_rbtree.c
+++ b/arch/x86/mm/pat_rbtree.c
@@ -160,9 +160,9 @@ success:
return 0;
failure:
- printk(KERN_INFO "%s:%d conflicting memory types "
- "%Lx-%Lx %s<->%s\n", current->comm, current->pid, start,
- end, cattr_name(found_type), cattr_name(match->type));
+ pr_info("x86/PAT: %s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid, start, end,
+ cattr_name(found_type), cattr_name(match->type));
return -EBUSY;
}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0b97d2c75df3..fb0a9dd1d6e4 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -563,16 +563,31 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
}
#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+/**
+ * pud_set_huge - setup kernel PUD mapping
+ *
+ * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this
+ * function sets up a huge page only if any of the following conditions are met:
+ *
+ * - MTRRs are disabled, or
+ *
+ * - MTRRs are enabled and the range is completely covered by a single MTRR, or
+ *
+ * - MTRRs are enabled and the corresponding MTRR memory type is WB, which
+ * has no effect on the requested PAT memory type.
+ *
+ * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger
+ * page mapping attempt fails.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
{
- u8 mtrr;
+ u8 mtrr, uniform;
- /*
- * Do not use a huge page when the range is covered by non-WB type
- * of MTRRs.
- */
- mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
- if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform);
+ if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+ (mtrr != MTRR_TYPE_WRBACK))
return 0;
prot = pgprot_4k_2_large(prot);
@@ -584,17 +599,24 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
return 1;
}
+/**
+ * pmd_set_huge - setup kernel PMD mapping
+ *
+ * See text over pud_set_huge() above.
+ *
+ * Returns 1 on success and 0 on failure.
+ */
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
{
- u8 mtrr;
+ u8 mtrr, uniform;
- /*
- * Do not use a huge page when the range is covered by non-WB type
- * of MTRRs.
- */
- mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
- if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+ mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform);
+ if ((mtrr != MTRR_TYPE_INVALID) && (!uniform) &&
+ (mtrr != MTRR_TYPE_WRBACK)) {
+ pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n",
+ __func__, addr, addr + PMD_SIZE);
return 0;
+ }
prot = pgprot_4k_2_large(prot);
@@ -605,6 +627,11 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
return 1;
}
+/**
+ * pud_clear_huge - clear kernel PUD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PUD map is found).
+ */
int pud_clear_huge(pud_t *pud)
{
if (pud_large(*pud)) {
@@ -615,6 +642,11 @@ int pud_clear_huge(pud_t *pud)
return 0;
}
+/**
+ * pmd_clear_huge - clear kernel PMD mapping when it is set
+ *
+ * Returns 1 on success and 0 on failure (no PMD map is found).
+ */
int pmd_clear_huge(pmd_t *pmd)
{
if (pmd_large(*pmd)) {
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index 6440221ced0d..4093216b3791 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -8,7 +8,6 @@
* of the License.
*/
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
/*
* Calling convention :
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 99f76103c6b7..ddeff4844a10 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -966,7 +966,12 @@ void bpf_int_jit_compile(struct bpf_prog *prog)
}
ctx.cleanup_addr = proglen;
- for (pass = 0; pass < 10; pass++) {
+ /* JITed image shrinks with every pass and the loop iterates
+ * until the image stops shrinking. Very large bpf programs
+ * may converge on the last pass. In such case do one more
+ * pass to emit the final image
+ */
+ for (pass = 0; pass < 10 || image; pass++) {
proglen = do_jit(prog, addrs, image, oldproglen, &ctx);
if (proglen <= 0) {
image = NULL;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d93963340c3c..14a63ed6fe09 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -482,9 +482,16 @@ struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
{
- struct pci_sysdata *sd = bridge->bus->sysdata;
-
- ACPI_COMPANION_SET(&bridge->dev, sd->companion);
+ /*
+ * We pass NULL as parent to pci_create_root_bus(), so if it is not NULL
+ * here, pci_create_root_bus() has been called by someone else and
+ * sysdata is likely to be different from what we expect. Let it go in
+ * that case.
+ */
+ if (!bridge->dev.parent) {
+ struct pci_sysdata *sd = bridge->bus->sysdata;
+ ACPI_COMPANION_SET(&bridge->dev, sd->companion);
+ }
return 0;
}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 349c0d32cc0b..0a9f2caf358f 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -429,12 +429,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
* Caller can followup with UC MINUS request and add a WC mtrr if there
* is a free mtrr slot.
*/
- if (!pat_enabled && write_combine)
+ if (!pat_enabled() && write_combine)
return -EINVAL;
- if (pat_enabled && write_combine)
+ if (pat_enabled() && write_combine)
prot |= cachemode2protval(_PAGE_CACHE_MODE_WC);
- else if (pat_enabled || boot_cpu_data.x86 > 3)
+ else if (pat_enabled() || boot_cpu_data.x86 > 3)
/*
* ioremap() and ioremap_nocache() defaults to UC MINUS for now.
* To avoid attribute conflicts, request UC MINUS here
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
index a62e0be3a2f1..f1a6c8e86ddd 100644
--- a/arch/x86/platform/Makefile
+++ b/arch/x86/platform/Makefile
@@ -1,4 +1,5 @@
# Platform specific code goes here
+obj-y += atom/
obj-y += ce4100/
obj-y += efi/
obj-y += geode/
diff --git a/arch/x86/platform/atom/Makefile b/arch/x86/platform/atom/Makefile
new file mode 100644
index 000000000000..0a3a40cbc794
--- /dev/null
+++ b/arch/x86/platform/atom/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_PUNIT_ATOM_DEBUG) += punit_atom_debug.o
diff --git a/arch/x86/platform/atom/punit_atom_debug.c b/arch/x86/platform/atom/punit_atom_debug.c
new file mode 100644
index 000000000000..5ca8ead91579
--- /dev/null
+++ b/arch/x86/platform/atom/punit_atom_debug.c
@@ -0,0 +1,183 @@
+/*
+ * Intel SOC Punit device state debug driver
+ * Punit controls power management for North Complex devices (Graphics
+ * blocks, Image Signal Processing, video processing, display, DSP etc.)
+ *
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/io.h>
+#include <asm/cpu_device_id.h>
+#include <asm/iosf_mbi.h>
+
+/* Side band Interface port */
+#define PUNIT_PORT 0x04
+/* Power gate status reg */
+#define PWRGT_STATUS 0x61
+/* Subsystem config/status Video processor */
+#define VED_SS_PM0 0x32
+/* Subsystem config/status ISP (Image Signal Processor) */
+#define ISP_SS_PM0 0x39
+/* Subsystem config/status Input/output controller */
+#define MIO_SS_PM 0x3B
+/* Shift bits for getting status for video, isp and i/o */
+#define SSS_SHIFT 24
+/* Shift bits for getting status for graphics rendering */
+#define RENDER_POS 0
+/* Shift bits for getting status for media control */
+#define MEDIA_POS 2
+/* Shift bits for getting status for Valley View/Baytrail display */
+#define VLV_DISPLAY_POS 6
+/* Subsystem config/status display for Cherry Trail SOC */
+#define CHT_DSP_SSS 0x36
+/* Shift bits for getting status for display */
+#define CHT_DSP_SSS_POS 16
+
+struct punit_device {
+ char *name;
+ int reg;
+ int sss_pos;
+};
+
+static const struct punit_device punit_device_byt[] = {
+ { "GFX RENDER", PWRGT_STATUS, RENDER_POS },
+ { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS },
+ { "DISPLAY", PWRGT_STATUS, VLV_DISPLAY_POS },
+ { "VED", VED_SS_PM0, SSS_SHIFT },
+ { "ISP", ISP_SS_PM0, SSS_SHIFT },
+ { "MIO", MIO_SS_PM, SSS_SHIFT },
+ { NULL }
+};
+
+static const struct punit_device punit_device_cht[] = {
+ { "GFX RENDER", PWRGT_STATUS, RENDER_POS },
+ { "GFX MEDIA", PWRGT_STATUS, MEDIA_POS },
+ { "DISPLAY", CHT_DSP_SSS, CHT_DSP_SSS_POS },
+ { "VED", VED_SS_PM0, SSS_SHIFT },
+ { "ISP", ISP_SS_PM0, SSS_SHIFT },
+ { "MIO", MIO_SS_PM, SSS_SHIFT },
+ { NULL }
+};
+
+static const char * const dstates[] = {"D0", "D0i1", "D0i2", "D0i3"};
+
+static int punit_dev_state_show(struct seq_file *seq_file, void *unused)
+{
+ u32 punit_pwr_status;
+ struct punit_device *punit_devp = seq_file->private;
+ int index;
+ int status;
+
+ seq_puts(seq_file, "\n\nPUNIT NORTH COMPLEX DEVICES :\n");
+ while (punit_devp->name) {
+ status = iosf_mbi_read(PUNIT_PORT, BT_MBI_PMC_READ,
+ punit_devp->reg,
+ &punit_pwr_status);
+ if (status) {
+ seq_printf(seq_file, "%9s : Read Failed\n",
+ punit_devp->name);
+ } else {
+ index = (punit_pwr_status >> punit_devp->sss_pos) & 3;
+ seq_printf(seq_file, "%9s : %s\n", punit_devp->name,
+ dstates[index]);
+ }
+ punit_devp++;
+ }
+
+ return 0;
+}
+
+static int punit_dev_state_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, punit_dev_state_show, inode->i_private);
+}
+
+static const struct file_operations punit_dev_state_ops = {
+ .open = punit_dev_state_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct dentry *punit_dbg_file;
+
+static int punit_dbgfs_register(struct punit_device *punit_device)
+{
+ static struct dentry *dev_state;
+
+ punit_dbg_file = debugfs_create_dir("punit_atom", NULL);
+ if (!punit_dbg_file)
+ return -ENXIO;
+
+ dev_state = debugfs_create_file("dev_power_state", S_IFREG | S_IRUGO,
+ punit_dbg_file, punit_device,
+ &punit_dev_state_ops);
+ if (!dev_state) {
+ pr_err("punit_dev_state register failed\n");
+ debugfs_remove(punit_dbg_file);
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static void punit_dbgfs_unregister(void)
+{
+ debugfs_remove_recursive(punit_dbg_file);
+}
+
+#define ICPU(model, drv_data) \
+ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT,\
+ (kernel_ulong_t)&drv_data }
+
+static const struct x86_cpu_id intel_punit_cpu_ids[] = {
+ ICPU(55, punit_device_byt), /* Valleyview, Bay Trail */
+ ICPU(76, punit_device_cht), /* Braswell, Cherry Trail */
+ {}
+};
+
+MODULE_DEVICE_TABLE(x86cpu, intel_punit_cpu_ids);
+
+static int __init punit_atom_debug_init(void)
+{
+ const struct x86_cpu_id *id;
+ int ret;
+
+ id = x86_match_cpu(intel_punit_cpu_ids);
+ if (!id)
+ return -ENODEV;
+
+ ret = punit_dbgfs_register((struct punit_device *)id->driver_data);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static void __exit punit_atom_debug_exit(void)
+{
+ punit_dbgfs_unregister();
+}
+
+module_init(punit_atom_debug_init);
+module_exit(punit_atom_debug_exit);
+
+MODULE_AUTHOR("Kumar P, Mahesh <mahesh.kumar.p@intel.com>");
+MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>");
+MODULE_DESCRIPTION("Driver for Punit devices states debugging");
+MODULE_LICENSE("GPL v2");
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index acb384d24669..a8fecc226946 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -26,7 +26,7 @@ else
obj-y += syscalls_64.o vdso/
-subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \
+subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \
../lib/rwsem.o
endif
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 7e8a1a650435..b9531d343134 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -39,7 +39,8 @@
#define smp_mb() barrier()
#define smp_rmb() barrier()
#define smp_wmb() barrier()
-#define set_mb(var, value) do { var = value; barrier(); } while (0)
+
+#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0)
#define read_barrier_depends() do { } while (0)
#define smp_read_barrier_depends() do { } while (0)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index fe969ac1c65e..a8f57a94785a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1468,6 +1468,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
{
struct physdev_set_iopl set_iopl;
unsigned long initrd_start = 0;
+ u64 pat;
int rc;
if (!xen_start_info)
@@ -1575,8 +1576,8 @@ asmlinkage __visible void __init xen_start_kernel(void)
* Modify the cache mode translation tables to match Xen's PAT
* configuration.
*/
-
- pat_init_cache_modes();
+ rdmsrl(MSR_IA32_CR_PAT, pat);
+ pat_init_cache_modes(pat);
/* keep using Xen gdt for now; no urgent need to change it */
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index b47124d4cd67..8b7f18e200aa 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -67,6 +67,7 @@
#include <linux/seq_file.h>
#include <linux/bootmem.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <asm/cache.h>
#include <asm/setup.h>
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 956374c1edbc..9e2ba5c6e1dd 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -17,6 +17,56 @@
#include "xen-ops.h"
#include "debugfs.h"
+static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
+static DEFINE_PER_CPU(char *, irq_name);
+static bool xen_pvspin = true;
+
+#ifdef CONFIG_QUEUED_SPINLOCKS
+
+#include <asm/qspinlock.h>
+
+static void xen_qlock_kick(int cpu)
+{
+ xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
+}
+
+/*
+ * Halt the current CPU & release it back to the host
+ */
+static void xen_qlock_wait(u8 *byte, u8 val)
+{
+ int irq = __this_cpu_read(lock_kicker_irq);
+
+ /* If kicker interrupts not initialized yet, just spin */
+ if (irq == -1)
+ return;
+
+ /* clear pending */
+ xen_clear_irq_pending(irq);
+ barrier();
+
+ /*
+ * We check the byte value after clearing pending IRQ to make sure
+ * that we won't miss a wakeup event because of the clearing.
+ *
+ * The sync_clear_bit() call in xen_clear_irq_pending() is atomic.
+ * So it is effectively a memory barrier for x86.
+ */
+ if (READ_ONCE(*byte) != val)
+ return;
+
+ /*
+ * If an interrupt happens here, it will leave the wakeup irq
+ * pending, which will cause xen_poll_irq() to return
+ * immediately.
+ */
+
+ /* Block until irq becomes pending (or perhaps a spurious wakeup) */
+ xen_poll_irq(irq);
+}
+
+#else /* CONFIG_QUEUED_SPINLOCKS */
+
enum xen_contention_stat {
TAKEN_SLOW,
TAKEN_SLOW_PICKUP,
@@ -100,12 +150,9 @@ struct xen_lock_waiting {
__ticket_t want;
};
-static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
-static DEFINE_PER_CPU(char *, irq_name);
static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting);
static cpumask_t waiting_cpus;
-static bool xen_pvspin = true;
__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want)
{
int irq = __this_cpu_read(lock_kicker_irq);
@@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next)
}
}
}
+#endif /* CONFIG_QUEUED_SPINLOCKS */
static irqreturn_t dummy_handler(int irq, void *dev_id)
{
@@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void)
return;
}
printk(KERN_DEBUG "xen: PV spinlocks enabled\n");
+#ifdef CONFIG_QUEUED_SPINLOCKS
+ __pv_init_lock_hash();
+ pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
+ pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
+ pv_lock_ops.wait = xen_qlock_wait;
+ pv_lock_ops.kick = xen_qlock_kick;
+#else
pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning);
pv_lock_ops.unlock_kick = xen_unlock_kick;
+#endif
}
/*
@@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg)
}
early_param("xen_nopvspin", xen_parse_nopvspin);
-#ifdef CONFIG_XEN_DEBUG_FS
+#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS)
static struct dentry *d_spin_debug;
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 04529e620559..f22667abf7b9 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -114,7 +114,7 @@ RELOC(xen_sysret32, 1b+1)
/* Normal 64-bit system call target */
ENTRY(xen_syscall_target)
undo_xen_syscall
- jmp system_call_after_swapgs
+ jmp entry_SYSCALL_64_after_swapgs
ENDPROC(xen_syscall_target)
#ifdef CONFIG_IA32_EMULATION
@@ -122,13 +122,13 @@ ENDPROC(xen_syscall_target)
/* 32-bit compat syscall target */
ENTRY(xen_syscall32_target)
undo_xen_syscall
- jmp ia32_cstar_target
+ jmp entry_SYSCALL_compat
ENDPROC(xen_syscall32_target)
/* 32-bit compat sysenter target */
ENTRY(xen_sysenter_target)
undo_xen_syscall
- jmp ia32_sysenter_target
+ jmp entry_SYSENTER_compat
ENDPROC(xen_sysenter_target)
#else /* !CONFIG_IA32_EMULATION */