1
0
Fork 0

Merge branch 'x86-irq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 irq updates from Ingo Molnar:
 "Here are the main changes in this tree:

   - Introduce x86-64 IRQ/exception/debug stack guard pages to detect
     stack overflows immediately and deterministically.

   - Clean up over a decade worth of cruft accumulated.

  The outcome of this should be more clear-cut faults/crashes when any
  of the low level x86 CPU stacks overflow, instead of silent memory
  corruption and sporadic failures much later on"

* 'x86-irq-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
  x86/irq: Fix outdated comments
  x86/irq/64: Remove stack overflow debug code
  x86/irq/64: Remap the IRQ stack with guard pages
  x86/irq/64: Split the IRQ stack into its own pages
  x86/irq/64: Init hardirq_stack_ptr during CPU hotplug
  x86/irq/32: Handle irq stack allocation failure proper
  x86/irq/32: Invoke irq_ctx_init() from init_IRQ()
  x86/irq/64: Rename irq_stack_ptr to hardirq_stack_ptr
  x86/irq/32: Rename hard/softirq_stack to hard/softirq_stack_ptr
  x86/irq/32: Make irq stack a character array
  x86/irq/32: Define IRQ_STACK_SIZE
  x86/dumpstack/64: Speedup in_exception_stack()
  x86/exceptions: Split debug IST stack
  x86/exceptions: Enable IST guard pages
  x86/exceptions: Disconnect IST index and stack order
  x86/cpu: Remove orig_ist array
  x86/cpu: Prepare TSS.IST setup for guard pages
  x86/dumpstack/64: Use cpu_entry_area instead of orig_ist
  x86/irq/64: Use cpu entry area instead of orig_ist
  x86/traps: Use cpu_entry_area instead of orig_ist
  ...
hifive-unleashed-5.2
Linus Torvalds 2019-05-06 15:56:41 -07:00
commit 8f14772703
33 changed files with 382 additions and 322 deletions

View File

@ -59,7 +59,7 @@ If that assumption is ever broken then the stacks will become corrupt.
The currently assigned IST stacks are :- The currently assigned IST stacks are :-
* DOUBLEFAULT_STACK. EXCEPTION_STKSZ (PAGE_SIZE). * ESTACK_DF. EXCEPTION_STKSZ (PAGE_SIZE).
Used for interrupt 8 - Double Fault Exception (#DF). Used for interrupt 8 - Double Fault Exception (#DF).
@ -68,7 +68,7 @@ The currently assigned IST stacks are :-
Using a separate stack allows the kernel to recover from it well enough Using a separate stack allows the kernel to recover from it well enough
in many cases to still output an oops. in many cases to still output an oops.
* NMI_STACK. EXCEPTION_STKSZ (PAGE_SIZE). * ESTACK_NMI. EXCEPTION_STKSZ (PAGE_SIZE).
Used for non-maskable interrupts (NMI). Used for non-maskable interrupts (NMI).
@ -76,7 +76,7 @@ The currently assigned IST stacks are :-
middle of switching stacks. Using IST for NMI events avoids making middle of switching stacks. Using IST for NMI events avoids making
assumptions about the previous state of the kernel stack. assumptions about the previous state of the kernel stack.
* DEBUG_STACK. DEBUG_STKSZ * ESTACK_DB. EXCEPTION_STKSZ (PAGE_SIZE).
Used for hardware debug interrupts (interrupt 1) and for software Used for hardware debug interrupts (interrupt 1) and for software
debug interrupts (INT3). debug interrupts (INT3).
@ -86,7 +86,12 @@ The currently assigned IST stacks are :-
avoids making assumptions about the previous state of the kernel avoids making assumptions about the previous state of the kernel
stack. stack.
* MCE_STACK. EXCEPTION_STKSZ (PAGE_SIZE). To handle nested #DB correctly there exist two instances of DB stacks. On
#DB entry the IST stackpointer for #DB is switched to the second instance
so a nested #DB starts from a clean stack. The nested #DB switches
the IST stackpointer to a guard hole to catch triple nesting.
* ESTACK_MCE. EXCEPTION_STKSZ (PAGE_SIZE).
Used for interrupt 18 - Machine Check Exception (#MC). Used for interrupt 18 - Machine Check Exception (#MC).

View File

@ -14,6 +14,7 @@ config X86_32
select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IPC_PARSE_VERSION
select CLKSRC_I8253 select CLKSRC_I8253
select CLONE_BACKWARDS select CLONE_BACKWARDS
select HAVE_DEBUG_STACKOVERFLOW
select MODULES_USE_ELF_REL select MODULES_USE_ELF_REL
select OLD_SIGACTION select OLD_SIGACTION
@ -138,7 +139,6 @@ config X86
select HAVE_COPY_THREAD_TLS select HAVE_COPY_THREAD_TLS
select HAVE_C_RECORDMCOUNT select HAVE_C_RECORDMCOUNT
select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DMA_CONTIGUOUS select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE
select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_DYNAMIC_FTRACE_WITH_REGS

View File

@ -298,7 +298,7 @@ ENTRY(__switch_to_asm)
#ifdef CONFIG_STACKPROTECTOR #ifdef CONFIG_STACKPROTECTOR
movq TASK_stack_canary(%rsi), %rbx movq TASK_stack_canary(%rsi), %rbx
movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset
#endif #endif
#ifdef CONFIG_RETPOLINE #ifdef CONFIG_RETPOLINE
@ -430,8 +430,8 @@ END(irq_entries_start)
* it before we actually move ourselves to the IRQ stack. * it before we actually move ourselves to the IRQ stack.
*/ */
movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8)
movq PER_CPU_VAR(irq_stack_ptr), %rsp movq PER_CPU_VAR(hardirq_stack_ptr), %rsp
#ifdef CONFIG_DEBUG_ENTRY #ifdef CONFIG_DEBUG_ENTRY
/* /*
@ -840,7 +840,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
/* /*
* Exception entry points. * Exception entry points.
*/ */
#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
/** /**
* idtentry - Generate an IDT entry stub * idtentry - Generate an IDT entry stub
@ -878,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
* @paranoid == 2 is special: the stub will never switch stacks. This is for * @paranoid == 2 is special: the stub will never switch stacks. This is for
* #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
*/ */
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0
ENTRY(\sym) ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8 UNWIND_HINT_IRET_REGS offset=\has_error_code*8
@ -924,13 +924,13 @@ ENTRY(\sym)
.endif .endif
.if \shift_ist != -1 .if \shift_ist != -1
subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) subq $\ist_offset, CPU_TSS_IST(\shift_ist)
.endif .endif
call \do_sym call \do_sym
.if \shift_ist != -1 .if \shift_ist != -1
addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) addq $\ist_offset, CPU_TSS_IST(\shift_ist)
.endif .endif
/* these procedures expect "no swapgs" flag in ebx */ /* these procedures expect "no swapgs" flag in ebx */
@ -1128,7 +1128,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
hv_stimer0_callback_vector hv_stimer0_vector_handler hv_stimer0_callback_vector hv_stimer0_vector_handler
#endif /* CONFIG_HYPERV */ #endif /* CONFIG_HYPERV */
idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
idtentry int3 do_int3 has_error_code=0 idtentry int3 do_int3 has_error_code=0
idtentry stack_segment do_stack_segment has_error_code=1 idtentry stack_segment do_stack_segment has_error_code=1

View File

@ -7,6 +7,64 @@
#include <asm/processor.h> #include <asm/processor.h>
#include <asm/intel_ds.h> #include <asm/intel_ds.h>
#ifdef CONFIG_X86_64
/* Macro to enforce the same ordering and stack sizes */
#define ESTACKS_MEMBERS(guardsize, db2_holesize)\
char DF_stack_guard[guardsize]; \
char DF_stack[EXCEPTION_STKSZ]; \
char NMI_stack_guard[guardsize]; \
char NMI_stack[EXCEPTION_STKSZ]; \
char DB2_stack_guard[guardsize]; \
char DB2_stack[db2_holesize]; \
char DB1_stack_guard[guardsize]; \
char DB1_stack[EXCEPTION_STKSZ]; \
char DB_stack_guard[guardsize]; \
char DB_stack[EXCEPTION_STKSZ]; \
char MCE_stack_guard[guardsize]; \
char MCE_stack[EXCEPTION_STKSZ]; \
char IST_top_guard[guardsize]; \
/* The exception stacks' physical storage. No guard pages required */
struct exception_stacks {
ESTACKS_MEMBERS(0, 0)
};
/* The effective cpu entry area mapping with guard pages. */
struct cea_exception_stacks {
ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ)
};
/*
* The exception stack ordering in [cea_]exception_stacks
*/
enum exception_stack_ordering {
ESTACK_DF,
ESTACK_NMI,
ESTACK_DB2,
ESTACK_DB1,
ESTACK_DB,
ESTACK_MCE,
N_EXCEPTION_STACKS
};
#define CEA_ESTACK_SIZE(st) \
sizeof(((struct cea_exception_stacks *)0)->st## _stack)
#define CEA_ESTACK_BOT(ceastp, st) \
((unsigned long)&(ceastp)->st## _stack)
#define CEA_ESTACK_TOP(ceastp, st) \
(CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st))
#define CEA_ESTACK_OFFS(st) \
offsetof(struct cea_exception_stacks, st## _stack)
#define CEA_ESTACK_PAGES \
(sizeof(struct cea_exception_stacks) / PAGE_SIZE)
#endif
/* /*
* cpu_entry_area is a percpu region that contains things needed by the CPU * cpu_entry_area is a percpu region that contains things needed by the CPU
* and early entry/exit code. Real types aren't used for all fields here * and early entry/exit code. Real types aren't used for all fields here
@ -32,12 +90,9 @@ struct cpu_entry_area {
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* /*
* Exception stacks used for IST entries. * Exception stacks used for IST entries with guard pages.
*
* In the future, this should have a separate slot for each stack
* with guard pages between them.
*/ */
char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; struct cea_exception_stacks estacks;
#endif #endif
#ifdef CONFIG_CPU_SUP_INTEL #ifdef CONFIG_CPU_SUP_INTEL
/* /*
@ -57,6 +112,7 @@ struct cpu_entry_area {
#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) #define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
extern void setup_cpu_entry_areas(void); extern void setup_cpu_entry_areas(void);
extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
@ -76,4 +132,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu)
return &get_cpu_entry_area(cpu)->entry_stack_page.stack; return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
} }
#define __this_cpu_ist_top_va(name) \
CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name)
#endif #endif

View File

@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void)
{ {
__this_cpu_dec(debug_stack_usage); __this_cpu_dec(debug_stack_usage);
} }
int is_debug_stack(unsigned long addr);
void debug_stack_set_zero(void); void debug_stack_set_zero(void);
void debug_stack_reset(void); void debug_stack_reset(void);
#else /* !X86_64 */ #else /* !X86_64 */
static inline int is_debug_stack(unsigned long addr) { return 0; }
static inline void debug_stack_set_zero(void) { } static inline void debug_stack_set_zero(void) { }
static inline void debug_stack_reset(void) { } static inline void debug_stack_reset(void) { }
static inline void debug_stack_usage_inc(void) { } static inline void debug_stack_usage_inc(void) { }

View File

@ -16,11 +16,7 @@ static inline int irq_canonicalize(int irq)
return ((irq == 2) ? 9 : irq); return ((irq == 2) ? 9 : irq);
} }
#ifdef CONFIG_X86_32 extern int irq_init_percpu_irqstack(unsigned int cpu);
extern void irq_ctx_init(int cpu);
#else
# define irq_ctx_init(cpu) do { } while (0)
#endif
#define __ARCH_HAS_DO_SOFTIRQ #define __ARCH_HAS_DO_SOFTIRQ

View File

@ -18,8 +18,8 @@
* Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
* Vectors 32 ... 127 : device interrupts * Vectors 32 ... 127 : device interrupts
* Vector 128 : legacy int80 syscall interface * Vector 128 : legacy int80 syscall interface
* Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts * Vectors 129 ... LOCAL_TIMER_VECTOR-1
* Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts
* *
* 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
* *

View File

@ -22,11 +22,9 @@
#define THREAD_SIZE_ORDER 1 #define THREAD_SIZE_ORDER 1
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define DOUBLEFAULT_STACK 1 #define IRQ_STACK_SIZE THREAD_SIZE
#define NMI_STACK 0
#define DEBUG_STACK 0 #define N_EXCEPTION_STACKS 1
#define MCE_STACK 0
#define N_EXCEPTION_STACKS 1
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
/* /*

View File

@ -14,22 +14,20 @@
#define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER)
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
#define CURRENT_MASK (~(THREAD_SIZE - 1))
#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
#define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
#define DOUBLEFAULT_STACK 1 /*
#define NMI_STACK 2 * The index for the tss.ist[] array. The hardware limit is 7 entries.
#define DEBUG_STACK 3 */
#define MCE_STACK 4 #define IST_INDEX_DF 0
#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define IST_INDEX_NMI 1
#define IST_INDEX_DB 2
#define IST_INDEX_MCE 3
/* /*
* Set __PAGE_OFFSET to the most negative possible address + * Set __PAGE_OFFSET to the most negative possible address +

View File

@ -367,6 +367,13 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw);
#define __KERNEL_TSS_LIMIT \ #define __KERNEL_TSS_LIMIT \
(IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
/* Per CPU interrupt stacks */
struct irq_stack {
char stack[IRQ_STACK_SIZE];
} __aligned(IRQ_STACK_SIZE);
DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#else #else
@ -374,38 +381,25 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
#endif #endif
/*
* Save the original ist values for checking stack pointers during debugging
*/
struct orig_ist {
unsigned long ist[7];
};
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
DECLARE_PER_CPU(struct orig_ist, orig_ist); struct fixed_percpu_data {
union irq_stack_union {
char irq_stack[IRQ_STACK_SIZE];
/* /*
* GCC hardcodes the stack canary as %gs:40. Since the * GCC hardcodes the stack canary as %gs:40. Since the
* irq_stack is the object at %gs:0, we reserve the bottom * irq_stack is the object at %gs:0, we reserve the bottom
* 48 bytes of the irq stack for the canary. * 48 bytes of the irq stack for the canary.
*/ */
struct { char gs_base[40];
char gs_base[40]; unsigned long stack_canary;
unsigned long stack_canary;
};
}; };
DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible;
DECLARE_INIT_PER_CPU(irq_stack_union); DECLARE_INIT_PER_CPU(fixed_percpu_data);
static inline unsigned long cpu_kernelmode_gs_base(int cpu) static inline unsigned long cpu_kernelmode_gs_base(int cpu)
{ {
return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
} }
DECLARE_PER_CPU(char *, irq_stack_ptr);
DECLARE_PER_CPU(unsigned int, irq_count); DECLARE_PER_CPU(unsigned int, irq_count);
extern asmlinkage void ignore_sysret(void); extern asmlinkage void ignore_sysret(void);
@ -427,15 +421,8 @@ struct stack_canary {
}; };
DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif #endif
/* /* Per CPU softirq stack pointer */
* per-CPU IRQ handling stacks DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
*/
struct irq_stack {
u32 stack[THREAD_SIZE/sizeof(u32)];
} __aligned(THREAD_SIZE);
DECLARE_PER_CPU(struct irq_stack *, hardirq_stack);
DECLARE_PER_CPU(struct irq_stack *, softirq_stack);
#endif /* X86_64 */ #endif /* X86_64 */
extern unsigned int fpu_kernel_xstate_size; extern unsigned int fpu_kernel_xstate_size;

View File

@ -131,7 +131,7 @@ void native_smp_prepare_boot_cpu(void);
void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_prepare_cpus(unsigned int max_cpus);
void calculate_max_logical_packages(void); void calculate_max_logical_packages(void);
void native_smp_cpus_done(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus);
void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
int native_cpu_disable(void); int native_cpu_disable(void);
int common_cpu_die(unsigned int cpu); int common_cpu_die(unsigned int cpu);

View File

@ -13,7 +13,7 @@
* On x86_64, %gs is shared by percpu area and stack canary. All * On x86_64, %gs is shared by percpu area and stack canary. All
* percpu symbols are zero based and %gs points to the base of percpu * percpu symbols are zero based and %gs points to the base of percpu
* area. The first occupant of the percpu area is always * area. The first occupant of the percpu area is always
* irq_stack_union which contains stack_canary at offset 40. Userland * fixed_percpu_data which contains stack_canary at offset 40. Userland
* %gs is always saved and restored on kernel entry and exit using * %gs is always saved and restored on kernel entry and exit using
* swapgs, so stack protector doesn't add any complexity there. * swapgs, so stack protector doesn't add any complexity there.
* *
@ -64,7 +64,7 @@ static __always_inline void boot_init_stack_canary(void)
u64 tsc; u64 tsc;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40);
#endif #endif
/* /*
* We both use the random pool and the current TSC as a source * We both use the random pool and the current TSC as a source
@ -79,7 +79,7 @@ static __always_inline void boot_init_stack_canary(void)
current->stack_canary = canary; current->stack_canary = canary;
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
this_cpu_write(irq_stack_union.stack_canary, canary); this_cpu_write(fixed_percpu_data.stack_canary, canary);
#else #else
this_cpu_write(stack_canary.canary, canary); this_cpu_write(stack_canary.canary, canary);
#endif #endif

View File

@ -9,6 +9,8 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/ptrace.h> #include <linux/ptrace.h>
#include <asm/cpu_entry_area.h>
#include <asm/switch_to.h> #include <asm/switch_to.h>
enum stack_type { enum stack_type {

View File

@ -68,10 +68,12 @@ int main(void)
#undef ENTRY #undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_ist, tss_struct, x86_tss.ist);
DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) -
offsetof(struct cea_exception_stacks, DB1_stack));
BLANK(); BLANK();
#ifdef CONFIG_STACKPROTECTOR #ifdef CONFIG_STACKPROTECTOR
DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary));
BLANK(); BLANK();
#endif #endif

View File

@ -507,19 +507,6 @@ void load_percpu_segment(int cpu)
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
#endif #endif
#ifdef CONFIG_X86_64
/*
* Special IST stacks which the CPU switches to when it calls
* an IST-marked descriptor entry. Up to 7 stacks (hardware
* limit), all of them are 4K, except the debug stack which
* is 8K.
*/
static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
#endif
/* Load the original GDT from the per-cpu structure */ /* Load the original GDT from the per-cpu structure */
void load_direct_gdt(int cpu) void load_direct_gdt(int cpu)
{ {
@ -1511,9 +1498,9 @@ static __init int setup_clearcpuid(char *arg)
__setup("clearcpuid=", setup_clearcpuid); __setup("clearcpuid=", setup_clearcpuid);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(union irq_stack_union, DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
irq_stack_union) __aligned(PAGE_SIZE) __visible; fixed_percpu_data) __aligned(PAGE_SIZE) __visible;
EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
/* /*
* The following percpu variables are hot. Align current_task to * The following percpu variables are hot. Align current_task to
@ -1523,9 +1510,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task; &init_task;
EXPORT_PER_CPU_SYMBOL(current_task); EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE;
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
@ -1562,23 +1547,7 @@ void syscall_init(void)
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
} }
/*
* Copies of the original ist values from the tss are only accessed during
* debugging, no special alignment required.
*/
DEFINE_PER_CPU(struct orig_ist, orig_ist);
static DEFINE_PER_CPU(unsigned long, debug_stack_addr);
DEFINE_PER_CPU(int, debug_stack_usage); DEFINE_PER_CPU(int, debug_stack_usage);
int is_debug_stack(unsigned long addr)
{
return __this_cpu_read(debug_stack_usage) ||
(addr <= __this_cpu_read(debug_stack_addr) &&
addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ));
}
NOKPROBE_SYMBOL(is_debug_stack);
DEFINE_PER_CPU(u32, debug_idt_ctr); DEFINE_PER_CPU(u32, debug_idt_ctr);
void debug_stack_set_zero(void) void debug_stack_set_zero(void)
@ -1690,17 +1659,14 @@ static void setup_getcpu(int cpu)
* initialized (naturally) in the bootstrap process, such as the GDT * initialized (naturally) in the bootstrap process, such as the GDT
* and IDT. We reload them nevertheless, this function acts as a * and IDT. We reload them nevertheless, this function acts as a
* 'CPU state barrier', nothing should get across. * 'CPU state barrier', nothing should get across.
* A lot of state is already set up in PDA init for 64 bit
*/ */
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
void cpu_init(void) void cpu_init(void)
{ {
struct orig_ist *oist; int cpu = raw_smp_processor_id();
struct task_struct *me; struct task_struct *me;
struct tss_struct *t; struct tss_struct *t;
unsigned long v;
int cpu = raw_smp_processor_id();
int i; int i;
wait_for_master_cpu(cpu); wait_for_master_cpu(cpu);
@ -1715,7 +1681,6 @@ void cpu_init(void)
load_ucode_ap(); load_ucode_ap();
t = &per_cpu(cpu_tss_rw, cpu); t = &per_cpu(cpu_tss_rw, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
if (this_cpu_read(numa_node) == 0 && if (this_cpu_read(numa_node) == 0 &&
@ -1753,16 +1718,11 @@ void cpu_init(void)
/* /*
* set up and load the per-CPU TSS * set up and load the per-CPU TSS
*/ */
if (!oist->ist[0]) { if (!t->x86_tss.ist[0]) {
char *estacks = get_cpu_entry_area(cpu)->exception_stacks; t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF);
t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI);
for (v = 0; v < N_EXCEPTION_STACKS; v++) { t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB);
estacks += exception_stack_sizes[v]; t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE);
oist->ist[v] = t->x86_tss.ist[v] =
(unsigned long)estacks;
if (v == DEBUG_STACK-1)
per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks;
}
} }
t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;

View File

@ -34,14 +34,14 @@ const char *stack_type_name(enum stack_type type)
static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
{ {
unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/* /*
* This is a software stack, so 'end' can be a valid stack pointer. * This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty. * It just means the stack is empty.
*/ */
if (stack <= begin || stack > end) if (stack < begin || stack > end)
return false; return false;
info->type = STACK_TYPE_IRQ; info->type = STACK_TYPE_IRQ;
@ -59,14 +59,14 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info)
static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
{ {
unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr);
unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); unsigned long *end = begin + (THREAD_SIZE / sizeof(long));
/* /*
* This is a software stack, so 'end' can be a valid stack pointer. * This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty. * It just means the stack is empty.
*/ */
if (stack <= begin || stack > end) if (stack < begin || stack > end)
return false; return false;
info->type = STACK_TYPE_SOFTIRQ; info->type = STACK_TYPE_SOFTIRQ;

View File

@ -16,23 +16,21 @@
#include <linux/bug.h> #include <linux/bug.h>
#include <linux/nmi.h> #include <linux/nmi.h>
#include <asm/cpu_entry_area.h>
#include <asm/stacktrace.h> #include <asm/stacktrace.h>
static char *exception_stack_names[N_EXCEPTION_STACKS] = { static const char * const exception_stack_names[] = {
[ DOUBLEFAULT_STACK-1 ] = "#DF", [ ESTACK_DF ] = "#DF",
[ NMI_STACK-1 ] = "NMI", [ ESTACK_NMI ] = "NMI",
[ DEBUG_STACK-1 ] = "#DB", [ ESTACK_DB2 ] = "#DB2",
[ MCE_STACK-1 ] = "#MC", [ ESTACK_DB1 ] = "#DB1",
}; [ ESTACK_DB ] = "#DB",
[ ESTACK_MCE ] = "#MC",
static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = {
[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
[DEBUG_STACK - 1] = DEBUG_STKSZ
}; };
const char *stack_type_name(enum stack_type type) const char *stack_type_name(enum stack_type type)
{ {
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
if (type == STACK_TYPE_IRQ) if (type == STACK_TYPE_IRQ)
return "IRQ"; return "IRQ";
@ -52,43 +50,84 @@ const char *stack_type_name(enum stack_type type)
return NULL; return NULL;
} }
/**
* struct estack_pages - Page descriptor for exception stacks
* @offs: Offset from the start of the exception stack area
* @size: Size of the exception stack
* @type: Type to store in the stack_info struct
*/
struct estack_pages {
u32 offs;
u16 size;
u16 type;
};
#define EPAGERANGE(st) \
[PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \
PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \
.offs = CEA_ESTACK_OFFS(st), \
.size = CEA_ESTACK_SIZE(st), \
.type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, }
/*
* Array of exception stack page descriptors. If the stack is larger than
* PAGE_SIZE, all pages covering a particular stack will have the same
* info. The guard pages including the not mapped DB2 stack are zeroed
* out.
*/
static const
struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
EPAGERANGE(DF),
EPAGERANGE(NMI),
EPAGERANGE(DB1),
EPAGERANGE(DB),
EPAGERANGE(MCE),
};
static bool in_exception_stack(unsigned long *stack, struct stack_info *info) static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
{ {
unsigned long *begin, *end; unsigned long begin, end, stk = (unsigned long)stack;
const struct estack_pages *ep;
struct pt_regs *regs; struct pt_regs *regs;
unsigned k; unsigned int k;
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
for (k = 0; k < N_EXCEPTION_STACKS; k++) { begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; end = begin + sizeof(struct cea_exception_stacks);
begin = end - (exception_stack_sizes[k] / sizeof(long)); /* Bail if @stack is outside the exception stack area. */
regs = (struct pt_regs *)end - 1; if (stk < begin || stk >= end)
return false;
if (stack <= begin || stack >= end) /* Calc page offset from start of exception stacks */
continue; k = (stk - begin) >> PAGE_SHIFT;
/* Lookup the page descriptor */
ep = &estack_pages[k];
/* Guard page? */
if (!ep->size)
return false;
info->type = STACK_TYPE_EXCEPTION + k; begin += (unsigned long)ep->offs;
info->begin = begin; end = begin + (unsigned long)ep->size;
info->end = end; regs = (struct pt_regs *)end - 1;
info->next_sp = (unsigned long *)regs->sp;
return true; info->type = ep->type;
} info->begin = (unsigned long *)begin;
info->end = (unsigned long *)end;
return false; info->next_sp = (unsigned long *)regs->sp;
return true;
} }
static bool in_irq_stack(unsigned long *stack, struct stack_info *info) static bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{ {
unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
/* /*
* This is a software stack, so 'end' can be a valid stack pointer. * This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty. * It just means the stack is empty.
*/ */
if (stack <= begin || stack > end) if (stack < begin || stack >= end)
return false; return false;
info->type = STACK_TYPE_IRQ; info->type = STACK_TYPE_IRQ;

View File

@ -265,7 +265,7 @@ ENDPROC(start_cpu0)
GLOBAL(initial_code) GLOBAL(initial_code)
.quad x86_64_start_kernel .quad x86_64_start_kernel
GLOBAL(initial_gs) GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union) .quad INIT_PER_CPU_VAR(fixed_percpu_data)
GLOBAL(initial_stack) GLOBAL(initial_stack)
/* /*
* The SIZEOF_PTREGS gap is a convention which helps the in-kernel * The SIZEOF_PTREGS gap is a convention which helps the in-kernel

View File

@ -41,13 +41,12 @@ struct idt_data {
#define SYSG(_vector, _addr) \ #define SYSG(_vector, _addr) \
G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS)
/* Interrupt gate with interrupt stack */ /*
* Interrupt gate with interrupt stack. The _ist index is the index in
* the tss.ist[] array, but for the descriptor it needs to start at 1.
*/
#define ISTG(_vector, _addr, _ist) \ #define ISTG(_vector, _addr, _ist) \
G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS)
/* System interrupt gate with interrupt stack */
#define SISTG(_vector, _addr, _ist) \
G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS)
/* Task gate */ /* Task gate */
#define TSKG(_vector, _gdt) \ #define TSKG(_vector, _gdt) \
@ -184,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
* cpu_init() when the TSS has been initialized. * cpu_init() when the TSS has been initialized.
*/ */
static const __initconst struct idt_data ist_idts[] = { static const __initconst struct idt_data ist_idts[] = {
ISTG(X86_TRAP_DB, debug, DEBUG_STACK), ISTG(X86_TRAP_DB, debug, IST_INDEX_DB),
ISTG(X86_TRAP_NMI, nmi, NMI_STACK), ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI),
ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF),
#ifdef CONFIG_X86_MCE #ifdef CONFIG_X86_MCE
ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE),
#endif #endif
}; };

View File

@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; }
static inline void print_stack_overflow(void) { } static inline void print_stack_overflow(void) { }
#endif #endif
DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DEFINE_PER_CPU(struct irq_stack *, softirq_stack); DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr);
static void call_on_stack(void *func, void *stack) static void call_on_stack(void *func, void *stack)
{ {
@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
u32 *isp, *prev_esp, arg1; u32 *isp, *prev_esp, arg1;
curstk = (struct irq_stack *) current_stack(); curstk = (struct irq_stack *) current_stack();
irqstk = __this_cpu_read(hardirq_stack); irqstk = __this_cpu_read(hardirq_stack_ptr);
/* /*
* this is where we switch to the IRQ stack. However, if we are * this is where we switch to the IRQ stack. However, if we are
@ -107,27 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
} }
/* /*
* allocate per-cpu stacks for hardirq and for softirq processing * Allocate per-cpu stacks for hardirq and softirq processing
*/ */
void irq_ctx_init(int cpu) int irq_init_percpu_irqstack(unsigned int cpu)
{ {
struct irq_stack *irqstk; int node = cpu_to_node(cpu);
struct page *ph, *ps;
if (per_cpu(hardirq_stack, cpu)) if (per_cpu(hardirq_stack_ptr, cpu))
return; return 0;
irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
THREADINFO_GFP, if (!ph)
THREAD_SIZE_ORDER)); return -ENOMEM;
per_cpu(hardirq_stack, cpu) = irqstk; ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER);
if (!ps) {
__free_pages(ph, THREAD_SIZE_ORDER);
return -ENOMEM;
}
irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), per_cpu(hardirq_stack_ptr, cpu) = page_address(ph);
THREADINFO_GFP, per_cpu(softirq_stack_ptr, cpu) = page_address(ps);
THREAD_SIZE_ORDER)); return 0;
per_cpu(softirq_stack, cpu) = irqstk;
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu));
} }
void do_softirq_own_stack(void) void do_softirq_own_stack(void)
@ -135,7 +136,7 @@ void do_softirq_own_stack(void)
struct irq_stack *irqstk; struct irq_stack *irqstk;
u32 *isp, *prev_esp; u32 *isp, *prev_esp;
irqstk = __this_cpu_read(softirq_stack); irqstk = __this_cpu_read(softirq_stack_ptr);
/* build the stack frame on the softirq stack */ /* build the stack frame on the softirq stack */
isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); isp = (u32 *) ((char *)irqstk + sizeof(*irqstk));

View File

@ -18,63 +18,64 @@
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/smp.h> #include <linux/smp.h>
#include <linux/sched/task_stack.h> #include <linux/sched/task_stack.h>
#include <asm/cpu_entry_area.h>
#include <asm/io_apic.h> #include <asm/io_apic.h>
#include <asm/apic.h> #include <asm/apic.h>
int sysctl_panic_on_stackoverflow; DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible;
DECLARE_INIT_PER_CPU(irq_stack_backing_store);
/*
* Probabilistic stack overflow check:
*
* Only check the stack in process context, because everything else
* runs on the big interrupt stacks. Checking reliably is too expensive,
* so we just check from interrupts.
*/
static inline void stack_overflow_check(struct pt_regs *regs)
{
#ifdef CONFIG_DEBUG_STACKOVERFLOW
#define STACK_TOP_MARGIN 128
struct orig_ist *oist;
u64 irq_stack_top, irq_stack_bottom;
u64 estack_top, estack_bottom;
u64 curbase = (u64)task_stack_page(current);
if (user_mode(regs))
return;
if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
regs->sp <= curbase + THREAD_SIZE)
return;
irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) +
STACK_TOP_MARGIN;
irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr);
if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom)
return;
oist = this_cpu_ptr(&orig_ist);
estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN;
estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1];
if (regs->sp >= estack_top && regs->sp <= estack_bottom)
return;
WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n",
current->comm, curbase, regs->sp,
irq_stack_top, irq_stack_bottom,
estack_top, estack_bottom, (void *)regs->ip);
if (sysctl_panic_on_stackoverflow)
panic("low stack detected by irq handler - check messages\n");
#endif
}
bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{ {
stack_overflow_check(regs);
if (IS_ERR_OR_NULL(desc)) if (IS_ERR_OR_NULL(desc))
return false; return false;
generic_handle_irq_desc(desc); generic_handle_irq_desc(desc);
return true; return true;
} }
#ifdef CONFIG_VMAP_STACK
/*
* VMAP the backing store with guard pages
*/
static int map_irq_stack(unsigned int cpu)
{
char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu);
struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE];
void *va;
int i;
for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) {
phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT));
pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
}
va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
if (!va)
return -ENOMEM;
per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
return 0;
}
#else
/*
* If VMAP stacks are disabled due to KASAN, just use the per cpu
* backing store without guard pages.
*/
static int map_irq_stack(unsigned int cpu)
{
void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
return 0;
}
#endif
int irq_init_percpu_irqstack(unsigned int cpu)
{
if (per_cpu(hardirq_stack_ptr, cpu))
return 0;
return map_irq_stack(cpu);
}

View File

@ -91,6 +91,8 @@ void __init init_IRQ(void)
for (i = 0; i < nr_legacy_irqs(); i++) for (i = 0; i < nr_legacy_irqs(); i++)
per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);
BUG_ON(irq_init_percpu_irqstack(smp_processor_id()));
x86_init.irqs.intr_init(); x86_init.irqs.intr_init();
} }
@ -104,6 +106,4 @@ void __init native_init_IRQ(void)
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2); setup_irq(2, &irq2);
irq_ctx_init(smp_processor_id());
} }

View File

@ -21,13 +21,14 @@
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/atomic.h>
#include <linux/sched/clock.h> #include <linux/sched/clock.h>
#if defined(CONFIG_EDAC) #if defined(CONFIG_EDAC)
#include <linux/edac.h> #include <linux/edac.h>
#endif #endif
#include <linux/atomic.h> #include <asm/cpu_entry_area.h>
#include <asm/traps.h> #include <asm/traps.h>
#include <asm/mach_traps.h> #include <asm/mach_traps.h>
#include <asm/nmi.h> #include <asm/nmi.h>
@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2);
* switch back to the original IDT. * switch back to the original IDT.
*/ */
static DEFINE_PER_CPU(int, update_debug_stack); static DEFINE_PER_CPU(int, update_debug_stack);
static bool notrace is_debug_stack(unsigned long addr)
{
struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks);
unsigned long top = CEA_ESTACK_TOP(cs, DB);
unsigned long bot = CEA_ESTACK_BOT(cs, DB1);
if (__this_cpu_read(debug_stack_usage))
return true;
/*
* Note, this covers the guard page between DB and DB1 as well to
* avoid two checks. But by all means @addr can never point into
* the guard page.
*/
return addr >= bot && addr < top;
}
NOKPROBE_SYMBOL(is_debug_stack);
#endif #endif
dotraplinkage notrace void dotraplinkage notrace void

View File

@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void)
per_cpu(x86_cpu_to_logical_apicid, cpu) = per_cpu(x86_cpu_to_logical_apicid, cpu) =
early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
#endif #endif
#ifdef CONFIG_X86_64
per_cpu(irq_stack_ptr, cpu) =
per_cpu(irq_stack_union.irq_stack, cpu) +
IRQ_STACK_SIZE;
#endif
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) = per_cpu(x86_cpu_to_node_map, cpu) =
early_per_cpu_map(x86_cpu_to_node_map, cpu); early_per_cpu_map(x86_cpu_to_node_map, cpu);

View File

@ -935,20 +935,27 @@ out:
return boot_error; return boot_error;
} }
void common_cpu_up(unsigned int cpu, struct task_struct *idle) int common_cpu_up(unsigned int cpu, struct task_struct *idle)
{ {
int ret;
/* Just in case we booted with a single CPU. */ /* Just in case we booted with a single CPU. */
alternatives_enable_smp(); alternatives_enable_smp();
per_cpu(current_task, cpu) = idle; per_cpu(current_task, cpu) = idle;
/* Initialize the interrupt stack(s) */
ret = irq_init_percpu_irqstack(cpu);
if (ret)
return ret;
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */ /* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#else #else
initial_gs = per_cpu_offset(cpu); initial_gs = per_cpu_offset(cpu);
#endif #endif
return 0;
} }
/* /*
@ -1106,7 +1113,9 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
/* the FPU context is blank, nobody can own it */ /* the FPU context is blank, nobody can own it */
per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL;
common_cpu_up(cpu, tidle); err = common_cpu_up(cpu, tidle);
if (err)
return err;
err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered);
if (err) { if (err) {

View File

@ -403,7 +403,8 @@ SECTIONS
*/ */
#define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load
INIT_PER_CPU(gdt_page); INIT_PER_CPU(gdt_page);
INIT_PER_CPU(irq_stack_union); INIT_PER_CPU(fixed_percpu_data);
INIT_PER_CPU(irq_stack_backing_store);
/* /*
* Build-time check on the image size: * Build-time check on the image size:
@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union);
"kernel image bigger than KERNEL_IMAGE_SIZE"); "kernel image bigger than KERNEL_IMAGE_SIZE");
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
. = ASSERT((irq_stack_union == 0), . = ASSERT((fixed_percpu_data == 0),
"irq_stack_union is not at start of per-cpu area"); "fixed_percpu_data is not at start of per-cpu area");
#endif #endif
#endif /* CONFIG_X86_32 */ #endif /* CONFIG_X86_32 */

View File

@ -13,8 +13,8 @@
static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
#endif #endif
struct cpu_entry_area *get_cpu_entry_area(int cpu) struct cpu_entry_area *get_cpu_entry_area(int cpu)
@ -52,10 +52,10 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
} }
static void __init percpu_setup_debug_store(int cpu) static void __init percpu_setup_debug_store(unsigned int cpu)
{ {
#ifdef CONFIG_CPU_SUP_INTEL #ifdef CONFIG_CPU_SUP_INTEL
int npages; unsigned int npages;
void *cea; void *cea;
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
@ -78,9 +78,43 @@ static void __init percpu_setup_debug_store(int cpu)
#endif #endif
} }
/* Setup the fixmap mappings only once per-processor */ #ifdef CONFIG_X86_64
static void __init setup_cpu_entry_area(int cpu)
#define cea_map_stack(name) do { \
npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \
cea_map_percpu_pages(cea->estacks.name## _stack, \
estacks->name## _stack, npages, PAGE_KERNEL); \
} while (0)
static void __init percpu_setup_exception_stacks(unsigned int cpu)
{ {
struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu);
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
unsigned int npages;
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
per_cpu(cea_exception_stacks, cpu) = &cea->estacks;
/*
* The exceptions stack mappings in the per cpu area are protected
* by guard pages so each stack must be mapped separately. DB2 is
* not mapped; it just exists to catch triple nesting of #DB.
*/
cea_map_stack(DF);
cea_map_stack(NMI);
cea_map_stack(DB1);
cea_map_stack(DB);
cea_map_stack(MCE);
}
#else
static inline void percpu_setup_exception_stacks(unsigned int cpu) {}
#endif
/* Setup the fixmap mappings only once per-processor */
static void __init setup_cpu_entry_area(unsigned int cpu)
{
struct cpu_entry_area *cea = get_cpu_entry_area(cpu);
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO; pgprot_t gdt_prot = PAGE_KERNEL_RO;
@ -101,10 +135,9 @@ static void __init setup_cpu_entry_area(int cpu)
pgprot_t tss_prot = PAGE_KERNEL; pgprot_t tss_prot = PAGE_KERNEL;
#endif #endif
cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot);
gdt_prot);
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, cea_map_percpu_pages(&cea->entry_stack_page,
per_cpu_ptr(&entry_stack_storage, cpu), 1, per_cpu_ptr(&entry_stack_storage, cpu), 1,
PAGE_KERNEL); PAGE_KERNEL);
@ -128,22 +161,15 @@ static void __init setup_cpu_entry_area(int cpu)
BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^
offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK);
BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0);
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu),
&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); sizeof(struct tss_struct) / PAGE_SIZE, tss_prot);
#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_32
per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); per_cpu(cpu_entry_area, cpu) = cea;
#endif #endif
#ifdef CONFIG_X86_64 percpu_setup_exception_stacks(cpu);
BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0);
BUILD_BUG_ON(sizeof(exception_stacks) !=
sizeof(((struct cpu_entry_area *)0)->exception_stacks));
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
#endif
percpu_setup_debug_store(cpu); percpu_setup_debug_store(cpu);
} }

View File

@ -28,6 +28,7 @@
#include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/mmu_context.h> /* vma_pkey() */
#include <asm/efi.h> /* efi_recover_from_page_fault()*/ #include <asm/efi.h> /* efi_recover_from_page_fault()*/
#include <asm/desc.h> /* store_idt(), ... */ #include <asm/desc.h> /* store_idt(), ... */
#include <asm/cpu_entry_area.h> /* exception stack */
#define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h> #include <asm/trace/exceptions.h>
@ -793,7 +794,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
if (is_vmalloc_addr((void *)address) && if (is_vmalloc_addr((void *)address) &&
(((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
/* /*
* We're likely to be running with very little stack space * We're likely to be running with very little stack space
* left. It's plausible that we'd hit this condition but * left. It's plausible that we'd hit this condition but

View File

@ -754,7 +754,7 @@ static void percpu_init(void)
* __per_cpu_load * __per_cpu_load
* *
* The "gold" linker incorrectly associates: * The "gold" linker incorrectly associates:
* init_per_cpu__irq_stack_union * init_per_cpu__fixed_percpu_data
* init_per_cpu__gdt_page * init_per_cpu__gdt_page
*/ */
static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)

View File

@ -361,7 +361,9 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
{ {
int rc; int rc;
common_cpu_up(cpu, idle); rc = common_cpu_up(cpu, idle);
if (rc)
return rc;
xen_setup_runstate_info(cpu); xen_setup_runstate_info(cpu);

View File

@ -40,13 +40,13 @@ ENTRY(startup_xen)
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
/* Set up %gs. /* Set up %gs.
* *
* The base of %gs always points to the bottom of the irqstack * The base of %gs always points to fixed_percpu_data. If the
* union. If the stack protector canary is enabled, it is * stack protector canary is enabled, it is located at %gs:40.
* located at %gs:40. Note that, on SMP, the boot cpu uses * Note that, on SMP, the boot cpu uses init data section until
* init data section till per cpu areas are set up. * the per cpu areas are set up.
*/ */
movl $MSR_GS_BASE,%ecx movl $MSR_GS_BASE,%ecx
movq $INIT_PER_CPU_VAR(irq_stack_union),%rax movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax
cdq cdq
wrmsr wrmsr
#endif #endif

View File

@ -1687,7 +1687,6 @@ void __init xen_init_IRQ(void)
#ifdef CONFIG_X86 #ifdef CONFIG_X86
if (xen_pv_domain()) { if (xen_pv_domain()) {
irq_ctx_init(smp_processor_id());
if (xen_initial_domain()) if (xen_initial_domain())
pci_xen_initial_domain(); pci_xen_initial_domain();
} }

View File

@ -1467,53 +1467,17 @@ static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
} }
#ifdef CONFIG_DEBUG_PAGEALLOC #ifdef CONFIG_DEBUG_PAGEALLOC
static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr, static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
unsigned long caller)
{
int size = cachep->object_size;
addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
if (size < 5 * sizeof(unsigned long))
return;
*addr++ = 0x12345678;
*addr++ = caller;
*addr++ = smp_processor_id();
size -= 3 * sizeof(unsigned long);
{
unsigned long *sptr = &caller;
unsigned long svalue;
while (!kstack_end(sptr)) {
svalue = *sptr++;
if (kernel_text_address(svalue)) {
*addr++ = svalue;
size -= sizeof(unsigned long);
if (size <= sizeof(unsigned long))
break;
}
}
}
*addr++ = 0x87654321;
}
static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
int map, unsigned long caller)
{ {
if (!is_debug_pagealloc_cache(cachep)) if (!is_debug_pagealloc_cache(cachep))
return; return;
if (caller)
store_stackinfo(cachep, objp, caller);
kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map); kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
} }
#else #else
static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp, static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
int map, unsigned long caller) {} int map) {}
#endif #endif
@ -1661,7 +1625,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
if (cachep->flags & SLAB_POISON) { if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp); check_poison_obj(cachep, objp);
slab_kernel_map(cachep, objp, 1, 0); slab_kernel_map(cachep, objp, 1);
} }
if (cachep->flags & SLAB_RED_ZONE) { if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE) if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@ -2433,7 +2397,7 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
/* need to poison the objs? */ /* need to poison the objs? */
if (cachep->flags & SLAB_POISON) { if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE); poison_obj(cachep, objp, POISON_FREE);
slab_kernel_map(cachep, objp, 0, 0); slab_kernel_map(cachep, objp, 0);
} }
} }
#endif #endif
@ -2812,7 +2776,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_POISON) { if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE); poison_obj(cachep, objp, POISON_FREE);
slab_kernel_map(cachep, objp, 0, caller); slab_kernel_map(cachep, objp, 0);
} }
return objp; return objp;
} }
@ -3076,7 +3040,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
return objp; return objp;
if (cachep->flags & SLAB_POISON) { if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp); check_poison_obj(cachep, objp);
slab_kernel_map(cachep, objp, 1, 0); slab_kernel_map(cachep, objp, 1);
poison_obj(cachep, objp, POISON_INUSE); poison_obj(cachep, objp, POISON_INUSE);
} }
if (cachep->flags & SLAB_STORE_USER) if (cachep->flags & SLAB_STORE_USER)