diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 05b03ecd7933..a3456f34f672 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -3,3 +3,4 @@ generic-y += clkdev.h generic-y += exec.h generic-y += kvm_para.h generic-y += trace_clock.h +generic-y += vtime.h \ No newline at end of file diff --git a/arch/m68k/include/asm/irqflags.h b/arch/m68k/include/asm/irqflags.h index 7ef4115b8c4a..4c6275522e67 100644 --- a/arch/m68k/include/asm/irqflags.h +++ b/arch/m68k/include/asm/irqflags.h @@ -3,7 +3,7 @@ #include #ifdef CONFIG_MMU -#include +#include #endif #include #include diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 650757c300db..704e6f10ae80 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -2,3 +2,4 @@ generic-y += clkdev.h generic-y += rwsem.h generic-y += trace_clock.h +generic-y += vtime.h \ No newline at end of file diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index d2ff41370c0c..f65bd3634519 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -13,9 +13,6 @@ #include -#define __ARCH_HAS_VTIME_ACCOUNT -#define __ARCH_HAS_VTIME_TASK_SWITCH - /* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ typedef unsigned long long __nocast cputime_t; diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h new file mode 100644 index 000000000000..af9896c53eb3 --- /dev/null +++ b/arch/s390/include/asm/vtime.h @@ -0,0 +1,7 @@ +#ifndef _S390_VTIME_H +#define _S390_VTIME_H + +#define __ARCH_HAS_VTIME_ACCOUNT +#define __ARCH_HAS_VTIME_TASK_SWITCH + +#endif /* _S390_VTIME_H */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 9b9c1b78ec67..abcfab55f99b 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "entry.h" diff --git a/include/asm-generic/vtime.h b/include/asm-generic/vtime.h new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index fc09d7b0dacf..158158704c30 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -2,100 +2,110 @@ #define _LINUX_CONTEXT_TRACKING_H #include -#include #include +#include #include -struct context_tracking { - /* - * When active is false, probes are unset in order - * to minimize overhead: TIF flags are cleared - * and calls to user_enter/exit are ignored. This - * may be further optimized using static keys. - */ - bool active; - enum ctx_state { - IN_KERNEL = 0, - IN_USER, - } state; -}; - -static inline void __guest_enter(void) -{ - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ - vtime_account_system(current); - current->flags |= PF_VCPU; -} - -static inline void __guest_exit(void) -{ - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ - vtime_account_system(current); - current->flags &= ~PF_VCPU; -} #ifdef CONFIG_CONTEXT_TRACKING -DECLARE_PER_CPU(struct context_tracking, context_tracking); +extern void context_tracking_cpu_set(int cpu); -static inline bool context_tracking_in_user(void) +extern void context_tracking_user_enter(void); +extern void context_tracking_user_exit(void); +extern void __context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next); + +static inline void user_enter(void) { - return __this_cpu_read(context_tracking.state) == IN_USER; -} + if (static_key_false(&context_tracking_enabled)) + context_tracking_user_enter(); -static inline bool context_tracking_active(void) +} +static inline void user_exit(void) { - return __this_cpu_read(context_tracking.active); + if (static_key_false(&context_tracking_enabled)) + context_tracking_user_exit(); } -extern void user_enter(void); -extern void user_exit(void); - -extern void guest_enter(void); -extern void guest_exit(void); - static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; + if (!static_key_false(&context_tracking_enabled)) + return 0; + prev_ctx = this_cpu_read(context_tracking.state); - user_exit(); + context_tracking_user_exit(); return prev_ctx; } static inline void exception_exit(enum ctx_state prev_ctx) { - if (prev_ctx == IN_USER) - user_enter(); + if (static_key_false(&context_tracking_enabled)) { + if (prev_ctx == IN_USER) + context_tracking_user_enter(); + } } -extern void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next); +static inline void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) +{ + if (static_key_false(&context_tracking_enabled)) + __context_tracking_task_switch(prev, next); +} #else -static inline bool context_tracking_in_user(void) { return false; } static inline void user_enter(void) { } static inline void user_exit(void) { } - -static inline void guest_enter(void) -{ - __guest_enter(); -} - -static inline void guest_exit(void) -{ - __guest_exit(); -} - static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_CONTEXT_TRACKING */ + +#ifdef CONFIG_CONTEXT_TRACKING_FORCE +extern void context_tracking_init(void); +#else +static inline void context_tracking_init(void) { } +#endif /* CONFIG_CONTEXT_TRACKING_FORCE */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static inline void guest_enter(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_enter(current); + else + current->flags |= PF_VCPU; +} + +static inline void guest_exit(void) +{ + if (vtime_accounting_enabled()) + vtime_guest_exit(current); + else + current->flags &= ~PF_VCPU; +} + +#else +static inline void guest_enter(void) +{ + /* + * This is running in ioctl context so its safe + * to assume that it's the stime pending cputime + * to flush. + */ + vtime_account_system(current); + current->flags |= PF_VCPU; +} + +static inline void guest_exit(void) +{ + /* Flush the guest cputime we spent on the guest */ + vtime_account_system(current); + current->flags &= ~PF_VCPU; +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ + #endif diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h new file mode 100644 index 000000000000..0f1979d0674f --- /dev/null +++ b/include/linux/context_tracking_state.h @@ -0,0 +1,39 @@ +#ifndef _LINUX_CONTEXT_TRACKING_STATE_H +#define _LINUX_CONTEXT_TRACKING_STATE_H + +#include +#include + +struct context_tracking { + /* + * When active is false, probes are unset in order + * to minimize overhead: TIF flags are cleared + * and calls to user_enter/exit are ignored. This + * may be further optimized using static keys. + */ + bool active; + enum ctx_state { + IN_KERNEL = 0, + IN_USER, + } state; +}; + +#ifdef CONFIG_CONTEXT_TRACKING +extern struct static_key context_tracking_enabled; +DECLARE_PER_CPU(struct context_tracking, context_tracking); + +static inline bool context_tracking_in_user(void) +{ + return __this_cpu_read(context_tracking.state) == IN_USER; +} + +static inline bool context_tracking_active(void) +{ + return __this_cpu_read(context_tracking.active); +} +#else +static inline bool context_tracking_in_user(void) { return false; } +static inline bool context_tracking_active(void) { return false; } +#endif /* CONFIG_CONTEXT_TRACKING */ + +#endif diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 05bcc0903766..ccfe17c5c8da 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -1,126 +1,11 @@ #ifndef LINUX_HARDIRQ_H #define LINUX_HARDIRQ_H -#include +#include #include #include #include -#include -/* - * We put the hardirq and softirq counter into the preemption - * counter. The bitmask has the following meaning: - * - * - bits 0-7 are the preemption count (max preemption depth: 256) - * - bits 8-15 are the softirq count (max # of softirqs: 256) - * - * The hardirq count can in theory reach the same as NR_IRQS. - * In reality, the number of nested IRQS is limited to the stack - * size as well. For archs with over 1000 IRQS it is not practical - * to expect that they will all nest. We give a max of 10 bits for - * hardirq nesting. An arch may choose to give less than 10 bits. - * m68k expects it to be 8. - * - * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024) - * - bit 26 is the NMI_MASK - * - bit 27 is the PREEMPT_ACTIVE flag - * - * PREEMPT_MASK: 0x000000ff - * SOFTIRQ_MASK: 0x0000ff00 - * HARDIRQ_MASK: 0x03ff0000 - * NMI_MASK: 0x04000000 - */ -#define PREEMPT_BITS 8 -#define SOFTIRQ_BITS 8 -#define NMI_BITS 1 - -#define MAX_HARDIRQ_BITS 10 - -#ifndef HARDIRQ_BITS -# define HARDIRQ_BITS MAX_HARDIRQ_BITS -#endif - -#if HARDIRQ_BITS > MAX_HARDIRQ_BITS -#error HARDIRQ_BITS too high! -#endif - -#define PREEMPT_SHIFT 0 -#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) -#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) -#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) - -#define __IRQ_MASK(x) ((1UL << (x))-1) - -#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) -#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) -#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) -#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) - -#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) -#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) -#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) -#define NMI_OFFSET (1UL << NMI_SHIFT) - -#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) - -#ifndef PREEMPT_ACTIVE -#define PREEMPT_ACTIVE_BITS 1 -#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) -#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) -#endif - -#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) -#error PREEMPT_ACTIVE is too low! -#endif - -#define hardirq_count() (preempt_count() & HARDIRQ_MASK) -#define softirq_count() (preempt_count() & SOFTIRQ_MASK) -#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ - | NMI_MASK)) - -/* - * Are we doing bottom half or hardware interrupt processing? - * Are we in a softirq context? Interrupt context? - * in_softirq - Are we currently processing softirq or have bh disabled? - * in_serving_softirq - Are we currently processing softirq? - */ -#define in_irq() (hardirq_count()) -#define in_softirq() (softirq_count()) -#define in_interrupt() (irq_count()) -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) - -/* - * Are we in NMI context? - */ -#define in_nmi() (preempt_count() & NMI_MASK) - -#if defined(CONFIG_PREEMPT_COUNT) -# define PREEMPT_CHECK_OFFSET 1 -#else -# define PREEMPT_CHECK_OFFSET 0 -#endif - -/* - * Are we running in atomic context? WARNING: this macro cannot - * always detect atomic context; in particular, it cannot know about - * held spinlocks in non-preemptible kernels. Thus it should not be - * used in the general case to determine whether sleeping is possible. - * Do not use in_atomic() in driver code. - */ -#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) - -/* - * Check whether we were atomic before we did preempt_disable(): - * (used by the scheduler, *after* releasing the kernel lock) - */ -#define in_atomic_preempt_off() \ - ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) - -#ifdef CONFIG_PREEMPT_COUNT -# define preemptible() (preempt_count() == 0 && !irqs_disabled()) -#else -# define preemptible() 0 -#endif #if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS) extern void synchronize_irq(unsigned int irq); diff --git a/include/linux/preempt_mask.h b/include/linux/preempt_mask.h new file mode 100644 index 000000000000..931bc616219f --- /dev/null +++ b/include/linux/preempt_mask.h @@ -0,0 +1,122 @@ +#ifndef LINUX_PREEMPT_MASK_H +#define LINUX_PREEMPT_MASK_H + +#include +#include + +/* + * We put the hardirq and softirq counter into the preemption + * counter. The bitmask has the following meaning: + * + * - bits 0-7 are the preemption count (max preemption depth: 256) + * - bits 8-15 are the softirq count (max # of softirqs: 256) + * + * The hardirq count can in theory reach the same as NR_IRQS. + * In reality, the number of nested IRQS is limited to the stack + * size as well. For archs with over 1000 IRQS it is not practical + * to expect that they will all nest. We give a max of 10 bits for + * hardirq nesting. An arch may choose to give less than 10 bits. + * m68k expects it to be 8. + * + * - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024) + * - bit 26 is the NMI_MASK + * - bit 27 is the PREEMPT_ACTIVE flag + * + * PREEMPT_MASK: 0x000000ff + * SOFTIRQ_MASK: 0x0000ff00 + * HARDIRQ_MASK: 0x03ff0000 + * NMI_MASK: 0x04000000 + */ +#define PREEMPT_BITS 8 +#define SOFTIRQ_BITS 8 +#define NMI_BITS 1 + +#define MAX_HARDIRQ_BITS 10 + +#ifndef HARDIRQ_BITS +# define HARDIRQ_BITS MAX_HARDIRQ_BITS +#endif + +#if HARDIRQ_BITS > MAX_HARDIRQ_BITS +#error HARDIRQ_BITS too high! +#endif + +#define PREEMPT_SHIFT 0 +#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) +#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) +#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) + +#define __IRQ_MASK(x) ((1UL << (x))-1) + +#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) +#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) +#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) +#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) + +#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) +#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) +#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) +#define NMI_OFFSET (1UL << NMI_SHIFT) + +#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) + +#ifndef PREEMPT_ACTIVE +#define PREEMPT_ACTIVE_BITS 1 +#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) +#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) +#endif + +#if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) +#error PREEMPT_ACTIVE is too low! +#endif + +#define hardirq_count() (preempt_count() & HARDIRQ_MASK) +#define softirq_count() (preempt_count() & SOFTIRQ_MASK) +#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ + | NMI_MASK)) + +/* + * Are we doing bottom half or hardware interrupt processing? + * Are we in a softirq context? Interrupt context? + * in_softirq - Are we currently processing softirq or have bh disabled? + * in_serving_softirq - Are we currently processing softirq? + */ +#define in_irq() (hardirq_count()) +#define in_softirq() (softirq_count()) +#define in_interrupt() (irq_count()) +#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) + +/* + * Are we in NMI context? + */ +#define in_nmi() (preempt_count() & NMI_MASK) + +#if defined(CONFIG_PREEMPT_COUNT) +# define PREEMPT_CHECK_OFFSET 1 +#else +# define PREEMPT_CHECK_OFFSET 0 +#endif + +/* + * Are we running in atomic context? WARNING: this macro cannot + * always detect atomic context; in particular, it cannot know about + * held spinlocks in non-preemptible kernels. Thus it should not be + * used in the general case to determine whether sleeping is possible. + * Do not use in_atomic() in driver code. + */ +#define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) + +/* + * Check whether we were atomic before we did preempt_disable(): + * (used by the scheduler, *after* releasing the kernel lock) + */ +#define in_atomic_preempt_off() \ + ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) + +#ifdef CONFIG_PREEMPT_COUNT +# define preemptible() (preempt_count() == 0 && !irqs_disabled()) +#else +# define preemptible() 0 +#endif + +#endif /* LINUX_PREEMPT_MASK_H */ diff --git a/include/linux/tick.h b/include/linux/tick.h index 62bd8b72873c..5128d33bbb39 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -158,20 +160,51 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL +extern bool tick_nohz_full_running; +extern cpumask_var_t tick_nohz_full_mask; + +static inline bool tick_nohz_full_enabled(void) +{ + if (!static_key_false(&context_tracking_enabled)) + return false; + + return tick_nohz_full_running; +} + +static inline bool tick_nohz_full_cpu(int cpu) +{ + if (!tick_nohz_full_enabled()) + return false; + + return cpumask_test_cpu(cpu, tick_nohz_full_mask); +} + extern void tick_nohz_init(void); -extern int tick_nohz_full_cpu(int cpu); -extern void tick_nohz_full_check(void); +extern void __tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_all(void); -extern void tick_nohz_task_switch(struct task_struct *tsk); +extern void __tick_nohz_task_switch(struct task_struct *tsk); #else static inline void tick_nohz_init(void) { } -static inline int tick_nohz_full_cpu(int cpu) { return 0; } -static inline void tick_nohz_full_check(void) { } +static inline bool tick_nohz_full_enabled(void) { return false; } +static inline bool tick_nohz_full_cpu(int cpu) { return false; } +static inline void __tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } -static inline void tick_nohz_task_switch(struct task_struct *tsk) { } +static inline void __tick_nohz_task_switch(struct task_struct *tsk) { } #endif +static inline void tick_nohz_full_check(void) +{ + if (tick_nohz_full_enabled()) + __tick_nohz_full_check(); +} + +static inline void tick_nohz_task_switch(struct task_struct *tsk) +{ + if (tick_nohz_full_enabled()) + __tick_nohz_task_switch(tsk); +} + #endif diff --git a/include/linux/vtime.h b/include/linux/vtime.h index b1dd2db80076..f5b72b364bda 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -1,18 +1,68 @@ #ifndef _LINUX_KERNEL_VTIME_H #define _LINUX_KERNEL_VTIME_H +#include +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +#include +#endif + + struct task_struct; +/* + * vtime_accounting_enabled() definitions/declarations + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +static inline bool vtime_accounting_enabled(void) { return true; } +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static inline bool vtime_accounting_enabled(void) +{ + if (static_key_false(&context_tracking_enabled)) { + if (context_tracking_active()) + return true; + } + + return false; +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING +static inline bool vtime_accounting_enabled(void) { return false; } +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ + + +/* + * Common vtime APIs + */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef __ARCH_HAS_VTIME_TASK_SWITCH extern void vtime_task_switch(struct task_struct *prev); +#else +extern void vtime_common_task_switch(struct task_struct *prev); +static inline void vtime_task_switch(struct task_struct *prev) +{ + if (vtime_accounting_enabled()) + vtime_common_task_switch(prev); +} +#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */ + extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account_user(struct task_struct *tsk); -extern void vtime_account_irq_enter(struct task_struct *tsk); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -static inline bool vtime_accounting_enabled(void) { return true; } -#endif +#ifdef __ARCH_HAS_VTIME_ACCOUNT +extern void vtime_account_irq_enter(struct task_struct *tsk); +#else +extern void vtime_common_account_irq_enter(struct task_struct *tsk); +static inline void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (vtime_accounting_enabled()) + vtime_common_account_irq_enter(tsk); +} +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ @@ -20,14 +70,20 @@ static inline void vtime_task_switch(struct task_struct *prev) { } static inline void vtime_account_system(struct task_struct *tsk) { } static inline void vtime_account_user(struct task_struct *tsk) { } static inline void vtime_account_irq_enter(struct task_struct *tsk) { } -static inline bool vtime_accounting_enabled(void) { return false; } -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_account_irq_exit(struct task_struct *tsk); -extern bool vtime_accounting_enabled(void); +extern void vtime_gen_account_irq_exit(struct task_struct *tsk); + +static inline void vtime_account_irq_exit(struct task_struct *tsk) +{ + if (vtime_accounting_enabled()) + vtime_gen_account_irq_exit(tsk); +} + extern void vtime_user_enter(struct task_struct *tsk); + static inline void vtime_user_exit(struct task_struct *tsk) { vtime_account_user(tsk); @@ -35,7 +91,7 @@ static inline void vtime_user_exit(struct task_struct *tsk) extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_init_idle(struct task_struct *tsk, int cpu); -#else +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ static inline void vtime_account_irq_exit(struct task_struct *tsk) { /* On hard|softirq exit we always account to hard|softirq cputime */ diff --git a/include/trace/events/context_tracking.h b/include/trace/events/context_tracking.h new file mode 100644 index 000000000000..ce8007cf29cf --- /dev/null +++ b/include/trace/events/context_tracking.h @@ -0,0 +1,58 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM context_tracking + +#if !defined(_TRACE_CONTEXT_TRACKING_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CONTEXT_TRACKING_H + +#include + +DECLARE_EVENT_CLASS(context_tracking_user, + + TP_PROTO(int dummy), + + TP_ARGS(dummy), + + TP_STRUCT__entry( + __field( int, dummy ) + ), + + TP_fast_assign( + __entry->dummy = dummy; + ), + + TP_printk("%s", "") +); + +/** + * user_enter - called when the kernel resumes to userspace + * @dummy: dummy arg to make trace event macro happy + * + * This event occurs when the kernel resumes to userspace after + * an exception or a syscall. + */ +DEFINE_EVENT(context_tracking_user, user_enter, + + TP_PROTO(int dummy), + + TP_ARGS(dummy) +); + +/** + * user_exit - called when userspace enters the kernel + * @dummy: dummy arg to make trace event macro happy + * + * This event occurs when userspace enters the kernel through + * an exception or a syscall. + */ +DEFINE_EVENT(context_tracking_user, user_exit, + + TP_PROTO(int dummy), + + TP_ARGS(dummy) +); + + +#endif /* _TRACE_CONTEXT_TRACKING_H */ + +/* This part must be outside protection */ +#include diff --git a/init/Kconfig b/init/Kconfig index 247084be0590..ffbf5d788bf3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -527,13 +527,29 @@ config RCU_USER_QS config CONTEXT_TRACKING_FORCE bool "Force context tracking" depends on CONTEXT_TRACKING - default CONTEXT_TRACKING + default y if !NO_HZ_FULL help - Probe on user/kernel boundaries by default in order to - test the features that rely on it such as userspace RCU extended - quiescent states. - This test is there for debugging until we have a real user like the - full dynticks mode. + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the developpement of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" diff --git a/init/main.c b/init/main.c index d03d2ec2eacf..af310afbef28 100644 --- a/init/main.c +++ b/init/main.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -545,6 +546,7 @@ asmlinkage void __init start_kernel(void) idr_init_cache(); rcu_init(); tick_nohz_init(); + context_tracking_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 383f8231e436..247091bf0587 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -20,22 +20,33 @@ #include #include -DEFINE_PER_CPU(struct context_tracking, context_tracking) = { -#ifdef CONFIG_CONTEXT_TRACKING_FORCE - .active = true, -#endif -}; +#define CREATE_TRACE_POINTS +#include + +struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL_GPL(context_tracking_enabled); + +DEFINE_PER_CPU(struct context_tracking, context_tracking); +EXPORT_SYMBOL_GPL(context_tracking); + +void context_tracking_cpu_set(int cpu) +{ + if (!per_cpu(context_tracking.active, cpu)) { + per_cpu(context_tracking.active, cpu) = true; + static_key_slow_inc(&context_tracking_enabled); + } +} /** - * user_enter - Inform the context tracking that the CPU is going to - * enter userspace mode. + * context_tracking_user_enter - Inform the context tracking that the CPU is going to + * enter userspace mode. * * This function must be called right before we switch from the kernel * to userspace, when it's guaranteed the remaining kernel instructions * to execute won't use any RCU read side critical section because this * function sets RCU in extended quiescent state. */ -void user_enter(void) +void context_tracking_user_enter(void) { unsigned long flags; @@ -54,17 +65,32 @@ void user_enter(void) WARN_ON_ONCE(!current->mm); local_irq_save(flags); - if (__this_cpu_read(context_tracking.active) && - __this_cpu_read(context_tracking.state) != IN_USER) { + if ( __this_cpu_read(context_tracking.state) != IN_USER) { + if (__this_cpu_read(context_tracking.active)) { + trace_user_enter(0); + /* + * At this stage, only low level arch entry code remains and + * then we'll run in userspace. We can assume there won't be + * any RCU read-side critical section until the next call to + * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * on the tick. + */ + vtime_user_enter(current); + rcu_user_enter(); + } /* - * At this stage, only low level arch entry code remains and - * then we'll run in userspace. We can assume there won't be - * any RCU read-side critical section until the next call to - * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency - * on the tick. + * Even if context tracking is disabled on this CPU, because it's outside + * the full dynticks mask for example, we still have to keep track of the + * context transitions and states to prevent inconsistency on those of + * other CPUs. + * If a task triggers an exception in userspace, sleep on the exception + * handler and then migrate to another CPU, that new CPU must know where + * the exception returns by the time we call exception_exit(). + * This information can only be provided by the previous CPU when it called + * exception_enter(). + * OTOH we can spare the calls to vtime and RCU when context_tracking.active + * is false because we know that CPU is not tickless. */ - vtime_user_enter(current); - rcu_user_enter(); __this_cpu_write(context_tracking.state, IN_USER); } local_irq_restore(flags); @@ -87,10 +113,9 @@ void user_enter(void) */ void __sched notrace preempt_schedule_context(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_ctx; - if (likely(ti->preempt_count || irqs_disabled())) + if (likely(!preemptible())) return; /* @@ -112,8 +137,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); #endif /* CONFIG_PREEMPT */ /** - * user_exit - Inform the context tracking that the CPU is - * exiting userspace mode and entering the kernel. + * context_tracking_user_exit - Inform the context tracking that the CPU is + * exiting userspace mode and entering the kernel. * * This function must be called after we entered the kernel from userspace * before any use of RCU read side critical section. This potentially include @@ -122,7 +147,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void user_exit(void) +void context_tracking_user_exit(void) { unsigned long flags; @@ -131,38 +156,22 @@ void user_exit(void) local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == IN_USER) { - /* - * We are going to run code that may use RCU. Inform - * RCU core about that (ie: we may need the tick again). - */ - rcu_user_exit(); - vtime_user_exit(current); + if (__this_cpu_read(context_tracking.active)) { + /* + * We are going to run code that may use RCU. Inform + * RCU core about that (ie: we may need the tick again). + */ + rcu_user_exit(); + vtime_user_exit(current); + trace_user_exit(0); + } __this_cpu_write(context_tracking.state, IN_KERNEL); } local_irq_restore(flags); } -void guest_enter(void) -{ - if (vtime_accounting_enabled()) - vtime_guest_enter(current); - else - __guest_enter(); -} -EXPORT_SYMBOL_GPL(guest_enter); - -void guest_exit(void) -{ - if (vtime_accounting_enabled()) - vtime_guest_exit(current); - else - __guest_exit(); -} -EXPORT_SYMBOL_GPL(guest_exit); - - /** - * context_tracking_task_switch - context switch the syscall callbacks + * __context_tracking_task_switch - context switch the syscall callbacks * @prev: the task that is being switched out * @next: the task that is being switched in * @@ -174,11 +183,19 @@ EXPORT_SYMBOL_GPL(guest_exit); * migrate to some CPU that doesn't do the context tracking. As such the TIF * flag may not be desired there. */ -void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) +void __context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) { - if (__this_cpu_read(context_tracking.active)) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); - } + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); } + +#ifdef CONFIG_CONTEXT_TRACKING_FORCE +void __init context_tracking_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + context_tracking_cpu_set(cpu); +} +#endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7c32cb7bfeb..3fb7acee7326 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2510,13 +2510,11 @@ void __sched schedule_preempt_disabled(void) */ asmlinkage void __sched notrace preempt_schedule(void) { - struct thread_info *ti = current_thread_info(); - /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (likely(ti->preempt_count || irqs_disabled())) + if (likely(!preemptible())) return; do { diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..c1d7493825ae 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ #ifdef CONFIG_VIRT_CPU_ACCOUNTING #ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) +void vtime_common_task_switch(struct task_struct *prev) { - if (!vtime_accounting_enabled()) - return; - if (is_idle_task(prev)) vtime_account_idle(prev); else @@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_common_account_irq_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - if (!in_interrupt()) { /* * If we interrupted user, context_tracking_in_user() @@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) } vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ @@ -559,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr, { cputime_t rtime, stime, utime, total; - if (vtime_accounting_enabled()) { - *ut = curr->utime; - *st = curr->stime; - return; - } - stime = curr->stime; total = stime + curr->utime; @@ -664,23 +652,17 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); } -void vtime_account_irq_exit(struct task_struct *tsk) +void vtime_gen_account_irq_exit(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); } @@ -688,12 +670,8 @@ void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; - if (!vtime_accounting_enabled()) - return; - - delta_cpu = get_vtime_delta(tsk); - write_seqlock(&tsk->vtime_seqlock); + delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); write_sequnlock(&tsk->vtime_seqlock); @@ -701,22 +679,27 @@ void vtime_account_user(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); - tsk->vtime_snap_whence = VTIME_USER; __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; write_sequnlock(&tsk->vtime_seqlock); } void vtime_guest_enter(struct task_struct *tsk) { + /* + * The flags must be updated under the lock with + * the vtime_snap flush and update. + * That enforces a right ordering and update sequence + * synchronization against the reader (task_gtime()) + * that can thus safely catch up with a tickless delta. + */ write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); current->flags |= PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { @@ -725,6 +708,7 @@ void vtime_guest_exit(struct task_struct *tsk) current->flags &= ~PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { @@ -733,11 +717,6 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(delta_cpu); } -bool vtime_accounting_enabled(void) -{ - return context_tracking_active(); -} - void arch_vtime_task_switch(struct task_struct *prev) { write_seqlock(&prev->vtime_seqlock); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012b..747bbc70f53b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -105,7 +105,6 @@ config NO_HZ_FULL select RCU_USER_QS select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN - select CONTEXT_TRACKING_FORCE select IRQ_WORK help Adaptively try to shutdown the tick whenever possible, even when diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index a326f27d7f09..0b479a6a22bb 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -121,7 +121,7 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) BUG_ON(bits > 32); WARN_ON(!irqs_disabled()); read_sched_clock = read; - sched_clock_mask = (1 << bits) - 1; + sched_clock_mask = (1ULL << bits) - 1; cd.rate = rate; /* calculate the mult/shift to convert counter ticks to ns. */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e77edc97e036..adea6fc3ba2a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -148,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) } #ifdef CONFIG_NO_HZ_FULL -static cpumask_var_t nohz_full_mask; -bool have_nohz_full_mask; +cpumask_var_t tick_nohz_full_mask; +bool tick_nohz_full_running; static bool can_stop_full_tick(void) { @@ -182,7 +183,8 @@ static bool can_stop_full_tick(void) * Don't allow the user to think they can get * full NO_HZ with this machine. */ - WARN_ONCE(1, "NO_HZ FULL will not work with unstable sched clock"); + WARN_ONCE(tick_nohz_full_running, + "NO_HZ FULL will not work with unstable sched clock"); return false; } #endif @@ -196,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); * Re-evaluate the need for the tick on the current CPU * and restart it if necessary. */ -void tick_nohz_full_check(void) +void __tick_nohz_full_check(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); @@ -210,7 +212,7 @@ void tick_nohz_full_check(void) static void nohz_full_kick_work_func(struct irq_work *work) { - tick_nohz_full_check(); + __tick_nohz_full_check(); } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { @@ -229,7 +231,7 @@ void tick_nohz_full_kick(void) static void nohz_full_kick_ipi(void *info) { - tick_nohz_full_check(); + __tick_nohz_full_check(); } /* @@ -238,11 +240,11 @@ static void nohz_full_kick_ipi(void *info) */ void tick_nohz_full_kick_all(void) { - if (!have_nohz_full_mask) + if (!tick_nohz_full_running) return; preempt_disable(); - smp_call_function_many(nohz_full_mask, + smp_call_function_many(tick_nohz_full_mask, nohz_full_kick_ipi, NULL, false); preempt_enable(); } @@ -252,7 +254,7 @@ void tick_nohz_full_kick_all(void) * It might need the tick due to per task/process properties: * perf events, posix cpu timers, ... */ -void tick_nohz_task_switch(struct task_struct *tsk) +void __tick_nohz_task_switch(struct task_struct *tsk) { unsigned long flags; @@ -268,31 +270,23 @@ out: local_irq_restore(flags); } -int tick_nohz_full_cpu(int cpu) -{ - if (!have_nohz_full_mask) - return 0; - - return cpumask_test_cpu(cpu, nohz_full_mask); -} - /* Parse the boot-time nohz CPU list from the kernel parameters. */ static int __init tick_nohz_full_setup(char *str) { int cpu; - alloc_bootmem_cpumask_var(&nohz_full_mask); - if (cpulist_parse(str, nohz_full_mask) < 0) { + alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); return 1; } cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, nohz_full_mask)) { + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); - cpumask_clear_cpu(cpu, nohz_full_mask); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); } - have_nohz_full_mask = true; + tick_nohz_full_running = true; return 1; } @@ -310,7 +304,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, * If we handle the timekeeping duty for full dynticks CPUs, * we can't safely shutdown that CPU. */ - if (have_nohz_full_mask && tick_do_timer_cpu == cpu) + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return NOTIFY_BAD; break; } @@ -329,14 +323,14 @@ static int tick_nohz_init_all(void) int err = -1; #ifdef CONFIG_NO_HZ_FULL_ALL - if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { + if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } err = 0; - cpumask_setall(nohz_full_mask); - cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); - have_nohz_full_mask = true; + cpumask_setall(tick_nohz_full_mask); + cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); + tick_nohz_full_running = true; #endif return err; } @@ -345,17 +339,18 @@ void __init tick_nohz_init(void) { int cpu; - if (!have_nohz_full_mask) { + if (!tick_nohz_full_running) { if (tick_nohz_init_all() < 0) return; } + for_each_cpu(cpu, tick_nohz_full_mask) + context_tracking_cpu_set(cpu); + cpu_notifier(tick_nohz_cpu_down_callback, 0); - cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); + cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); } -#else -#define have_nohz_full_mask (0) #endif /* @@ -733,7 +728,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (have_nohz_full_mask) { + if (tick_nohz_full_enabled()) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around