From 47c95a46d0fae07762f0a38aa3709ae63f307048 Mon Sep 17 00:00:00 2001 From: Bin Gao Date: Tue, 15 Nov 2016 12:27:21 -0800 Subject: [PATCH 01/19] x86/tsc: Add X86_FEATURE_TSC_KNOWN_FREQ flag The X86_FEATURE_TSC_RELIABLE flag in Linux kernel implies both reliable (at runtime) and trustable (at calibration). But reliable running and trustable calibration independent of each other. Add a new flag X86_FEATURE_TSC_KNOWN_FREQ, which denotes that the frequency is known (via MSR/CPUID). This flag is only meant to skip the long term calibration on systems which have a known frequency. Add X86_FEATURE_TSC_KNOWN_FREQ to the skip the delayed calibration and leave X86_FEATURE_TSC_RELIABLE in place. After converting the existing users of X86_FEATURE_TSC_RELIABLE to use either both flags or just X86_FEATURE_TSC_KNOWN_FREQ we can seperate the functionality. Suggested-by: Thomas Gleixner Signed-off-by: Bin Gao Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1479241644-234277-2-git-send-email-bin.gao@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/tsc.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a39629206864..7f6a5f88d5ae 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -106,6 +106,7 @@ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46b2f41f8b05..d2c4ee4e4866 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1283,10 +1283,15 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP; /* - * Trust the results of the earlier calibration on systems - * exporting a reliable TSC. + * When TSC frequency is known (retrieved via MSR or CPUID), we skip + * the refined calibration and directly register it as a clocksource. + * + * We still keep the TSC_RELIABLE flag here to avoid regressions - + * it will be removed after all the conversion for other code paths + * connected to this flag is done. */ - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE) || + boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } From 4ca4df0b7eb06df264b2919759957f6d6ea1822e Mon Sep 17 00:00:00 2001 From: Bin Gao Date: Tue, 15 Nov 2016 12:27:22 -0800 Subject: [PATCH 02/19] x86/tsc: Mark TSC frequency determined by CPUID as known CPUs/SoCs with CPUID leaf 0x15 come with a known frequency and will report the frequency to software via CPUID instruction. This hardware provided frequency is the "real" frequency of TSC. Set the X86_FEATURE_TSC_KNOWN_FREQ flag for such systems to skip the software calibration process. A 24 hours test on one of the CPUID 0x15 capable platforms was conducted. PIT calibrated frequency resulted in more than 3 seconds drift whereas the CPUID determined frequency showed less than 0.5 second drift. Signed-off-by: Bin Gao Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1479241644-234277-3-git-send-email-bin.gao@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index d2c4ee4e4866..e58c31959666 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -702,6 +702,13 @@ unsigned long native_calibrate_tsc(void) } } + /* + * TSC frequency determined by CPUID is a "hardware reported" + * frequency and is the most accurate one so far we have. This + * is considered a known frequency. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + return crystal_khz * ebx_numerator / eax_denominator; } From 4635fdc696a8e89eead3ea1712ae6ada38538d40 Mon Sep 17 00:00:00 2001 From: Bin Gao Date: Tue, 15 Nov 2016 12:27:23 -0800 Subject: [PATCH 03/19] x86/tsc: Mark Intel ATOM_GOLDMONT TSC reliable On Intel GOLDMONT Atom SoC TSC is the only available clocksource, so there is no way to do software calibration or have a watchdog clocksource for it. Software calibration is already disabled via the TSC_KNOWN_FREQ flag, but the watchdog requirement still persists, so such systems cannot switch to high resolution/nohz mode. Mark it reliable, so it becomes usable. Hardware teams confirmed that this is safe on that SoC. Signed-off-by: Bin Gao Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1479241644-234277-4-git-send-email-bin.gao@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e58c31959666..f4dfdaa6633c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -709,6 +709,13 @@ unsigned long native_calibrate_tsc(void) */ setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + /* + * For Atom SoCs TSC is the only reliable clocksource. + * Mark TSC reliable so no watchdog on it. + */ + if (boot_cpu_data.x86_model == INTEL_FAM6_ATOM_GOLDMONT) + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return crystal_khz * ebx_numerator / eax_denominator; } From f3a02ecebed7df7d5d68898628dea7a3bfcf03e3 Mon Sep 17 00:00:00 2001 From: Bin Gao Date: Tue, 15 Nov 2016 12:27:24 -0800 Subject: [PATCH 04/19] x86/tsc: Set TSC_KNOWN_FREQ and TSC_RELIABLE flags on Intel Atom SoCs TSC on Intel Atom SoCs capable of determining TSC frequency by MSR is reliable and the frequency is known (provided by HW). On these platforms PIT/HPET is generally not available so calibration won't work at all and there is no other clocksource to act as a watchdog for the TSC, so we have no other choice than to trust it. Set both X86_FEATURE_TSC_KNOWN_FREQ and X86_FEATURE_TSC_RELIABLE flags to make sure the calibration is skipped and no watchdog is required. Signed-off-by: Bin Gao Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1479241644-234277-5-git-send-email-bin.gao@linux.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_msr.c | 19 +++++++++++++++++++ arch/x86/platform/intel-mid/mfld.c | 9 +++++++-- arch/x86/platform/intel-mid/mrfld.c | 8 ++++++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 0fe720d64fef..19afdbd7d0a7 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -100,5 +100,24 @@ unsigned long cpu_khz_from_msr(void) #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; #endif + + /* + * TSC frequency determined by MSR is always considered "known" + * because it is reported by HW. + * Another fact is that on MSR capable platforms, PIT/HPET is + * generally not available so calibration won't work at all. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + + /* + * Unfortunately there is no way for hardware to tell whether the + * TSC is reliable. We were told by silicon design team that TSC + * on Atom SoCs are always "reliable". TSC is also the only + * reliable clocksource on these SoCs (HPET is either not present + * or not functional) so mark TSC reliable which removes the + * requirement for a watchdog clocksource. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); + return res; } diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index 1eb47b6298c2..e793fe509971 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -49,8 +49,13 @@ static unsigned long __init mfld_calibrate_tsc(void) fast_calibrate = ratio * fsb; pr_debug("read penwell tsc %lu khz\n", fast_calibrate); lapic_timer_frequency = fsb * 1000 / HZ; - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c index 59253db41bbc..e0607c77a1bd 100644 --- a/arch/x86/platform/intel-mid/mrfld.c +++ b/arch/x86/platform/intel-mid/mrfld.c @@ -78,8 +78,12 @@ static unsigned long __init tangier_calibrate_tsc(void) pr_debug("Setting lapic_timer_frequency = %d\n", lapic_timer_frequency); - /* mark tsc clocksource as reliable */ - set_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC_RELIABLE); + /* + * TSC on Intel Atom SoCs is reliable and of known frequency. + * See tsc_msr.c for details. + */ + setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); return fast_calibrate; } From 984fecebda3b9c8e3d75f8492593da71c58972b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Nov 2016 10:38:09 +0100 Subject: [PATCH 05/19] x86/tsc: Finalize the split of the TSC_RELIABLE flag All places which used the TSC_RELIABLE to skip the delayed calibration have been converted to use the TSC_KNOWN_FREQ flag. Make the immeditate clocksource registration, which skips the long term calibration, solely depend on TSC_KNOWN_FREQ. The TSC_RELIABLE now merily removes the requirement for a watchdog clocksource. Signed-off-by: Thomas Gleixner Cc: Bin Gao Cc: Peter Zijlstra --- arch/x86/kernel/tsc.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index f4dfdaa6633c..0ff1ec61d1e4 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1299,13 +1299,8 @@ static int __init init_tsc_clocksource(void) /* * When TSC frequency is known (retrieved via MSR or CPUID), we skip * the refined calibration and directly register it as a clocksource. - * - * We still keep the TSC_RELIABLE flag here to avoid regressions - - * it will be removed after all the conversion for other code paths - * connected to this flag is done. */ - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE) || - boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { + if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) { clocksource_register_khz(&clocksource_tsc, tsc_khz); return 0; } From 7b3d2f6e08ed5eb6bcf6912938f7a542405f8e8e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Nov 2016 13:47:33 +0000 Subject: [PATCH 06/19] x86/tsc: Use X86_FEATURE_TSC_ADJUST in detect_art() The art detection uses rdmsrl_safe() to detect the availablity of the TSC_ADJUST MSR. That's pointless because we have a feature bit for this. Use it. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161119134017.483561692@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 0ff1ec61d1e4..2b27c5ae9d1f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1057,17 +1057,19 @@ static void detect_art(void) if (boot_cpu_data.cpuid_level < ART_CPUID_LEAF) return; + /* Don't enable ART in a VM, non-stop TSC and TSC_ADJUST required */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || + !boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + cpuid(ART_CPUID_LEAF, &art_to_tsc_denominator, &art_to_tsc_numerator, unused, unused+1); - /* Don't enable ART in a VM, non-stop TSC required */ - if (boot_cpu_has(X86_FEATURE_HYPERVISOR) || - !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) || - art_to_tsc_denominator < ART_MIN_DENOMINATOR) + if (art_to_tsc_denominator < ART_MIN_DENOMINATOR) return; - if (rdmsrl_safe(MSR_IA32_TSC_ADJUST, &art_to_tsc_offset)) - return; + rdmsrl(MSR_IA32_TSC_ADJUST, art_to_tsc_offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); From bec8520dca0d27c1ddac703f9d0a78275ca2603e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Nov 2016 13:47:35 +0000 Subject: [PATCH 07/19] x86/tsc: Detect random warps If time warps can be observed then they should only ever be observed on one CPU. If they are observed on both CPUs then the system is completely hosed. Add a check for this condition and notify if it happens. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161119134017.574838461@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_sync.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 78083bf23ed1..40f8edd55151 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -37,6 +37,7 @@ static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; static cycles_t last_tsc; static cycles_t max_warp; static int nr_warps; +static int random_warps; /* * TSC-warp measurement loop running on both CPUs. This is not called @@ -45,7 +46,7 @@ static int nr_warps; static void check_tsc_warp(unsigned int timeout) { cycles_t start, now, prev, end; - int i; + int i, cur_warps = 0; start = rdtsc_ordered(); /* @@ -85,7 +86,14 @@ static void check_tsc_warp(unsigned int timeout) if (unlikely(prev > now)) { arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); + /* + * Check whether this bounces back and forth. Only + * one CPU should observe time going backwards. + */ + if (cur_warps != nr_warps) + random_warps++; nr_warps++; + cur_warps = nr_warps; arch_spin_unlock(&sync_lock); } } @@ -160,6 +168,8 @@ void check_tsc_sync_source(int cpu) smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); + if (random_warps) + pr_warning("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); } else { pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", @@ -170,6 +180,7 @@ void check_tsc_sync_source(int cpu) * Reset it - just in case we boot another CPU later: */ atomic_set(&start_count, 0); + random_warps = 0; nr_warps = 0; max_warp = 0; last_tsc = 0; From 8b223bc7abe0e30e8d297a24ee6c6c07ef8d0bb9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Nov 2016 13:47:36 +0000 Subject: [PATCH 08/19] x86/tsc: Store and check TSC ADJUST MSR The TSC_ADJUST MSR shows whether the TSC has been modified. This is helpful in a two aspects: 1) It allows to detect BIOS wreckage, where SMM code tries to 'hide' the cycles spent by storing the TSC value at SMM entry and restoring it at SMM exit. On affected machines the TSCs run slowly out of sync up to the point where the clocksource watchdog (if available) detects it. The TSC_ADJUST MSR allows to detect the TSC modification before that and eventually restore it. This is also important for SoCs which have no watchdog clocksource and therefore TSC wreckage cannot be detected and acted upon. 2) All threads in a package are required to have the same TSC_ADJUST value. Broken BIOSes break that and as a result the TSC synchronization check fails. The TSC_ADJUST MSR allows to detect the deviation when a CPU comes online. If detected set it to the value of an already online CPU in the same package. This also allows to reduce the number of sync tests because with that in place the test is only required for the first CPU in a package. In principle all CPUs in a system should have the same TSC_ADJUST value even across packages, but with physical CPU hotplug this assumption is not true because the TSC starts with power on, so physical hotplug has to do some trickery to bring the TSC into sync with already running packages, which requires to use an TSC_ADJUST value different from CPUs which got powered earlier. A final enhancement is the opportunity to compensate for unsynced TSCs accross nodes at boot time and make the TSC usable that way. It won't help for TSCs which run apart due to frequency skew between packages, but this gets detected by the clocksource watchdog later. The first step toward this is to store the TSC_ADJUST value of a starting CPU and compare it with the value of an already online CPU in the same package. If they differ, emit a warning and adjust it to the reference value. The !SMP version just stores the boot value for later verification. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161119134017.655323776@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/tsc.h | 6 +++ arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/tsc.c | 2 + arch/x86/kernel/tsc_sync.c | 88 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 97 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 33b6365c22fe..1ec0595867d9 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -48,6 +48,12 @@ extern int tsc_clocksource_reliable; extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); +#ifdef CONFIG_X86_TSC +extern void tsc_store_and_check_tsc_adjust(void); +#else +static inline void tsc_store_and_check_tsc_adjust(void) { } +#endif + extern int notsc_setup(char *); extern void tsc_save_sched_clock_state(void); extern void tsc_restore_sched_clock_state(void); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 79076d75bdbf..c0ac317dd372 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -75,7 +75,7 @@ apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SMP) += smpboot.o -obj-$(CONFIG_SMP) += tsc_sync.o +obj-$(CONFIG_X86_TSC) += tsc_sync.o obj-$(CONFIG_SMP) += setup_percpu.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-y += apic/ diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 2b27c5ae9d1f..2bb8de4f3b39 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1379,6 +1379,8 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); + else + tsc_store_and_check_tsc_adjust(); check_system_tsc_reliable(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 40f8edd55151..bd2bd5e89d96 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -14,12 +14,95 @@ * ( The serial nature of the boot logic and the CPU hotplug lock * protects against more than 2 CPUs entering this code. ) */ +#include #include #include #include #include #include +struct tsc_adjust { + s64 bootval; + s64 adjusted; +}; + +static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); + +#ifndef CONFIG_SMP +void __init tsc_store_and_check_tsc_adjust(void) +{ + struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + cur->adjusted = bootval; + pr_info("TSC ADJUST: Boot CPU0: %lld\n", bootval); +} + +#else /* !CONFIG_SMP */ + +/* + * Store and check the TSC ADJUST MSR if available + */ +void tsc_store_and_check_tsc_adjust(void) +{ + struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); + unsigned int refcpu, cpu = smp_processor_id(); + s64 bootval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + + rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + cur->bootval = bootval; + + /* + * Check whether this CPU is the first in a package to come up. In + * this case do not check the boot value against another package + * because the package might have been physically hotplugged, where + * TSC_ADJUST is expected to be different. + */ + refcpu = cpumask_any_but(topology_core_cpumask(cpu), cpu); + + if (refcpu >= nr_cpu_ids) { + /* + * First online CPU in a package stores the boot value in + * the adjustment value. This value might change later via + * the sync mechanism. If that fails we still can yell + * about boot values not being consistent. + */ + cur->adjusted = bootval; + pr_info_once("TSC ADJUST: Boot CPU%u: %lld\n", cpu, bootval); + return; + } + + ref = per_cpu_ptr(&tsc_adjust, refcpu); + /* + * Compare the boot value and complain if it differs in the + * package. + */ + if (bootval != ref->bootval) { + pr_warn("TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->bootval, cpu, bootval); + } + /* + * The TSC_ADJUST values in a package must be the same. If the boot + * value on this newly upcoming CPU differs from the adjustment + * value of the already online CPU in this package, set it to that + * adjusted value. + */ + if (bootval != ref->adjusted) { + pr_warn("TSC ADJUST synchronize: Reference CPU%u: %lld CPU%u: %lld\n", + refcpu, ref->adjusted, cpu, bootval); + cur->adjusted = ref->adjusted; + wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + } +} + /* * Entry/exit counters that make sure that both CPUs * run the measurement code at once: @@ -202,6 +285,9 @@ void check_tsc_sync_target(void) if (unsynchronized_tsc() || tsc_clocksource_reliable) return; + /* Store and check the TSC ADJUST MSR */ + tsc_store_and_check_tsc_adjust(); + /* * Register this CPU's participation and wait for the * source CPU to start the measurement: @@ -223,3 +309,5 @@ void check_tsc_sync_target(void) while (atomic_read(&stop_count) != cpus) cpu_relax(); } + +#endif /* CONFIG_SMP */ From 1d0095feea591bbd94f35d8a98aed746319783e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Nov 2016 13:47:37 +0000 Subject: [PATCH 09/19] x86/tsc: Verify TSC_ADJUST from idle When entering idle, it's a good oportunity to verify that the TSC_ADJUST MSR has not been tampered with (BIOS hiding SMM cycles). If tampering is detected, emit a warning and restore it to the previous value. This is especially important for machines, which mark the TSC reliable because there is no watchdog clocksource available (SoCs). This is not sufficient for HPC (NOHZ_FULL) situations where a CPU never goes idle, but adding a timer to do the check periodically is not an option either. On a machine, which has this issue, the check triggeres right during boot, so there is a decent chance that the sysadmin will notice. Rate limit the check to once per second and warn only once per cpu. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161119134017.732180441@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/tsc.h | 2 ++ arch/x86/kernel/process.c | 1 + arch/x86/kernel/tsc_sync.c | 37 +++++++++++++++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 1ec0595867d9..b896e9ee65bc 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -50,8 +50,10 @@ extern void check_tsc_sync_target(void); #ifdef CONFIG_X86_TSC extern void tsc_store_and_check_tsc_adjust(void); +extern void tsc_verify_tsc_adjust(void); #else static inline void tsc_store_and_check_tsc_adjust(void) { } +static inline void tsc_verify_tsc_adjust(void) { } #endif extern int notsc_setup(char *); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0888a879120f..4fe5dc84da4f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -277,6 +277,7 @@ void exit_idle(void) void arch_cpu_idle_enter(void) { + tsc_verify_tsc_adjust(); local_touch_nmi(); enter_idle(); } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bd2bd5e89d96..f713e84d1cb4 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -22,12 +22,42 @@ #include struct tsc_adjust { - s64 bootval; - s64 adjusted; + s64 bootval; + s64 adjusted; + unsigned long nextcheck; + bool warned; }; static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); +void tsc_verify_tsc_adjust(void) +{ + struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); + s64 curval; + + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + return; + + /* Rate limit the MSR check */ + if (time_before(jiffies, adj->nextcheck)) + return; + + adj->nextcheck = jiffies + HZ; + + rdmsrl(MSR_IA32_TSC_ADJUST, curval); + if (adj->adjusted == curval) + return; + + /* Restore the original value */ + wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + + if (!adj->warned) { + pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", + smp_processor_id(), adj->adjusted, curval); + adj->warned = true; + } +} + #ifndef CONFIG_SMP void __init tsc_store_and_check_tsc_adjust(void) { @@ -40,6 +70,7 @@ void __init tsc_store_and_check_tsc_adjust(void) rdmsrl(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->adjusted = bootval; + cur->nextcheck = jiffies + HZ; pr_info("TSC ADJUST: Boot CPU0: %lld\n", bootval); } @@ -59,6 +90,8 @@ void tsc_store_and_check_tsc_adjust(void) rdmsrl(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; + cur->nextcheck = jiffies + HZ; + cur->warned = false; /* * Check whether this CPU is the first in a package to come up. In From a36f5136814b6a87601220927cb9ad9ecc731e92 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Nov 2016 13:47:39 +0000 Subject: [PATCH 10/19] x86/tsc: Sync test only for the first cpu in a package If the TSC_ADJUST MSR is available all CPUs in a package are forced to the same value. So TSCs cannot be out of sync when the first CPU in the package was in sync. That allows to skip the sync test for all CPUs except the first starting CPU in a package. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161119134017.809901363@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/tsc.h | 4 ++-- arch/x86/kernel/tsc_sync.c | 37 ++++++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index b896e9ee65bc..04721d54d8d9 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -49,10 +49,10 @@ extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); #ifdef CONFIG_X86_TSC -extern void tsc_store_and_check_tsc_adjust(void); +extern bool tsc_store_and_check_tsc_adjust(void); extern void tsc_verify_tsc_adjust(void); #else -static inline void tsc_store_and_check_tsc_adjust(void) { } +static inline bool tsc_store_and_check_tsc_adjust(void) { } static inline void tsc_verify_tsc_adjust(void) { } #endif diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index f713e84d1cb4..fdb3b7befc47 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -59,19 +59,20 @@ void tsc_verify_tsc_adjust(void) } #ifndef CONFIG_SMP -void __init tsc_store_and_check_tsc_adjust(void) +bool __init tsc_store_and_check_tsc_adjust(void) { struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); s64 bootval; if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) - return; + return false; rdmsrl(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->adjusted = bootval; cur->nextcheck = jiffies + HZ; pr_info("TSC ADJUST: Boot CPU0: %lld\n", bootval); + return false; } #else /* !CONFIG_SMP */ @@ -79,14 +80,14 @@ void __init tsc_store_and_check_tsc_adjust(void) /* * Store and check the TSC ADJUST MSR if available */ -void tsc_store_and_check_tsc_adjust(void) +bool tsc_store_and_check_tsc_adjust(void) { struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); unsigned int refcpu, cpu = smp_processor_id(); s64 bootval; if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) - return; + return false; rdmsrl(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; @@ -110,7 +111,7 @@ void tsc_store_and_check_tsc_adjust(void) */ cur->adjusted = bootval; pr_info_once("TSC ADJUST: Boot CPU%u: %lld\n", cpu, bootval); - return; + return false; } ref = per_cpu_ptr(&tsc_adjust, refcpu); @@ -134,6 +135,11 @@ void tsc_store_and_check_tsc_adjust(void) cur->adjusted = ref->adjusted; wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); } + /* + * We have the TSCs forced to be in sync on this package. Skip sync + * test: + */ + return true; } /* @@ -142,6 +148,7 @@ void tsc_store_and_check_tsc_adjust(void) */ static atomic_t start_count; static atomic_t stop_count; +static atomic_t skip_test; /* * We use a raw spinlock in this exceptional case, because @@ -265,10 +272,16 @@ void check_tsc_sync_source(int cpu) atomic_set(&stop_count, 0); /* - * Wait for the target to arrive: + * Wait for the target to start or to skip the test: */ - while (atomic_read(&start_count) != cpus-1) + while (atomic_read(&start_count) != cpus - 1) { + if (atomic_read(&skip_test) > 0) { + atomic_set(&skip_test, 0); + return; + } cpu_relax(); + } + /* * Trigger the target to continue into the measurement too: */ @@ -318,8 +331,14 @@ void check_tsc_sync_target(void) if (unsynchronized_tsc() || tsc_clocksource_reliable) return; - /* Store and check the TSC ADJUST MSR */ - tsc_store_and_check_tsc_adjust(); + /* + * Store, verify and sanitize the TSC adjust register. If + * successful skip the test. + */ + if (tsc_store_and_check_tsc_adjust()) { + atomic_inc(&skip_test); + return; + } /* * Register this CPU's participation and wait for the From 4c5e3c63752162262c42424147f319b8571a20af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Nov 2016 13:47:40 +0000 Subject: [PATCH 11/19] x86/tsc: Move sync cleanup to a safe place Cleaning up the stop marker on the control CPU is wrong when we want to add retry support. Move the cleanup to the starting CPU. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161119134017.892095627@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_sync.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index fdb3b7befc47..8f394eeb936e 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -266,11 +266,6 @@ void check_tsc_sync_source(int cpu) return; } - /* - * Reset it - in case this is a second bootup: - */ - atomic_set(&stop_count, 0); - /* * Wait for the target to start or to skip the test: */ @@ -360,6 +355,11 @@ void check_tsc_sync_target(void) */ while (atomic_read(&stop_count) != cpus) cpu_relax(); + + /* + * Reset it for the next sync test: + */ + atomic_set(&stop_count, 0); } #endif /* CONFIG_SMP */ From 76d3b85158509cafec5be7675a97ef80118e288e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Nov 2016 13:47:41 +0000 Subject: [PATCH 12/19] x86/tsc: Prepare warp test for TSC adjustment To allow TSC compensation cross nodes its necessary to know in which direction the TSC warp was observed. Return the maximum observed value on the calling CPU so the caller can determine the direction later. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161119134017.970859287@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_sync.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 8f394eeb936e..9ad074c87e72 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -166,9 +166,9 @@ static int random_warps; * TSC-warp measurement loop running on both CPUs. This is not called * if there is no TSC. */ -static void check_tsc_warp(unsigned int timeout) +static cycles_t check_tsc_warp(unsigned int timeout) { - cycles_t start, now, prev, end; + cycles_t start, now, prev, end, cur_max_warp = 0; int i, cur_warps = 0; start = rdtsc_ordered(); @@ -209,6 +209,7 @@ static void check_tsc_warp(unsigned int timeout) if (unlikely(prev > now)) { arch_spin_lock(&sync_lock); max_warp = max(max_warp, prev - now); + cur_max_warp = max_warp; /* * Check whether this bounces back and forth. Only * one CPU should observe time going backwards. @@ -223,6 +224,7 @@ static void check_tsc_warp(unsigned int timeout) WARN(!(now-start), "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); + return cur_max_warp; } /* From cc4db26899dcd0e6ff0448c77abd8eb61b1a1333 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 19 Nov 2016 13:47:43 +0000 Subject: [PATCH 13/19] x86/tsc: Try to adjust TSC if sync test fails If the first CPU of a package comes online, it is necessary to test whether the TSC is in sync with a CPU on some other package. When a deviation is observed (time going backwards between the two CPUs) the TSC is marked unstable, which is a problem on large machines as they have to fall back to the HPET clocksource, which is insanely slow. It has been attempted to compensate the TSC by adding the offset to the TSC and writing it back some time ago, but this never was merged because it did not turn out to be stable, especially not on older systems. Modern systems have become more stable in that regard and the TSC_ADJUST MSR allows us to compensate for the time deviation in a sane way. If it's available allow up to three synchronization runs and if a time warp is detected the starting CPU can compensate the time warp via the TSC_ADJUST MSR and retry. If the third run still shows a deviation or when random time warps are detected the test terminally fails. Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Yinghai Lu Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161119134018.048237517@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_sync.c | 83 +++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9ad074c87e72..d7f48a640ff5 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -149,6 +149,7 @@ bool tsc_store_and_check_tsc_adjust(void) static atomic_t start_count; static atomic_t stop_count; static atomic_t skip_test; +static atomic_t test_runs; /* * We use a raw spinlock in this exceptional case, because @@ -268,6 +269,16 @@ void check_tsc_sync_source(int cpu) return; } + /* + * Set the maximum number of test runs to + * 1 if the CPU does not provide the TSC_ADJUST MSR + * 3 if the MSR is available, so the target can try to adjust + */ + if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) + atomic_set(&test_runs, 1); + else + atomic_set(&test_runs, 3); +retry: /* * Wait for the target to start or to skip the test: */ @@ -289,7 +300,21 @@ void check_tsc_sync_source(int cpu) while (atomic_read(&stop_count) != cpus-1) cpu_relax(); - if (nr_warps) { + /* + * If the test was successful set the number of runs to zero and + * stop. If not, decrement the number of runs an check if we can + * retry. In case of random warps no retry is attempted. + */ + if (!nr_warps) { + atomic_set(&test_runs, 0); + + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); + + } else if (atomic_dec_and_test(&test_runs) || random_warps) { + /* Force it to 0 if random warps brought us here */ + atomic_set(&test_runs, 0); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " @@ -297,9 +322,6 @@ void check_tsc_sync_source(int cpu) if (random_warps) pr_warning("TSC warped randomly between CPUs\n"); mark_tsc_unstable("check_tsc_sync_source failed"); - } else { - pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", - smp_processor_id(), cpu); } /* @@ -315,6 +337,12 @@ void check_tsc_sync_source(int cpu) * Let the target continue with the bootup: */ atomic_inc(&stop_count); + + /* + * Retry, if there is a chance to do so. + */ + if (atomic_read(&test_runs) > 0) + goto retry; } /* @@ -322,6 +350,9 @@ void check_tsc_sync_source(int cpu) */ void check_tsc_sync_target(void) { + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); + unsigned int cpu = smp_processor_id(); + cycles_t cur_max_warp, gbl_max_warp; int cpus = 2; /* Also aborts if there is no TSC. */ @@ -337,6 +368,7 @@ void check_tsc_sync_target(void) return; } +retry: /* * Register this CPU's participation and wait for the * source CPU to start the measurement: @@ -345,7 +377,12 @@ void check_tsc_sync_target(void) while (atomic_read(&start_count) != cpus) cpu_relax(); - check_tsc_warp(loop_timeout(smp_processor_id())); + cur_max_warp = check_tsc_warp(loop_timeout(cpu)); + + /* + * Store the maximum observed warp value for a potential retry: + */ + gbl_max_warp = max_warp; /* * Ok, we are done: @@ -362,6 +399,42 @@ void check_tsc_sync_target(void) * Reset it for the next sync test: */ atomic_set(&stop_count, 0); + + /* + * Check the number of remaining test runs. If not zero, the test + * failed and a retry with adjusted TSC is possible. If zero the + * test was either successful or failed terminally. + */ + if (!atomic_read(&test_runs)) + return; + + /* + * If the warp value of this CPU is 0, then the other CPU + * observed time going backwards so this TSC was ahead and + * needs to move backwards. + */ + if (!cur_max_warp) + cur_max_warp = -gbl_max_warp; + + /* + * Add the result to the previous adjustment value. + * + * The adjustement value is slightly off by the overhead of the + * sync mechanism (observed values are ~200 TSC cycles), but this + * really depends on CPU, node distance and frequency. So + * compensating for this is hard to get right. Experiments show + * that the warp is not longer detectable when the observed warp + * value is used. In the worst case the adjustment needs to go + * through a 3rd run for fine tuning. + */ + cur->adjusted += cur_max_warp; + + pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", + cpu, cur_max_warp, cur->adjusted); + + wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + goto retry; + } #endif /* CONFIG_SMP */ From b836554386cc77f31ab43a8492a2587e0c51d51e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 29 Nov 2016 20:28:31 +0100 Subject: [PATCH 14/19] x86/tsc: Fix broken CONFIG_X86_TSC=n build Add the missing return statement to the inline stub tsc_store_and_check_tsc_adjust() and add the other stubs to make a SMP=y,TSC=n build happy. While at it, remove the unused variable from the UP variant of tsc_store_and_check_tsc_adjust(). Fixes: commit ba75fb646931 ("x86/tsc: Sync test only for the first cpu in a package") Reported-by: kbuild test robot Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/tsc.h | 9 +++++---- arch/x86/kernel/tsc_sync.c | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 04721d54d8d9..c054eaa7dc94 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -45,15 +45,16 @@ extern int tsc_clocksource_reliable; * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: */ -extern void check_tsc_sync_source(int cpu); -extern void check_tsc_sync_target(void); - #ifdef CONFIG_X86_TSC extern bool tsc_store_and_check_tsc_adjust(void); extern void tsc_verify_tsc_adjust(void); +extern void check_tsc_sync_source(int cpu); +extern void check_tsc_sync_target(void); #else -static inline bool tsc_store_and_check_tsc_adjust(void) { } +static inline bool tsc_store_and_check_tsc_adjust(void) { return false; } static inline void tsc_verify_tsc_adjust(void) { } +static inline void check_tsc_sync_source(int cpu) { } +static inline void check_tsc_sync_target(void) { } #endif extern int notsc_setup(char *); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index d7f48a640ff5..8fde44f8cfec 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -61,7 +61,7 @@ void tsc_verify_tsc_adjust(void) #ifndef CONFIG_SMP bool __init tsc_store_and_check_tsc_adjust(void) { - struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); + struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); s64 bootval; if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) From 31f8a651fc5784a9e6f482be5ef0dd111a535e88 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Dec 2016 13:26:58 +0100 Subject: [PATCH 15/19] x86/tsc: Validate cpumask pointer before accessing it 0-day testing encountered a NULL pointer dereference in a cpumask access from tsc_store_and_check_tsc_adjust(). This happens when the function is called on the boot CPU and the topology masks are not yet available due to CPUMASK_OFFSTACK=y. Add a NULL pointer check for the mask pointer. If NULL it's safe to assume that the CPU is the boot CPU and the first one in the package. Fixes: 8b223bc7abe0 ("x86/tsc: Store and check TSC ADJUST MSR") Reported-by: kernel test robot Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tsc_sync.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 8fde44f8cfec..a75f696011d5 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -84,6 +84,7 @@ bool tsc_store_and_check_tsc_adjust(void) { struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); unsigned int refcpu, cpu = smp_processor_id(); + struct cpumask *mask; s64 bootval; if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) @@ -98,9 +99,11 @@ bool tsc_store_and_check_tsc_adjust(void) * Check whether this CPU is the first in a package to come up. In * this case do not check the boot value against another package * because the package might have been physically hotplugged, where - * TSC_ADJUST is expected to be different. + * TSC_ADJUST is expected to be different. When called on the boot + * CPU topology_core_cpumask() might not be available yet. */ - refcpu = cpumask_any_but(topology_core_cpumask(cpu), cpu); + mask = topology_core_cpumask(cpu); + refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids; if (refcpu >= nr_cpu_ids) { /* From 6a369583178d0b89c2c3919c4456ee22fee0f249 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Dec 2016 13:14:17 +0000 Subject: [PATCH 16/19] x86/tsc: Validate TSC_ADJUST after resume Some 'feature' BIOSes fiddle with the TSC_ADJUST register during suspend/resume which renders the TSC unusable. Add sanity checks into the resume path and restore the original value if it was adjusted. Reported-and-tested-by: Roland Scheidegger Signed-off-by: Thomas Gleixner Cc: Bruce Schlobohm Cc: Kevin Stanton Cc: Peter Zijlstra Cc: Allen Hung Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161213131211.317654500@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/tsc.h | 4 ++-- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/tsc.c | 6 ++++++ arch/x86/kernel/tsc_sync.c | 6 +++--- arch/x86/power/cpu.c | 1 + 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c054eaa7dc94..372ad0cd1357 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -47,12 +47,12 @@ extern int tsc_clocksource_reliable; */ #ifdef CONFIG_X86_TSC extern bool tsc_store_and_check_tsc_adjust(void); -extern void tsc_verify_tsc_adjust(void); +extern void tsc_verify_tsc_adjust(bool resume); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); #else static inline bool tsc_store_and_check_tsc_adjust(void) { return false; } -static inline void tsc_verify_tsc_adjust(void) { } +static inline void tsc_verify_tsc_adjust(bool resume) { } static inline void check_tsc_sync_source(int cpu) { } static inline void check_tsc_sync_target(void) { } #endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4fe5dc84da4f..a67e0f0cdaab 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -277,7 +277,7 @@ void exit_idle(void) void arch_cpu_idle_enter(void) { - tsc_verify_tsc_adjust(); + tsc_verify_tsc_adjust(false); local_touch_nmi(); enter_idle(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 2bb8de4f3b39..bfb541a5bb48 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1080,6 +1080,11 @@ static void detect_art(void) static struct clocksource clocksource_tsc; +static void tsc_resume(struct clocksource *cs) +{ + tsc_verify_tsc_adjust(true); +} + /* * We used to compare the TSC to the cycle_last value in the clocksource * structure to avoid a nasty time-warp. This can be observed in a @@ -1112,6 +1117,7 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, + .resume = tsc_resume, }; void mark_tsc_unstable(char *reason) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index a75f696011d5..94f2ce5fb159 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -30,7 +30,7 @@ struct tsc_adjust { static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); -void tsc_verify_tsc_adjust(void) +void tsc_verify_tsc_adjust(bool resume) { struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); s64 curval; @@ -39,7 +39,7 @@ void tsc_verify_tsc_adjust(void) return; /* Rate limit the MSR check */ - if (time_before(jiffies, adj->nextcheck)) + if (!resume && time_before(jiffies, adj->nextcheck)) return; adj->nextcheck = jiffies + HZ; @@ -51,7 +51,7 @@ void tsc_verify_tsc_adjust(void) /* Restore the original value */ wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); - if (!adj->warned) { + if (!adj->warned || resume) { pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", smp_processor_id(), adj->adjusted, curval); adj->warned = true; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 53cace2ec0e2..66ade16c7693 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -252,6 +252,7 @@ static void notrace __restore_processor_state(struct saved_context *ctxt) fix_processor_context(); do_fpu_end(); + tsc_verify_tsc_adjust(true); x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); perf_restore_debug_store(); From 5bae156241e05d25171b18ee43e49f103c3f8097 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 13 Dec 2016 13:14:17 +0000 Subject: [PATCH 17/19] x86/tsc: Force TSC_ADJUST register to value >= zero Roland reported that his DELL T5810 sports a value add BIOS which completely wreckages the TSC. The squirmware [(TM) Ingo Molnar] boots with random negative TSC_ADJUST values, different on all CPUs. That renders the TSC useless because the sycnchronization check fails. Roland tested the new TSC_ADJUST mechanism. While it manages to readjust the TSCs he needs to disable the TSC deadline timer, otherwise the machine just stops booting. Deeper investigation unearthed that the TSC deadline timer is sensitive to the TSC_ADJUST value. Writing TSC_ADJUST to a negative value results in an interrupt storm caused by the TSC deadline timer. This does not make any sense and it's hard to imagine what kind of hardware wreckage is behind that misfeature, but it's reliably reproducible on other systems which have TSC_ADJUST and TSC deadline timer. While it would be understandable that a big enough negative value which moves the resulting TSC readout into the negative space could have the described effect, this happens even with a adjust value of -1, which keeps the TSC readout definitely in the positive space. The compare register for the TSC deadline timer is set to a positive value larger than the TSC, but despite not having reached the deadline the interrupt is raised immediately. If this happens on the boot CPU, then the machine dies silently because this setup happens before the NMI watchdog is armed. Further experiments showed that any other adjustment of TSC_ADJUST works as expected as long as it stays in the positive range. The direction of the adjustment has no influence either. See the lkml link for further analysis. Yet another proof for the theory that timers are designed by janitors and the underlying (obviously undocumented) mechanisms which allow BIOSes to wreckage them are considered a feature. Well done Intel - NOT! To address this wreckage add the following sanity measures: - If the TSC_ADJUST value on the boot cpu is not 0, set it to 0 - If the TSC_ADJUST value on any cpu is negative, set it to 0 - Prevent the cross package synchronization mechanism from setting negative TSC_ADJUST values. Reported-and-tested-by: Roland Scheidegger Signed-off-by: Thomas Gleixner Cc: Bruce Schlobohm Cc: Kevin Stanton Cc: Peter Zijlstra Cc: Allen Hung Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20161213131211.397588033@linutronix.de Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/tsc.h | 4 +-- arch/x86/kernel/tsc.c | 2 +- arch/x86/kernel/tsc_sync.c | 55 +++++++++++++++++++++++++++----------- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 372ad0cd1357..abb1fdcc545a 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -46,12 +46,12 @@ extern int tsc_clocksource_reliable; * all CPUs/cores: */ #ifdef CONFIG_X86_TSC -extern bool tsc_store_and_check_tsc_adjust(void); +extern bool tsc_store_and_check_tsc_adjust(bool bootcpu); extern void tsc_verify_tsc_adjust(bool resume); extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); #else -static inline bool tsc_store_and_check_tsc_adjust(void) { return false; } +static inline bool tsc_store_and_check_tsc_adjust(bool bootcpu) { return false; } static inline void tsc_verify_tsc_adjust(bool resume) { } static inline void check_tsc_sync_source(int cpu) { } static inline void check_tsc_sync_target(void) { } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index bfb541a5bb48..0aed75a1e31b 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1386,7 +1386,7 @@ void __init tsc_init(void) if (unsynchronized_tsc()) mark_tsc_unstable("TSCs unsynchronized"); else - tsc_store_and_check_tsc_adjust(); + tsc_store_and_check_tsc_adjust(true); check_system_tsc_reliable(); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 94f2ce5fb159..9151f0ce6a42 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -58,8 +58,33 @@ void tsc_verify_tsc_adjust(bool resume) } } +static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, + unsigned int cpu, bool bootcpu) +{ + /* + * First online CPU in a package stores the boot value in the + * adjustment value. This value might change later via the sync + * mechanism. If that fails we still can yell about boot values not + * being consistent. + * + * On the boot cpu we just force set the ADJUST value to 0 if it's + * non zero. We don't do that on non boot cpus because physical + * hotplug should have set the ADJUST register to a value > 0 so + * the TSC is in sync with the already running cpus. + * + * But we always force positive ADJUST values. Otherwise the TSC + * deadline timer creates an interrupt storm. Sigh! + */ + if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0)) { + pr_warn("TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); + wrmsrl(MSR_IA32_TSC_ADJUST, 0); + bootval = 0; + } + cur->adjusted = bootval; +} + #ifndef CONFIG_SMP -bool __init tsc_store_and_check_tsc_adjust(void) +bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) { struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); s64 bootval; @@ -69,9 +94,8 @@ bool __init tsc_store_and_check_tsc_adjust(void) rdmsrl(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; - cur->adjusted = bootval; cur->nextcheck = jiffies + HZ; - pr_info("TSC ADJUST: Boot CPU0: %lld\n", bootval); + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); return false; } @@ -80,7 +104,7 @@ bool __init tsc_store_and_check_tsc_adjust(void) /* * Store and check the TSC ADJUST MSR if available */ -bool tsc_store_and_check_tsc_adjust(void) +bool tsc_store_and_check_tsc_adjust(bool bootcpu) { struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); unsigned int refcpu, cpu = smp_processor_id(); @@ -98,22 +122,16 @@ bool tsc_store_and_check_tsc_adjust(void) /* * Check whether this CPU is the first in a package to come up. In * this case do not check the boot value against another package - * because the package might have been physically hotplugged, where - * TSC_ADJUST is expected to be different. When called on the boot - * CPU topology_core_cpumask() might not be available yet. + * because the new package might have been physically hotplugged, + * where TSC_ADJUST is expected to be different. When called on the + * boot CPU topology_core_cpumask() might not be available yet. */ mask = topology_core_cpumask(cpu); refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids; if (refcpu >= nr_cpu_ids) { - /* - * First online CPU in a package stores the boot value in - * the adjustment value. This value might change later via - * the sync mechanism. If that fails we still can yell - * about boot values not being consistent. - */ - cur->adjusted = bootval; - pr_info_once("TSC ADJUST: Boot CPU%u: %lld\n", cpu, bootval); + tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), + bootcpu); return false; } @@ -366,7 +384,7 @@ void check_tsc_sync_target(void) * Store, verify and sanitize the TSC adjust register. If * successful skip the test. */ - if (tsc_store_and_check_tsc_adjust()) { + if (tsc_store_and_check_tsc_adjust(false)) { atomic_inc(&skip_test); return; } @@ -429,8 +447,13 @@ retry: * that the warp is not longer detectable when the observed warp * value is used. In the worst case the adjustment needs to go * through a 3rd run for fine tuning. + * + * But we must make sure that the value doesn't become negative + * otherwise TSC deadline timer will create an interrupt storm. */ cur->adjusted += cur_max_warp; + if (cur->adjusted < 0) + cur->adjusted = 0; pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", cpu, cur_max_warp, cur->adjusted); From 16588f659257495212ac6b9beaf008d9b19e8165 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 18 Dec 2016 15:06:27 +0100 Subject: [PATCH 18/19] x86/tsc: Annotate printouts as firmware bug Make it more obvious that the BIOS is screwed up. Signed-off-by: Thomas Gleixner Cc: Roland Scheidegger Cc: Bruce Schlobohm Cc: Kevin Stanton Cc: Peter Zijlstra Cc: Borislav Petkov --- arch/x86/kernel/tsc_sync.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 9151f0ce6a42..1d8508fd15f7 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -76,7 +76,8 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, * deadline timer creates an interrupt storm. Sigh! */ if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0)) { - pr_warn("TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); + pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, + bootval); wrmsrl(MSR_IA32_TSC_ADJUST, 0); bootval = 0; } @@ -141,7 +142,7 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) * package. */ if (bootval != ref->bootval) { - pr_warn("TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", + pr_warn(FW_BUG "TSC ADJUST differs: Reference CPU%u: %lld CPU%u: %lld\n", refcpu, ref->bootval, cpu, bootval); } /* From 8c9b9d87b855226a823b41a77a05f42324497603 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 18 Dec 2016 15:09:29 +0100 Subject: [PATCH 19/19] x86/tsc: Limit the adjust value further Adjust value 0x80000000 and other values larger than that render the TSC deadline timer disfunctional. We have not yet any information about this from Intel, but experimentation clearly proves that this is a 32/64 bit and sign extension issue. If adjust values larger than that are actually required, which might be the case for physical CPU hotplug, then we need to disable the deadline timer on the affected package/CPUs and use the local APIC timer instead. That requires some surgery in the APIC setup code, so we just limit the ADJUST register value into the known to work range for now and revisit this when Intel comes forth with proper information. Signed-off-by: Thomas Gleixner Cc: Roland Scheidegger Cc: Bruce Schlobohm Cc: Kevin Stanton Cc: Peter Zijlstra Cc: Borislav Petkov --- arch/x86/kernel/tsc_sync.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 1d8508fd15f7..d0db011051a5 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -73,9 +73,11 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, * the TSC is in sync with the already running cpus. * * But we always force positive ADJUST values. Otherwise the TSC - * deadline timer creates an interrupt storm. Sigh! + * deadline timer creates an interrupt storm. We also have to + * prevent values > 0x7FFFFFFF as those wreckage the timer as well. */ - if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0)) { + if ((bootcpu && bootval != 0) || (!bootcpu && bootval < 0) || + (bootval > 0x7FFFFFFF)) { pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); wrmsrl(MSR_IA32_TSC_ADJUST, 0); @@ -448,13 +450,22 @@ retry: * that the warp is not longer detectable when the observed warp * value is used. In the worst case the adjustment needs to go * through a 3rd run for fine tuning. - * - * But we must make sure that the value doesn't become negative - * otherwise TSC deadline timer will create an interrupt storm. */ cur->adjusted += cur_max_warp; + + /* + * TSC deadline timer stops working or creates an interrupt storm + * with adjust values < 0 and > x07ffffff. + * + * To allow adjust values > 0x7FFFFFFF we need to disable the + * deadline timer and use the local APIC timer, but that requires + * more intrusive changes and we do not have any useful information + * from Intel about the underlying HW wreckage yet. + */ if (cur->adjusted < 0) cur->adjusted = 0; + if (cur->adjusted > 0x7FFFFFFF) + cur->adjusted = 0x7FFFFFFF; pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", cpu, cur_max_warp, cur->adjusted);