p***@xen.org
2018-11-19 11:44:44 UTC
commit 9f7c1e765d04c277ee762b41bf7d4ae469f9b9fb
Author: Jan Beulich <***@suse.com>
AuthorDate: Mon Nov 5 11:13:09 2018 +0100
Commit: Jan Beulich <***@suse.com>
CommitDate: Mon Nov 5 11:13:09 2018 +0100
x86: deal with firmware setting bogus TSC_ADJUST values
The system Intel have handed me for AVX512 emulator work ("Gigabyte
Technology Co., Ltd. X299 AORUS Gaming 3 Pro/X299 AORUS Gaming 3
Pro-CF, BIOS F3 12/28/2017") would not come up under Xen - it hung in
the middle of Dom0 PCI initialization. As it turned out, Xen's time
management did not work because of the firmware setting (only) the boot
CPU's TSC_ADJUST MSR to a large negative value (on the order of -2^50).
Follow Linux (also shamelessly stealing their comments) in
- clearing the register for the boot CPU (we don't have a need for
exceptions here yet, as the only exception in Linux is a class of
systems Xen doesn't work on anyway as far as I'm aware),
- forcing non-negative values uniformly (commit 855615eee9 ["x86/tsc:
Remove the TSC_ADJUST clamp"] dropped this, but without this my
Haswell box won't boot anymore),
- syncing the registers within sockets.
Linux, prior to aforementioned commit, capped at 0x7fffffff as well, but as the
description there says this issue has been addressed with a microcode
update. Hence until someone runs into such a system without being able
to update its microcode, I think we should leave out that specific part.
In order to avoid making init_percpu_time() depend on running _before_
set_cpu_sibling_map() (and hence the booting CPU _not_ being accounted
in socket_cpumask[] yet), move that call slightly earlier in
start_secondary().
Signed-off-by: Jan Beulich <***@suse.com>
Acked-by: Andrew Cooper <***@citrix.com>
---
xen/arch/x86/smpboot.c | 3 +-
xen/arch/x86/time.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 77 insertions(+), 1 deletion(-)
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 43deb82e53..567cece748 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -381,6 +381,8 @@ void start_secondary(void *unused)
smp_callin();
+ set_cpu_sibling_map(cpu);
+
init_percpu_time();
setup_secondary_APIC_clock();
@@ -393,7 +395,6 @@ void start_secondary(void *unused)
/* This must be done before setting cpu_online_map */
spin_debug_enable();
- set_cpu_sibling_map(cpu);
notify_cpu_starting(cpu);
/*
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 03f792e7e5..24d4c2794b 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -88,6 +88,9 @@ static bool __read_mostly using_pit;
/* Boot timestamp, filled in head.S */
u64 __initdata boot_tsc_stamp;
+/* Per-socket TSC_ADJUST values, for secondary cores/threads to sync to. */
+static uint64_t *__read_mostly tsc_adjust;
+
/*
* 32-bit division of integer dividend and integer divisor yielding
* 32-bit fractional quotient.
@@ -1602,6 +1605,56 @@ void init_percpu_time(void)
/* Initial estimate for TSC rate. */
t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
+ if ( tsc_adjust )
+ {
+ unsigned int socket = cpu_to_socket(smp_processor_id());
+ int64_t adj;
+
+ /* For now we don't want to come here for the BSP. */
+ ASSERT(system_state >= SYS_STATE_smp_boot);
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, adj);
+
+ /*
+ * Check whether this CPU is the first in a package to come up. In
+ * this case do not check the boot value against another package
+ * because the new package might have been physically hotplugged,
+ * where TSC_ADJUST is expected to be different.
+ */
+ if ( cpumask_weight(socket_cpumask[socket]) == 1 )
+ {
+ /*
+ * On the boot CPU we just force the ADJUST value to 0 if it's non-
+ * zero (in early_time_init()). We don't do that on non-boot CPUs
+ * because physical hotplug should have set the ADJUST register to a
+ * value > 0, so the TSC is in sync with the already running CPUs.
+ *
+ * But we always force non-negative ADJUST values for now.
+ */
+ if ( adj < 0 )
+ {
+ printk(XENLOG_WARNING
+ "TSC ADJUST set to -%lx on CPU%u - clearing\n",
+ -adj, smp_processor_id());
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ adj = 0;
+ }
+ tsc_adjust[socket] = adj;
+ }
+ else if ( adj != tsc_adjust[socket] )
+ {
+ static bool __read_mostly warned;
+
+ if ( !warned )
+ {
+ warned = true;
+ printk(XENLOG_WARNING
+ "Differing TSC ADJUST values within socket(s) - fixing all\n");
+ }
+ wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust[socket]);
+ }
+ }
+
local_irq_save(flags);
now = read_platform_stime(NULL);
tsc = rdtsc_ordered();
@@ -1788,6 +1841,15 @@ int __init init_xen_time(void)
/* Finish platform timer initialization. */
try_platform_timer_tail(false);
+ /*
+ * Setup space to track per-socket TSC_ADJUST values. Don't fiddle with
+ * values if the TSC is not reported as invariant. Ignore allocation
+ * failure here - most systems won't need any adjustment anyway.
+ */
+ if ( boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ boot_cpu_has(X86_FEATURE_ITSC) )
+ tsc_adjust = xzalloc_array(uint64_t, nr_sockets);
+
return 0;
}
@@ -1798,6 +1860,19 @@ void __init early_time_init(void)
struct cpu_time *t = &this_cpu(cpu_time);
u64 tmp;
+ if ( boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ boot_cpu_has(X86_FEATURE_ITSC) )
+ {
+ rdmsrl(MSR_IA32_TSC_ADJUST, tmp);
+ if ( tmp )
+ {
+ printk(XENLOG_WARNING
+ "TSC ADJUST set to %lx on boot CPU - clearing\n", tmp);
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ boot_tsc_stamp -= tmp;
+ }
+ }
+
preinit_pit();
tmp = init_platform_timer();
plt_tsc.frequency = tmp;
--
generated by git-patchbot for /home/xen/git/xen.git#master
Author: Jan Beulich <***@suse.com>
AuthorDate: Mon Nov 5 11:13:09 2018 +0100
Commit: Jan Beulich <***@suse.com>
CommitDate: Mon Nov 5 11:13:09 2018 +0100
x86: deal with firmware setting bogus TSC_ADJUST values
The system Intel have handed me for AVX512 emulator work ("Gigabyte
Technology Co., Ltd. X299 AORUS Gaming 3 Pro/X299 AORUS Gaming 3
Pro-CF, BIOS F3 12/28/2017") would not come up under Xen - it hung in
the middle of Dom0 PCI initialization. As it turned out, Xen's time
management did not work because of the firmware setting (only) the boot
CPU's TSC_ADJUST MSR to a large negative value (on the order of -2^50).
Follow Linux (also shamelessly stealing their comments) in
- clearing the register for the boot CPU (we don't have a need for
exceptions here yet, as the only exception in Linux is a class of
systems Xen doesn't work on anyway as far as I'm aware),
- forcing non-negative values uniformly (commit 855615eee9 ["x86/tsc:
Remove the TSC_ADJUST clamp"] dropped this, but without this my
Haswell box won't boot anymore),
- syncing the registers within sockets.
Linux, prior to aforementioned commit, capped at 0x7fffffff as well, but as the
description there says this issue has been addressed with a microcode
update. Hence until someone runs into such a system without being able
to update its microcode, I think we should leave out that specific part.
In order to avoid making init_percpu_time() depend on running _before_
set_cpu_sibling_map() (and hence the booting CPU _not_ being accounted
in socket_cpumask[] yet), move that call slightly earlier in
start_secondary().
Signed-off-by: Jan Beulich <***@suse.com>
Acked-by: Andrew Cooper <***@citrix.com>
---
xen/arch/x86/smpboot.c | 3 +-
xen/arch/x86/time.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 77 insertions(+), 1 deletion(-)
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 43deb82e53..567cece748 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -381,6 +381,8 @@ void start_secondary(void *unused)
smp_callin();
+ set_cpu_sibling_map(cpu);
+
init_percpu_time();
setup_secondary_APIC_clock();
@@ -393,7 +395,6 @@ void start_secondary(void *unused)
/* This must be done before setting cpu_online_map */
spin_debug_enable();
- set_cpu_sibling_map(cpu);
notify_cpu_starting(cpu);
/*
diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c
index 03f792e7e5..24d4c2794b 100644
--- a/xen/arch/x86/time.c
+++ b/xen/arch/x86/time.c
@@ -88,6 +88,9 @@ static bool __read_mostly using_pit;
/* Boot timestamp, filled in head.S */
u64 __initdata boot_tsc_stamp;
+/* Per-socket TSC_ADJUST values, for secondary cores/threads to sync to. */
+static uint64_t *__read_mostly tsc_adjust;
+
/*
* 32-bit division of integer dividend and integer divisor yielding
* 32-bit fractional quotient.
@@ -1602,6 +1605,56 @@ void init_percpu_time(void)
/* Initial estimate for TSC rate. */
t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
+ if ( tsc_adjust )
+ {
+ unsigned int socket = cpu_to_socket(smp_processor_id());
+ int64_t adj;
+
+ /* For now we don't want to come here for the BSP. */
+ ASSERT(system_state >= SYS_STATE_smp_boot);
+
+ rdmsrl(MSR_IA32_TSC_ADJUST, adj);
+
+ /*
+ * Check whether this CPU is the first in a package to come up. In
+ * this case do not check the boot value against another package
+ * because the new package might have been physically hotplugged,
+ * where TSC_ADJUST is expected to be different.
+ */
+ if ( cpumask_weight(socket_cpumask[socket]) == 1 )
+ {
+ /*
+ * On the boot CPU we just force the ADJUST value to 0 if it's non-
+ * zero (in early_time_init()). We don't do that on non-boot CPUs
+ * because physical hotplug should have set the ADJUST register to a
+ * value > 0, so the TSC is in sync with the already running CPUs.
+ *
+ * But we always force non-negative ADJUST values for now.
+ */
+ if ( adj < 0 )
+ {
+ printk(XENLOG_WARNING
+ "TSC ADJUST set to -%lx on CPU%u - clearing\n",
+ -adj, smp_processor_id());
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ adj = 0;
+ }
+ tsc_adjust[socket] = adj;
+ }
+ else if ( adj != tsc_adjust[socket] )
+ {
+ static bool __read_mostly warned;
+
+ if ( !warned )
+ {
+ warned = true;
+ printk(XENLOG_WARNING
+ "Differing TSC ADJUST values within socket(s) - fixing all\n");
+ }
+ wrmsrl(MSR_IA32_TSC_ADJUST, tsc_adjust[socket]);
+ }
+ }
+
local_irq_save(flags);
now = read_platform_stime(NULL);
tsc = rdtsc_ordered();
@@ -1788,6 +1841,15 @@ int __init init_xen_time(void)
/* Finish platform timer initialization. */
try_platform_timer_tail(false);
+ /*
+ * Setup space to track per-socket TSC_ADJUST values. Don't fiddle with
+ * values if the TSC is not reported as invariant. Ignore allocation
+ * failure here - most systems won't need any adjustment anyway.
+ */
+ if ( boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ boot_cpu_has(X86_FEATURE_ITSC) )
+ tsc_adjust = xzalloc_array(uint64_t, nr_sockets);
+
return 0;
}
@@ -1798,6 +1860,19 @@ void __init early_time_init(void)
struct cpu_time *t = &this_cpu(cpu_time);
u64 tmp;
+ if ( boot_cpu_has(X86_FEATURE_TSC_ADJUST) &&
+ boot_cpu_has(X86_FEATURE_ITSC) )
+ {
+ rdmsrl(MSR_IA32_TSC_ADJUST, tmp);
+ if ( tmp )
+ {
+ printk(XENLOG_WARNING
+ "TSC ADJUST set to %lx on boot CPU - clearing\n", tmp);
+ wrmsrl(MSR_IA32_TSC_ADJUST, 0);
+ boot_tsc_stamp -= tmp;
+ }
+ }
+
preinit_pit();
tmp = init_platform_timer();
plt_tsc.frequency = tmp;
--
generated by git-patchbot for /home/xen/git/xen.git#master