x86/mwait-idle: disable IBRS during long idle
authorPeter Zijlstra <peterz@infradead.org>
Thu, 13 Oct 2022 15:55:22 +0000 (17:55 +0200)
committerJan Beulich <jbeulich@suse.com>
Thu, 13 Oct 2022 15:55:22 +0000 (17:55 +0200)
Having IBRS enabled while the SMT sibling is idle unnecessarily slows
down the running sibling. OTOH, disabling IBRS around idle takes two
MSR writes, which will increase the idle latency.

Therefore, only disable IBRS around deeper idle states. Shallow idle
states are bounded by the tick in duration, since NOHZ is not allowed
for them by virtue of their short target residency.

Only do this for mwait-driven idle, since that keeps interrupts disabled
across idle, which makes disabling IBRS vs IRQ-entry a non-issue.

Note: C6 is a random threshold, most importantly C1 probably shouldn't
disable IBRS, benchmarking needed.

Suggested-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Origin: git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git bf5835bcdb96
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Roger Pau Monné <roger.pau@citrix.com>
Release-acked-by: Henry Wang <Henry.Wang@arm.com>
xen/arch/x86/cpu/mwait-idle.c
xen/include/xen/cpuidle.h

index 86c47a04c747257d3f38ea6eebe879b3928ff688..f5c83121a8b5572d37c9fbda735d5d6cb8a8bbf1 100644 (file)
@@ -140,6 +140,12 @@ static const struct cpuidle_state {
  */
 #define CPUIDLE_FLAG_TLB_FLUSHED       0x10000
 
+/*
+ * Disable IBRS across idle (when KERNEL_IBRS), is exclusive vs IRQ_ENABLE
+ * above.
+ */
+#define CPUIDLE_FLAG_IBRS              0x20000
+
 /*
  * MWAIT takes an 8-bit "hint" in EAX "suggesting"
  * the C-state (top nibble) and sub-state (bottom nibble)
@@ -530,31 +536,31 @@ static struct cpuidle_state __read_mostly skl_cstates[] = {
        },
        {
                .name = "C6",
-               .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 85,
                .target_residency = 200,
        },
        {
                .name = "C7s",
-               .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 124,
                .target_residency = 800,
        },
        {
                .name = "C8",
-               .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 200,
                .target_residency = 800,
        },
        {
                .name = "C9",
-               .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 480,
                .target_residency = 5000,
        },
        {
                .name = "C10",
-               .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 890,
                .target_residency = 5000,
        },
@@ -576,7 +582,7 @@ static struct cpuidle_state __read_mostly skx_cstates[] = {
        },
        {
                .name = "C6",
-               .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
+               .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
                .exit_latency = 133,
                .target_residency = 600,
        },
@@ -906,6 +912,7 @@ static const struct cpuidle_state snr_cstates[] = {
 static void cf_check mwait_idle(void)
 {
        unsigned int cpu = smp_processor_id();
+       struct cpu_info *info = get_cpu_info();
        struct acpi_processor_power *power = processor_powers[cpu];
        struct acpi_processor_cx *cx = NULL;
        unsigned int next_state;
@@ -932,8 +939,6 @@ static void cf_check mwait_idle(void)
                        pm_idle_save();
                else
                {
-                       struct cpu_info *info = get_cpu_info();
-
                        spec_ctrl_enter_idle(info);
                        safe_halt();
                        spec_ctrl_exit_idle(info);
@@ -960,6 +965,11 @@ static void cf_check mwait_idle(void)
        if ((cx->type >= 3) && errata_c6_workaround())
                cx = power->safe_state;
 
+       if (cx->ibrs_disable) {
+               ASSERT(!cx->irq_enable_early);
+               spec_ctrl_enter_idle(info);
+       }
+
 #if 0 /* XXX Can we/do we need to do something similar on Xen? */
        /*
         * leave_mm() to avoid costly and often unnecessary wakeups
@@ -991,6 +1001,10 @@ static void cf_check mwait_idle(void)
 
        /* Now back in C0. */
        update_idle_stats(power, cx, before, after);
+
+       if (cx->ibrs_disable)
+               spec_ctrl_exit_idle(info);
+
        local_irq_enable();
 
        TRACE_6D(TRC_PM_IDLE_EXIT, cx->type, after,
@@ -1603,6 +1617,8 @@ static int cf_check mwait_idle_cpu_init(
                    /* cstate_restore_tsc() needs to be a no-op */
                    boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
                        cx->irq_enable_early = true;
+               if (cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_IBRS)
+                       cx->ibrs_disable = true;
 
                dev->count++;
        }
index bd24a31e126dde91a88e85e69a8dfcf113f3f7a6..521a8deb04c23bba99178d341d0cc05288b6382d 100644 (file)
@@ -42,7 +42,8 @@ struct acpi_processor_cx
     u8 idx;
     u8 type;         /* ACPI_STATE_Cn */
     u8 entry_method; /* ACPI_CSTATE_EM_xxx */
-    bool irq_enable_early;
+    bool irq_enable_early:1;
+    bool ibrs_disable:1;
     u32 address;
     u32 latency;
     u32 target_residency;