static atomic_t waiting_for_crash_ipi;
static unsigned int crashing_cpu;
+static DEFINE_PER_CPU_READ_MOSTLY(bool_t, crash_save_done);
-static int crash_nmi_callback(struct cpu_user_regs *regs, int cpu)
+/* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing. */
+void __attribute__((noreturn)) do_nmi_crash(struct cpu_user_regs *regs)
{
- /* Don't do anything if this handler is invoked on crashing cpu.
- * Otherwise, system will completely hang. Crashing cpu can get
- * an NMI if system was initially booted with nmi_watchdog parameter.
+ int cpu = smp_processor_id();
+
+ /* nmi_shootdown_cpus() should ensure that this assertion is correct. */
+ ASSERT(cpu != crashing_cpu);
+
+ /* Save crash information and shut down CPU. Attempt only once. */
+ if ( !this_cpu(crash_save_done) )
+ {
+ /* Disable the interrupt stack table for the MCE handler. This
+ * prevents race conditions between clearing MCIP and receving a
+ * new MCE, during which the exception frame would be clobbered
+ * and the MCE handler fall into an infinite loop. We are soon
+ * going to disable the NMI watchdog, so the loop would not be
+ * caught.
+ *
+ * We do not need to change the NMI IST, as the nmi_crash
+ * handler is immue to corrupt exception frames, by virtue of
+ * being designed never to return.
+ *
+ * This update is safe from a security point of view, as this
+ * pcpu is never going to try to sysret back to a PV vcpu.
+ */
+ set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
+
+ kexec_crash_save_cpu();
+ __stop_this_cpu();
+
+ this_cpu(crash_save_done) = 1;
+ atomic_dec(&waiting_for_crash_ipi);
+ }
+
+ /* Poor mans self_nmi(). __stop_this_cpu() has reverted the LAPIC
+ * back to its boot state, so we are unable to rely on the regular
+ * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
+ * (The likely scenario is that we have reverted from x2apic mode to
+ * xapic, at which point #GPFs will occur if we use the apic_*
+ * functions)
+ *
+ * The ICR and APIC ID of the LAPIC are still valid even during
+ * software disable (Intel SDM Vol 3, 10.4.7.2). As a result, we
+ * can deliberately queue up another NMI at the LAPIC which will not
+ * be delivered as the hardware NMI latch is currently in effect.
+ * This means that if NMIs become unlatched (e.g. following a
+ * non-fatal MCE), the LAPIC will force us back here rather than
+ * wandering back into regular Xen code.
*/
- if ( cpu == crashing_cpu )
- return 1;
- local_irq_disable();
+ switch ( current_local_apic_mode() )
+ {
+ u32 apic_id;
- kexec_crash_save_cpu();
+ case APIC_MODE_X2APIC:
+ apic_id = apic_rdmsr(APIC_ID);
- __stop_this_cpu();
+ apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL
+ | ((u64)apic_id << 32));
+ break;
+
+ case APIC_MODE_XAPIC:
+ apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));
- atomic_dec(&waiting_for_crash_ipi);
+ while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
+ cpu_relax();
+
+ apic_mem_write(APIC_ICR2, apic_id << 24);
+ apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
+ break;
+
+ default:
+ break;
+ }
for ( ; ; )
halt();
-
- return 1;
}
static void nmi_shootdown_cpus(void)
{
unsigned long msecs;
+ int i, cpu = smp_processor_id();
local_irq_disable();
- crashing_cpu = smp_processor_id();
+ crashing_cpu = cpu;
local_irq_count(crashing_cpu) = 0;
atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1);
- /* Would it be better to replace the trap vector here? */
- set_nmi_callback(crash_nmi_callback);
+
+ /* Change NMI trap handlers. Non-crashing pcpus get nmi_crash which
+ * invokes do_nmi_crash (above), which cause them to write state and
+ * fall into a loop. The crashing pcpu gets the nop handler to
+ * cause it to return to this function ASAP.
+ */
+ for ( i = 0; i < nr_cpu_ids; i++ )
+ {
+ if ( idt_tables[i] == NULL )
+ continue;
+
+ if ( i == cpu )
+ {
+ /*
+ * Disable the interrupt stack tables for this cpu's MCE and NMI
+ * handlers, and alter the NMI handler to have no operation.
+ * Disabling the stack tables prevents stack corruption race
+ * conditions, while changing the handler helps prevent cascading
+ * faults; we are certainly going to crash by this point.
+ *
+ * This update is safe from a security point of view, as this pcpu
+ * is never going to try to sysret back to a PV vcpu.
+ */
+ _set_gate_lower(&idt_tables[i][TRAP_nmi], 14, 0, &trap_nop);
+ set_ist(&idt_tables[i][TRAP_machine_check], IST_NONE);
+ }
+ else
+ {
+ /* Do not update stack table for other pcpus. */
+ _update_gate_addr_lower(&idt_tables[i][TRAP_nmi], &nmi_crash);
+ }
+ }
+
/* Ensure the new callback function is set before sending out the NMI. */
wmb();
.base = (unsigned long)(boot_cpu_gdt_table - FIRST_RESERVED_GDT_ENTRY),
.limit = LAST_RESERVED_GDT_BYTE
};
+ int i;
/* We are about to permenantly jump out of the Xen context into the kexec
* purgatory code. We really dont want to be still servicing interupts.
*/
local_irq_disable();
+ /* Now regular interrupts are disabled, we need to reduce the impact
+ * of interrupts not disabled by 'cli'.
+ *
+ * The NMI handlers have already been set up nmi_shootdown_cpus(). All
+ * pcpus other than us have the nmi_crash handler, while we have the nop
+ * handler.
+ *
+ * The MCE handlers touch extensive areas of Xen code and data. At this
+ * point, there is nothing we can usefully do, so set the nop handler.
+ */
+ for ( i = 0; i < nr_cpu_ids; i++ )
+ {
+ if ( idt_tables[i] == NULL )
+ continue;
+ _update_gate_addr_lower(&idt_tables[i][TRAP_machine_check], &trap_nop);
+ }
+
+ /* Explicitly enable NMIs on this CPU. Some crashdump kernels do
+ * not like running with NMIs disabled. */
+ enable_nmis();
+
/*
* compat_machine_kexec() returns to idle pagetables, which requires us
* to be running on a static GDT mapping (idle pagetables have no GDT
u64 a, b;
} idt_entry_t;
+/* Write the lower 64 bits of an IDT Entry. This relies on the upper 32
+ * bits of the address not changing, which is a safe assumption as all
+ * functions we are likely to load will live inside the 1GB
+ * code/data/bss address range.
+ *
+ * Ideally, we would use cmpxchg16b, but this is not supported on some
+ * old AMD 64bit capable processors, and has no safe equivalent.
+ */
+static inline void _write_gate_lower(volatile idt_entry_t *gate,
+ const idt_entry_t *new)
+{
+ ASSERT(gate->b == new->b);
+ gate->a = new->a;
+}
+
#define _set_gate(gate_addr,type,dpl,addr) \
do { \
(gate_addr)->a = 0; \
(1UL << 47); \
} while (0)
+static inline void _set_gate_lower(idt_entry_t *gate, unsigned long type,
+ unsigned long dpl, void *addr)
+{
+ idt_entry_t idte;
+ idte.b = gate->b;
+ idte.a =
+ (((unsigned long)(addr) & 0xFFFF0000UL) << 32) |
+ ((unsigned long)(dpl) << 45) |
+ ((unsigned long)(type) << 40) |
+ ((unsigned long)(addr) & 0xFFFFUL) |
+ ((unsigned long)__HYPERVISOR_CS64 << 16) |
+ (1UL << 47);
+ _write_gate_lower(gate, &idte);
+}
+
+/* Update the lower half handler of an IDT Entry, without changing any
+ * other configuration. */
+static inline void _update_gate_addr_lower(idt_entry_t *gate, void *addr)
+{
+ idt_entry_t idte;
+ idte.a = gate->a;
+
+ idte.b = ((unsigned long)(addr) >> 32);
+ idte.a &= 0x0000FFFFFFFF0000ULL;
+ idte.a |= (((unsigned long)(addr) & 0xFFFF0000UL) << 32) |
+ ((unsigned long)(addr) & 0xFFFFUL);
+
+ _write_gate_lower(gate, &idte);
+}
+
#define _set_tssldt_desc(desc,addr,limit,type) \
do { \
(desc)[0].b = (desc)[1].b = 0; \