From: Keir Fraser <keir.fraser@citrix.com>
Date: Mon, 22 Dec 2008 08:12:33 +0000 (+0000)
Subject: Enable CMCI for Intel CPUs
X-Git-Tag: archive/raspbian/4.8.0-1+rpi1~1^2~14026^2~3
X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=8c9a4d8e4c8688f0306eb043e883ed5a589cbbe3;p=xen.git

Enable CMCI for Intel CPUs

Signed-off-by Yunhong Jiang <yunhong.jiang@intel.com>
Signed-off-by Liping Ke <liping.ke@intel.com>
---

diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c
index 55a25bce39..7376c66e7c 100644
--- a/xen/arch/x86/apic.c
+++ b/xen/arch/x86/apic.c
@@ -99,8 +99,11 @@ void __init apic_intr_init(void)
     /* Performance Counters Interrupt */
     set_intr_gate(PMU_APIC_VECTOR, pmu_apic_interrupt);
 
-    /* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
+    /* CMCI Correctable Machine Check Interrupt */
+    set_intr_gate(CMCI_APIC_VECTOR, cmci_interrupt);
+
+    /* thermal monitor LVT interrupt, for P4 and latest Intel CPU*/
+#ifdef CONFIG_X86_MCE_THERMAL
     set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
 }
@@ -172,12 +175,17 @@ void clear_local_APIC(void)
     }
 
 /* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#ifdef CONFIG_X86_MCE_THERMAL
     if (maxlvt >= 5) {
         v = apic_read(APIC_LVTTHMR);
         apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
     }
 #endif
+
+    if (maxlvt >= 6) {
+        v = apic_read(APIC_CMCI);
+        apic_write_around(APIC_CMCI, v | APIC_LVT_MASKED);
+    }
     /*
      * Clean APIC state for other OSs:
      */
@@ -189,10 +197,13 @@ void clear_local_APIC(void)
     if (maxlvt >= 4)
         apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
 
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#ifdef CONFIG_X86_MCE_THERMAL
     if (maxlvt >= 5)
         apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
 #endif
+    if (maxlvt >= 6)
+        apic_write_around(APIC_CMCI, APIC_LVT_MASKED);
+
     v = GET_APIC_VERSION(apic_read(APIC_LVR));
     if (APIC_INTEGRATED(v)) {  /* !82489DX */
         if (maxlvt > 3)        /* Due to Pentium errata 3AP and 11AP. */
@@ -597,6 +608,7 @@ static struct {
     unsigned int apic_spiv;
     unsigned int apic_lvtt;
     unsigned int apic_lvtpc;
+    unsigned int apic_lvtcmci;
     unsigned int apic_lvt0;
     unsigned int apic_lvt1;
     unsigned int apic_lvterr;
@@ -608,7 +620,7 @@ static struct {
 int lapic_suspend(void)
 {
     unsigned long flags;
-
+    int maxlvt = get_maxlvt();
     if (!apic_pm_state.active)
         return 0;
 
@@ -620,6 +632,11 @@ int lapic_suspend(void)
     apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
     apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
     apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
+
+    if (maxlvt >= 6) {
+        apic_pm_state.apic_lvtcmci = apic_read(APIC_CMCI);
+    }
+
     apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
     apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
     apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
@@ -637,6 +654,7 @@ int lapic_resume(void)
 {
     unsigned int l, h;
     unsigned long flags;
+    int maxlvt = get_maxlvt();
 
     if (!apic_pm_state.active)
         return 0;
@@ -669,6 +687,11 @@ int lapic_resume(void)
     apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
     apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
     apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
+
+    if (maxlvt >= 6) {
+        apic_write(APIC_CMCI, apic_pm_state.apic_lvtcmci);
+    }
+
     apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
     apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
     apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
diff --git a/xen/arch/x86/cpu/mcheck/Makefile b/xen/arch/x86/cpu/mcheck/Makefile
index 3ecc791402..15fed6eb0b 100644
--- a/xen/arch/x86/cpu/mcheck/Makefile
+++ b/xen/arch/x86/cpu/mcheck/Makefile
@@ -3,8 +3,7 @@ obj-y += k7.o
 obj-y += amd_k8.o
 obj-y += amd_f10.o
 obj-y += mce.o
+obj-y += mce_intel.o
 obj-y += non-fatal.o
-obj-y += p4.o
 obj-$(x86_32) += p5.o
-obj-$(x86_32) += p6.o
 obj-$(x86_32) += winchip.o
diff --git a/xen/arch/x86/cpu/mcheck/k7.c b/xen/arch/x86/cpu/mcheck/k7.c
index 045dd433a2..aedd0a0e1f 100644
--- a/xen/arch/x86/cpu/mcheck/k7.c
+++ b/xen/arch/x86/cpu/mcheck/k7.c
@@ -14,6 +14,7 @@
 #include <asm/msr.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 /* Machine Check Handler For AMD Athlon/Duron */
 static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_code)
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
index d4c6f05751..2331f5d670 100644
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -27,7 +27,7 @@ EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
  * to physical cpus present in the machine.
  * The more physical cpus are available, the more entries you need.
  */
-#define MAX_MCINFO	10
+#define MAX_MCINFO	20
 
 struct mc_machine_notify {
 	struct mc_info mc;
@@ -110,6 +110,22 @@ static void amd_mcheck_init(struct cpuinfo_x86 *ci)
 	}
 }
 
+/*check the existence of Machine Check*/
+int mce_available(struct cpuinfo_x86 *c)
+{
+	return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
+}
+
+/*Make sure there are no machine check on offlined or suspended CPUs*/
+void mce_disable_cpu(void)
+{
+    if (!mce_available(&current_cpu_data) || mce_disabled == 1)
+         return;
+    printk(KERN_DEBUG "MCE: disable mce on CPU%d\n", smp_processor_id());
+    clear_in_cr4(X86_CR4_MCE);
+}
+
+
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
@@ -135,11 +151,13 @@ void mcheck_init(struct cpuinfo_x86 *c)
 #ifndef CONFIG_X86_64
 		if (c->x86==5)
 			intel_p5_mcheck_init(c);
-		if (c->x86==6)
-			intel_p6_mcheck_init(c);
 #endif
-		if (c->x86==15)
-			intel_p4_mcheck_init(c);
+		/*If it is P6 or P4 family, including CORE 2 DUO series*/
+		if (c->x86 == 6 || c->x86==15)
+		{
+			printk(KERN_DEBUG "MCE: Intel newly family MC Init\n");
+			intel_mcheck_init(c);
+		}
 		break;
 
 #ifndef CONFIG_X86_64
@@ -413,7 +431,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
 		if (mic == NULL)
 			return;
 		if (mic->type != MC_TYPE_BANK)
-			continue;
+			goto next;
 
 		mc_bank = (struct mcinfo_bank *)mic;
 	
@@ -426,6 +444,7 @@ void x86_mcinfo_dump(struct mc_info *mi)
 			printk(" at %16"PRIx64, mc_bank->mc_addr);
 
 		printk("\n");
+next:
 		mic = x86_mcinfo_next(mic); /* next entry */
 		if ((mic == NULL) || (mic->size == 0))
 			break;
diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
index 8a48ef5fd1..d588a6dc05 100644
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -1,14 +1,22 @@
 #include <xen/init.h>
+#include <asm/types.h>
 #include <asm/traps.h>
+#include <asm/atomic.h>
+#include <asm/percpu.h>
+
 
 /* Init functions */
 void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
 void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
 void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
 void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
+
+
+void intel_mcheck_timer(struct cpuinfo_x86 *c);
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
+void intel_mcheck_init(struct cpuinfo_x86 *c);
+void mce_intel_feature_init(struct cpuinfo_x86 *c);
+
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
 /* Function pointer used in the handlers to collect additional information
@@ -19,6 +27,7 @@ extern int (*mc_callback_bank_extended)(struct mc_info *mi,
 		uint16_t bank, uint64_t status);
 
 
+int mce_available(struct cpuinfo_x86 *c);
 /* Helper functions used for collecting error telemetry */
 struct mc_info *x86_mcinfo_getptr(void);
 void x86_mcinfo_clear(struct mc_info *mi);
@@ -26,6 +35,3 @@ int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
 void x86_mcinfo_dump(struct mc_info *mi);
 void mc_panic(char *s);
 
-/* Global variables */
-extern int mce_disabled;
-extern unsigned int nr_mce_banks;
diff --git a/xen/arch/x86/cpu/mcheck/mce_intel.c b/xen/arch/x86/cpu/mcheck/mce_intel.c
new file mode 100644
index 0000000000..c053f6b0ee
--- /dev/null
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c
@@ -0,0 +1,681 @@
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/irq.h>
+#include <xen/event.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+#include "mce.h"
+#include "x86_mca.h"
+
+DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
+
+static int nr_intel_ext_msrs = 0;
+static int cmci_support = 0;
+extern int firstbank;
+
+#ifdef CONFIG_X86_MCE_THERMAL
+static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
+{	
+    printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n",
+                smp_processor_id());
+    add_taint(TAINT_MACHINE_CHECK);
+}
+
+/* P4/Xeon Thermal transition interrupt handler */
+static void intel_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    u32 l, h;
+    unsigned int cpu = smp_processor_id();
+    static s_time_t next[NR_CPUS];
+
+    ack_APIC_irq();
+    if (NOW() < next[cpu])
+        return;
+
+    next[cpu] = NOW() + MILLISECS(5000);
+    rdmsr(MSR_IA32_THERM_STATUS, l, h);
+    if (l & 0x1) {
+        printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
+        printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
+                cpu);
+        add_taint(TAINT_MACHINE_CHECK);
+    } else {
+        printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
+    }
+}
+
+/* Thermal interrupt handler for this CPU setup */
+static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs) 
+        = unexpected_thermal_interrupt;
+
+fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
+{
+    irq_enter();
+    vendor_thermal_interrupt(regs);
+    irq_exit();
+}
+
+/* P4/Xeon Thermal regulation detect and init */
+static void intel_init_thermal(struct cpuinfo_x86 *c)
+{
+    u32 l, h;
+    int tm2 = 0;
+    unsigned int cpu = smp_processor_id();
+
+    /* Thermal monitoring */
+    if (!cpu_has(c, X86_FEATURE_ACPI))
+        return;	/* -ENODEV */
+
+    /* Clock modulation */
+    if (!cpu_has(c, X86_FEATURE_ACC))
+        return;	/* -ENODEV */
+
+    /* first check if its enabled already, in which case there might
+     * be some SMM goo which handles it, so we can't even put a handler
+     * since it might be delivered via SMI already -zwanem.
+     */
+    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+    h = apic_read(APIC_LVTTHMR);
+    if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
+        printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu);
+        return; /* -EBUSY */
+    }
+
+    if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13)))
+        tm2 = 1;
+
+	/* check whether a vector already exists, temporarily masked? */
+    if (h & APIC_VECTOR_MASK) {
+        printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n",
+                 cpu, (h & APIC_VECTOR_MASK));
+        return; /* -EBUSY */
+    }
+
+    /* The temperature transition interrupt handler setup */
+    h = THERMAL_APIC_VECTOR;		/* our delivery vector */
+    h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
+    apic_write_around(APIC_LVTTHMR, h);
+
+    rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
+    wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
+
+    /* ok we're good to go... */
+    vendor_thermal_interrupt = intel_thermal_interrupt;
+
+    rdmsr (MSR_IA32_MISC_ENABLE, l, h);
+    wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
+
+    l = apic_read (APIC_LVTTHMR);
+    apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+    printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 
+            cpu, tm2 ? "TM2" : "TM1");
+    return;
+}
+#endif /* CONFIG_X86_MCE_THERMAL */
+
+static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext)
+{
+    if (nr_intel_ext_msrs == 0)
+        return;
+
+	/*this function will called when CAP(9).MCG_EXT_P = 1*/
+    memset(mc_ext, 0, sizeof(struct mcinfo_extended));
+    mc_ext->common.type = MC_TYPE_EXTENDED;
+    mc_ext->common.size = sizeof(mc_ext);
+    mc_ext->mc_msrs = 10;
+
+    mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX;
+    rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value);
+    mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX;
+    rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value);
+    mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX;
+    rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value);
+
+    mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX;
+    rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value);
+    mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI;
+    rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value);
+    mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI;
+    rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value);
+
+    mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP;
+    rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value);
+    mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP;
+    rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value);
+    mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS;
+    rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value);
+    mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP;
+    rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value);
+}
+
+/* machine_check_poll might be called by following types:
+ * 1. called when do mcheck_init.
+ * 2. called in cmci interrupt handler
+ * 3. called in polling handler
+ * It will generate a new mc_info item if found CE/UC errors. DOM0 is the 
+ * consumer.
+*/
+static int machine_check_poll(struct mc_info *mi, int calltype)
+{
+    int exceptions = (read_cr4() & X86_CR4_MCE);
+    int i, nr_unit = 0, uc = 0, pcc = 0;
+    uint64_t status, addr;
+    struct mcinfo_global mcg;
+    struct mcinfo_extended mce;
+    unsigned int cpu;
+    struct domain *d;
+
+    cpu = smp_processor_id();
+
+    if (!mi) {
+        printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n");
+        return 0;
+    }
+    x86_mcinfo_clear(mi);
+
+    memset(&mcg, 0, sizeof(mcg));
+    mcg.common.type = MC_TYPE_GLOBAL;
+    mcg.common.size = sizeof(mcg);
+    /*If called from cpu-reset check, don't need to fill them.
+     *If called from cmci context, we'll try to fill domid by memory addr
+    */
+    mcg.mc_domid = -1;
+    mcg.mc_vcpuid = -1;
+    if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET)
+        mcg.mc_flags = MC_FLAG_POLLED;
+    else if (calltype == MC_FLAG_CMCI)
+        mcg.mc_flags = MC_FLAG_CMCI;
+    mcg.mc_socketid = phys_proc_id[cpu];
+    mcg.mc_coreid = cpu_core_id[cpu];
+    mcg.mc_apicid = cpu_physical_id(cpu);
+    mcg.mc_core_threadid = mcg.mc_apicid & ( 1 << (smp_num_siblings - 1)); 
+    rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus);
+
+    for ( i = 0; i < nr_mce_banks; i++ ) {
+        struct mcinfo_bank mcb;
+        /*For CMCI, only owners checks the owned MSRs*/
+        if ( !test_bit(i, __get_cpu_var(mce_banks_owned)) &&
+			(calltype & MC_FLAG_CMCI) )
+            continue;
+        rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+        if (! (status & MCi_STATUS_VAL) )
+            continue;
+        /*
+         * Uncorrected events are handled by the exception
+         * handler when it is enabled. But when the exception
+         * is disabled such as when mcheck_init, log everything.
+         */
+        if ((status & MCi_STATUS_UC) && exceptions)
+            continue;
+
+        if (status & MCi_STATUS_UC)
+            uc = 1;
+        if (status & MCi_STATUS_PCC)
+            pcc = 1;
+
+        memset(&mcb, 0, sizeof(mcb));
+        mcb.common.type = MC_TYPE_BANK;
+        mcb.common.size = sizeof(mcb);
+        mcb.mc_bank = i;
+        mcb.mc_status = status;
+        if (status & MCi_STATUS_MISCV)
+            rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc);
+        if (status & MCi_STATUS_ADDRV) {
+            rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+            d = maddr_get_owner(addr);
+            if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) )
+                mcb.mc_domid = d->domain_id;
+        }
+        if (cmci_support)
+            rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+        if (calltype == MC_FLAG_CMCI)
+            rdtscll(mcb.mc_tsc);
+        x86_mcinfo_add(mi, &mcb);
+        nr_unit++;
+        add_taint(TAINT_MACHINE_CHECK);
+        /*Clear state for this bank */
+        wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0);
+        printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%lx]\n", 
+                i, cpu, status);
+        printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], "
+                "thread[%d]\n", cpu, mcg.mc_socketid, 
+                mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid);
+ 
+    }
+    /*if pcc = 1, uc must be 1*/
+    if (pcc)
+        mcg.mc_flags |= MC_FLAG_UNCORRECTABLE;
+    else if (uc)
+        mcg.mc_flags |= MC_FLAG_RECOVERABLE;
+    else /*correctable*/
+        mcg.mc_flags |= MC_FLAG_CORRECTABLE;
+
+    if (nr_unit && nr_intel_ext_msrs && 
+                    (mcg.mc_gstatus & MCG_STATUS_EIPV)) {
+        intel_get_extended_msrs(&mce);
+        x86_mcinfo_add(mi, &mce);
+    }
+    if (nr_unit) 
+        x86_mcinfo_add(mi, &mcg);
+    /*Clear global state*/
+    return nr_unit;
+}
+
+static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
+{
+    /* MACHINE CHECK Error handler will be sent in another patch,
+     * simply copy old solutions here. This code will be replaced
+     * by upcoming machine check patches
+     */
+
+    int recover=1;
+    u32 alow, ahigh, high, low;
+    u32 mcgstl, mcgsth;
+    int i;
+   
+    rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+    if (mcgstl & (1<<0))	/* Recoverable ? */
+    	recover=0;
+    
+    printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
+    	smp_processor_id(), mcgsth, mcgstl);
+    
+    for (i=0; i<nr_mce_banks; i++) {
+    	rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
+    	if (high & (1<<31)) {
+    		if (high & (1<<29))
+    			recover |= 1;
+    		if (high & (1<<25))
+    			recover |= 2;
+    		printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
+    		high &= ~(1<<31);
+    		if (high & (1<<27)) {
+    			rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
+    			printk ("[%08x%08x]", ahigh, alow);
+    		}
+    		if (high & (1<<26)) {
+    			rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
+    			printk (" at %08x%08x", ahigh, alow);
+    		}
+    		printk ("\n");
+    	}
+    }
+    
+    if (recover & 2)
+    	mc_panic ("CPU context corrupt");
+    if (recover & 1)
+    	mc_panic ("Unable to continue");
+    
+    printk(KERN_EMERG "Attempting to continue.\n");
+    /* 
+     * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
+     * recoverable/continuable.This will allow BIOS to look at the MSRs
+     * for errors if the OS could not log the error.
+     */
+    for (i=0; i<nr_mce_banks; i++) {
+    	u32 msr;
+    	msr = MSR_IA32_MC0_STATUS+i*4;
+    	rdmsr (msr, low, high);
+    	if (high&(1<<31)) {
+    		/* Clear it */
+    		wrmsr(msr, 0UL, 0UL);
+    		/* Serialize */
+    		wmb();
+    		add_taint(TAINT_MACHINE_CHECK);
+    	}
+    }
+    mcgstl &= ~(1<<2);
+    wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+}
+
+extern void (*cpu_down_handler)(int down_cpu);
+extern void (*cpu_down_rollback_handler)(int down_cpu);
+extern void mce_disable_cpu(void);
+static bool_t cmci_clear_lock = 0;
+static DEFINE_SPINLOCK(cmci_discover_lock);
+static DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
+
+/*
+ * Discover bank sharing using the algorithm recommended in the SDM.
+ */
+static int do_cmci_discover(int i)
+{
+    unsigned msr = MSR_IA32_MC0_CTL2 + i;
+    u64 val;
+
+    rdmsrl(msr, val);
+    /* Some other CPU already owns this bank. */
+    if (val & CMCI_EN) {
+    	clear_bit(i, __get_cpu_var(mce_banks_owned));
+    	goto out;
+    }
+    wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD);
+    rdmsrl(msr, val);
+
+    if (!(val & CMCI_EN)) {
+     /*
+      * This bank does not support CMCI. The polling
+      * timer has to handle it. 
+      */
+    	set_bit(i, __get_cpu_var(no_cmci_banks));
+    	return 0;
+    }
+    set_bit(i, __get_cpu_var(mce_banks_owned));
+out:
+    clear_bit(i, __get_cpu_var(no_cmci_banks));
+    return 1;
+}
+
+void cmci_discover(void)
+{
+    int i;
+
+    printk(KERN_DEBUG "CMCI: find owner on CPU%d\n", smp_processor_id());
+    spin_lock(&cmci_discover_lock);
+    for (i = 0; i < nr_mce_banks; i++) {
+        /*If the cpu is the bank owner, need not re-discover*/
+        if (test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        do_cmci_discover(i);
+    }
+    spin_unlock(&cmci_discover_lock);
+    printk(KERN_DEBUG "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n", 
+            smp_processor_id(), 
+            *((unsigned long *)__get_cpu_var(mce_banks_owned)), 
+            *((unsigned long *)__get_cpu_var(no_cmci_banks)));
+}
+
+/*
+ * Define an owner for each bank. Banks can be shared between CPUs
+ * and to avoid reporting events multiple times always set up one
+ * CPU as owner. 
+ *
+ * The assignment has to be redone when CPUs go offline and
+ * any of the owners goes away. Also pollers run in parallel so we
+ * have to be careful to update the banks in a way that doesn't
+ * lose or duplicate events.
+ */
+
+static void mce_set_owner(void)
+{
+
+    if (!cmci_support || mce_disabled == 1)
+        return;
+
+    cmci_discover();
+}
+
+static void clear_cmci(void)
+{
+    int i;
+
+    if (!cmci_support || mce_disabled == 1)
+        return;
+
+    printk(KERN_DEBUG "CMCI: clear_cmci support on CPU%d\n", 
+            smp_processor_id());
+
+    for (i = 0; i < nr_mce_banks; i++) {
+        unsigned msr = MSR_IA32_MC0_CTL2 + i;
+        u64 val;
+        if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
+            continue;
+        rdmsrl(msr, val);
+        if (val & (CMCI_EN|CMCI_THRESHOLD_MASK))
+            wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
+        clear_bit(i, __get_cpu_var(mce_banks_owned));
+    }
+}
+
+/*we need to re-set cmci owners when cpu_down fail or cpu_up*/
+static void cmci_reenable_cpu(void *h)
+{
+    if (!mce_available(&current_cpu_data) || mce_disabled == 1)
+         return;
+    printk(KERN_DEBUG "CMCI: reenable mce on CPU%d\n", smp_processor_id());
+    mce_set_owner();
+    set_in_cr4(X86_CR4_MCE);
+}
+
+/* When take cpu_down, we need to execute the impacted cmci_owner judge algorithm 
+ * First, we need to clear the ownership on the dead CPU
+ * Then,  other CPUs will check whether to take the bank's ownership from down_cpu
+ * CPU0 need not and "never" execute this path
+*/
+void  __cpu_clear_cmci( int down_cpu)
+{
+    int cpu = smp_processor_id();
+
+    if (!cmci_support && mce_disabled == 1)
+        return;
+
+    if (cpu == 0) {
+        printk(KERN_DEBUG "CMCI: CPU0 need not be cleared\n");
+        return;
+    }
+
+    local_irq_disable();
+    if (cpu == down_cpu){
+        mce_disable_cpu();
+        clear_cmci();
+        wmb();
+        test_and_set_bool(cmci_clear_lock);
+        return;
+    }
+    while (!cmci_clear_lock)
+        cpu_relax();
+    if (cpu != down_cpu)
+        mce_set_owner();
+
+    test_and_clear_bool(cmci_clear_lock);
+    local_irq_enable();
+
+}
+
+void  __cpu_clear_cmci_rollback( int down_cpu)
+{
+    cpumask_t down_map;
+    if (!cmci_support || mce_disabled == 1) 
+        return;
+
+    cpus_clear(down_map);
+    cpu_set(down_cpu, down_map);
+    printk(KERN_ERR "CMCI: cpu_down fail. "
+        "Reenable cmci on CPU%d\n", down_cpu);
+    on_selected_cpus(down_map, cmci_reenable_cpu, NULL, 1, 1);
+}
+
+static void intel_init_cmci(struct cpuinfo_x86 *c)
+{
+    u32 l, apic;
+    int cpu = smp_processor_id();
+
+    if (!mce_available(c) || !cmci_support) {
+        printk(KERN_DEBUG "CMCI: CPU%d has no CMCI support\n", cpu);
+        return;
+    }
+
+    apic = apic_read(APIC_CMCI);
+    if ( apic & APIC_VECTOR_MASK )
+    {
+        printk(KERN_WARNING "CPU%d CMCI LVT vector (%#x) already installed\n",
+            cpu, ( apic & APIC_VECTOR_MASK ));
+        return;
+    }
+
+    apic = CMCI_APIC_VECTOR;
+    apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
+    apic_write_around(APIC_CMCI, apic);
+
+	/*now clear mask flag*/
+    l = apic_read(APIC_CMCI);
+    apic_write_around(APIC_CMCI, l & ~APIC_LVT_MASKED);
+    cpu_down_handler =  __cpu_clear_cmci;
+    cpu_down_rollback_handler = __cpu_clear_cmci_rollback; 
+}
+
+fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs)
+{
+    int nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+    int cpu = smp_processor_id();
+
+    ack_APIC_irq();
+    irq_enter();
+    printk(KERN_DEBUG "CMCI: cmci_intr happen on CPU%d\n", cpu);
+    nr_unit = machine_check_poll(mi, MC_FLAG_CMCI);
+    if (nr_unit) {
+        x86_mcinfo_dump(mi);
+        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+            send_guest_global_virq(dom0, VIRQ_MCA);
+    }
+    irq_exit();
+}
+
+void mce_intel_feature_init(struct cpuinfo_x86 *c)
+{
+
+#ifdef CONFIG_X86_MCE_THERMAL
+    intel_init_thermal(c);
+#endif
+    intel_init_cmci(c);
+}
+
+static void mce_cap_init(struct cpuinfo_x86 *c)
+{
+    u32 l, h;
+
+    rdmsr (MSR_IA32_MCG_CAP, l, h);
+    if ((l & MCG_CMCI_P) && cpu_has_apic)
+        cmci_support = 1;
+
+    nr_mce_banks = l & 0xff;
+    if (nr_mce_banks > MAX_NR_BANKS)
+        printk(KERN_WARNING "MCE: exceed max mce banks\n");
+    if (l & MCG_EXT_P)
+    {
+        nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff;
+        printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n",
+            smp_processor_id(), nr_intel_ext_msrs);
+    }
+    /* for most of p6 family, bank 0 is an alias bios MSR.
+     * But after model>1a, bank 0 is available*/
+    if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL
+            && c->x86_model < 0x1A)
+        firstbank = 1;
+    else
+        firstbank = 0;
+}
+
+static void mce_init(void)
+{
+    u32 l, h;
+    int i, nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+    clear_in_cr4(X86_CR4_MCE);
+    /* log the machine checks left over from the previous reset.
+     * This also clears all registers*/
+
+    nr_unit = machine_check_poll(mi, MC_FLAG_RESET);
+    /*in the boot up stage, not expect inject to DOM0, but go print out
+    */
+    if (nr_unit > 0)
+        x86_mcinfo_dump(mi);
+
+    set_in_cr4(X86_CR4_MCE);
+    rdmsr (MSR_IA32_MCG_CAP, l, h);
+    if (l & MCG_CTL_P)	/* Control register present ? */
+        wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
+
+    for (i = firstbank; i < nr_mce_banks; i++)
+    {
+        /*Some banks are shared across cores, use MCi_CTRL to judge whether
+         * this bank has been initialized by other cores already.*/
+        rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h);
+        if (!l & !h)
+        {
+            /*if ctl is 0, this bank is never initialized*/
+            printk(KERN_DEBUG "mce_init: init bank%d\n", i);
+            wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff);
+            wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0);
+       }
+    }
+    if (firstbank) /*if cmci enabled, firstbank = 0*/
+        wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0);
+}
+
+/*p4/p6 faimily has similar MCA initialization process*/
+void intel_mcheck_init(struct cpuinfo_x86 *c)
+{
+	
+	mce_cap_init(c);
+	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+		smp_processor_id());
+	/* machine check is available */
+	machine_check_vector = intel_machine_check;
+	mce_init();
+	mce_intel_feature_init(c);
+	mce_set_owner();
+}
+
+/*
+ * Periodic polling timer for "silent" machine check errors. If the
+ * poller finds an MCE, poll faster. When the poller finds no more 
+ * errors, poll slower
+*/
+static struct timer mce_timer;
+
+#define MCE_PERIOD 4000
+#define MCE_MIN    2000
+#define MCE_MAX    32000
+
+static u64 period = MCE_PERIOD;
+static int adjust = 0;
+
+static void mce_intel_checkregs(void *info)
+{
+    int nr_unit;
+    struct mc_info *mi =  x86_mcinfo_getptr();
+
+    if( !mce_available(&current_cpu_data))
+        return;
+    nr_unit = machine_check_poll(mi, MC_FLAG_POLLED);
+    if (nr_unit)
+    {
+        x86_mcinfo_dump(mi);
+        adjust++;
+        if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA))
+            send_guest_global_virq(dom0, VIRQ_MCA);
+    }
+}
+
+static void mce_intel_work_fn(void *data)
+{
+    on_each_cpu(mce_intel_checkregs, data, 1, 1);
+    if (adjust) {
+        period = period / (adjust + 1);
+        printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval to %ld",
+            period);
+    }
+    else {
+        period *= 2;
+    }
+    if (period > MCE_MAX) 
+        period = MCE_MAX;
+    if (period < MCE_MIN)
+        period = MCE_MIN;
+    set_timer(&mce_timer, NOW() + MILLISECS(period));
+    adjust = 0;
+}
+
+void intel_mcheck_timer(struct cpuinfo_x86 *c)
+{
+    printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n");
+    init_timer(&mce_timer, mce_intel_work_fn, NULL, 0);
+    set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD));
+}
+
diff --git a/xen/arch/x86/cpu/mcheck/non-fatal.c b/xen/arch/x86/cpu/mcheck/non-fatal.c
index 4984eed757..35982a461b 100644
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c
@@ -19,8 +19,8 @@
 #include <asm/msr.h>
 
 #include "mce.h"
-
-static int firstbank;
+#include "x86_mca.h"
+int firstbank = 0;
 static struct timer mce_timer;
 
 #define MCE_PERIOD MILLISECS(15000)
@@ -61,13 +61,8 @@ static int __init init_nonfatal_mce_checker(void)
 	struct cpuinfo_x86 *c = &boot_cpu_data;
 
 	/* Check for MCE support */
-	if (!cpu_has(c, X86_FEATURE_MCE))
-		return -ENODEV;
-
-	/* Check for PPro style MCA */
-	if (!cpu_has(c, X86_FEATURE_MCA))
+	if (!mce_available(c))
 		return -ENODEV;
-
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
@@ -85,12 +80,20 @@ static int __init init_nonfatal_mce_checker(void)
 		break;
 
 	case X86_VENDOR_INTEL:
-		init_timer(&mce_timer, mce_work_fn, NULL, 0);
-		set_timer(&mce_timer, NOW() + MCE_PERIOD);
+		/* p5 family is different. P4/P6 and latest CPUs shares the
+		 * same polling methods
+		*/
+		if ( c->x86 != 5 )
+		{
+			/* some CPUs or banks don't support cmci, we need to 
+			 * enable this feature anyway
+			 */
+			intel_mcheck_timer(c);
+		}
 		break;
 	}
 
-	printk(KERN_INFO "MCA: Machine check polling timer started.\n");
+	printk(KERN_INFO "mcheck_poll: Machine check polling timer started.\n");
 	return 0;
 }
 __initcall(init_nonfatal_mce_checker);
diff --git a/xen/arch/x86/cpu/mcheck/p4.c b/xen/arch/x86/cpu/mcheck/p4.c
deleted file mode 100644
index bb758e8e37..0000000000
--- a/xen/arch/x86/cpu/mcheck/p4.c
+++ /dev/null
@@ -1,270 +0,0 @@
-/*
- * P4 specific Machine Check Exception Reporting
- */
-
-#include <xen/init.h>
-#include <xen/types.h>
-#include <xen/kernel.h>
-#include <xen/config.h>
-#include <xen/smp.h>
-#include <xen/irq.h>
-#include <xen/time.h>
-#include <asm/processor.h> 
-#include <asm/system.h>
-#include <asm/msr.h>
-#include <asm/apic.h>
-
-#include "mce.h"
-
-/* as supported by the P4/Xeon family */
-struct intel_mce_extended_msrs {
-	u32 eax;
-	u32 ebx;
-	u32 ecx;
-	u32 edx;
-	u32 esi;
-	u32 edi;
-	u32 ebp;
-	u32 esp;
-	u32 eflags;
-	u32 eip;
-	/* u32 *reserved[]; */
-};
-
-static int mce_num_extended_msrs = 0;
-
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
-static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
-{	
-	printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n",
-			smp_processor_id());
-	add_taint(TAINT_MACHINE_CHECK);
-}
-
-/* P4/Xeon Thermal transition interrupt handler */
-static void intel_thermal_interrupt(struct cpu_user_regs *regs)
-{
-	u32 l, h;
-	unsigned int cpu = smp_processor_id();
-	static s_time_t next[NR_CPUS];
-
-	ack_APIC_irq();
-
-	if (NOW() < next[cpu])
-		return;
-
-	next[cpu] = NOW() + MILLISECS(5000);
-	rdmsr(MSR_IA32_THERM_STATUS, l, h);
-	if (l & 0x1) {
-		printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu);
-		printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n",
-				cpu);
-		add_taint(TAINT_MACHINE_CHECK);
-	} else {
-		printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu);
-	}
-}
-
-/* Thermal interrupt handler for this CPU setup */
-static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs) = unexpected_thermal_interrupt;
-
-fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs)
-{
-	irq_enter();
-	vendor_thermal_interrupt(regs);
-	irq_exit();
-}
-
-/* P4/Xeon Thermal regulation detect and init */
-static void intel_init_thermal(struct cpuinfo_x86 *c)
-{
-	u32 l, h;
-	unsigned int cpu = smp_processor_id();
-
-	/* Thermal monitoring */
-	if (!cpu_has(c, X86_FEATURE_ACPI))
-		return;	/* -ENODEV */
-
-	/* Clock modulation */
-	if (!cpu_has(c, X86_FEATURE_ACC))
-		return;	/* -ENODEV */
-
-	/* first check if its enabled already, in which case there might
-	 * be some SMM goo which handles it, so we can't even put a handler
-	 * since it might be delivered via SMI already -zwanem.
-	 */
-	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
-	h = apic_read(APIC_LVTTHMR);
-	if ((l & (1<<3)) && (h & APIC_DM_SMI)) {
-		printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",
-				cpu);
-		return; /* -EBUSY */
-	}
-
-	/* check whether a vector already exists, temporarily masked? */	
-	if (h & APIC_VECTOR_MASK) {
-		printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already "
-				"installed\n",
-			cpu, (h & APIC_VECTOR_MASK));
-		return; /* -EBUSY */
-	}
-
-	/* The temperature transition interrupt handler setup */
-	h = THERMAL_APIC_VECTOR;		/* our delivery vector */
-	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
-	apic_write_around(APIC_LVTTHMR, h);
-
-	rdmsr (MSR_IA32_THERM_INTERRUPT, l, h);
-	wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
-
-	/* ok we're good to go... */
-	vendor_thermal_interrupt = intel_thermal_interrupt;
-	
-	rdmsr (MSR_IA32_MISC_ENABLE, l, h);
-	wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h);
-	
-	l = apic_read (APIC_LVTTHMR);
-	apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
-	printk (KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
-	return;
-}
-#endif /* CONFIG_X86_MCE_P4THERMAL */
-
-
-/* P4/Xeon Extended MCE MSR retrieval, return 0 if unsupported */
-static inline int intel_get_extended_msrs(struct intel_mce_extended_msrs *r)
-{
-	u32 h;
-
-	if (mce_num_extended_msrs == 0)
-		goto done;
-
-	rdmsr (MSR_IA32_MCG_EAX, r->eax, h);
-	rdmsr (MSR_IA32_MCG_EBX, r->ebx, h);
-	rdmsr (MSR_IA32_MCG_ECX, r->ecx, h);
-	rdmsr (MSR_IA32_MCG_EDX, r->edx, h);
-	rdmsr (MSR_IA32_MCG_ESI, r->esi, h);
-	rdmsr (MSR_IA32_MCG_EDI, r->edi, h);
-	rdmsr (MSR_IA32_MCG_EBP, r->ebp, h);
-	rdmsr (MSR_IA32_MCG_ESP, r->esp, h);
-	rdmsr (MSR_IA32_MCG_EFLAGS, r->eflags, h);
-	rdmsr (MSR_IA32_MCG_EIP, r->eip, h);
-
-	/* can we rely on kmalloc to do a dynamic
-	 * allocation for the reserved registers?
-	 */
-done:
-	return mce_num_extended_msrs;
-}
-
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
-{
-	int recover=1;
-	u32 alow, ahigh, high, low;
-	u32 mcgstl, mcgsth;
-	int i;
-	struct intel_mce_extended_msrs dbg;
-
-	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover=0;
-
-	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-		smp_processor_id(), mcgsth, mcgstl);
-
-	if (intel_get_extended_msrs(&dbg)) {
-		printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
-			smp_processor_id(), dbg.eip, dbg.eflags);
-		printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: %08x\n",
-			dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
-		printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: %08x\n",
-			dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
-	}
-
-	for (i=0; i<nr_mce_banks; i++) {
-		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
-		if (high & (1<<31)) {
-			if (high & (1<<29))
-				recover |= 1;
-			if (high & (1<<25))
-				recover |= 2;
-			printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
-			high &= ~(1<<31);
-			if (high & (1<<27)) {
-				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				printk ("[%08x%08x]", ahigh, alow);
-			}
-			if (high & (1<<26)) {
-				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				printk (" at %08x%08x", ahigh, alow);
-			}
-			printk ("\n");
-		}
-	}
-
-	if (recover & 2)
-		mc_panic ("CPU context corrupt");
-	if (recover & 1)
-		mc_panic ("Unable to continue");
-
-	printk(KERN_EMERG "Attempting to continue.\n");
-	/* 
-	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
-	 * recoverable/continuable.This will allow BIOS to look at the MSRs
-	 * for errors if the OS could not log the error.
-	 */
-	for (i=0; i<nr_mce_banks; i++) {
-		u32 msr;
-		msr = MSR_IA32_MC0_STATUS+i*4;
-		rdmsr (msr, low, high);
-		if (high&(1<<31)) {
-			/* Clear it */
-			wrmsr(msr, 0UL, 0UL);
-			/* Serialize */
-			wmb();
-			add_taint(TAINT_MACHINE_CHECK);
-		}
-	}
-	mcgstl &= ~(1<<2);
-	wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
-}
-
-
-void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
-{
-	u32 l, h;
-	int i;
-	
-	machine_check_vector = intel_machine_check;
-	wmb();
-
-	printk (KERN_INFO "Intel machine check architecture supported.\n");
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
-	if (l & (1<<8))	/* Control register present ? */
-		wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-	nr_mce_banks = l & 0xff;
-
-	for (i=0; i<nr_mce_banks; i++) {
-		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
-	}
-
-	set_in_cr4 (X86_CR4_MCE);
-	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
-		smp_processor_id());
-
-	/* Check for P4/Xeon extended MCE MSRs */
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
-	if (l & (1<<9))	{/* MCG_EXT_P */
-		mce_num_extended_msrs = (l >> 16) & 0xff;
-		printk (KERN_INFO "CPU%d: Intel P4/Xeon Extended MCE MSRs (%d)"
-				" available\n",
-			smp_processor_id(), mce_num_extended_msrs);
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
-		/* Check for P4/Xeon Thermal monitor */
-		intel_init_thermal(c);
-#endif
-	}
-}
diff --git a/xen/arch/x86/cpu/mcheck/p6.c b/xen/arch/x86/cpu/mcheck/p6.c
deleted file mode 100644
index a2f631ed27..0000000000
--- a/xen/arch/x86/cpu/mcheck/p6.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * P6 specific Machine Check Exception Reporting
- * (C) Copyright 2002 Alan Cox <alan@redhat.com>
- */
-
-#include <xen/init.h>
-#include <xen/types.h>
-#include <xen/kernel.h>
-#include <xen/smp.h>
-
-#include <asm/processor.h> 
-#include <asm/system.h>
-#include <asm/msr.h>
-
-#include "mce.h"
-
-/* Machine Check Handler For PII/PIII */
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code)
-{
-	int recover=1;
-	u32 alow, ahigh, high, low;
-	u32 mcgstl, mcgsth;
-	int i;
-
-	rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-	if (mcgstl & (1<<0))	/* Recoverable ? */
-		recover=0;
-
-	printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-		smp_processor_id(), mcgsth, mcgstl);
-
-	for (i=0; i<nr_mce_banks; i++) {
-		rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
-		if (high & (1<<31)) {
-			if (high & (1<<29))
-				recover |= 1;
-			if (high & (1<<25))
-				recover |= 2;
-			printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
-			high &= ~(1<<31);
-			if (high & (1<<27)) {
-				rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-				printk ("[%08x%08x]", ahigh, alow);
-			}
-			if (high & (1<<26)) {
-				rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-				printk (" at %08x%08x", ahigh, alow);
-			}
-			printk ("\n");
-		}
-	}
-
-	if (recover & 2)
-		mc_panic ("CPU context corrupt");
-	if (recover & 1)
-		mc_panic ("Unable to continue");
-
-	printk (KERN_EMERG "Attempting to continue.\n");
-	/* 
-	 * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
-	 * recoverable/continuable.This will allow BIOS to look at the MSRs
-	 * for errors if the OS could not log the error.
-	 */
-	for (i=0; i<nr_mce_banks; i++) {
-		unsigned int msr;
-		msr = MSR_IA32_MC0_STATUS+i*4;
-		rdmsr (msr,low, high);
-		if (high & (1<<31)) {
-			/* Clear it */
-			wrmsr (msr, 0UL, 0UL);
-			/* Serialize */
-			wmb();
-			add_taint(TAINT_MACHINE_CHECK);
-		}
-	}
-	mcgstl &= ~(1<<2);
-	wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
-}
-
-/* Set up machine check reporting for processors with Intel style MCE */
-void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
-{
-	u32 l, h;
-	int i;
-	
-	/* Check for MCE support */
-	if (!cpu_has(c, X86_FEATURE_MCE))
-		return;
-
-	/* Check for PPro style MCA */
- 	if (!cpu_has(c, X86_FEATURE_MCA))
-		return;
-
-	/* Ok machine check is available */
-	machine_check_vector = intel_machine_check;
-	wmb();
-
-	printk (KERN_INFO "Intel machine check architecture supported.\n");
-	rdmsr (MSR_IA32_MCG_CAP, l, h);
-	if (l & (1<<8))	/* Control register present ? */
-		wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
-	nr_mce_banks = l & 0xff;
-
-	/*
-	 * Following the example in IA-32 SDM Vol 3:
-	 * - MC0_CTL should not be written
-	 * - Status registers on all banks should be cleared on reset
-	 */
-	for (i=1; i<nr_mce_banks; i++)
-		wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
-
-	for (i=0; i<nr_mce_banks; i++)
-		wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
-
-	set_in_cr4 (X86_CR4_MCE);
-	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
-		smp_processor_id());
-}
diff --git a/xen/arch/x86/cpu/mcheck/x86_mca.h b/xen/arch/x86/cpu/mcheck/x86_mca.h
index b20808e451..df3899bbfe 100644
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h
@@ -28,7 +28,10 @@
 /* Bitfield of the MSR_IA32_MCG_CAP register */
 #define MCG_CAP_COUNT           0x00000000000000ffULL
 #define MCG_CTL_P               0x0000000000000100ULL
-/* Bits 9-63 are reserved */
+#define MCG_EXT_P		(1UL<<9)
+#define MCG_EXT_CNT		(16)
+#define MCG_CMCI_P		(1UL<<10)
+/* Other bits are reserved */
 
 /* Bitfield of the MSR_IA32_MCG_STATUS register */
 #define MCG_STATUS_RIPV         0x0000000000000001ULL
@@ -70,3 +73,17 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/*Intel Specific bitfield*/
+#define CMCI_THRESHOLD			0x2
+
+
+#define MAX_NR_BANKS 128
+
+typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS);
+DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned);
+
+/* Global variables */
+extern int mce_disabled;
+extern unsigned int nr_mce_banks;
+extern int firstbank;
+
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index f065b45494..50204e6087 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -2030,7 +2030,8 @@ static void vmx_do_extint(struct cpu_user_regs *regs)
     fastcall void smp_spurious_interrupt(struct cpu_user_regs *regs);
     fastcall void smp_error_interrupt(struct cpu_user_regs *regs);
     fastcall void smp_pmu_apic_interrupt(struct cpu_user_regs *regs);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+    fastcall void smp_cmci_interrupt(struct cpu_user_regs *regs);
+#ifdef CONFIG_X86_MCE_THERMAL
     fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs);
 #endif
 
@@ -2060,10 +2061,13 @@ static void vmx_do_extint(struct cpu_user_regs *regs)
     case ERROR_APIC_VECTOR:
         smp_error_interrupt(regs);
         break;
+    case CMCI_APIC_VECTOR:
+        smp_cmci_interrupt(regs);
+        break;
     case PMU_APIC_VECTOR:
         smp_pmu_apic_interrupt(regs);
         break;
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#ifdef CONFIG_X86_MCE_THERMAL
     case THERMAL_APIC_VECTOR:
         smp_thermal_interrupt(regs);
         break;
diff --git a/xen/arch/x86/i8259.c b/xen/arch/x86/i8259.c
index 79317085b7..f945f5df64 100644
--- a/xen/arch/x86/i8259.c
+++ b/xen/arch/x86/i8259.c
@@ -74,6 +74,7 @@ BUILD_SMP_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_SMP_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 BUILD_SMP_INTERRUPT(pmu_apic_interrupt,PMU_APIC_VECTOR)
 BUILD_SMP_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
+BUILD_SMP_INTERRUPT(cmci_interrupt, CMCI_APIC_VECTOR)
 
 #define IRQ(x,y) \
     IRQ##x##y##_interrupt
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index 2382fc3da7..4dce3d5a10 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -1237,11 +1237,25 @@ remove_siblinginfo(int cpu)
 }
 
 extern void fixup_irqs(cpumask_t map);
-int __cpu_disable(void)
+
+/*
+ * Functions called when offline cpu. 
+ * We need to process some new feature such as 
+ * CMCI owner change when do cpu hotplug in latest 
+ * Intel CPU families
+*/
+void (*cpu_down_handler)(int down_cpu) = NULL;
+void (*cpu_down_rollback_handler)(int down_cpu) = NULL;
+
+
+int __cpu_disable(int down_cpu)
 {
 	cpumask_t map = cpu_online_map;
 	int cpu = smp_processor_id();
 
+	/*Only down_cpu need to execute this function*/
+	if (cpu != down_cpu)
+		return 0;
 	/*
 	 * Perhaps use cpufreq to drop frequency, but that could go
 	 * into generic code.
@@ -1293,10 +1307,14 @@ void __cpu_die(unsigned int cpu)
 	}
  	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
-
-static int take_cpu_down(void *unused)
+static int take_cpu_down(void *down_cpu)
 {
-    return __cpu_disable();
+
+    if (cpu_down_handler)
+        cpu_down_handler(*(int *)down_cpu);
+    wmb();
+
+    return __cpu_disable(*(int *)down_cpu);
 }
 
 int cpu_down(unsigned int cpu)
@@ -1322,7 +1340,7 @@ int cpu_down(unsigned int cpu)
 
 	printk("Prepare to bring CPU%d down...\n", cpu);
 
-	err = stop_machine_run(take_cpu_down, NULL, cpu);
+	err = stop_machine_run(take_cpu_down, &cpu, cpu_online_map);
 	if ( err < 0 )
 		goto out;
 
@@ -1333,6 +1351,10 @@ int cpu_down(unsigned int cpu)
 		err = -EBUSY;
 	}
 out:
+	/*if cpu_offline failed, re-check cmci_owner*/
+
+	if ( err < 0 && cpu_down_rollback_handler) 
+		cpu_down_rollback_handler(cpu); 
 	spin_unlock(&cpu_add_remove_lock);
 	return err;
 }
diff --git a/xen/common/stop_machine.c b/xen/common/stop_machine.c
index f7d9071b66..7404a4996f 100644
--- a/xen/common/stop_machine.c
+++ b/xen/common/stop_machine.c
@@ -45,7 +45,7 @@ struct stopmachine_data {
     enum stopmachine_state state;
     atomic_t done;
 
-    unsigned int fn_cpu;
+    cpumask_t fn_cpus;
     int fn_result;
     int (*fn)(void *);
     void *fn_data;
@@ -63,21 +63,22 @@ static void stopmachine_set_state(enum stopmachine_state state)
         cpu_relax();
 }
 
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
+int stop_machine_run(int (*fn)(void *), void *data, cpumask_t cpus)
 {
     cpumask_t allbutself;
     unsigned int i, nr_cpus;
-    int ret;
+    int cur_cpu, ret;
 
     BUG_ON(!local_irq_is_enabled());
 
     allbutself = cpu_online_map;
-    cpu_clear(smp_processor_id(), allbutself);
+    cur_cpu = smp_processor_id();
+    cpu_clear(cur_cpu, allbutself);
     nr_cpus = cpus_weight(allbutself);
 
     if ( nr_cpus == 0 )
     {
-        BUG_ON(cpu != smp_processor_id());
+        BUG_ON(!cpu_isset(cur_cpu, cpus));
         return (*fn)(data);
     }
 
@@ -91,7 +92,8 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
     stopmachine_data.fn = fn;
     stopmachine_data.fn_data = data;
     stopmachine_data.nr_cpus = nr_cpus;
-    stopmachine_data.fn_cpu = cpu;
+    stopmachine_data.fn_cpus = cpus;
+    stopmachine_data.fn_result = 0;
     atomic_set(&stopmachine_data.done, 0);
     stopmachine_data.state = STOPMACHINE_START;
 
@@ -105,8 +107,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
     local_irq_disable();
     stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
 
-    if ( cpu == smp_processor_id() )
-        stopmachine_data.fn_result = (*fn)(data);
+    /* callback will run on each cpu of the input map.
+     * If callback fails on any CPU, the stop_machine_run
+     * will return the  *ORed* the failure
+     */
+    if ( cpu_isset(cur_cpu, cpus) ){
+        stopmachine_data.fn_result |= (*fn)(data);
+    }
     stopmachine_set_state(STOPMACHINE_INVOKE);
     ret = stopmachine_data.fn_result;
 
@@ -121,7 +128,6 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 static void stopmachine_softirq(void)
 {
     enum stopmachine_state state = STOPMACHINE_START;
-
     smp_mb();
 
     while ( state != STOPMACHINE_EXIT )
@@ -136,10 +142,11 @@ static void stopmachine_softirq(void)
             local_irq_disable();
             break;
         case STOPMACHINE_INVOKE:
-            if ( stopmachine_data.fn_cpu == smp_processor_id() )
-                stopmachine_data.fn_result =
+            if ( cpu_isset(smp_processor_id(), stopmachine_data.fn_cpus )) {
+                stopmachine_data.fn_result |= 
                     stopmachine_data.fn(stopmachine_data.fn_data);
-            break;
+            }
+           break;
         default:
             break;
         }
diff --git a/xen/include/asm-x86/apicdef.h b/xen/include/asm-x86/apicdef.h
index 134ac890e4..0eb84d90d7 100644
--- a/xen/include/asm-x86/apicdef.h
+++ b/xen/include/asm-x86/apicdef.h
@@ -80,6 +80,8 @@
 #define		APIC_LVTTHMR	0x330
 #define		APIC_LVTPC	0x340
 #define		APIC_LVT0	0x350
+#define		APIC_CMCI	0x2F0
+
 #define			APIC_LVT_TIMER_BASE_MASK	(0x3<<18)
 #define			GET_APIC_TIMER_BASE(x)		(((x)>>18)&0x3)
 #define			SET_APIC_TIMER_BASE(x)		(((x)<<18))
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
index aa9b234370..b490887a64 100644
--- a/xen/include/asm-x86/config.h
+++ b/xen/include/asm-x86/config.h
@@ -22,7 +22,7 @@
 #define CONFIG_X86_IO_APIC 1
 #define CONFIG_X86_PM_TIMER 1
 #define CONFIG_HPET_TIMER 1
-#define CONFIG_X86_MCE_P4THERMAL 1
+#define CONFIG_X86_MCE_THERMAL 1
 #define CONFIG_NUMA 1
 #define CONFIG_DISCONTIGMEM 1
 #define CONFIG_NUMA_EMU 1
diff --git a/xen/include/asm-x86/irq.h b/xen/include/asm-x86/irq.h
index 920ac7f85e..f0f4bfda43 100644
--- a/xen/include/asm-x86/irq.h
+++ b/xen/include/asm-x86/irq.h
@@ -33,6 +33,7 @@ fastcall void error_interrupt(void);
 fastcall void pmu_apic_interrupt(void);
 fastcall void spurious_interrupt(void);
 fastcall void thermal_interrupt(void);
+fastcall void cmci_interrupt(void);
 
 void disable_8259A_irq(unsigned int irq);
 void enable_8259A_irq(unsigned int irq);
diff --git a/xen/include/asm-x86/mach-default/irq_vectors.h b/xen/include/asm-x86/mach-default/irq_vectors.h
index 057b2a35b8..30c3f89daf 100644
--- a/xen/include/asm-x86/mach-default/irq_vectors.h
+++ b/xen/include/asm-x86/mach-default/irq_vectors.h
@@ -10,13 +10,13 @@
 #define THERMAL_APIC_VECTOR	0xfa
 #define LOCAL_TIMER_VECTOR	0xf9
 #define PMU_APIC_VECTOR 	0xf8
-
+#define CMCI_APIC_VECTOR	0xf7
 /*
  * High-priority dynamically-allocated vectors. For interrupts that
  * must be higher priority than any guest-bound interrupt.
  */
 #define FIRST_HIPRIORITY_VECTOR	0xf0
-#define LAST_HIPRIORITY_VECTOR  0xf7
+#define LAST_HIPRIORITY_VECTOR  0xf6
 
 /* Legacy PIC uses vectors 0xe0-0xef. */
 #define FIRST_LEGACY_VECTOR	0xe0
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
index d3fc446271..4f91ca152d 100644
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -92,8 +92,10 @@
 #define MSR_IA32_MC0_STATUS		0x00000401
 #define MSR_IA32_MC0_ADDR		0x00000402
 #define MSR_IA32_MC0_MISC		0x00000403
+#define MSR_IA32_MC0_CTL2		0x00000280
+#define CMCI_EN 			(1UL<<30)
+#define CMCI_THRESHOLD_MASK		0x7FFF
 
-#define MSR_IA32_MC1_CTL		0x00000404
 #define MSR_IA32_MC1_STATUS		0x00000405
 #define MSR_IA32_MC1_ADDR		0x00000406
 #define MSR_IA32_MC1_MISC		0x00000407
diff --git a/xen/include/asm-x86/smp.h b/xen/include/asm-x86/smp.h
index 2078d441ec..e48526ae95 100644
--- a/xen/include/asm-x86/smp.h
+++ b/xen/include/asm-x86/smp.h
@@ -101,7 +101,7 @@ static __inline int logical_smp_processor_id(void)
 
 #endif
 
-extern int __cpu_disable(void);
+extern int __cpu_disable(int down_cpu);
 extern void __cpu_die(unsigned int cpu);
 #endif /* !__ASSEMBLY__ */
 
diff --git a/xen/include/public/arch-x86/xen-mca.h b/xen/include/public/arch-x86/xen-mca.h
index 103d41fd3d..60ae871947 100644
--- a/xen/include/public/arch-x86/xen-mca.h
+++ b/xen/include/public/arch-x86/xen-mca.h
@@ -106,7 +106,10 @@ struct mcinfo_common {
 
 #define MC_FLAG_CORRECTABLE     (1 << 0)
 #define MC_FLAG_UNCORRECTABLE   (1 << 1)
-
+#define MC_FLAG_RECOVERABLE	(1 << 2)
+#define MC_FLAG_POLLED		(1 << 3)
+#define MC_FLAG_RESET		(1 << 4)
+#define MC_FLAG_CMCI		(1 << 5)
 /* contains global x86 mc information */
 struct mcinfo_global {
     struct mcinfo_common common;
@@ -115,6 +118,7 @@ struct mcinfo_global {
     uint16_t mc_domid;
     uint32_t mc_socketid; /* physical socket of the physical core */
     uint16_t mc_coreid; /* physical impacted core */
+    uint8_t  mc_apicid;
     uint16_t mc_core_threadid; /* core thread of physical core */
     uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
     uint64_t mc_gstatus; /* global status */
@@ -132,6 +136,8 @@ struct mcinfo_bank {
     uint64_t mc_addr;   /* bank address, only valid
                          * if addr bit is set in mc_status */
     uint64_t mc_misc;
+    uint64_t mc_ctrl2;
+    uint64_t mc_tsc;
 };
 
 
@@ -150,7 +156,12 @@ struct mcinfo_extended {
      * multiple times. */
 
     uint32_t mc_msrs; /* Number of msr with valid values. */
-    struct mcinfo_msr mc_msr[5];
+    /*
+     * Currently Intel extended MSR (32/64) including all gp registers
+     * and E(R)DI, E(R)BP, E(R)SP, E(R)FLAGS, E(R)IP, E(R)MISC, only 10
+     * of them might be useful. So expend this array to 10.
+    */
+    struct mcinfo_msr mc_msr[10];
 };
 
 #define MCINFO_HYPERCALLSIZE	1024
diff --git a/xen/include/xen/stop_machine.h b/xen/include/xen/stop_machine.h
index 7d4506869f..750e19ad13 100644
--- a/xen/include/xen/stop_machine.h
+++ b/xen/include/xen/stop_machine.h
@@ -5,7 +5,7 @@
  * stop_machine_run: freeze the machine on all CPUs and run this function
  * @fn: the function to run
  * @data: the data ptr for the @fn()
- * @cpu: the cpu to run @fn() on (or any, if @cpu == NR_CPUS).
+ * @cpus: cpus to run @fn() on.
  *
  * Description: This causes every other cpu to enter a safe point, with
  * each of which disables interrupts, and finally interrupts are disabled
@@ -14,6 +14,6 @@
  *
  * This can be thought of as a very heavy write lock, equivalent to
  * grabbing every spinlock in the kernel. */
-int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu);
+int stop_machine_run(int (*fn)(void *), void *data, cpumask_t cpu);
 
 #endif /* __XEN_STOP_MACHINE_H__ */