x86: introduce and use scratch CPU mask
authorJan Beulich <jbeulich@suse.com>
Fri, 16 Dec 2016 13:34:34 +0000 (14:34 +0100)
committerJan Beulich <jbeulich@suse.com>
Fri, 16 Dec 2016 13:34:34 +0000 (14:34 +0100)
__get_page_type(), so far using an on-stack CPU mask variable, is
involved in recursion when e.g. pinning page tables. This means there
may be up to five instances of the function active at a time, implying
five instances of the (up to 512 bytes large) CPU mask variable. An IRQ
happening at the deepest point of the stack has been observed to cause
a stack overflow with a 4095-pCPU build, when the IRQ handling results
in send_guest_pirq() being called (leading to vcpu_kick() -> ... ->
csched_vcpu_wake() -> __runq_tickle() -> cpumask_raise_softirq(), the
last two of which also have CPU mask variables on their stacks).

Introduce a per-CPU variable instead, which can then be used by any
code never running in IRQ context.

The mask can then also be used by other MMU code as well as by
msi_compose_msg() (and quite likely we'll find further uses down the
road).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
xen/arch/x86/mm.c
xen/arch/x86/msi.c
xen/arch/x86/smpboot.c
xen/include/asm-x86/smp.h

index 24a521138a5ea0ea89bbbaac06f4649604ab3dea..8a9fe8fa9a4db157e0ed66da35f0b398e5389745 100644 (file)
@@ -2477,6 +2477,7 @@ static int __get_page_type(struct page_info *page, unsigned long type,
     int rc = 0, iommu_ret = 0;
 
     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
+    ASSERT(!in_irq());
 
     for ( ; ; )
     {
@@ -2509,20 +2510,21 @@ static int __get_page_type(struct page_info *page, unsigned long type,
                  * may be unnecessary (e.g., page was GDT/LDT) but those 
                  * circumstances should be very rare.
                  */
-                cpumask_t mask;
+                cpumask_t *mask = this_cpu(scratch_cpumask);
 
-                cpumask_copy(&mask, d->domain_dirty_cpumask);
+                BUG_ON(in_irq());
+                cpumask_copy(mask, d->domain_dirty_cpumask);
 
                 /* Don't flush if the timestamp is old enough */
-                tlbflush_filter(&mask, page->tlbflush_timestamp);
+                tlbflush_filter(mask, page->tlbflush_timestamp);
 
-                if ( unlikely(!cpumask_empty(&mask)) &&
+                if ( unlikely(!cpumask_empty(mask)) &&
                      /* Shadow mode: track only writable pages. */
                      (!shadow_mode_enabled(page_get_owner(page)) ||
                       ((nx & PGT_type_mask) == PGT_writable_page)) )
                 {
                     perfc_incr(need_flush_tlb_flush);
-                    flush_tlb_mask(&mask);
+                    flush_tlb_mask(mask);
                 }
 
                 /* We lose existing type and validity. */
@@ -3403,22 +3405,22 @@ long do_mmuext_op(
         case MMUEXT_TLB_FLUSH_MULTI:
         case MMUEXT_INVLPG_MULTI:
         {
-            cpumask_t pmask;
+            cpumask_t *mask = this_cpu(scratch_cpumask);
 
             if ( unlikely(d != pg_owner) )
                 rc = -EPERM;
             else if ( unlikely(vcpumask_to_pcpumask(d,
                                    guest_handle_to_param(op.arg2.vcpumask,
                                                          const_void),
-                                   &pmask)) )
+                                   mask)) )
                 rc = -EINVAL;
             if ( unlikely(rc) )
                 break;
 
             if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
-                flush_tlb_mask(&pmask);
+                flush_tlb_mask(mask);
             else if ( __addr_ok(op.arg1.linear_addr) )
-                flush_tlb_one_mask(&pmask, op.arg1.linear_addr);
+                flush_tlb_one_mask(mask, op.arg1.linear_addr);
             break;
         }
 
@@ -3456,14 +3458,14 @@ long do_mmuext_op(
             else if ( likely(cache_flush_permitted(d)) )
             {
                 unsigned int cpu;
-                cpumask_t mask;
+                cpumask_t *mask = this_cpu(scratch_cpumask);
 
-                cpumask_clear(&mask);
+                cpumask_clear(mask);
                 for_each_online_cpu(cpu)
-                    if ( !cpumask_intersects(&mask,
+                    if ( !cpumask_intersects(mask,
                                              per_cpu(cpu_sibling_mask, cpu)) )
-                        __cpumask_set_cpu(cpu, &mask);
-                flush_mask(&mask, FLUSH_CACHE);
+                        __cpumask_set_cpu(cpu, mask);
+                flush_mask(mask, FLUSH_CACHE);
             }
             else
             {
@@ -4458,7 +4460,7 @@ static int __do_update_va_mapping(
     struct page_info *gl1pg;
     l1_pgentry_t  *pl1e;
     unsigned long  bmap_ptr, gl1mfn;
-    cpumask_t      pmask;
+    cpumask_t     *mask = NULL;
     int            rc;
 
     perfc_incr(calls_to_update_va);
@@ -4504,15 +4506,17 @@ static int __do_update_va_mapping(
             flush_tlb_local();
             break;
         case UVMF_ALL:
-            flush_tlb_mask(d->domain_dirty_cpumask);
+            mask = d->domain_dirty_cpumask;
             break;
         default:
+            mask = this_cpu(scratch_cpumask);
             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
                                                                      void),
-                                      &pmask);
-            flush_tlb_mask(&pmask);
+                                      mask);
             break;
         }
+        if ( mask )
+            flush_tlb_mask(mask);
         break;
 
     case UVMF_INVLPG:
@@ -4522,15 +4526,17 @@ static int __do_update_va_mapping(
             paging_invlpg(v, va);
             break;
         case UVMF_ALL:
-            flush_tlb_one_mask(d->domain_dirty_cpumask, va);
+            mask = d->domain_dirty_cpumask;
             break;
         default:
+            mask = this_cpu(scratch_cpumask);
             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
                                                                      void),
-                                      &pmask);
-            flush_tlb_one_mask(&pmask, va);
+                                      mask);
             break;
         }
+        if ( mask )
+            flush_tlb_one_mask(mask, va);
         break;
     }
 
index 2c38adb1b1ce9d1753109a44471eae5bbe0c948a..6756a14f1e46462b4c9678cc797e863580d7b2d7 100644 (file)
@@ -40,7 +40,6 @@ static void __pci_disable_msix(struct msi_desc *);
 /* bitmap indicate which fixed map is free */
 static DEFINE_SPINLOCK(msix_fixmap_lock);
 static DECLARE_BITMAP(msix_fixmap_pages, FIX_MSIX_MAX_PAGES);
-static DEFINE_PER_CPU(cpumask_var_t, scratch_mask);
 
 static int msix_fixmap_alloc(void)
 {
@@ -167,7 +166,7 @@ void msi_compose_msg(unsigned vector, const cpumask_t *cpu_mask, struct msi_msg
 
     if ( cpu_mask )
     {
-        cpumask_t *mask = this_cpu(scratch_mask);
+        cpumask_t *mask = this_cpu(scratch_cpumask);
 
         if ( !cpumask_intersects(cpu_mask, &cpu_online_map) )
             return;
@@ -1458,43 +1457,12 @@ int pci_restore_msi_state(struct pci_dev *pdev)
     return 0;
 }
 
-static int msi_cpu_callback(
-    struct notifier_block *nfb, unsigned long action, void *hcpu)
-{
-    unsigned int cpu = (unsigned long)hcpu;
-
-    switch ( action )
-    {
-    case CPU_UP_PREPARE:
-        if ( !alloc_cpumask_var(&per_cpu(scratch_mask, cpu)) )
-            return notifier_from_errno(ENOMEM);
-        break;
-    case CPU_UP_CANCELED:
-    case CPU_DEAD:
-        free_cpumask_var(per_cpu(scratch_mask, cpu));
-        break;
-    default:
-        break;
-    }
-
-    return NOTIFY_DONE;
-}
-
-static struct notifier_block msi_cpu_nfb = {
-    .notifier_call = msi_cpu_callback
-};
-
 void __init early_msi_init(void)
 {
     if ( use_msi < 0 )
         use_msi = !(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI);
     if ( !use_msi )
         return;
-
-    register_cpu_notifier(&msi_cpu_nfb);
-    if ( msi_cpu_callback(&msi_cpu_nfb, CPU_UP_PREPARE, NULL) &
-         NOTIFY_STOP_MASK )
-        BUG();
 }
 
 static void dump_msi(unsigned char key)
index 3a9dd3ea9c54c557b86cc519b4fc9d9341042b80..ba651fb03868c9065e1f7459dfd32d4e6c6fab78 100644 (file)
@@ -56,6 +56,8 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_sibling_mask);
 /* representing HT and core siblings of each logical CPU */
 DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_core_mask);
 
+DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, scratch_cpumask);
+
 cpumask_t cpu_online_map __read_mostly;
 EXPORT_SYMBOL(cpu_online_map);
 
@@ -646,6 +648,7 @@ static void cpu_smpboot_free(unsigned int cpu)
 
     free_cpumask_var(per_cpu(cpu_sibling_mask, cpu));
     free_cpumask_var(per_cpu(cpu_core_mask, cpu));
+    free_cpumask_var(per_cpu(scratch_cpumask, cpu));
 
     if ( per_cpu(stubs.addr, cpu) )
     {
@@ -734,7 +737,8 @@ static int cpu_smpboot_alloc(unsigned int cpu)
         goto oom;
 
     if ( zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, cpu)) &&
-         zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) )
+         zalloc_cpumask_var(&per_cpu(cpu_core_mask, cpu)) &&
+         alloc_cpumask_var(&per_cpu(scratch_cpumask, cpu)) )
         return 0;
 
  oom:
@@ -791,7 +795,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
         panic("No memory for socket CPU siblings map");
 
     if ( !zalloc_cpumask_var(&per_cpu(cpu_sibling_mask, 0)) ||
-         !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) )
+         !zalloc_cpumask_var(&per_cpu(cpu_core_mask, 0)) ||
+         !alloc_cpumask_var(&per_cpu(scratch_cpumask, 0)) )
         panic("No memory for boot CPU sibling/core maps");
 
     set_cpu_sibling_map(0);
index e3782bbef1a467d7f3387b7e597a03a03466a3b6..95d6cd29fe72160b4b4a6269099b19a702680afc 100644 (file)
@@ -25,6 +25,7 @@
  */
 DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_mask);
 DECLARE_PER_CPU(cpumask_var_t, cpu_core_mask);
+DECLARE_PER_CPU(cpumask_var_t, scratch_cpumask);
 
 void smp_send_nmi_allbutself(void);