x86/EPT: force re-evaluation of memory type as necessary
authorJan Beulich <jbeulich@suse.com>
Thu, 10 Apr 2014 14:01:41 +0000 (16:01 +0200)
committerJan Beulich <jbeulich@suse.com>
Thu, 10 Apr 2014 14:01:41 +0000 (16:01 +0200)
The main goal here is to drop the bogus dependency of
epte_get_entry_emt() on d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT].

Any change to state influencing epte_get_entry_emt()'s decision needs
to result in re-calculation. Do this by using the EPT_MISCONFIG VM
exit, storing an invalid memory type into EPT's emt field (leaving the
IOMMU, which doesn't care about memory types, unaffected).

This is being done in a hierarchical manner to keep execution time
down: Initially only the top level directory gets invalidated this way.
Upon access, the involved intermediate page table levels get cleared
back to zero, and the leaf entry gets its field properly set. For 4k
leaves all other entries in the same directory also get processed to
amortize the cost of the extra VM exit (which halved the number of
these VM exits in my testing).

This restoring can result in spurious EPT_MISCONFIG VM exits (since
two vCPU-s may access addresses involving identical page table
structures). Rather than simply returning in such cases (and risking
that such a VM exit results from a real mis-configuration, which
would then result in an endless loop rather than killing the VM), a
per-vCPU flag is being introduced indicating when such a spurious VM
exit might validly happen - if another one occurs right after VM re-
entry, the flag would generally end up being clear, causing the VM
to be killed as before on such VM exits.

Note that putting a reserved memory type value in the EPT structures
isn't formally sanctioned by the specification. Intel isn't willing to
adjust the specification to make this or a similar use of the
EPT_MISCONFIG VM exit formally possible, but they have indicated that
us using this is low risk wrt forward compatibility.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Tim Deegan <tim@xen.org>
Acked-by: Kevin Tian <kevin.tian@intel.com>
xen/arch/x86/domctl.c
xen/arch/x86/hvm/hvm.c
xen/arch/x86/hvm/mtrr.c
xen/arch/x86/hvm/vmx/vmx.c
xen/arch/x86/mm/p2m-ept.c
xen/arch/x86/mm/p2m.c
xen/common/domctl.c
xen/include/asm-x86/hvm/vmx/vmcs.h
xen/include/asm-x86/hvm/vmx/vmx.h
xen/include/asm-x86/mtrr.h
xen/include/asm-x86/p2m.h

index 26635ffc47945bbe2ab3b437b5bf2c742e90ad02..d626e6f455bb8bd0d75c0a1e8f8616eb7b6e8946 100644 (file)
@@ -83,6 +83,8 @@ long arch_do_domctl(
             ret = ioports_permit_access(d, fp, fp + np - 1);
         else
             ret = ioports_deny_access(d, fp, fp + np - 1);
+        if ( !ret )
+            memory_type_changed(d);
     }
     break;
 
@@ -706,6 +708,8 @@ long arch_do_domctl(
                        ret, add ? "removing" : "denying", d->domain_id,
                        mfn, mfn + nr_mfns - 1);
         }
+        /* Do this unconditionally to cover errors on above failure paths. */
+        memory_type_changed(d);
     }
     break;
 
@@ -792,6 +796,8 @@ long arch_do_domctl(
                        "ioport_map: error %ld denying dom%d access to [%x,%x]\n",
                        ret, d->domain_id, fmp, fmp + np - 1);
         }
+        if ( !ret )
+            memory_type_changed(d);
     }
     break;
 
index 8c3797e30592986a698d7da5c161d46b26fbf355..46634cd5bc097d4dec4993d0de9d89022ad6fc16 100644 (file)
@@ -252,6 +252,9 @@ int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat)
 
     if ( !hvm_funcs.set_guest_pat(v, guest_pat) )
         v->arch.hvm_vcpu.pat_cr = guest_pat;
+
+    memory_type_changed(v->domain);
+
     return 1;
 }
 
index 45c8e7b463d1a25a4014f01c2431b42cdb10de96..ae24efa9727444188fc33778c8034f249f5d5e7b 100644 (file)
@@ -431,8 +431,12 @@ bool_t mtrr_def_type_msr_set(struct domain *d, struct mtrr_state *m,
          return 0;
     }
 
-    m->enabled = enabled;
-    m->def_type = def_type;
+    if ( m->enabled != enabled || m->def_type != def_type )
+    {
+        m->enabled = enabled;
+        m->def_type = def_type;
+        memory_type_changed(d);
+    }
 
     return 1;
 }
@@ -452,6 +456,7 @@ bool_t mtrr_fix_range_msr_set(struct domain *d, struct mtrr_state *m,
                 return 0;
 
         fixed_range_base[row] = msr_content;
+        memory_type_changed(d);
     }
 
     return 1;
@@ -496,6 +501,8 @@ bool_t mtrr_var_range_msr_set(
 
     m->overlapped = is_var_mtrr_overlapped(m);
 
+    memory_type_changed(d);
+
     return 1;
 }
 
@@ -690,6 +697,12 @@ static int hvm_load_mtrr_msr(struct domain *d, hvm_domain_context_t *h)
 HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr,
                           1, HVMSR_PER_VCPU);
 
+void memory_type_changed(struct domain *d)
+{
+    if ( iommu_enabled && !iommu_snoop && d->vcpu && d->vcpu[0] )
+        p2m_memory_type_changed(d);
+}
+
 uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn,
                            uint8_t *ipat, bool_t direct_mmio)
 {
@@ -733,8 +746,7 @@ uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn,
         return MTRR_TYPE_WRBACK;
     }
 
-    gmtrr_mtype = is_hvm_domain(d) && v &&
-                  d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] ?
+    gmtrr_mtype = is_hvm_domain(d) && v ?
                   get_mtrr_type(&v->arch.hvm_vcpu.mtrr, (gfn << PAGE_SHIFT)) :
                   MTRR_TYPE_WRBACK;
 
index 77ce16787281ef3566b0efbdfe46243d60df9742..180cf6ccc9844e7ef7c14b7dea6d1044fcdfec9c 100644 (file)
@@ -3016,6 +3016,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs)
         break;
     }
 
+    case EXIT_REASON_EPT_MISCONFIG:
+    {
+        paddr_t gpa;
+
+        __vmread(GUEST_PHYSICAL_ADDRESS, &gpa);
+        if ( !ept_handle_misconfig(gpa) )
+            goto exit_and_crash;
+        break;
+    }
+
     case EXIT_REASON_MONITOR_TRAP_FLAG:
         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
         vmx_update_cpu_exec_control(v);
index 99a10845bae69b99e5f6604abe944e3dc366a50a..342fc7048b7f4d78ee50b876b28ce8fdc1eccf15 100644 (file)
@@ -270,6 +270,125 @@ static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
     return GUEST_TABLE_NORMAL_PAGE;
 }
 
+static bool_t ept_invalidate_emt(mfn_t mfn)
+{
+    ept_entry_t *epte = map_domain_page(mfn_x(mfn));
+    unsigned int i;
+    bool_t changed = 0;
+
+    for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
+    {
+        ept_entry_t e = atomic_read_ept_entry(&epte[i]);
+
+        if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
+             e.emt == MTRR_NUM_TYPES )
+            continue;
+
+        e.emt = MTRR_NUM_TYPES;
+        atomic_write_ept_entry(&epte[i], e);
+        changed = 1;
+    }
+
+    unmap_domain_page(epte);
+
+    return changed;
+}
+
+bool_t ept_handle_misconfig(uint64_t gpa)
+{
+    struct vcpu *curr = current;
+    struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
+    struct ept_data *ept = &p2m->ept;
+    unsigned int level = ept_get_wl(ept);
+    unsigned long gfn = PFN_DOWN(gpa);
+    unsigned long mfn = ept_get_asr(ept);
+    ept_entry_t *epte;
+    int okay;
+
+    if ( !mfn )
+        return 0;
+
+    p2m_lock(p2m);
+
+    okay = -curr->arch.hvm_vmx.ept_spurious_misconfig;
+    for ( ; ; --level )
+    {
+        ept_entry_t e;
+        unsigned int i;
+
+        epte = map_domain_page(mfn);
+        i = (gfn >> (level * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
+        e = atomic_read_ept_entry(&epte[i]);
+
+        if ( level == 0 || is_epte_superpage(&e) )
+        {
+            uint8_t ipat = 0;
+
+            if ( e.emt != MTRR_NUM_TYPES )
+                break;
+
+            if ( level == 0 )
+            {
+                for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i )
+                {
+                    e = atomic_read_ept_entry(&epte[i]);
+                    if ( e.emt == MTRR_NUM_TYPES )
+                        e.emt = 0;
+                    if ( !is_epte_valid(&e) || !is_epte_present(&e) )
+                        continue;
+                    e.emt = epte_get_entry_emt(p2m->domain, gfn + i,
+                                               _mfn(e.mfn), &ipat,
+                                               e.sa_p2mt == p2m_mmio_direct);
+                    e.ipat = ipat;
+                    atomic_write_ept_entry(&epte[i], e);
+                }
+            }
+            else
+            {
+                e.emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn),
+                                           &ipat,
+                                           e.sa_p2mt == p2m_mmio_direct);
+                e.ipat = ipat;
+                atomic_write_ept_entry(&epte[i], e);
+            }
+
+            okay = 1;
+            break;
+        }
+
+        if ( e.emt == MTRR_NUM_TYPES )
+        {
+            ASSERT(is_epte_present(&e));
+            ept_invalidate_emt(_mfn(e.mfn));
+            smp_wmb();
+            e.emt = 0;
+            atomic_write_ept_entry(&epte[i], e);
+            unmap_domain_page(epte);
+            okay = 1;
+        }
+        else if ( is_epte_present(&e) && !e.emt )
+            unmap_domain_page(epte);
+        else
+            break;
+
+        mfn = e.mfn;
+    }
+
+    unmap_domain_page(epte);
+    if ( okay > 0 )
+    {
+        struct vcpu *v;
+
+        for_each_vcpu ( curr->domain, v )
+            v->arch.hvm_vmx.ept_spurious_misconfig = 1;
+    }
+    curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
+    ept_sync_domain(p2m);
+    p2m_unlock(p2m);
+
+    return !!okay;
+}
+
 /*
  * ept_set_entry() computes 'need_modify_vtd_table' for itself,
  * by observing whether any gfn->mfn translations are modified.
@@ -660,6 +779,17 @@ static void ept_change_entry_type_global(struct p2m_domain *p2m,
     ept_sync_domain(p2m);
 }
 
+static void ept_memory_type_changed(struct p2m_domain *p2m)
+{
+    unsigned long mfn = ept_get_asr(&p2m->ept);
+
+    if ( !mfn )
+        return;
+
+    if ( ept_invalidate_emt(_mfn(mfn)) )
+        ept_sync_domain(p2m);
+}
+
 static void __ept_sync_domain(void *info)
 {
     struct ept_data *ept = &((struct p2m_domain *)info)->ept;
@@ -697,6 +827,7 @@ int ept_p2m_init(struct p2m_domain *p2m)
     p2m->set_entry = ept_set_entry;
     p2m->get_entry = ept_get_entry;
     p2m->change_entry_type_global = ept_change_entry_type_global;
+    p2m->memory_type_changed = ept_memory_type_changed;
     p2m->audit_p2m = NULL;
 
     /* Set the memory type used when accessing EPT paging structures. */
@@ -737,6 +868,7 @@ static void ept_dump_p2m_table(unsigned char key)
         [MTRR_TYPE_WRTHROUGH]      = "WT",
         [MTRR_TYPE_WRPROT]         = "WP",
         [MTRR_TYPE_WRBACK]         = "WB",
+        [MTRR_NUM_TYPES]           = "??"
     };
 
     for_each_domain(d)
@@ -750,11 +882,16 @@ static void ept_dump_p2m_table(unsigned char key)
 
         for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += 1UL << order )
         {
+            char c = 0;
+
             gfn_remainder = gfn;
             table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m)));
 
             for ( i = ept_get_wl(ept); i > 0; i-- )
             {
+                ept_entry = table + (gfn_remainder >> (i * EPT_TABLE_ORDER));
+                if ( ept_entry->emt == MTRR_NUM_TYPES )
+                    c = '?';
                 ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
                 if ( ret != GUEST_TABLE_NORMAL_PAGE )
                     break;
@@ -775,7 +912,7 @@ static void ept_dump_p2m_table(unsigned char key)
                            memory_types[ept_entry->emt][0],
                            memory_types[ept_entry->emt][1]
                            ?: ept_entry->emt + '0',
-                           ept_entry->ipat ? '!' : ' ');
+                           c ?: ept_entry->ipat ? '!' : ' ');
 
                 if ( !(record_counter++ % 100) )
                     process_pending_softirqs();
index c38f334a88dd1d805cfd9406e3db3ca0463625ea..d2de237d97bf7f1ed96d3ea9a8c18792ca211d4a 100644 (file)
@@ -200,6 +200,18 @@ void p2m_change_entry_type_global(struct domain *d,
     p2m_unlock(p2m);
 }
 
+void p2m_memory_type_changed(struct domain *d)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    if ( p2m->memory_type_changed )
+    {
+        p2m_lock(p2m);
+        p2m->memory_type_changed(p2m);
+        p2m_unlock(p2m);
+    }
+}
+
 mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn,
                     p2m_type_t *t, p2m_access_t *a, p2m_query_t q,
                     unsigned int *page_order, bool_t locked)
index 5342e5df719d553a6890520ec796be7e7bb53a75..5e807abef7fe8470c1fce0add363724b3c634d17 100644 (file)
@@ -815,6 +815,10 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
             ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
         else
             ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
+#ifdef CONFIG_X86
+        if ( !ret )
+            memory_type_changed(d);
+#endif
     }
     break;
 
index 45dcfa1deff47153d322b4b7c6e42460f6b2bbe5..445b39f63bb67efdb10133c38430714d770434c0 100644 (file)
@@ -124,6 +124,9 @@ struct arch_vmx_struct {
 
     unsigned long        host_cr0;
 
+    /* Do we need to tolerate a spurious EPT_MISCONFIG VM exit? */
+    bool_t               ept_spurious_misconfig;
+
     /* Is the guest in real mode? */
     uint8_t              vmx_realmode;
     /* Are we emulating rather than VMENTERing? */
index d403099003ada2278631837fea04c1a41e7c9df4..2e8cd70c64de56c71c018826c47fc4d774f8ef3e 100644 (file)
@@ -520,6 +520,7 @@ int ept_p2m_init(struct p2m_domain *p2m);
 void ept_p2m_uninit(struct p2m_domain *p2m);
 
 void ept_walk_table(struct domain *d, unsigned long gfn);
+bool_t ept_handle_misconfig(uint64_t gpa);
 void setup_ept_dump(void);
 
 void update_guest_eip(void);
index 1b5993e6bbfca299acad0842bf000abc9a0bee9b..b3f238cbaa796b7baad8b7bff6cce6158afd597a 100644 (file)
@@ -88,6 +88,7 @@ extern bool_t mtrr_fix_range_msr_set(struct domain *, struct mtrr_state *,
                                      uint32_t row, uint64_t msr_content);
 extern bool_t mtrr_def_type_msr_set(struct domain *, struct mtrr_state *,
                                     uint64_t msr_content);
+extern void memory_type_changed(struct domain *);
 extern bool_t pat_msr_set(uint64_t *pat, uint64_t msr);
 
 bool_t is_var_mtrr_overlapped(struct mtrr_state *m);
index d644f82c3cbfdbdff1396d3f66901d456754b955..c64f3d065204317a4d4264c67ac79ca7fb602db4 100644 (file)
@@ -233,6 +233,7 @@ struct p2m_domain {
     void               (*change_entry_type_global)(struct p2m_domain *p2m,
                                                    p2m_type_t ot,
                                                    p2m_type_t nt);
+    void               (*memory_type_changed)(struct p2m_domain *p2m);
     
     void               (*write_p2m_entry)(struct p2m_domain *p2m,
                                           unsigned long gfn, l1_pgentry_t *p,
@@ -506,6 +507,9 @@ void p2m_change_type_range(struct domain *d,
 p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
                            p2m_type_t ot, p2m_type_t nt);
 
+/* Report a change affecting memory types. */
+void p2m_memory_type_changed(struct domain *d);
+
 /* Set mmio addresses in the p2m table (for pass-through) */
 int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn);
 int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);