From: Jan Beulich Date: Thu, 10 Apr 2014 14:01:41 +0000 (+0200) Subject: x86/EPT: force re-evaluation of memory type as necessary X-Git-Tag: archive/raspbian/4.8.0-1+rpi1~1^2~5232 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=aa9114edd97b292cd89b3616e3f2089471fd2201;p=xen.git x86/EPT: force re-evaluation of memory type as necessary The main goal here is to drop the bogus dependency of epte_get_entry_emt() on d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT]. Any change to state influencing epte_get_entry_emt()'s decision needs to result in re-calculation. Do this by using the EPT_MISCONFIG VM exit, storing an invalid memory type into EPT's emt field (leaving the IOMMU, which doesn't care about memory types, unaffected). This is being done in a hierarchical manner to keep execution time down: Initially only the top level directory gets invalidated this way. Upon access, the involved intermediate page table levels get cleared back to zero, and the leaf entry gets its field properly set. For 4k leaves all other entries in the same directory also get processed to amortize the cost of the extra VM exit (which halved the number of these VM exits in my testing). This restoring can result in spurious EPT_MISCONFIG VM exits (since two vCPU-s may access addresses involving identical page table structures). Rather than simply returning in such cases (and risking that such a VM exit results from a real mis-configuration, which would then result in an endless loop rather than killing the VM), a per-vCPU flag is being introduced indicating when such a spurious VM exit might validly happen - if another one occurs right after VM re- entry, the flag would generally end up being clear, causing the VM to be killed as before on such VM exits. Note that putting a reserved memory type value in the EPT structures isn't formally sanctioned by the specification. Intel isn't willing to adjust the specification to make this or a similar use of the EPT_MISCONFIG VM exit formally possible, but they have indicated that us using this is low risk wrt forward compatibility. Signed-off-by: Jan Beulich Reviewed-by: Tim Deegan Acked-by: Kevin Tian --- diff --git a/xen/arch/x86/domctl.c b/xen/arch/x86/domctl.c index 26635ffc47..d626e6f455 100644 --- a/xen/arch/x86/domctl.c +++ b/xen/arch/x86/domctl.c @@ -83,6 +83,8 @@ long arch_do_domctl( ret = ioports_permit_access(d, fp, fp + np - 1); else ret = ioports_deny_access(d, fp, fp + np - 1); + if ( !ret ) + memory_type_changed(d); } break; @@ -706,6 +708,8 @@ long arch_do_domctl( ret, add ? "removing" : "denying", d->domain_id, mfn, mfn + nr_mfns - 1); } + /* Do this unconditionally to cover errors on above failure paths. */ + memory_type_changed(d); } break; @@ -792,6 +796,8 @@ long arch_do_domctl( "ioport_map: error %ld denying dom%d access to [%x,%x]\n", ret, d->domain_id, fmp, fmp + np - 1); } + if ( !ret ) + memory_type_changed(d); } break; diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c index 8c3797e305..46634cd5bc 100644 --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -252,6 +252,9 @@ int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat) if ( !hvm_funcs.set_guest_pat(v, guest_pat) ) v->arch.hvm_vcpu.pat_cr = guest_pat; + + memory_type_changed(v->domain); + return 1; } diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c index 45c8e7b463..ae24efa972 100644 --- a/xen/arch/x86/hvm/mtrr.c +++ b/xen/arch/x86/hvm/mtrr.c @@ -431,8 +431,12 @@ bool_t mtrr_def_type_msr_set(struct domain *d, struct mtrr_state *m, return 0; } - m->enabled = enabled; - m->def_type = def_type; + if ( m->enabled != enabled || m->def_type != def_type ) + { + m->enabled = enabled; + m->def_type = def_type; + memory_type_changed(d); + } return 1; } @@ -452,6 +456,7 @@ bool_t mtrr_fix_range_msr_set(struct domain *d, struct mtrr_state *m, return 0; fixed_range_base[row] = msr_content; + memory_type_changed(d); } return 1; @@ -496,6 +501,8 @@ bool_t mtrr_var_range_msr_set( m->overlapped = is_var_mtrr_overlapped(m); + memory_type_changed(d); + return 1; } @@ -690,6 +697,12 @@ static int hvm_load_mtrr_msr(struct domain *d, hvm_domain_context_t *h) HVM_REGISTER_SAVE_RESTORE(MTRR, hvm_save_mtrr_msr, hvm_load_mtrr_msr, 1, HVMSR_PER_VCPU); +void memory_type_changed(struct domain *d) +{ + if ( iommu_enabled && !iommu_snoop && d->vcpu && d->vcpu[0] ) + p2m_memory_type_changed(d); +} + uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn, uint8_t *ipat, bool_t direct_mmio) { @@ -733,8 +746,7 @@ uint8_t epte_get_entry_emt(struct domain *d, unsigned long gfn, mfn_t mfn, return MTRR_TYPE_WRBACK; } - gmtrr_mtype = is_hvm_domain(d) && v && - d->arch.hvm_domain.params[HVM_PARAM_IDENT_PT] ? + gmtrr_mtype = is_hvm_domain(d) && v ? get_mtrr_type(&v->arch.hvm_vcpu.mtrr, (gfn << PAGE_SHIFT)) : MTRR_TYPE_WRBACK; diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c index 77ce167872..180cf6ccc9 100644 --- a/xen/arch/x86/hvm/vmx/vmx.c +++ b/xen/arch/x86/hvm/vmx/vmx.c @@ -3016,6 +3016,16 @@ void vmx_vmexit_handler(struct cpu_user_regs *regs) break; } + case EXIT_REASON_EPT_MISCONFIG: + { + paddr_t gpa; + + __vmread(GUEST_PHYSICAL_ADDRESS, &gpa); + if ( !ept_handle_misconfig(gpa) ) + goto exit_and_crash; + break; + } + case EXIT_REASON_MONITOR_TRAP_FLAG: v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG; vmx_update_cpu_exec_control(v); diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c index 99a10845ba..342fc7048b 100644 --- a/xen/arch/x86/mm/p2m-ept.c +++ b/xen/arch/x86/mm/p2m-ept.c @@ -270,6 +270,125 @@ static int ept_next_level(struct p2m_domain *p2m, bool_t read_only, return GUEST_TABLE_NORMAL_PAGE; } +static bool_t ept_invalidate_emt(mfn_t mfn) +{ + ept_entry_t *epte = map_domain_page(mfn_x(mfn)); + unsigned int i; + bool_t changed = 0; + + for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ ) + { + ept_entry_t e = atomic_read_ept_entry(&epte[i]); + + if ( !is_epte_valid(&e) || !is_epte_present(&e) || + e.emt == MTRR_NUM_TYPES ) + continue; + + e.emt = MTRR_NUM_TYPES; + atomic_write_ept_entry(&epte[i], e); + changed = 1; + } + + unmap_domain_page(epte); + + return changed; +} + +bool_t ept_handle_misconfig(uint64_t gpa) +{ + struct vcpu *curr = current; + struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain); + struct ept_data *ept = &p2m->ept; + unsigned int level = ept_get_wl(ept); + unsigned long gfn = PFN_DOWN(gpa); + unsigned long mfn = ept_get_asr(ept); + ept_entry_t *epte; + int okay; + + if ( !mfn ) + return 0; + + p2m_lock(p2m); + + okay = -curr->arch.hvm_vmx.ept_spurious_misconfig; + for ( ; ; --level ) + { + ept_entry_t e; + unsigned int i; + + epte = map_domain_page(mfn); + i = (gfn >> (level * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1); + e = atomic_read_ept_entry(&epte[i]); + + if ( level == 0 || is_epte_superpage(&e) ) + { + uint8_t ipat = 0; + + if ( e.emt != MTRR_NUM_TYPES ) + break; + + if ( level == 0 ) + { + for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i ) + { + e = atomic_read_ept_entry(&epte[i]); + if ( e.emt == MTRR_NUM_TYPES ) + e.emt = 0; + if ( !is_epte_valid(&e) || !is_epte_present(&e) ) + continue; + e.emt = epte_get_entry_emt(p2m->domain, gfn + i, + _mfn(e.mfn), &ipat, + e.sa_p2mt == p2m_mmio_direct); + e.ipat = ipat; + atomic_write_ept_entry(&epte[i], e); + } + } + else + { + e.emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn), + &ipat, + e.sa_p2mt == p2m_mmio_direct); + e.ipat = ipat; + atomic_write_ept_entry(&epte[i], e); + } + + okay = 1; + break; + } + + if ( e.emt == MTRR_NUM_TYPES ) + { + ASSERT(is_epte_present(&e)); + ept_invalidate_emt(_mfn(e.mfn)); + smp_wmb(); + e.emt = 0; + atomic_write_ept_entry(&epte[i], e); + unmap_domain_page(epte); + okay = 1; + } + else if ( is_epte_present(&e) && !e.emt ) + unmap_domain_page(epte); + else + break; + + mfn = e.mfn; + } + + unmap_domain_page(epte); + if ( okay > 0 ) + { + struct vcpu *v; + + for_each_vcpu ( curr->domain, v ) + v->arch.hvm_vmx.ept_spurious_misconfig = 1; + } + curr->arch.hvm_vmx.ept_spurious_misconfig = 0; + ept_sync_domain(p2m); + p2m_unlock(p2m); + + return !!okay; +} + /* * ept_set_entry() computes 'need_modify_vtd_table' for itself, * by observing whether any gfn->mfn translations are modified. @@ -660,6 +779,17 @@ static void ept_change_entry_type_global(struct p2m_domain *p2m, ept_sync_domain(p2m); } +static void ept_memory_type_changed(struct p2m_domain *p2m) +{ + unsigned long mfn = ept_get_asr(&p2m->ept); + + if ( !mfn ) + return; + + if ( ept_invalidate_emt(_mfn(mfn)) ) + ept_sync_domain(p2m); +} + static void __ept_sync_domain(void *info) { struct ept_data *ept = &((struct p2m_domain *)info)->ept; @@ -697,6 +827,7 @@ int ept_p2m_init(struct p2m_domain *p2m) p2m->set_entry = ept_set_entry; p2m->get_entry = ept_get_entry; p2m->change_entry_type_global = ept_change_entry_type_global; + p2m->memory_type_changed = ept_memory_type_changed; p2m->audit_p2m = NULL; /* Set the memory type used when accessing EPT paging structures. */ @@ -737,6 +868,7 @@ static void ept_dump_p2m_table(unsigned char key) [MTRR_TYPE_WRTHROUGH] = "WT", [MTRR_TYPE_WRPROT] = "WP", [MTRR_TYPE_WRBACK] = "WB", + [MTRR_NUM_TYPES] = "??" }; for_each_domain(d) @@ -750,11 +882,16 @@ static void ept_dump_p2m_table(unsigned char key) for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += 1UL << order ) { + char c = 0; + gfn_remainder = gfn; table = map_domain_page(pagetable_get_pfn(p2m_get_pagetable(p2m))); for ( i = ept_get_wl(ept); i > 0; i-- ) { + ept_entry = table + (gfn_remainder >> (i * EPT_TABLE_ORDER)); + if ( ept_entry->emt == MTRR_NUM_TYPES ) + c = '?'; ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i); if ( ret != GUEST_TABLE_NORMAL_PAGE ) break; @@ -775,7 +912,7 @@ static void ept_dump_p2m_table(unsigned char key) memory_types[ept_entry->emt][0], memory_types[ept_entry->emt][1] ?: ept_entry->emt + '0', - ept_entry->ipat ? '!' : ' '); + c ?: ept_entry->ipat ? '!' : ' '); if ( !(record_counter++ % 100) ) process_pending_softirqs(); diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c index c38f334a88..d2de237d97 100644 --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -200,6 +200,18 @@ void p2m_change_entry_type_global(struct domain *d, p2m_unlock(p2m); } +void p2m_memory_type_changed(struct domain *d) +{ + struct p2m_domain *p2m = p2m_get_hostp2m(d); + + if ( p2m->memory_type_changed ) + { + p2m_lock(p2m); + p2m->memory_type_changed(p2m); + p2m_unlock(p2m); + } +} + mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, unsigned int *page_order, bool_t locked) diff --git a/xen/common/domctl.c b/xen/common/domctl.c index 5342e5df71..5e807abef7 100644 --- a/xen/common/domctl.c +++ b/xen/common/domctl.c @@ -815,6 +815,10 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1); else ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1); +#ifdef CONFIG_X86 + if ( !ret ) + memory_type_changed(d); +#endif } break; diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h index 45dcfa1def..445b39f63b 100644 --- a/xen/include/asm-x86/hvm/vmx/vmcs.h +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h @@ -124,6 +124,9 @@ struct arch_vmx_struct { unsigned long host_cr0; + /* Do we need to tolerate a spurious EPT_MISCONFIG VM exit? */ + bool_t ept_spurious_misconfig; + /* Is the guest in real mode? */ uint8_t vmx_realmode; /* Are we emulating rather than VMENTERing? */ diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h index d403099003..2e8cd70c64 100644 --- a/xen/include/asm-x86/hvm/vmx/vmx.h +++ b/xen/include/asm-x86/hvm/vmx/vmx.h @@ -520,6 +520,7 @@ int ept_p2m_init(struct p2m_domain *p2m); void ept_p2m_uninit(struct p2m_domain *p2m); void ept_walk_table(struct domain *d, unsigned long gfn); +bool_t ept_handle_misconfig(uint64_t gpa); void setup_ept_dump(void); void update_guest_eip(void); diff --git a/xen/include/asm-x86/mtrr.h b/xen/include/asm-x86/mtrr.h index 1b5993e6bb..b3f238cbaa 100644 --- a/xen/include/asm-x86/mtrr.h +++ b/xen/include/asm-x86/mtrr.h @@ -88,6 +88,7 @@ extern bool_t mtrr_fix_range_msr_set(struct domain *, struct mtrr_state *, uint32_t row, uint64_t msr_content); extern bool_t mtrr_def_type_msr_set(struct domain *, struct mtrr_state *, uint64_t msr_content); +extern void memory_type_changed(struct domain *); extern bool_t pat_msr_set(uint64_t *pat, uint64_t msr); bool_t is_var_mtrr_overlapped(struct mtrr_state *m); diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h index d644f82c3c..c64f3d0652 100644 --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -233,6 +233,7 @@ struct p2m_domain { void (*change_entry_type_global)(struct p2m_domain *p2m, p2m_type_t ot, p2m_type_t nt); + void (*memory_type_changed)(struct p2m_domain *p2m); void (*write_p2m_entry)(struct p2m_domain *p2m, unsigned long gfn, l1_pgentry_t *p, @@ -506,6 +507,9 @@ void p2m_change_type_range(struct domain *d, p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn, p2m_type_t ot, p2m_type_t nt); +/* Report a change affecting memory types. */ +void p2m_memory_type_changed(struct domain *d); + /* Set mmio addresses in the p2m table (for pass-through) */ int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn); int clear_mmio_p2m_entry(struct domain *d, unsigned long gfn);