From 12794ba7df5ceed2f4ccf2818e44dcaaa78c210e Mon Sep 17 00:00:00 2001 From: "kfraser@dhcp93.uk.xensource.com" Date: Fri, 16 Jun 2006 18:18:55 +0100 Subject: [PATCH] [XEN] Make the spurious page-fault detection logic more robust. In particular it must be able to handle spurious write faults on mappings that have been changed from read-only to writable. If a CPU has a stale read-only entry in its TLB, it is allowed to fault on the next write access without re-walking the page table. Signed-off-by: Keir Fraser --- xen/arch/x86/traps.c | 184 +++++++++++++++++++++++--------- xen/arch/x86/x86_32/traps.c | 34 ------ xen/arch/x86/x86_64/traps.c | 34 ------ xen/include/asm-x86/processor.h | 8 +- 4 files changed, 142 insertions(+), 118 deletions(-) diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index 9da6f65c5b..9b0b7e31c6 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -511,9 +511,9 @@ void propagate_page_fault(unsigned long addr, u16 error_code) v->vcpu_info->arch.cr2 = addr; /* Re-set error_code.user flag appropriately for the guest. */ - error_code &= ~4; + error_code &= ~PGERR_user_mode; if ( !guest_kernel_mode(v, guest_cpu_user_regs()) ) - error_code |= 4; + error_code |= PGERR_user_mode; ti = &v->arch.guest_context.trap_ctxt[TRAP_page_fault]; tb->flags = TBF_EXCEPTION | TBF_EXCEPTION_ERRCODE; @@ -578,10 +578,125 @@ static int handle_gdt_ldt_mapping_fault( (((va) >= HYPERVISOR_VIRT_START)) #endif +static int __spurious_page_fault( + unsigned long addr, struct cpu_user_regs *regs) +{ + unsigned long mfn = read_cr3() >> PAGE_SHIFT; +#if CONFIG_PAGING_LEVELS >= 4 + l4_pgentry_t l4e, *l4t; +#endif +#if CONFIG_PAGING_LEVELS >= 3 + l3_pgentry_t l3e, *l3t; +#endif + l2_pgentry_t l2e, *l2t; + l1_pgentry_t l1e, *l1t; + unsigned int required_flags, disallowed_flags; + + required_flags = _PAGE_PRESENT; + if ( regs->error_code & PGERR_write_access ) + required_flags |= _PAGE_RW; + if ( regs->error_code & PGERR_user_mode ) + required_flags |= _PAGE_USER; + + disallowed_flags = 0; + if ( regs->error_code & PGERR_instr_fetch ) + disallowed_flags |= _PAGE_NX; + +#if CONFIG_PAGING_LEVELS >= 4 + l4t = map_domain_page(mfn); + l4e = l4t[l4_table_offset(addr)]; + mfn = l4e_get_pfn(l4e); + unmap_domain_page(l4t); + if ( !(l4e_get_flags(l4e) & required_flags) || + (l4e_get_flags(l4e) & disallowed_flags) ) + return 0; +#endif + +#if CONFIG_PAGING_LEVELS >= 3 + l3t = map_domain_page(mfn); + l3e = l3t[l3_table_offset(addr)]; + mfn = l3e_get_pfn(l3e); + unmap_domain_page(l3t); +#ifdef CONFIG_X86_PAE + if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) + return 0; +#else + if ( !(l3e_get_flags(l3e) & required_flags) || + (l3e_get_flags(l3e) & disallowed_flags) ) + return 0; +#endif +#endif + + l2t = map_domain_page(mfn); + l2e = l2t[l2_table_offset(addr)]; + mfn = l2e_get_pfn(l2e); + unmap_domain_page(l2t); + if ( !(l2e_get_flags(l2e) & required_flags) || + (l2e_get_flags(l2e) & disallowed_flags) ) + return 0; + if ( l2e_get_flags(l2e) & _PAGE_PSE ) + return 1; + + l1t = map_domain_page(mfn); + l1e = l1t[l1_table_offset(addr)]; + mfn = l1e_get_pfn(l1e); + unmap_domain_page(l1t); + if ( !(l1e_get_flags(l1e) & required_flags) || + (l1e_get_flags(l1e) & disallowed_flags) ) + return 0; + return 1; +} + +static int spurious_page_fault( + unsigned long addr, struct cpu_user_regs *regs) +{ + struct vcpu *v = current; + struct domain *d = v->domain; + int is_spurious; + + /* Reserved bit violations are never spurious faults. */ + if ( regs->error_code & PGERR_reserved_bit ) + return 0; + + LOCK_BIGLOCK(d); + + is_spurious = __spurious_page_fault(addr, regs); + if ( is_spurious ) + goto out; + + /* + * The only possible reason for a spurious page fault not to be picked + * up already is that a page directory was unhooked by writable page table + * logic and then reattached before the faulting VCPU could detect it. + */ + if ( is_idle_domain(d) || /* no ptwr in idle domain */ + IN_HYPERVISOR_RANGE(addr) || /* no ptwr on hypervisor addrs */ + shadow_mode_enabled(d) || /* no ptwr logic in shadow mode */ + (regs->error_code & PGERR_page_present) ) /* not-present fault? */ + goto out; + + /* + * The page directory could have been detached again while we weren't + * holding the per-domain lock. Detect that and fix up if it's the case. + */ + if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) && + unlikely(l2_linear_offset(addr) == + d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) ) + { + ptwr_flush(d, PTWR_PT_ACTIVE); + is_spurious = 1; + } + + out: + UNLOCK_BIGLOCK(d); + return is_spurious; +} + static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) { struct vcpu *v = current; struct domain *d = v->domain; + int rc; if ( unlikely(IN_HYPERVISOR_RANGE(addr)) ) { @@ -590,12 +705,20 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) if ( (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) ) return handle_gdt_ldt_mapping_fault( addr - GDT_LDT_VIRT_START, regs); + /* + * Do not propagate spurious faults in the hypervisor area to the + * guest. It cannot fix them up. + */ + LOCK_BIGLOCK(d); + rc = __spurious_page_fault(addr, regs); + UNLOCK_BIGLOCK(d); + return rc; } - else if ( unlikely(shadow_mode_enabled(d)) ) - { + + if ( unlikely(shadow_mode_enabled(d)) ) return shadow_fault(addr, regs); - } - else if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) + + if ( likely(VM_ASSIST(d, VMASST_TYPE_writable_pagetables)) ) { LOCK_BIGLOCK(d); if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) && @@ -609,7 +732,10 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) if ( guest_kernel_mode(v, regs) && /* Protection violation on write? No reserved-bit violation? */ - ((regs->error_code & 0xb) == 0x3) && + ((regs->error_code & (PGERR_page_present | + PGERR_write_access | + PGERR_reserved_bit)) == + (PGERR_page_present | PGERR_write_access)) && ptwr_do_page_fault(d, addr, regs) ) { UNLOCK_BIGLOCK(d); @@ -621,46 +747,6 @@ static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs) return 0; } -static int spurious_page_fault(unsigned long addr, struct cpu_user_regs *regs) -{ - struct vcpu *v = current; - struct domain *d = v->domain; - int rc; - - /* - * The only possible reason for a spurious page fault not to be picked - * up already is that a page directory was unhooked by writable page table - * logic and then reattached before the faulting VCPU could detect it. - */ - if ( is_idle_domain(d) || /* no ptwr in idle domain */ - IN_HYPERVISOR_RANGE(addr) || /* no ptwr on hypervisor addrs */ - shadow_mode_enabled(d) || /* no ptwr logic in shadow mode */ - ((regs->error_code & 0x1d) != 0) ) /* simple not-present fault? */ - return 0; - - LOCK_BIGLOCK(d); - - /* - * The page directory could have been detached again while we weren't - * holding the per-domain lock. Detect that and fix up if it's the case. - */ - if ( unlikely(d->arch.ptwr[PTWR_PT_ACTIVE].l1va) && - unlikely(l2_linear_offset(addr) == - d->arch.ptwr[PTWR_PT_ACTIVE].l2_idx) ) - { - ptwr_flush(d, PTWR_PT_ACTIVE); - rc = 1; - } - else - { - /* Okay, walk the page tables. Only check for not-present faults.*/ - rc = __spurious_page_fault(addr); - } - - UNLOCK_BIGLOCK(d); - return rc; -} - /* * #PF error code: * Bit 0: Protection violation (=1) ; Page not present (=0) @@ -784,8 +870,8 @@ static inline int admin_io_okay( (admin_io_okay(_p, 4, _d, _r) ? outl(_v, _p) : ((void)0)) /* Propagate a fault back to the guest kernel. */ -#define USER_READ_FAULT 4 /* user mode, read fault */ -#define USER_WRITE_FAULT 6 /* user mode, write fault */ +#define USER_READ_FAULT (PGERR_user_mode) +#define USER_WRITE_FAULT (PGERR_user_mode | PGERR_write_access) #define PAGE_FAULT(_faultaddr, _errcode) \ ({ propagate_page_fault(_faultaddr, _errcode); \ return EXCRET_fault_fixed; \ diff --git a/xen/arch/x86/x86_32/traps.c b/xen/arch/x86/x86_32/traps.c index eb5beb44ad..c064415aad 100644 --- a/xen/arch/x86/x86_32/traps.c +++ b/xen/arch/x86/x86_32/traps.c @@ -113,40 +113,6 @@ void show_page_walk(unsigned long addr) unmap_domain_page(l1t); } -int __spurious_page_fault(unsigned long addr) -{ - unsigned long mfn = read_cr3() >> PAGE_SHIFT; -#ifdef CONFIG_X86_PAE - l3_pgentry_t l3e, *l3t; -#endif - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; - -#ifdef CONFIG_X86_PAE - l3t = map_domain_page(mfn); - l3e = l3t[l3_table_offset(addr)]; - mfn = l3e_get_pfn(l3e); - unmap_domain_page(l3t); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - return 0; -#endif - - l2t = map_domain_page(mfn); - l2e = l2t[l2_table_offset(addr)]; - mfn = l2e_get_pfn(l2e); - unmap_domain_page(l2t); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return 0; - if ( l2e_get_flags(l2e) & _PAGE_PSE ) - return 1; - - l1t = map_domain_page(mfn); - l1e = l1t[l1_table_offset(addr)]; - mfn = l1e_get_pfn(l1e); - unmap_domain_page(l1t); - return !!(l1e_get_flags(l1e) & _PAGE_PRESENT); -} - #define DOUBLEFAULT_STACK_SIZE 1024 static struct tss_struct doublefault_tss; static unsigned char doublefault_stack[DOUBLEFAULT_STACK_SIZE]; diff --git a/xen/arch/x86/x86_64/traps.c b/xen/arch/x86/x86_64/traps.c index 8c11a5ef4f..b5716c23a7 100644 --- a/xen/arch/x86/x86_64/traps.c +++ b/xen/arch/x86/x86_64/traps.c @@ -115,40 +115,6 @@ void show_page_walk(unsigned long addr) printk(" L1 = %"PRIpte" %016lx\n", l1e_get_intpte(l1e), pfn); } -int __spurious_page_fault(unsigned long addr) -{ - unsigned long mfn = read_cr3() >> PAGE_SHIFT; - l4_pgentry_t l4e, *l4t; - l3_pgentry_t l3e, *l3t; - l2_pgentry_t l2e, *l2t; - l1_pgentry_t l1e, *l1t; - - l4t = mfn_to_virt(mfn); - l4e = l4t[l4_table_offset(addr)]; - mfn = l4e_get_pfn(l4e); - if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) ) - return 0; - - l3t = mfn_to_virt(mfn); - l3e = l3t[l3_table_offset(addr)]; - mfn = l3e_get_pfn(l3e); - if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ) - return 0; - - l2t = mfn_to_virt(mfn); - l2e = l2t[l2_table_offset(addr)]; - mfn = l2e_get_pfn(l2e); - if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) - return 0; - if ( l2e_get_flags(l2e) & _PAGE_PSE ) - return 1; - - l1t = mfn_to_virt(mfn); - l1e = l1t[l1_table_offset(addr)]; - mfn = l1e_get_pfn(l1e); - return !!(l1e_get_flags(l1e) & _PAGE_PRESENT); -} - asmlinkage void double_fault(void); asmlinkage void do_double_fault(struct cpu_user_regs *regs) { diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 2cda6c15f5..f32a763b27 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -129,6 +129,13 @@ #define _TF_kernel_mode 0 #define TF_kernel_mode (1<<_TF_kernel_mode) +/* #PF error code values. */ +#define PGERR_page_present (1U<<0) +#define PGERR_write_access (1U<<1) +#define PGERR_user_mode (1U<<2) +#define PGERR_reserved_bit (1U<<3) +#define PGERR_instr_fetch (1U<<4) + #ifndef __ASSEMBLY__ struct domain; @@ -524,7 +531,6 @@ extern always_inline void prefetchw(const void *x) void show_stack(struct cpu_user_regs *regs); void show_registers(struct cpu_user_regs *regs); void show_page_walk(unsigned long addr); -int __spurious_page_fault(unsigned long addr); asmlinkage void fatal_trap(int trapnr, struct cpu_user_regs *regs); extern void mtrr_ap_init(void); -- 2.30.2