[LINUX] Add spurious page-fault detection, intended primarily
authorkfraser@dhcp93.uk.xensource.com <kfraser@dhcp93.uk.xensource.com>
Fri, 16 Jun 2006 17:19:40 +0000 (18:19 +0100)
committerkfraser@dhcp93.uk.xensource.com <kfraser@dhcp93.uk.xensource.com>
Fri, 16 Jun 2006 17:19:40 +0000 (18:19 +0100)
for spurious write faults on mappings that have been
changed from read-only to writable. If a CPU has a stale
read-only entry in its TLB, it is allowed to fault on
the next write access without re-walking the page table.
Signed-off-by: Keir Fraser <keir@xensource.com>
linux-2.6-xen-sparse/arch/i386/mm/fault-xen.c
linux-2.6-xen-sparse/arch/x86_64/mm/fault-xen.c

index 308c0bbe8ab72cbdff2625124035b44dfeb7afb0..16a0155ecbc0ad79932e2299ef2ce37032ad0fe8 100644 (file)
@@ -273,6 +273,49 @@ static void dump_fault_path(unsigned long address)
 }
 #endif
 
+static int spurious_fault(struct pt_regs *regs,
+                         unsigned long address,
+                         unsigned long error_code)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+#ifdef CONFIG_XEN
+       /* Faults in hypervisor area are never spurious. */
+       if (address >= HYPERVISOR_VIRT_START)
+               return 0;
+#endif
+
+       /* Reserved-bit violation or user access to kernel space? */
+       if (error_code & 0x0c)
+               return 0;
+
+       pgd = init_mm.pgd + pgd_index(address);
+       if (!pgd_present(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return 0;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+       if ((error_code & 0x02) && !pte_write(*pte))
+               return 0;
+#ifdef CONFIG_X86_PAE
+       if ((error_code & 0x10) && (pte_val(*pte) & _PAGE_NX))
+               return 0;
+#endif
+
+       return 1;
+}
 
 /*
  * This routine handles page faults.  It determines the address,
@@ -327,8 +370,16 @@ fastcall void __kprobes do_page_fault(struct pt_regs *regs,
         * protection error (error_code & 1) == 0.
         */
        if (unlikely(address >= TASK_SIZE)) { 
+#ifdef CONFIG_XEN
+               /* Faults in hypervisor area can never be patched up. */
+               if (address >= HYPERVISOR_VIRT_START)
+                       goto bad_area_nosemaphore;
+#endif
                if (!(error_code & 5))
                        goto vmalloc_fault;
+               /* Can take a spurious fault if mapping changes R/O -> R/W. */
+               if (spurious_fault(regs, address, error_code))
+                       return;
                /* 
                 * Don't take the mm semaphore here. If we fixup a prefetch
                 * fault we could otherwise deadlock.
index dfd2f2df0fdf787d4f4abf1fded10dfd35bef5c2..963446013fdcf35f1840ca00e1c55f46f5a5b815 100644 (file)
@@ -307,6 +307,49 @@ int exception_trace = 1;
 #define MEM_LOG(_f, _a...) ((void)0)
 #endif
 
+static int spurious_fault(struct pt_regs *regs,
+                         unsigned long address,
+                         unsigned long error_code)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+#ifdef CONFIG_XEN
+       /* Faults in hypervisor area are never spurious. */
+       if ((address >= HYPERVISOR_VIRT_START) &&
+           (address < HYPERVISOR_VIRT_END))
+               return 0;
+#endif
+
+       /* Reserved-bit violation or user access to kernel space? */
+       if (error_code & PF_RSVD|PF_USER)
+               return 0;
+
+       pgd = init_mm.pgd + pgd_index(address);
+       if (!pgd_present(*pgd))
+               return 0;
+
+       pud = pud_offset(pgd, address);
+       if (!pud_present(*pud))
+               return 0;
+
+       pmd = pmd_offset(pud, address);
+       if (!pmd_present(*pmd))
+               return 0;
+
+       pte = pte_offset_kernel(pmd, address);
+       if (!pte_present(*pte))
+               return 0;
+       if ((error_code & PF_WRITE) && !pte_write(*pte))
+               return 0;
+       if ((error_code & PF_INSTR) && (pte_val(*pte) & _PAGE_NX))
+               return 0;
+
+       return 1;
+}
+
 /*
  * This routine handles page faults.  It determines the address,
  * and the problem, and then passes it off to one of the appropriate
@@ -361,16 +404,19 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
         */
        if (unlikely(address >= TASK_SIZE64)) {
                /*
-                * Must check for the entire kernel range here: with writable
-                * page tables the hypervisor may temporarily clear PMD
-                * entries.
+                * Don't check for the module range here: its PML4
+                * is always initialized because it's shared with the main
+                * kernel text. Only vmalloc may need PML4 syncups.
                 */
                if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) &&
-                   address >= PAGE_OFFSET) {
+                     ((address >= VMALLOC_START && address < VMALLOC_END))) {
                        if (vmalloc_fault(address) < 0)
                                goto bad_area_nosemaphore;
                        return;
                }
+               /* Can take a spurious fault if mapping changes R/O -> R/W. */
+               if (spurious_fault(regs, address, error_code))
+                       return;
                /*
                 * Don't take the mm semaphore here. If we fixup a prefetch
                 * fault we could otherwise deadlock.