From bd1cd81d648447eafd80f5e49cc568e35b5985dd Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Wed, 5 Nov 2008 10:57:21 +0000 Subject: [PATCH] x86: PV support for hugepages Hugepage support must be enabled via the hypervisor command line option "allowhugepage". There is currently no support in the tools for saving/restoring/migrating guests who use hugepages. Signed-off-by: Dave McCracken --- xen/arch/x86/mm.c | 87 ++++++++++++++++++++++++++----- xen/arch/x86/traps.c | 10 ++-- xen/include/asm-x86/mm.h | 1 + xen/include/asm-x86/x86_32/page.h | 2 +- xen/include/asm-x86/x86_64/page.h | 2 +- 5 files changed, 85 insertions(+), 17 deletions(-) diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index f05ca57586..6eceabe68a 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -160,6 +160,9 @@ unsigned long total_pages; #define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT) +int opt_allow_hugepage; +boolean_param("allowhugepage", opt_allow_hugepage); + #define l1_disallow_mask(d) \ ((d != dom_io) && \ (rangeset_is_empty((d)->iomem_caps) && \ @@ -586,6 +589,28 @@ static int get_page_and_type_from_pagenr(unsigned long page_nr, return rc; } +static int get_data_page( + struct page_info *page, struct domain *d, int writeable) +{ + int rc; + + if ( writeable ) + rc = get_page_and_type(page, d, PGT_writable_page); + else + rc = get_page(page, d); + + return rc; +} + +static void put_data_page( + struct page_info *page, int writeable) +{ + if ( writeable ) + put_page_and_type(page); + else + put_page(page); +} + /* * We allow root tables to map each other (a.k.a. linear page tables). It * needs some special care with reference counts and access permissions: @@ -700,10 +725,9 @@ get_page_from_l1e( * contribute to writeable mapping refcounts. (This allows the * qemu-dm helper process in dom0 to map the domain's memory without * messing up the count of "real" writable mappings.) */ - okay = (((l1f & _PAGE_RW) && - !(unlikely(paging_mode_external(d) && (d != curr->domain)))) - ? get_page_and_type(page, d, PGT_writable_page) - : get_page(page, d)); + okay = get_data_page( + page, d, + (l1f & _PAGE_RW) && !(paging_mode_external(d) && (d != curr->domain))); if ( !okay ) { MEM_LOG("Error getting mfn %lx (pfn %lx) from L1 entry %" PRIpte @@ -751,6 +775,7 @@ static int get_page_from_l2e( l2_pgentry_t l2e, unsigned long pfn, struct domain *d) { + unsigned long mfn = l2e_get_pfn(l2e); int rc; if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ) @@ -762,10 +787,37 @@ get_page_from_l2e( return -EINVAL; } - rc = get_page_and_type_from_pagenr( - l2e_get_pfn(l2e), PGT_l1_page_table, d, 0, 0); - if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) - rc = 0; + if ( !(l2e_get_flags(l2e) & _PAGE_PSE) ) + { + rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, 0, 0); + if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) ) + rc = 0; + } + else if ( !opt_allow_hugepage || (mfn & (L1_PAGETABLE_ENTRIES-1)) ) + { + rc = -EINVAL; + } + else + { + unsigned long m = mfn; + int writeable = !!(l2e_get_flags(l2e) & _PAGE_RW); + + do { + rc = get_data_page(mfn_to_page(m), d, writeable); + if ( unlikely(!rc) ) + { + while ( m-- > mfn ) + put_data_page(mfn_to_page(m), writeable); + return -EINVAL; + } + } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) ); + +#ifdef __x86_64__ + map_pages_to_xen( + (unsigned long)mfn_to_virt(mfn), mfn, L1_PAGETABLE_ENTRIES, + PAGE_HYPERVISOR | l2e_get_flags(l2e)); +#endif + } return rc; } @@ -954,13 +1006,24 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *d) */ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn) { - if ( (l2e_get_flags(l2e) & _PAGE_PRESENT) && - (l2e_get_pfn(l2e) != pfn) ) + if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) ) + return 1; + + if ( l2e_get_flags(l2e) & _PAGE_PSE ) + { + unsigned long mfn = l2e_get_pfn(l2e), m = mfn; + int writeable = l2e_get_flags(l2e) & _PAGE_RW; + ASSERT(opt_allow_hugepage && !(mfn & (L1_PAGETABLE_ENTRIES-1))); + do { + put_data_page(mfn_to_page(m), writeable); + } while ( m++ < (mfn + (L1_PAGETABLE_ENTRIES-1)) ); + } + else { put_page_and_type(l2e_get_page(l2e)); - return 0; } - return 1; + + return 0; } static int __put_page_type(struct page_info *, int preemptible); diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index c4e9d30597..d6e77549dd 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -723,7 +723,8 @@ static void pv_cpuid(struct cpu_user_regs *regs) { /* Modify Feature Information. */ __clear_bit(X86_FEATURE_VME, &d); - __clear_bit(X86_FEATURE_PSE, &d); + if ( !opt_allow_hugepage ) + __clear_bit(X86_FEATURE_PSE, &d); __clear_bit(X86_FEATURE_PGE, &d); __clear_bit(X86_FEATURE_MCE, &d); __clear_bit(X86_FEATURE_MCA, &d); @@ -2003,9 +2004,12 @@ static int emulate_privileged_op(struct cpu_user_regs *regs) case 4: /* Read CR4 */ /* * Guests can read CR4 to see what features Xen has enabled. We - * therefore lie about PGE & PSE as they are unavailable to guests. + * therefore lie about PGE as it is unavailable to guests. + * Also disallow PSE if hugepages are not enabled. */ - *reg = read_cr4() & ~(X86_CR4_PGE|X86_CR4_PSE); + *reg = read_cr4() & ~X86_CR4_PGE; + if ( !opt_allow_hugepage ) + *reg &= ~X86_CR4_PSE; break; default: diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index 005b6603e2..d017c4cb56 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -263,6 +263,7 @@ pae_copy_root(struct vcpu *v, l3_pgentry_t *l3tab); int check_descriptor(const struct domain *, struct desc_struct *d); +extern int opt_allow_hugepage; /****************************************************************************** * With shadow pagetables, the different kinds of address start diff --git a/xen/include/asm-x86/x86_32/page.h b/xen/include/asm-x86/x86_32/page.h index 16659a1ae3..aef51f51af 100644 --- a/xen/include/asm-x86/x86_32/page.h +++ b/xen/include/asm-x86/x86_32/page.h @@ -112,7 +112,7 @@ extern unsigned int PAGE_HYPERVISOR_NOCACHE; #define BASE_DISALLOW_MASK (0xFFFFF198U & ~_PAGE_NX) #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB) -#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK) +#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE) #define L3_DISALLOW_MASK 0xFFFFF1FEU /* must-be-zero */ #endif /* __X86_32_PAGE_H__ */ diff --git a/xen/include/asm-x86/x86_64/page.h b/xen/include/asm-x86/x86_64/page.h index 948cd656f0..ac44a9a1c1 100644 --- a/xen/include/asm-x86/x86_64/page.h +++ b/xen/include/asm-x86/x86_64/page.h @@ -115,7 +115,7 @@ typedef l4_pgentry_t root_pgentry_t; #define BASE_DISALLOW_MASK (0xFF800198U & ~_PAGE_NX) #define L1_DISALLOW_MASK (BASE_DISALLOW_MASK | _PAGE_GNTTAB) -#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK) +#define L2_DISALLOW_MASK (BASE_DISALLOW_MASK & ~_PAGE_PSE) #define L3_DISALLOW_MASK (BASE_DISALLOW_MASK) #define L4_DISALLOW_MASK (BASE_DISALLOW_MASK) -- 2.30.2