From 9ad9a1609fbe407bde40498428ec465a6bd977a8 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keir.fraser@citrix.com>
Date: Mon, 5 Jan 2009 10:44:39 +0000
Subject: [PATCH] PoD memory 5/9: emergency scan

Implement "emergency scan" for zero pages, to deal with start-of-day
page scrubbers.

If the cache is running out, scan through memory looking for "zero
pages" that we can reclaim for the cache.  This is necessary for
operating systems which have a start-of-day page scrubber which runs
before the balloon driver can balloon down to the target.

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
---
 xen/arch/x86/mm/p2m.c         | 296 ++++++++++++++++++++++++++++++++++
 xen/common/grant_table.c      |  29 ++++
 xen/include/asm-x86/p2m.h     |   3 +
 xen/include/xen/grant_table.h |   4 +
 4 files changed, 332 insertions(+)

diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index dbb62237ad..685424aedb 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -496,6 +496,289 @@ p2m_pod_dump_data(struct domain *d)
            p2md->pod.entry_count, p2md->pod.count);
 }
 
+#define superpage_aligned(_x)  (((_x)&((1<<9)-1))==0)
+
+/* Must be called w/ p2m lock held, page_alloc lock not held */
+static int
+p2m_pod_zero_check_superpage(struct domain *d, unsigned long gfn)
+{
+    mfn_t mfns[1<<9];
+    p2m_type_t types[1<<9];
+    unsigned long * map[1<<9] = { NULL };
+    int ret=0, reset = 0, reset_max = 0;
+    int i, j;
+
+    if ( !superpage_aligned(gfn) )
+        goto out;
+
+    /* Look up the mfns, checking to make sure they're the same mfn
+     * and aligned, and mapping them. */
+    for ( i=0; i<(1<<9); i++ )
+    {
+        mfns[i] = gfn_to_mfn_query(d, gfn + i, types + i);
+
+        /* Conditions that must be met for superpage-superpage:
+         * + All gfns are ram types
+         * + All gfns have the same type
+         * + None of the mfns are used as pagetables
+         * + The first mfn is 2-meg aligned
+         * + All the other mfns are in sequence
+         */
+        if ( p2m_is_ram(types[i])
+             && types[i] == types[0]
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_page_table) == 0 )
+             && ( ( i == 0 && superpage_aligned(mfn_x(mfns[0])) )
+                  || ( i != 0 && mfn_x(mfns[i]) == mfn_x(mfns[0]) + i ) ) )
+            map[i] = map_domain_page(mfn_x(mfns[i]));
+        else
+            goto out_unmap;
+    }
+
+    /* Now, do a quick check to see if it may be zero before unmapping. */
+    for ( i=0; i<(1<<9); i++ )
+    {
+        /* Quick zero-check */
+        for ( j=0; j<16; j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        if ( j < 16 )
+            goto out_unmap;
+
+    }
+
+    /* Try to remove the page, restoring old mapping if it fails. */
+    reset_max = 1<<9;
+    set_p2m_entry(d, gfn,
+                  _mfn(POPULATE_ON_DEMAND_MFN), 9,
+                  p2m_populate_on_demand);
+
+    if ( (mfn_to_page(mfns[0])->u.inuse.type_info & PGT_count_mask) != 0 )
+    {
+        reset = 1;
+        goto out_reset;
+    }
+
+    /* Timing here is important.  We need to make sure not to reclaim
+     * a page which has been grant-mapped to another domain.  But we
+     * can't grab the grant table lock, because we may be invoked from
+     * the grant table code!  So we first remove the page from the
+     * p2m, then check to see if the gpfn has been granted.  Once this
+     * gpfn is marked PoD, any future gfn_to_mfn() call will block
+     * waiting for the p2m lock.  If we find that it has been granted, we
+     * simply restore the old value.
+     */
+    if ( gnttab_is_granted(d, gfn, 9) )
+    {
+        printk("gfn contains grant table %lx\n", gfn);
+        reset = 1;
+        goto out_reset;
+    }
+
+    /* Finally, do a full zero-check */
+    for ( i=0; i < (1<<9); i++ )
+    {
+        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
+            if( *(map[i]+j) != 0 )
+            {
+                reset = 1;
+                break;
+            }
+
+        if ( reset )
+            goto out_reset;
+    }
+
+    /* Finally!  We've passed all the checks, and can add the mfn superpage
+     * back on the PoD cache, and account for the new p2m PoD entries */
+    p2m_pod_cache_add(d, mfn_to_page(mfns[0]), 9);
+    d->arch.p2m->pod.entry_count += (1<<9);
+
+out_reset:
+    if ( reset )
+    {
+        if (reset_max == (1<<9) )
+            set_p2m_entry(d, gfn, mfns[0], 9, types[0]);
+        else
+            for ( i=0; i<reset_max; i++)
+                set_p2m_entry(d, gfn + i, mfns[i], 0, types[i]);
+    }
+    
+out_unmap:
+    for ( i=0; i<(1<<9); i++ )
+        if ( map[i] )
+            unmap_domain_page(map[i]);
+out:
+    return ret;
+}
+
+static void
+p2m_pod_zero_check(struct domain *d, unsigned long *gfns, int count)
+{
+    mfn_t mfns[count];
+    p2m_type_t types[count];
+    unsigned long * map[count];
+
+    int i, j;
+
+    /* First, get the gfn list, translate to mfns, and map the pages. */
+    for ( i=0; i<count; i++ )
+    {
+        mfns[i] = gfn_to_mfn_query(d, gfns[i], types + i);
+        /* If this is ram, and not a pagetable, map it; otherwise,
+         * skip. */
+        if ( p2m_is_ram(types[i])
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_page_table) == 0 ) )
+            map[i] = map_domain_page(mfn_x(mfns[i]));
+        else
+            map[i] = NULL;
+    }
+
+    /* Then, go through and check for zeroed pages, removing write permission
+     * for those with zeroes. */
+    for ( i=0; i<count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        /* Quick zero-check */
+        for ( j=0; j<16; j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        if ( j < 16 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+            continue;
+        }
+
+        /* Try to remove the page, restoring old mapping if it fails. */
+        set_p2m_entry(d, gfns[i],
+                      _mfn(POPULATE_ON_DEMAND_MFN), 0,
+                      p2m_populate_on_demand);
+
+        if ( (mfn_to_page(mfns[i])->u.inuse.type_info & PGT_count_mask) != 0 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+
+            set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
+
+            continue;
+        }
+    }
+
+    /* Now check each page for real */
+    for ( i=0; i < count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        /* See comment in p2m_pod_zero_check_superpage() re gnttab
+         * check timing.  */
+        if ( j < PAGE_SIZE/sizeof(*map[i])
+             || gnttab_is_granted(d, gfns[i], 0) )
+        {
+            set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
+            continue;
+        }
+        else
+        {
+            /* Add to cache, and account for the new p2m PoD entry */
+            p2m_pod_cache_add(d, mfn_to_page(mfns[i]), 0);
+            d->arch.p2m->pod.entry_count++;
+        }
+
+        unmap_domain_page(map[i]);
+        map[i] = NULL;
+    }
+    
+}
+
+#define POD_SWEEP_LIMIT 1024
+static void
+p2m_pod_emergency_sweep_super(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long i, start, limit;
+
+    if ( p2md->pod.reclaim_super == 0 )
+    {
+        p2md->pod.reclaim_super = (p2md->pod.max_guest>>9)<<9;
+        p2md->pod.reclaim_super -= (1<<9);
+    }
+    
+    start = p2md->pod.reclaim_super;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    for ( i=p2md->pod.reclaim_super ; i > 0 ; i-=(1<<9) )
+    {
+        p2m_pod_zero_check_superpage(d, i);
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( !list_empty(&p2md->pod.super) &&  i < limit )
+            break;
+    }
+
+    p2md->pod.reclaim_super = i ? i - (1<<9) : 0;
+
+}
+
+#define POD_SWEEP_STRIDE  16
+static void
+p2m_pod_emergency_sweep(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long gfns[POD_SWEEP_STRIDE];
+    unsigned long i, j=0, start, limit;
+    p2m_type_t t;
+
+
+    if ( p2md->pod.reclaim_single == 0 )
+        p2md->pod.reclaim_single = p2md->pod.max_guest;
+
+    start = p2md->pod.reclaim_single;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    /* FIXME: Figure out how to avoid superpages */
+    for ( i=p2md->pod.reclaim_single ; i > 0 ; i-- )
+    {
+        gfn_to_mfn_query(d, i, &t );
+        if ( p2m_is_ram(t) )
+        {
+            gfns[j] = i;
+            j++;
+            BUG_ON(j > POD_SWEEP_STRIDE);
+            if ( j == POD_SWEEP_STRIDE )
+            {
+                p2m_pod_zero_check(d, gfns, j);
+                j = 0;
+            }
+        }
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( p2md->pod.count > 0 && i < limit )
+            break;
+    }
+
+    if ( j )
+        p2m_pod_zero_check(d, gfns, j);
+
+    p2md->pod.reclaim_single = i ? i - 1 : i;
+
+}
+
 static int
 p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
                         mfn_t table_mfn,
@@ -523,6 +806,19 @@ p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
         return 0;
     }
 
+    /* If we're low, start a sweep */
+    if ( order == 9 && list_empty(&p2md->pod.super) )
+        p2m_pod_emergency_sweep_super(d);
+
+    if ( list_empty(&p2md->pod.single) &&
+         ( ( order == 0 )
+           || (order == 9 && list_empty(&p2md->pod.super) ) ) )
+        p2m_pod_emergency_sweep(d);
+
+    /* Keep track of the highest gfn demand-populated by a guest fault */
+    if ( q == p2m_guest && gfn > p2md->pod.max_guest )
+        p2md->pod.max_guest = gfn;
+
     spin_lock(&d->page_alloc_lock);
 
     if ( p2md->pod.count == 0 )
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c
index afd03fec09..f7d30ddc19 100644
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -111,6 +111,33 @@ static unsigned inline int max_nr_maptrack_frames(void)
 #define active_entry(t, e) \
     ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
 
+/* The p2m emergency sweep code should not reclaim a frame that is currenlty
+ * grant mapped by another domain.  That would involve checking all other
+ * domains grant maps, which is impractical.  Instead, we check the active
+ * grant table for this domain to see if it's been granted.  Since this
+ * may be called as a result of a grant table op, we can't grab the lock. */
+int
+gnttab_is_granted(struct domain *d, xen_pfn_t gfn, int order)
+{
+    int i, found=0;
+    struct active_grant_entry *act;
+
+    /* We need to compare with active grant entries to make sure that
+     * pinned (== currently mapped) entries don't disappear under our
+     * feet. */
+    for ( i=0; i<nr_grant_entries(d->grant_table); i++ )
+    {
+        act = &active_entry(d->grant_table, i);
+        if ( act->gfn >> order == gfn >> order )
+        {
+            found = 1;
+            break;
+        }
+    }
+
+    return found;
+}
+
 static inline int
 __get_maptrack_handle(
     struct grant_table *t)
@@ -317,6 +344,7 @@ __gnttab_map_grant_ref(
         if ( !act->pin )
         {
             act->domid = scombo.shorts.domid;
+            act->gfn = sha->frame;
             act->frame = gmfn_to_mfn(rd, sha->frame);
         }
     }
@@ -1335,6 +1363,7 @@ __acquire_grant_for_copy(
         if ( !act->pin )
         {
             act->domid = scombo.shorts.domid;
+            act->gfn = sha->frame;
             act->frame = gmfn_to_mfn(rd, sha->frame);
         }
     }
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index e701a7aecb..2d4fd382f0 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -152,6 +152,9 @@ struct p2m_domain {
                          single;       /* Non-super lists                   */
         int              count,        /* # of pages in cache lists         */
                          entry_count;  /* # of pages in p2m marked pod      */
+        unsigned         reclaim_super; /* Last gpfn of a scan */
+        unsigned         reclaim_single; /* Last gpfn of a scan */
+        unsigned         max_guest;    /* gpfn of max guest demand-populate */
     } pod;
 };
 
diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h
index 0164668e2c..85a7c17592 100644
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -32,6 +32,7 @@
 struct active_grant_entry {
     u32           pin;    /* Reference count information.  */
     domid_t       domid;  /* Domain being granted access.  */
+    unsigned long gfn;    /* Guest's idea of the frame being granted. */
     unsigned long frame;  /* Frame being granted.          */
 };
 
@@ -146,4 +147,7 @@ nr_active_grant_frames(struct grant_table *gt)
     return num_act_frames_from_sha_frames(nr_grant_frames(gt));
 }
 
+int
+gnttab_is_granted(struct domain *d, xen_pfn_t gfn, int order);
+
 #endif /* __XEN_GRANT_TABLE_H__ */
-- 
2.30.2