PoD memory 5/9: emergency scan

author Keir Fraser <keir.fraser@citrix.com>

Mon, 5 Jan 2009 10:44:39 +0000 (10:44 +0000)

committer Keir Fraser <keir.fraser@citrix.com>

Mon, 5 Jan 2009 10:44:39 +0000 (10:44 +0000)
author Keir Fraser <keir.fraser@citrix.com>
Mon, 5 Jan 2009 10:44:39 +0000 (10:44 +0000)
committer Keir Fraser <keir.fraser@citrix.com>
Mon, 5 Jan 2009 10:44:39 +0000 (10:44 +0000)
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c

index dbb62237addfd74f484020614b3f3a2969a0f9cd..685424aedb266c2760ba41351b52acb449f0dbfe 100644 (file)
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -496,6 +496,289 @@ p2m_pod_dump_data(struct domain *d)
             p2md->pod.entry_count, p2md->pod.count);
  }
  
+#define superpage_aligned(_x)  (((_x)&((1<<9)-1))==0)
+
+/* Must be called w/ p2m lock held, page_alloc lock not held */
+static int
+p2m_pod_zero_check_superpage(struct domain *d, unsigned long gfn)
+{
+    mfn_t mfns[1<<9];
+    p2m_type_t types[1<<9];
+    unsigned long * map[1<<9] = { NULL };
+    int ret=0, reset = 0, reset_max = 0;
+    int i, j;
+
+    if ( !superpage_aligned(gfn) )
+        goto out;
+
+    /* Look up the mfns, checking to make sure they're the same mfn
+     * and aligned, and mapping them. */
+    for ( i=0; i<(1<<9); i++ )
+    {
+        mfns[i] = gfn_to_mfn_query(d, gfn + i, types + i);
+
+        /* Conditions that must be met for superpage-superpage:
+         * + All gfns are ram types
+         * + All gfns have the same type
+         * + None of the mfns are used as pagetables
+         * + The first mfn is 2-meg aligned
+         * + All the other mfns are in sequence
+         */
+        if ( p2m_is_ram(types[i])
+             && types[i] == types[0]
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_page_table) == 0 )
+             && ( ( i == 0 && superpage_aligned(mfn_x(mfns[0])) )
+                  || ( i != 0 && mfn_x(mfns[i]) == mfn_x(mfns[0]) + i ) ) )
+            map[i] = map_domain_page(mfn_x(mfns[i]));
+        else
+            goto out_unmap;
+    }
+
+    /* Now, do a quick check to see if it may be zero before unmapping. */
+    for ( i=0; i<(1<<9); i++ )
+    {
+        /* Quick zero-check */
+        for ( j=0; j<16; j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        if ( j < 16 )
+            goto out_unmap;
+
+    }
+
+    /* Try to remove the page, restoring old mapping if it fails. */
+    reset_max = 1<<9;
+    set_p2m_entry(d, gfn,
+                  _mfn(POPULATE_ON_DEMAND_MFN), 9,
+                  p2m_populate_on_demand);
+
+    if ( (mfn_to_page(mfns[0])->u.inuse.type_info & PGT_count_mask) != 0 )
+    {
+        reset = 1;
+        goto out_reset;
+    }
+
+    /* Timing here is important.  We need to make sure not to reclaim
+     * a page which has been grant-mapped to another domain.  But we
+     * can't grab the grant table lock, because we may be invoked from
+     * the grant table code!  So we first remove the page from the
+     * p2m, then check to see if the gpfn has been granted.  Once this
+     * gpfn is marked PoD, any future gfn_to_mfn() call will block
+     * waiting for the p2m lock.  If we find that it has been granted, we
+     * simply restore the old value.
+     */
+    if ( gnttab_is_granted(d, gfn, 9) )
+    {
+        printk("gfn contains grant table %lx\n", gfn);
+        reset = 1;
+        goto out_reset;
+    }
+
+    /* Finally, do a full zero-check */
+    for ( i=0; i < (1<<9); i++ )
+    {
+        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
+            if( *(map[i]+j) != 0 )
+            {
+                reset = 1;
+                break;
+            }
+
+        if ( reset )
+            goto out_reset;
+    }
+
+    /* Finally!  We've passed all the checks, and can add the mfn superpage
+     * back on the PoD cache, and account for the new p2m PoD entries */
+    p2m_pod_cache_add(d, mfn_to_page(mfns[0]), 9);
+    d->arch.p2m->pod.entry_count += (1<<9);
+
+out_reset:
+    if ( reset )
+    {
+        if (reset_max == (1<<9) )
+            set_p2m_entry(d, gfn, mfns[0], 9, types[0]);
+        else
+            for ( i=0; i<reset_max; i++)
+                set_p2m_entry(d, gfn + i, mfns[i], 0, types[i]);
+    }
+    
+out_unmap:
+    for ( i=0; i<(1<<9); i++ )
+        if ( map[i] )
+            unmap_domain_page(map[i]);
+out:
+    return ret;
+}
+
+static void
+p2m_pod_zero_check(struct domain *d, unsigned long *gfns, int count)
+{
+    mfn_t mfns[count];
+    p2m_type_t types[count];
+    unsigned long * map[count];
+
+    int i, j;
+
+    /* First, get the gfn list, translate to mfns, and map the pages. */
+    for ( i=0; i<count; i++ )
+    {
+        mfns[i] = gfn_to_mfn_query(d, gfns[i], types + i);
+        /* If this is ram, and not a pagetable, map it; otherwise,
+         * skip. */
+        if ( p2m_is_ram(types[i])
+             && ( (mfn_to_page(mfns[i])->count_info & PGC_page_table) == 0 ) )
+            map[i] = map_domain_page(mfn_x(mfns[i]));
+        else
+            map[i] = NULL;
+    }
+
+    /* Then, go through and check for zeroed pages, removing write permission
+     * for those with zeroes. */
+    for ( i=0; i<count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        /* Quick zero-check */
+        for ( j=0; j<16; j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        if ( j < 16 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+            continue;
+        }
+
+        /* Try to remove the page, restoring old mapping if it fails. */
+        set_p2m_entry(d, gfns[i],
+                      _mfn(POPULATE_ON_DEMAND_MFN), 0,
+                      p2m_populate_on_demand);
+
+        if ( (mfn_to_page(mfns[i])->u.inuse.type_info & PGT_count_mask) != 0 )
+        {
+            unmap_domain_page(map[i]);
+            map[i] = NULL;
+
+            set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
+
+            continue;
+        }
+    }
+
+    /* Now check each page for real */
+    for ( i=0; i < count; i++ )
+    {
+        if(!map[i])
+            continue;
+
+        for ( j=0; j<PAGE_SIZE/sizeof(*map[i]); j++ )
+            if( *(map[i]+j) != 0 )
+                break;
+
+        /* See comment in p2m_pod_zero_check_superpage() re gnttab
+         * check timing.  */
+        if ( j < PAGE_SIZE/sizeof(*map[i])
+             || gnttab_is_granted(d, gfns[i], 0) )
+        {
+            set_p2m_entry(d, gfns[i], mfns[i], 0, types[i]);
+            continue;
+        }
+        else
+        {
+            /* Add to cache, and account for the new p2m PoD entry */
+            p2m_pod_cache_add(d, mfn_to_page(mfns[i]), 0);
+            d->arch.p2m->pod.entry_count++;
+        }
+
+        unmap_domain_page(map[i]);
+        map[i] = NULL;
+    }
+    
+}
+
+#define POD_SWEEP_LIMIT 1024
+static void
+p2m_pod_emergency_sweep_super(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long i, start, limit;
+
+    if ( p2md->pod.reclaim_super == 0 )
+    {
+        p2md->pod.reclaim_super = (p2md->pod.max_guest>>9)<<9;
+        p2md->pod.reclaim_super -= (1<<9);
+    }
+    
+    start = p2md->pod.reclaim_super;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    for ( i=p2md->pod.reclaim_super ; i > 0 ; i-=(1<<9) )
+    {
+        p2m_pod_zero_check_superpage(d, i);
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( !list_empty(&p2md->pod.super) &&  i < limit )
+            break;
+    }
+
+    p2md->pod.reclaim_super = i ? i - (1<<9) : 0;
+
+}
+
+#define POD_SWEEP_STRIDE  16
+static void
+p2m_pod_emergency_sweep(struct domain *d)
+{
+    struct p2m_domain *p2md = d->arch.p2m;
+    unsigned long gfns[POD_SWEEP_STRIDE];
+    unsigned long i, j=0, start, limit;
+    p2m_type_t t;
+
+
+    if ( p2md->pod.reclaim_single == 0 )
+        p2md->pod.reclaim_single = p2md->pod.max_guest;
+
+    start = p2md->pod.reclaim_single;
+    limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
+
+    /* FIXME: Figure out how to avoid superpages */
+    for ( i=p2md->pod.reclaim_single ; i > 0 ; i-- )
+    {
+        gfn_to_mfn_query(d, i, &t );
+        if ( p2m_is_ram(t) )
+        {
+            gfns[j] = i;
+            j++;
+            BUG_ON(j > POD_SWEEP_STRIDE);
+            if ( j == POD_SWEEP_STRIDE )
+            {
+                p2m_pod_zero_check(d, gfns, j);
+                j = 0;
+            }
+        }
+        /* Stop if we're past our limit and we have found *something*.
+         *
+         * NB that this is a zero-sum game; we're increasing our cache size
+         * by re-increasing our 'debt'.  Since we hold the p2m lock,
+         * (entry_count - count) must remain the same. */
+        if ( p2md->pod.count > 0 && i < limit )
+            break;
+    }
+
+    if ( j )
+        p2m_pod_zero_check(d, gfns, j);
+
+    p2md->pod.reclaim_single = i ? i - 1 : i;
+
+}
+
  static int
  p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
                          mfn_t table_mfn,
@@ -523,6 +806,19 @@ p2m_pod_demand_populate(struct domain *d, unsigned long gfn,
          return 0;
      }
  
+    /* If we're low, start a sweep */
+    if ( order == 9 && list_empty(&p2md->pod.super) )
+        p2m_pod_emergency_sweep_super(d);
+
+    if ( list_empty(&p2md->pod.single) &&
+         ( ( order == 0 )
+           || (order == 9 && list_empty(&p2md->pod.super) ) ) )
+        p2m_pod_emergency_sweep(d);
+
+    /* Keep track of the highest gfn demand-populated by a guest fault */
+    if ( q == p2m_guest && gfn > p2md->pod.max_guest )
+        p2md->pod.max_guest = gfn;
+
      spin_lock(&d->page_alloc_lock);
  
      if ( p2md->pod.count == 0 )
diff --git a/xen/common/grant_table.c b/xen/common/grant_table.c

index afd03fec09d592fe80d73133de196bb02730f6d7..f7d30ddc194d5900355adeed4d946e4bcf367172 100644 (file)
--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -111,6 +111,33 @@ static unsigned inline int max_nr_maptrack_frames(void)
  #define active_entry(t, e) \
      ((t)->active[(e)/ACGNT_PER_PAGE][(e)%ACGNT_PER_PAGE])
  
+/* The p2m emergency sweep code should not reclaim a frame that is currenlty
+ * grant mapped by another domain.  That would involve checking all other
+ * domains grant maps, which is impractical.  Instead, we check the active
+ * grant table for this domain to see if it's been granted.  Since this
+ * may be called as a result of a grant table op, we can't grab the lock. */
+int
+gnttab_is_granted(struct domain *d, xen_pfn_t gfn, int order)
+{
+    int i, found=0;
+    struct active_grant_entry *act;
+
+    /* We need to compare with active grant entries to make sure that
+     * pinned (== currently mapped) entries don't disappear under our
+     * feet. */
+    for ( i=0; i<nr_grant_entries(d->grant_table); i++ )
+    {
+        act = &active_entry(d->grant_table, i);
+        if ( act->gfn >> order == gfn >> order )
+        {
+            found = 1;
+            break;
+        }
+    }
+
+    return found;
+}
+
  static inline int
  __get_maptrack_handle(
      struct grant_table *t)
@@ -317,6 +344,7 @@ __gnttab_map_grant_ref(
          if ( !act->pin )
          {
              act->domid = scombo.shorts.domid;
+            act->gfn = sha->frame;
              act->frame = gmfn_to_mfn(rd, sha->frame);
          }
      }
@@ -1335,6 +1363,7 @@ __acquire_grant_for_copy(
          if ( !act->pin )
          {
              act->domid = scombo.shorts.domid;
+            act->gfn = sha->frame;
              act->frame = gmfn_to_mfn(rd, sha->frame);
          }
      }
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h

index e701a7aecbf8481a51c0127dde5c4a4466175997..2d4fd382f0cb17fd3b7f1e4ac232f500b691fc79 100644 (file)
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -152,6 +152,9 @@ struct p2m_domain {
                           single;       /* Non-super lists                   */
          int              count,        /* # of pages in cache lists         */
                           entry_count;  /* # of pages in p2m marked pod      */
+        unsigned         reclaim_super; /* Last gpfn of a scan */
+        unsigned         reclaim_single; /* Last gpfn of a scan */
+        unsigned         max_guest;    /* gpfn of max guest demand-populate */
      } pod;
  };
  
diff --git a/xen/include/xen/grant_table.h b/xen/include/xen/grant_table.h

index 0164668e2cc1f3f85316677e6c1e901258ee9edc..85a7c175929611a05278063c517b9f06b8c4be6d 100644 (file)
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -32,6 +32,7 @@
  struct active_grant_entry {
      u32           pin;    /* Reference count information.  */
      domid_t       domid;  /* Domain being granted access.  */
+    unsigned long gfn;    /* Guest's idea of the frame being granted. */
      unsigned long frame;  /* Frame being granted.          */
  };
  
@@ -146,4 +147,7 @@ nr_active_grant_frames(struct grant_table *gt)
      return num_act_frames_from_sha_frames(nr_grant_frames(gt));
  }
  
+int
+gnttab_is_granted(struct domain *d, xen_pfn_t gfn, int order);
+
  #endif /* __XEN_GRANT_TABLE_H__ */
author	Keir Fraser <keir.fraser@citrix.com>
	Mon, 5 Jan 2009 10:44:39 +0000 (10:44 +0000)
committer	Keir Fraser <keir.fraser@citrix.com>
	Mon, 5 Jan 2009 10:44:39 +0000 (10:44 +0000)
xen/arch/x86/mm/p2m.c		patch \| blob \| history
xen/common/grant_table.c		patch \| blob \| history
xen/include/asm-x86/p2m.h		patch \| blob \| history
xen/include/xen/grant_table.h		patch \| blob \| history