bitkeeper revision 1.908 (40aa7a41_qzAxT0SBKFNAXKT6FF62g)
authoriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>
Tue, 18 May 2004 21:04:01 +0000 (21:04 +0000)
committeriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>
Tue, 18 May 2004 21:04:01 +0000 (21:04 +0000)
live migrate now works on SMP

13 files changed:
tools/xc/lib/xc.h
tools/xc/lib/xc_linux_restore.c
tools/xc/lib/xc_linux_save.c
tools/xc/lib/xc_private.c
tools/xc/py/Xc.c
xen/arch/i386/smp.c
xen/arch/i386/traps.c
xen/common/domain.c
xen/common/shadow.c
xen/include/xen/shadow.h
xen/net/dev.c
xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c
xenolinux-2.4.26-sparse/arch/xen/kernel/setup.c

index 2132d6e7c1ee63d6ee5763dde474dbab14ed382a..223710fad12e5116b22128b8fb959e94c54364df 100644 (file)
@@ -64,6 +64,7 @@ int xc_shadow_control(int xc_handle,
 
 #define XCFLAGS_VERBOSE 1
 #define XCFLAGS_LIVE    2
+#define XCFLAGS_DEBUG   4
 
 int xc_linux_save(int xc_handle,
                   u64 domid, 
index 1bbc575889f17b7b9a1c93aeff7ec56aef8944b2..d66e22fd0a6c8e11c7cdf0d17451be532a3b90de 100644 (file)
@@ -67,6 +67,7 @@ int xc_linux_restore(int xc_handle,
     unsigned long mfn, pfn, xpfn;
     unsigned int prev_pc, this_pc;
     int verbose = flags & XCFLAGS_VERBOSE;
+    int verify = 0; 
 
     /* Number of page frames in use by this Linux session. */
     unsigned long nr_pfns;
@@ -106,6 +107,8 @@ int xc_linux_restore(int xc_handle,
 
     int pm_handle = -1;
 
+    /* used by debug verify code */
+    unsigned long buf[PAGE_SIZE/sizeof(unsigned long)];
 
     if ( mlock(&ctxt, sizeof(ctxt) ) )
     {   
@@ -241,8 +244,17 @@ int xc_linux_restore(int xc_handle,
 
        DPRINTF("batch %d\n",j);
        
+       if (j == -1)
+       {
+           verify = 1;
+           printf("Entering page verify mode\n");
+           continue;
+       }
+
        if (j == 0) 
+       {
            break;  // our work here is done
+       }
 
        if( j > MAX_BATCH_SIZE )
        {
@@ -296,7 +308,10 @@ int xc_linux_restore(int xc_handle,
 
            mfn = pfn_to_mfn_table[pfn];
 
-            ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
+           if ( verify )
+               ppage = (unsigned long*) buf;  // debug case
+           else
+               ppage = (unsigned long*) (region_base + i*PAGE_SIZE);
 
            if ( (*readerfn)(readerst, ppage, PAGE_SIZE) )
            {
@@ -364,6 +379,24 @@ int xc_linux_restore(int xc_handle,
 
            } // end of page type switch statement
 
+           if ( verify )
+           {
+               int res = memcmp(buf, (region_base + i*PAGE_SIZE), PAGE_SIZE );
+               if (res)
+               {
+                   int v;
+                   printf("************** pfn=%x type=%x gotcs=%08lx actualcs=%08lx\n",pfn,pfn_type[pfn],csum_page(region_base + i*PAGE_SIZE),csum_page(buf));
+                   for(v=0;v<4;v++)
+                   {
+                       unsigned long * p = (unsigned long *) (region_base + i*PAGE_SIZE);
+                       if ( buf[v] != p[v] )
+                           printf("    %d: %08lx %08lx\n",
+                                  v, buf[v], p[v] );
+                   }
+
+               }
+           }
+
            if ( add_mmu_update(xc_handle, mmu,
                                (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) )
                goto out;
@@ -421,7 +454,6 @@ int xc_linux_restore(int xc_handle,
         goto out;
     }
 
-
     /* Uncanonicalise the suspend-record frame number and poke resume rec. */
     pfn = ctxt.cpu_ctxt.esi;
     if ( (pfn >= nr_pfns) || (pfn_type[pfn] != NONE) )
@@ -464,6 +496,12 @@ int xc_linux_restore(int xc_handle,
     }
     ctxt.pt_base = pfn_to_mfn_table[pfn] << PAGE_SHIFT;
 
+
+    /* clear any pending events and the selector */
+    memset( &(((shared_info_t *)shared_info)->evtchn_pending[0]),
+           0, sizeof (((shared_info_t *)shared_info)->evtchn_pending)+
+           sizeof(((shared_info_t *)shared_info)->evtchn_pending_sel) );
+
     /* Copy saved contents of shared-info page. No checking needed. */
     ppage = map_pfn_writeable(pm_handle, shared_info_frame);
     memcpy(ppage, shared_info, sizeof(shared_info_t));
@@ -543,13 +581,18 @@ int xc_linux_restore(int xc_handle,
     op.u.builddomain.ctxt = &ctxt;
     rc = do_dom0_op(xc_handle, &op);
 
-    DPRINTF("Everything OK!\n");
+    /* don't start the domain as we have console etc to set up */
+  
+    if( rc == 0 )
+    {
+       /* Success: print the domain id. */
+       verbose_printf("DOM=%llu\n", dom);
+       return 0;
+    }
 
- out:
-    if ( mmu != NULL )
-        free(mmu);
 
-    if ( rc != 0 )
+ out:
+    if ( rc != 0 )  // destroy is something went wrong
     {
         if ( dom != 0 )
         {
@@ -559,11 +602,9 @@ int xc_linux_restore(int xc_handle,
             (void)do_dom0_op(xc_handle, &op);
         }
     }
-    else
-    {
-        /* Success: print the domain id. */
-        verbose_printf("DOM=%llu\n", dom);
-    }
+
+    if ( mmu != NULL )
+        free(mmu);
 
     if ( pm_handle >= 0 )
         (void)close_pfn_mapper(pm_handle);
@@ -577,5 +618,7 @@ int xc_linux_restore(int xc_handle,
     if ( rc == 0 )
         *pdomid = dom;
 
+    DPRINTF("Restore exit with rc=%d\n",rc);
+
     return rc;
 }
index 37dd7c6fce089eb78618857ec8cde6eb8478ac62..ceb5f02e15a0c4f9f33780239f3c21e519824ef7 100644 (file)
@@ -12,6 +12,7 @@
 #define BATCH_SIZE 1024   /* 1024 pages (4MB) at a time */
 
 #define DEBUG 0
+#define DDEBUG 0
 
 #if DEBUG
 #define DPRINTF(_f, _a...) printf ( _f , ## _a )
 #define DPRINTF(_f, _a...) ((void)0)
 #endif
 
+#if DDEBUG
+#define DDPRINTF(_f, _a...) printf ( _f , ## _a )
+#else
+#define DDPRINTF(_f, _a...) ((void)0)
+#endif
+
+
+
 /* This may allow us to create a 'quiet' command-line option, if necessary. */
 #define verbose_printf(_f, _a...) \
     do {                          \
@@ -61,6 +70,18 @@ inline int test_bit ( int nr, volatile void * addr)
             (nr % (sizeof(unsigned long)*8) ) ) & 1;
 }
 
+inline void clear_bit ( int nr, volatile void * addr)
+{
+    ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] &= 
+       ~(1 << (nr % (sizeof(unsigned long)*8) ) );
+}
+
+inline void set_bit ( int nr, volatile void * addr)
+{
+    ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] |= 
+       (1 << (nr % (sizeof(unsigned long)*8) ) );
+}
+
 
 int xc_linux_save(int xc_handle,
                   u64 domid, 
@@ -73,6 +94,7 @@ int xc_linux_save(int xc_handle,
     unsigned long mfn;
     int verbose = flags & XCFLAGS_VERBOSE;
     int live = flags & XCFLAGS_LIVE;
+    int debug = flags & XCFLAGS_DEBUG;
     int sent_last_iter, sent_this_iter, max_iters;
 
     /* Remember if we stopped the guest, so we can restart it on exit. */
@@ -89,6 +111,7 @@ int xc_linux_save(int xc_handle,
 
     /* A table containg the type of each PFN (/not/ MFN!). */
     unsigned long *pfn_type = NULL;
+    unsigned long *pfn_batch = NULL;
 
     /* A temporary mapping, and a copy, of one frame of guest memory. */
     unsigned long page[1024];
@@ -115,7 +138,9 @@ int xc_linux_save(int xc_handle,
     unsigned long nr_pfns;
 
     /* bitmap of pages left to send */
-    unsigned long *to_send;
+    unsigned long *to_send, *to_fix;
+
+//live=0;
 
     if ( mlock(&ctxt, sizeof(ctxt) ) )
     {
@@ -274,8 +299,9 @@ int xc_linux_save(int xc_handle,
        int sz = (nr_pfns/8) + 8; // includes slop at end of array
        
        to_send = malloc( sz );
+       to_fix  = calloc( 1, sz );
 
-       if (!to_send)
+       if (!to_send || !to_fix)
        {
            ERROR("Couldn't allocate to_send array");
            goto out;
@@ -292,8 +318,9 @@ int xc_linux_save(int xc_handle,
 
     /* We want zeroed memory so use calloc rather than malloc. */
     pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
+    pfn_batch = calloc(BATCH_SIZE, sizeof(unsigned long));
 
-    if ( (pfn_type == NULL) )
+    if ( (pfn_type == NULL) || (pfn_batch == NULL) )
     {
         errno = ENOMEM;
         goto out;
@@ -370,22 +397,41 @@ int xc_linux_save(int xc_handle,
 
            for( batch = 0; batch < BATCH_SIZE && n < nr_pfns ; n++ )
            {
-               if ( !test_bit(n, to_send ) ) continue;
 
+               if(0 && debug)
+                   fprintf(stderr,"%d pfn= %08lx mfn= %08lx %d   [mfn]= %08lx\n",
+                           iter, n, live_pfn_to_mfn_table[n],
+                           test_bit(n,to_send),
+                           live_mfn_to_pfn_table[live_pfn_to_mfn_table[n]&0xFFFFF]);
+
+
+               if ( !test_bit(n, to_send ) &&
+                   !( last_iter && test_bit(n, to_fix ) ) ) continue;
+               
+               pfn_batch[batch] = n;
                pfn_type[batch] = live_pfn_to_mfn_table[n];
 
                if( pfn_type[batch] == 0x80000004 )
                {
-                   DPRINTF("Skip netbuf pfn %lx. mfn %lx\n",n,pfn_type[batch]);
+                   set_bit( n, to_fix );
+                   if( iter>1 )
+                       DDPRINTF("Urk! netbuf race: iter %d, pfn %lx. mfn %lx\n",
+                              iter,n,pfn_type[batch]);
                    continue;
                }
 
-               if(iter>1) { DPRINTF("pfn=%x mfn=%x\n",n,pfn_type[batch]); }
-               
+               if ( last_iter && test_bit(n, to_fix ) && !test_bit(n, to_send ))
+               {
+                   DPRINTF("Fix! iter %d, pfn %lx. mfn %lx\n",
+                              iter,n,pfn_type[batch]);
+               }
+
+               clear_bit( n, to_fix ); 
+
                batch++;
            }
            
-           DPRINTF("batch %d:%d (n=%d)\n",iter,batch,n);
+           DDPRINTF("batch %d:%d (n=%d)\n",iter,batch,n);
 
            if(batch == 0) goto skip; // vanishingly unlikely...
            
@@ -408,15 +454,26 @@ int xc_linux_save(int xc_handle,
            {
                if((pfn_type[j]>>29) == 7)
                {
-                   DPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
+                   DDPRINTF("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
                    continue;
                }
                
+               if(0 && debug)
+                   fprintf(stderr,"%d pfn= %08lx mfn= %08lx [mfn]= %08lx sum= %08lx\n",
+                           iter, 
+                           (pfn_type[j] & PGT_type_mask) | pfn_batch[j],
+                           pfn_type[j],
+                           live_mfn_to_pfn_table[pfn_type[j]&(~PGT_type_mask)],
+                           csum_page(region_base + (PAGE_SIZE*j))
+                       );
+
                /* canonicalise mfn->pfn */
                pfn_type[j] = (pfn_type[j] & PGT_type_mask) |
-                   live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask];
+                   pfn_batch[j];
+               //live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask];
+
            }
-           
+
            
            if ( (*writerfn)(writerst, &batch, sizeof(int) ) )
            {
@@ -437,7 +494,7 @@ int xc_linux_save(int xc_handle,
                
                if((pfn_type[j]>>29) == 7)
                {
-                   DPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
+                   DDPRINTF("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
                    continue;
                }
                
@@ -494,6 +551,7 @@ int xc_linux_save(int xc_handle,
                }  /* end of it's a PT page */
                else
                {  /* normal page */
+
                    if ( (*writerfn)(writerst, region_base + (PAGE_SIZE*j), PAGE_SIZE) )
                    {
                        ERROR("Error when writing to state file (5)");
@@ -512,6 +570,23 @@ int xc_linux_save(int xc_handle,
        
        verbose_printf("\b\b\b\b100%% (%d pages)\n", sent_this_iter );
        
+       if ( debug && last_iter )
+       {
+           int minusone = -1;
+           memset( to_send, 0xff, nr_pfns/8 );
+           debug = 0;
+           printf("Entering debug resend-all mode\n");
+    
+           /* send "-1" to put receiver into debug mode */
+           if ( (*writerfn)(writerst, &minusone, sizeof(int)) )
+           {
+               ERROR("Error when writing to state file (6)");
+               goto out;
+           }
+
+           continue;
+       }
+
        if ( last_iter )
            break;
 
@@ -520,7 +595,7 @@ int xc_linux_save(int xc_handle,
            if ( ( sent_this_iter > (sent_last_iter * 0.95) ) ||
                 (iter >= max_iters) || (sent_this_iter < 10) )
            {
-               printf("Start last iteration\n");
+               DPRINTF("Start last iteration\n");
                last_iter = 1;
 
                xc_domain_stop_sync( xc_handle, domid );
@@ -536,6 +611,7 @@ int xc_linux_save(int xc_handle,
            }
 
            sent_last_iter = sent_this_iter;
+
        }
 
 
@@ -609,6 +685,8 @@ out:
 
     if ( pfn_type != NULL )
         free(pfn_type);
+
+    DPRINTF("Save exit rc=%d\n",rc);
     
     return !!rc;
 
index 41eb2e744a1ad03ce4a0882c9f8305bdc0e0d860..430dc6ec1154eb1aa5d50f5f5a7f1ca5fe61adc4 100644 (file)
@@ -410,7 +410,7 @@ int xc_domain_stop_sync( int xc_handle, domid_t domid )
 
         if ( op.u.getdomaininfo.state == DOMSTATE_STOPPED )
        {
-           printf("\nDomain %lld stopped\n",domid);
+           printf("Domain %lld stopped\n",domid);
             return 0;
        }
        
@@ -420,3 +420,21 @@ int xc_domain_stop_sync( int xc_handle, domid_t domid )
 out:
     return -1;    
 }
+
+/**********************************************************************/
+
+// this is shared between save and restore, and may be useful.
+
+unsigned long csum_page ( void * page )
+{
+    int i;
+    unsigned long *p = page;
+    unsigned long long sum=0;
+
+    for (i=0;i<PAGE_SIZE/sizeof(unsigned long);i++)
+    {
+       sum += p[i];
+    }
+
+    return sum ^ (sum>>32);
+}
index b2ae143edafd7dd95662e4aea03ef83efec90e72..7bb1d877bdda8a4447b69ad97896061a6e982486 100644 (file)
@@ -191,17 +191,18 @@ static PyObject *pyxc_linux_save(PyObject *self,
 
     u64   dom;
     char *state_file;
-    int   progress = 1, live = -1;
+    int   progress = 1, live = -1, debug = 0;
     unsigned int flags = 0;
 
-    static char *kwd_list[] = { "dom", "state_file", "progress", "live", NULL };
+    static char *kwd_list[] = { "dom", "state_file", "progress", "live", "debug", NULL };
 
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Ls|ii", kwd_list, 
-                                      &dom, &state_file, &progress, &live) )
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Ls|iii", kwd_list, 
+                                      &dom, &state_file, &progress, &live, &debug) )
         return NULL;
 
     if (progress)  flags |= XCFLAGS_VERBOSE;
     if (live == 1) flags |= XCFLAGS_LIVE;
+    if (debug)     flags |= XCFLAGS_DEBUG;
 
     if ( strncmp(state_file,"tcp:", strlen("tcp:")) == 0 )
     {
@@ -362,6 +363,7 @@ static PyObject *pyxc_linux_restore(PyObject *self,
            do { 
                rc = read( (int) fd, ((char*)buf)+tot, count-tot ); 
                if ( rc < 0 ) { perror("READ"); return rc; }
+               if ( rc == 0 ) { printf("read: need %d, tot=%d got zero\n"); return -1; }
                tot += rc;
            } 
             while ( tot < count );
index 4989fc5085dba60fcb8152e8c60a6f21dd4910ec..363d61626e411f66e89d244f8c2eccb50dcdbafc 100644 (file)
@@ -212,7 +212,7 @@ static inline void send_IPI_allbutself(int vector)
  */
 
 static spinlock_t flush_lock = SPIN_LOCK_UNLOCKED;
-static volatile unsigned long flush_cpumask;
+volatile unsigned long flush_cpumask;
 
 asmlinkage void smp_invalidate_interrupt(void)
 {
index 7250074420bc3af9b6fc2110fdf8a8e6ed568cf7..d30c324804c81926b3e59fbcd613f8b6eea8e287 100644 (file)
@@ -167,7 +167,6 @@ void show_registers(struct pt_regs *regs)
            regs->xfs & 0xffff, regs->xgs & 0xffff, ss);
 
     show_stack(&regs->esp);
-    show_trace(&regs->esp);
 }      
 
 
index f952ce577ed3e910117e2c9f405d36f2bd43fb57..ee11f20fcd7afb1d62a82a802dd5d7b063341272 100644 (file)
@@ -526,6 +526,16 @@ unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
 
        /* initialise to machine_to_phys_mapping table to likely pfn */
        machine_to_phys_mapping[page-frame_table] = alloc_pfns;
+
+#ifndef NDEBUG
+       {
+           // initialise with magic marker if in DEBUG mode
+           void * a = map_domain_mem( (page-frame_table)<<PAGE_SHIFT );
+           memset( a, 0x80 | (char) p->domain, PAGE_SIZE );
+           unmap_domain_mem( a );
+       }
+#endif
+
     }
 
     p->tot_pages = nr_pages;
index 44945556e18d1bb7151b088ee40f13bc8b02e5a3..216c3deda1992aebf478eed6ef67fc377cacf75c 100644 (file)
@@ -28,7 +28,19 @@ hypercall lock anyhow (at least initially).
 
 ********/
 
-static spinlock_t cpu_stall_lock; 
+
+/**
+
+FIXME:
+
+1. Flush needs to avoid blowing away the L2 page that another CPU may be using!
+
+fix using cpu_raise_softirq
+
+have a flag to count in, (after switching to init's PTs) 
+spinlock, reload cr3_shadow, unlock
+
+**/
 
 static inline void free_shadow_page( struct mm_struct *m, 
                                      struct pfn_info *pfn_info )
@@ -115,7 +127,7 @@ static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
 
             for (i=0;i<ENTRIES_PER_L1_PAGETABLE;i++)
             {                    
-                if ( spl1e[i] & _PAGE_RW )
+                if ( (spl1e[i] & _PAGE_PRESENT ) && (spl1e[i] & _PAGE_RW) )
                 {
                     work++;
                     spl1e[i] &= ~_PAGE_RW;
@@ -124,6 +136,8 @@ static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
             unmap_domain_mem( spl1e );
         }
     }
+       break;
+
     }
     return work;
 }
@@ -161,7 +175,6 @@ static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
 
 void shadow_mode_init(void)
 {
-    spin_lock_init( &cpu_stall_lock ); 
 }
 
 int shadow_mode_enable( struct task_struct *p, unsigned int mode )
@@ -184,9 +197,9 @@ int shadow_mode_enable( struct task_struct *p, unsigned int mode )
 
     // allocate space for first lot of extra nodes
     m->shadow_ht_extras = kmalloc( sizeof(void*) + 
-                                  (shadow_ht_extra_size * 
-                                   sizeof(struct shadow_status)),
-                                  GFP_KERNEL );
+                                                                  (shadow_ht_extra_size * 
+                                                                       sizeof(struct shadow_status)),
+                                                                  GFP_KERNEL );
 
     if( ! m->shadow_ht_extras )
         goto nomem;
@@ -225,7 +238,7 @@ int shadow_mode_enable( struct task_struct *p, unsigned int mode )
     __shadow_mk_pagetable( m );
     return 0;
 
- nomem:
+nomem:
     return -ENOMEM;
 }
 
@@ -263,7 +276,7 @@ void shadow_mode_disable( struct task_struct *p )
 }
 
 static int shadow_mode_table_op( struct task_struct *p, 
-                                dom0_shadow_control_t *sc )
+                                                                dom0_shadow_control_t *sc )
 {
     unsigned int op = sc->op;
     struct mm_struct *m = &p->mm;
@@ -273,6 +286,8 @@ static int shadow_mode_table_op( struct task_struct *p,
     // tables right now. Calling flush on yourself would be really
     // stupid.
 
+    ASSERT(spin_is_locked(&p->mm.shadow_lock));
+
     if ( m == &current->mm )
     {
         printk("Don't try and flush your own page tables!\n");
@@ -291,48 +306,49 @@ static int shadow_mode_table_op( struct task_struct *p,
    
     case DOM0_SHADOW_CONTROL_OP_CLEAN:
     {
-       int i,j,zero=1;
+               int i,j,zero=1;
                
-       __scan_shadow_table( m, op );
+               __scan_shadow_table( m, op );
+               //    __free_shadow_table( m );
        
-       if( p->tot_pages > sc->pages || 
-           !sc->dirty_bitmap || !p->mm.shadow_dirty_bitmap )
-       {
-           rc = -EINVAL;
-           goto out;
-       }
+               if( p->tot_pages > sc->pages || 
+                       !sc->dirty_bitmap || !p->mm.shadow_dirty_bitmap )
+               {
+                       rc = -EINVAL;
+                       goto out;
+               }
        
-       sc->pages = p->tot_pages;
+               sc->pages = p->tot_pages;
        
 #define chunk (8*1024) // do this in 1KB chunks for L1 cache
        
-       for(i=0;i<p->tot_pages;i+=chunk)
-       {
-           int bytes = ((  ((p->tot_pages-i) > (chunk))?
-                           (chunk):(p->tot_pages-i) ) + 7) / 8;
+               for(i=0;i<p->tot_pages;i+=chunk)
+               {
+                       int bytes = ((  ((p->tot_pages-i) > (chunk))?
+                                                       (chunk):(p->tot_pages-i) ) + 7) / 8;
            
-           copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
-                         p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
-                         bytes );
+                       copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                                                 p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                                                 bytes );
            
-           for(j=0; zero && j<bytes/sizeof(unsigned long);j++)
-           {
-               if( p->mm.shadow_dirty_bitmap[j] != 0 )
-                   zero = 0;
-           }
-
-           memset( p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
-                   0, bytes);
-       }
+                       for(j=0; zero && j<bytes/sizeof(unsigned long);j++)
+                       {
+                               if( p->mm.shadow_dirty_bitmap[j] != 0 )
+                                       zero = 0;
+                       }
 
-       if (zero)
-       {
-           /* might as well stop the domain as an optimization. */
-           if ( p->state != TASK_STOPPED )
-               send_guest_virq(p, VIRQ_STOP);
-       }
-       
-       break;
+                       memset( p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                                       0, bytes);
+               }
+
+               if (zero)
+               {
+                       /* might as well stop the domain as an optimization. */
+                       if ( p->state != TASK_STOPPED )
+                               send_guest_virq(p, VIRQ_STOP);
+               }
+
+               break;
     }
     }
 
@@ -352,50 +368,10 @@ out:
 int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc )
 {
     unsigned int cmd = sc->op;
-    int rc = 0, cpu;
-
-    // don't call if already shadowed...
-
-    /* The following is pretty hideous because we don't have a way of
-       synchronously pausing a domain. If it's assigned to the curernt CPU,
-       we don't have to worry -- it can't possibly actually be running.
-       If its on another CPU, for the moment, we do something really gross:
-       we cause the other CPU to spin regardless of what domain it is running. 
-
-       I know this is really grim, but it only lasts a few 10's of
-       microseconds. It needs fixing as soon as the last of the Linux-isms
-       get removed from the task structure...
-
-       Oh, and let's hope someone doesn't repin the CPU while we're here.
-       Also, prey someone else doesn't do this in another domain.
-       At least there's only one dom0 at the moment...
-
-     */
+    int rc = 0;
 
-printk("XXX\n");
     spin_lock(&p->mm.shadow_lock);
 
-printk("SMC irq=%d\n",local_irq_is_enabled());
-    spin_lock( &cpu_stall_lock );              
-    cpu = p->processor;
-printk("got target cpu=%d this cpu=%d\n",cpu, current->processor );
-    if ( cpu != current->processor )
-    {
-       static void cpu_stall(void * data)
-       {
-           if ( current->processor == (int) data )
-           {
-               printk("Stall cpu=%d is locked %d irq=%d\n",(int)data,spin_is_locked(&cpu_stall_lock),local_irq_is_enabled());
-               spin_lock( &cpu_stall_lock );
-               printk("release\n");
-               spin_unlock( &cpu_stall_lock );
-           }
-       }
-printk("before\n");
-       smp_call_function(cpu_stall, (void*)cpu, 1, 0); // don't wait!
-printk("after\n");
-    }
-
     if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
     {
         shadow_mode_disable(p);
@@ -412,18 +388,15 @@ printk("after\n");
     } 
     else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN )
     {
-printk("+");
         rc = shadow_mode_table_op(p, sc);
-printk("=");
     }
     else
     {
         rc = -EINVAL;
     }
 
-    spin_unlock( &cpu_stall_lock );
-printk("SMC- %d\n",rc);
-
+       flush_tlb_cpu(p->processor);
+   
     spin_unlock(&p->mm.shadow_lock);
 
     return rc;
@@ -549,8 +522,6 @@ int shadow_fault( unsigned long va, long error_code )
     unsigned long gpte, spte;
     struct mm_struct *m = &current->mm;
 
-    // we know interrupts are always on entry to the page fault handler 
-
     SH_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code );
 
     check_pagetable( current, current->mm.pagetable, "pre-sf" );
@@ -573,9 +544,18 @@ int shadow_fault( unsigned long va, long error_code )
         return 0;
     }
 
-    spin_lock(&current->mm.shadow_lock);
     // take the lock and reread gpte
 
+    while( unlikely(!spin_trylock(&current->mm.shadow_lock)) )
+       {
+               extern volatile unsigned long flush_cpumask;
+               if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
+                       local_flush_tlb();
+               rep_nop();
+       }
+       
+       ASSERT(spin_is_locked(&current->mm.shadow_lock));
+       
     if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
     {
         SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
index 587f9178bd5dd040c02c8dcc1432349dbb363991..f1ce8b6689d0b62608de623c1f374f6814788379 100644 (file)
@@ -27,14 +27,14 @@ extern void shadow_mode_init(void);
 extern int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc );
 extern int shadow_fault( unsigned long va, long error_code );
 extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, 
-                                                                               unsigned long *prev_spfn_ptr,
-                                                                               l1_pgentry_t **prev_spl1e_ptr  );
+                                       unsigned long *prev_spfn_ptr,
+                                       l1_pgentry_t **prev_spl1e_ptr  );
 extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte );
 extern void unshadow_table( unsigned long gpfn, unsigned int type );
 extern int shadow_mode_enable( struct task_struct *p, unsigned int mode );
 extern void shadow_mode_disable( struct task_struct *p );
 extern unsigned long shadow_l2_table( 
-                     struct mm_struct *m, unsigned long gpfn );
+    struct mm_struct *m, unsigned long gpfn );
 
 #define SHADOW_DEBUG 0
 #define SHADOW_HASH_DEBUG 0
@@ -51,24 +51,24 @@ struct shadow_status {
 
 #ifndef NDEBUG
 #define SH_LOG(_f, _a...)                             \
-  printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
-         current->domain , __LINE__ , ## _a )
+printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
+       current->domain , __LINE__ , ## _a )
 #else
 #define SH_LOG(_f, _a...) 
 #endif
 
 #if SHADOW_DEBUG
 #define SH_VLOG(_f, _a...)                             \
-  printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
-         current->domain , __LINE__ , ## _a )
+    printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
+          current->domain , __LINE__ , ## _a )
 #else
 #define SH_VLOG(_f, _a...) 
 #endif
 
 #if 0
 #define SH_VVLOG(_f, _a...)                             \
-  printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
-         current->domain , __LINE__ , ## _a )
+    printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
+          current->domain , __LINE__ , ## _a )
 #else
 #define SH_VVLOG(_f, _a...) 
 #endif
@@ -76,46 +76,57 @@ struct shadow_status {
 
 /************************************************************************/
 
-static inline void mark_dirty( struct mm_struct *m, unsigned int mfn )
+    static inline void __mark_dirty( struct mm_struct *m, unsigned int mfn )
 {
-       unsigned int pfn;
+    unsigned int pfn;
 
-       pfn = machine_to_phys_mapping[mfn];
+    ASSERT(spin_is_locked(&m->shadow_lock));
+       
+    //printk("%08x %08lx\n", mfn, machine_to_phys_mapping[mfn] );
 
-       /* We use values with the top bit set to mark MFNs that aren't
-          really part of the domain's psuedo-physical memory map e.g.
-           the shared info frame. Nothing to do here...
-         */
-       if ( unlikely(pfn & 0x80000000U) ) return; 
+    pfn = machine_to_phys_mapping[mfn];
 
-       ASSERT(m->shadow_dirty_bitmap);
-       if( likely(pfn<m->shadow_dirty_bitmap_size) )
-       {
-               /* use setbit to be smp guest safe. Since the same page is likely to 
-                  get marked dirty many times, examine the bit first before doing the
-                  expensive lock-prefixed opertion */
+    /* We use values with the top bit set to mark MFNs that aren't
+       really part of the domain's psuedo-physical memory map e.g.
+       the shared info frame. Nothing to do here...
+       */
+    if ( unlikely(pfn & 0x80000000U) ) return; 
 
-               if (! test_bit( pfn, m->shadow_dirty_bitmap ) )
-                       set_bit( pfn, m->shadow_dirty_bitmap );
-       }
-       else
-       {
-               extern void show_traceX(void);
-               SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
-                          mfn, pfn, m->shadow_dirty_bitmap_size, m );
-               SH_LOG("dom=%lld caf=%08x taf=%08x\n", 
-                          frame_table[mfn].u.domain->domain,
-                          frame_table[mfn].count_and_flags, 
-                          frame_table[mfn].type_and_flags );
-               //show_traceX();
-       }
+    ASSERT(m->shadow_dirty_bitmap);
+    if( likely(pfn<m->shadow_dirty_bitmap_size) )
+    {
+       /* These updates occur with mm.shadow_lock held */
+       __set_bit( pfn, m->shadow_dirty_bitmap );
+    }
+    else
+    {
+       extern void show_traceX(void);
+       SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
+              mfn, pfn, m->shadow_dirty_bitmap_size, m );
+       SH_LOG("dom=%lld caf=%08x taf=%08x\n", 
+              frame_table[mfn].u.domain->domain,
+              frame_table[mfn].count_and_flags, 
+              frame_table[mfn].type_and_flags );
+       //show_traceX();
+    }
 
 }
 
+
+static inline void mark_dirty( struct mm_struct *m, unsigned int mfn )
+{      
+    ASSERT(local_irq_is_enabled());
+    //if(spin_is_locked(&m->shadow_lock)) printk("+");
+    spin_lock(&m->shadow_lock);
+    __mark_dirty( m, mfn );
+    spin_unlock(&m->shadow_lock);
+}
+
+
 /************************************************************************/
 
 static inline void l1pte_write_fault( struct mm_struct *m, 
-                                                                         unsigned long *gpte_p, unsigned long *spte_p )
+                                     unsigned long *gpte_p, unsigned long *spte_p )
 { 
     unsigned long gpte = *gpte_p;
     unsigned long spte = *spte_p;
@@ -123,17 +134,17 @@ static inline void l1pte_write_fault( struct mm_struct *m,
     switch( m->shadow_mode )
     {
     case SHM_test:
-               spte = gpte;
-               gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
-               spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;                        
-               break;
+       spte = gpte;
+       gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
+       spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;                        
+       break;
 
     case SHM_logdirty:
-               spte = gpte;
-               gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
-               spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;                        
-               mark_dirty( m, (gpte >> PAGE_SHIFT) );
-               break;
+       spte = gpte;
+       gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
+       spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;                        
+       __mark_dirty( m, (gpte >> PAGE_SHIFT) );
+       break;
     }
 
     *gpte_p = gpte;
@@ -141,7 +152,7 @@ static inline void l1pte_write_fault( struct mm_struct *m,
 }
 
 static inline void l1pte_read_fault( struct mm_struct *m, 
-                                                                        unsigned long *gpte_p, unsigned long *spte_p )
+                                    unsigned long *gpte_p, unsigned long *spte_p )
 { 
     unsigned long gpte = *gpte_p;
     unsigned long spte = *spte_p;
@@ -149,19 +160,19 @@ static inline void l1pte_read_fault( struct mm_struct *m,
     switch( m->shadow_mode )
     {
     case SHM_test:
-               spte = gpte;
-               gpte |= _PAGE_ACCESSED;
-               spte |= _PAGE_ACCESSED;                         
-               if ( ! (gpte & _PAGE_DIRTY ) )
-                       spte &= ~ _PAGE_RW;
-               break;
+       spte = gpte;
+       gpte |= _PAGE_ACCESSED;
+       spte |= _PAGE_ACCESSED;                         
+       if ( ! (gpte & _PAGE_DIRTY ) )
+           spte &= ~ _PAGE_RW;
+       break;
 
     case SHM_logdirty:
-               spte = gpte;
-               gpte |= _PAGE_ACCESSED;
-               spte |= _PAGE_ACCESSED;                         
-               spte &= ~ _PAGE_RW;
-               break;
+       spte = gpte;
+       gpte |= _PAGE_ACCESSED;
+       spte |= _PAGE_ACCESSED;                         
+       spte &= ~ _PAGE_RW;
+       break;
     }
 
     *gpte_p = gpte;
@@ -169,7 +180,7 @@ static inline void l1pte_read_fault( struct mm_struct *m,
 }
 
 static inline void l1pte_no_fault( struct mm_struct *m, 
-                                                                  unsigned long *gpte_p, unsigned long *spte_p )
+                                  unsigned long *gpte_p, unsigned long *spte_p )
 { 
     unsigned long gpte = *gpte_p;
     unsigned long spte = *spte_p;
@@ -177,26 +188,26 @@ static inline void l1pte_no_fault( struct mm_struct *m,
     switch( m->shadow_mode )
     {
     case SHM_test:
-               spte = 0;
-               if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
-                        (_PAGE_PRESENT|_PAGE_ACCESSED) )
-               {
-                       spte = gpte;
-                       if ( ! (gpte & _PAGE_DIRTY ) )
-                               spte &= ~ _PAGE_RW;
-               }
-               break;
+       spte = 0;
+       if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
+            (_PAGE_PRESENT|_PAGE_ACCESSED) )
+       {
+           spte = gpte;
+           if ( ! (gpte & _PAGE_DIRTY ) )
+               spte &= ~ _PAGE_RW;
+       }
+       break;
 
     case SHM_logdirty:
-               spte = 0;
-               if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
-                        (_PAGE_PRESENT|_PAGE_ACCESSED) )
-               {
-                       spte = gpte;
-                       spte &= ~ _PAGE_RW;
-               }
-
-               break;
+       spte = 0;
+       if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) == 
+            (_PAGE_PRESENT|_PAGE_ACCESSED) )
+       {
+           spte = gpte;
+           spte &= ~ _PAGE_RW;
+       }
+
+       break;
     }
 
     *gpte_p = gpte;
@@ -204,27 +215,27 @@ static inline void l1pte_no_fault( struct mm_struct *m,
 }
 
 static inline void l2pde_general( struct mm_struct *m, 
-                          unsigned long *gpde_p, unsigned long *spde_p,
-                          unsigned long sl1pfn)
+                                 unsigned long *gpde_p, unsigned long *spde_p,
+                                 unsigned long sl1pfn)
 {
     unsigned long gpde = *gpde_p;
     unsigned long spde = *spde_p;
 
-       spde = 0;
+    spde = 0;
 
-       if ( sl1pfn )
-       {
-               spde = (gpde & ~PAGE_MASK) | (sl1pfn<<PAGE_SHIFT) | 
-                       _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
-               gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY;
-
-               if ( unlikely( (sl1pfn<<PAGE_SHIFT) == (gpde & PAGE_MASK)  ) )
-               {   
-                       // detect linear map, and keep pointing at guest
-                       SH_VLOG("4c: linear mapping ( %08lx )",sl1pfn);
-                       spde = gpde & ~_PAGE_RW;
-               }
+    if ( sl1pfn )
+    {
+       spde = (gpde & ~PAGE_MASK) | (sl1pfn<<PAGE_SHIFT) | 
+           _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
+       gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY;
+
+       if ( unlikely( (sl1pfn<<PAGE_SHIFT) == (gpde & PAGE_MASK)  ) )
+       {   
+           // detect linear map, and keep pointing at guest
+           SH_VLOG("4c: linear mapping ( %08lx )",sl1pfn);
+           spde = gpde & ~_PAGE_RW;
        }
+    }
 
     *gpde_p = gpde;
     *spde_p = spde;
@@ -237,45 +248,45 @@ static inline void l2pde_general( struct mm_struct *m,
 #if SHADOW_HASH_DEBUG
 static void shadow_audit(struct mm_struct *m, int print)
 {
-       int live=0, free=0, j=0, abs;
-       struct shadow_status *a;
+    int live=0, free=0, j=0, abs;
+    struct shadow_status *a;
        
     for(j=0;j<shadow_ht_buckets;j++)
     {
         a = &m->shadow_ht[j];        
-               if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);}
-               ASSERT((a->pfn&0xf0000000)==0);
-               ASSERT(a->pfn<0x00100000);
-               a=a->next;
+       if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);}
+       ASSERT((a->pfn&0xf0000000)==0);
+       ASSERT(a->pfn<0x00100000);
+       a=a->next;
         while(a && live<9999)
-               { 
-                       live++; 
-                       if(a->pfn == 0 || a->spfn_and_flags == 0)
-                       {
-                               printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
-                                          live, a->pfn, a->spfn_and_flags, a->next);
-                               BUG();
-                       }
-                       ASSERT(a->pfn);
-                       ASSERT((a->pfn&0xf0000000)==0);
-                       ASSERT(a->pfn<0x00100000);
-                       ASSERT(a->spfn_and_flags&PSH_pfn_mask);
-                       a=a->next; 
-               }
-               ASSERT(live<9999);
+       { 
+           live++; 
+           if(a->pfn == 0 || a->spfn_and_flags == 0)
+           {
+               printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
+                      live, a->pfn, a->spfn_and_flags, a->next);
+               BUG();
+           }
+           ASSERT(a->pfn);
+           ASSERT((a->pfn&0xf0000000)==0);
+           ASSERT(a->pfn<0x00100000);
+           ASSERT(a->spfn_and_flags&PSH_pfn_mask);
+           a=a->next; 
        }
+       ASSERT(live<9999);
+    }
 
     a = m->shadow_ht_free;
     while(a) { free++; a=a->next; }
 
     if(print) printk("Xlive=%d free=%d\n",live,free);
 
-       abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live;
-       if( abs < -1 || abs > 1 )
-       {
-               printk("live=%d free=%d l1=%d l2=%d\n",live,free,
-                         perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) );
-               BUG();
+    abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live;
+    if( abs < -1 || abs > 1 )
+    {
+       printk("live=%d free=%d l1=%d l2=%d\n",live,free,
+              perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) );
+       BUG();
     }
 
 }
@@ -287,56 +298,56 @@ static void shadow_audit(struct mm_struct *m, int print)
 
 
 static inline struct shadow_status* hash_bucket( struct mm_struct *m,
-                                                                                                unsigned int gpfn )
+                                                unsigned int gpfn )
 {
     return &(m->shadow_ht[gpfn % shadow_ht_buckets]);
 }
 
 
 static inline unsigned long __shadow_status( struct mm_struct *m,
-                                                                                  unsigned int gpfn )
+                                            unsigned int gpfn )
 {
-       struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn );
+    struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn );
 
     b = B;
     ob = NULL;
 
-       SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b );
-       shadow_audit(m,0);  // if in debug mode
+    SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b );
+    shadow_audit(m,0);  // if in debug mode
 
-       do
+    do
+    {
+       if ( b->pfn == gpfn )
        {
-               if ( b->pfn == gpfn )
-               {
-                       unsigned long t;
-                       struct shadow_status *x;
-
-                       // swap with head
-                       t=B->pfn; B->pfn=b->pfn; b->pfn=t;
-                       t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags; 
-                           b->spfn_and_flags=t;
-
-                       if(ob)
-                       {   // pull to front
-                               *ob=b->next;
-                               x=B->next;
-                               B->next=b;      
-                               b->next=x;
-                       }                       
-                       return B->spfn_and_flags;
-               }
+           unsigned long t;
+           struct shadow_status *x;
+
+           // swap with head
+           t=B->pfn; B->pfn=b->pfn; b->pfn=t;
+           t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags; 
+           b->spfn_and_flags=t;
+
+           if(ob)
+           {   // pull to front
+               *ob=b->next;
+               x=B->next;
+               B->next=b;      
+               b->next=x;
+           }                   
+           return B->spfn_and_flags;
+       }
 #if SHADOW_HASH_DEBUG
-               else
-               {
-                       if(b!=B)ASSERT(b->pfn);
-               }
-#endif
-               ob=&b->next;
-               b=b->next;
+       else
+       {
+           if(b!=B)ASSERT(b->pfn);
        }
-       while (b);
+#endif
+       ob=&b->next;
+       b=b->next;
+    }
+    while (b);
 
-       return 0;
+    return 0;
 }
 
 /* we can make this locking more fine grained e.g. per shadow page if it 
@@ -344,112 +355,119 @@ ever becomes a problem, but since we need a spin lock on the hash table
 anyway its probably not worth being too clever. */
 
 static inline unsigned long get_shadow_status( struct mm_struct *m,
-                                                                                  unsigned int gpfn )
+                                              unsigned int gpfn )
 {
-       unsigned long res;
-
-       /* If we get here, we know that this domain is running in shadow mode. 
-          We also know that some sort of update has happened to the underlying
-          page table page: either a PTE has been updated, or the page has
-          changed type. If we're in log dirty mode, we should set the approrpiate
-          bit in the dirty bitmap.
-          NB: the VA update path doesn't use this so needs to be handled 
-          independnetly. 
-        */
-
-       if( m->shadow_mode == SHM_logdirty )
-               mark_dirty( m, gpfn );
+    unsigned long res;
+
+    /* If we get here, we know that this domain is running in shadow mode. 
+       We also know that some sort of update has happened to the underlying
+       page table page: either a PTE has been updated, or the page has
+       changed type. If we're in log dirty mode, we should set the approrpiate
+       bit in the dirty bitmap.
+       NB: the VA update path doesn't use this so needs to be handled 
+       independnetly. 
+       */
+
+    ASSERT(local_irq_is_enabled());
+    //if(spin_is_locked(&m->shadow_lock)) printk("*");
+    spin_lock(&m->shadow_lock);
+
+    if( m->shadow_mode == SHM_logdirty )
+       __mark_dirty( m, gpfn );
        
-       spin_lock(&m->shadow_lock);
-       res = __shadow_status( m, gpfn );
-       if (!res) spin_unlock(&m->shadow_lock);
-       return res;
+    res = __shadow_status( m, gpfn );
+    if (!res) spin_unlock(&m->shadow_lock);
+    return res;
 }
 
 
 static inline void put_shadow_status( struct mm_struct *m )
 {
-       spin_unlock(&m->shadow_lock);
+    spin_unlock(&m->shadow_lock);
 }
 
 
 static inline void delete_shadow_status( struct mm_struct *m,
-                                                                         unsigned int gpfn )
+                                        unsigned int gpfn )
 {
-       struct shadow_status *b, *B, **ob;
+    struct shadow_status *b, *B, **ob;
 
-       B = b = hash_bucket( m, gpfn );
+    ASSERT(spin_is_locked(&m->shadow_lock));
 
-       SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b );
-       shadow_audit(m,0);
-       ASSERT(gpfn);
+    B = b = hash_bucket( m, gpfn );
+
+    SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b );
+    shadow_audit(m,0);
+    ASSERT(gpfn);
 
-       if( b->pfn == gpfn )
+    if( b->pfn == gpfn )
     {
-               if (b->next)
-               {
-                       struct shadow_status *D=b->next;
-                       b->spfn_and_flags = b->next->spfn_and_flags;
-                       b->pfn = b->next->pfn;
-
-                       b->next = b->next->next;
-                       D->next = m->shadow_ht_free;
-                       D->pfn = 0;
-                       D->spfn_and_flags = 0;
-                       m->shadow_ht_free = D;
-               }
-               else
-               {
-                       b->pfn = 0;
-                       b->spfn_and_flags = 0;
-               }
+       if (b->next)
+       {
+           struct shadow_status *D=b->next;
+           b->spfn_and_flags = b->next->spfn_and_flags;
+           b->pfn = b->next->pfn;
+
+           b->next = b->next->next;
+           D->next = m->shadow_ht_free;
+           D->pfn = 0;
+           D->spfn_and_flags = 0;
+           m->shadow_ht_free = D;
+       }
+       else
+       {
+           b->pfn = 0;
+           b->spfn_and_flags = 0;
+       }
 
 #if SHADOW_HASH_DEBUG
-               if( __shadow_status(m,gpfn) ) BUG();  
-               shadow_audit(m,0);
+       if( __shadow_status(m,gpfn) ) BUG();  
+       shadow_audit(m,0);
 #endif
-               return;
+       return;
     }
 
-       ob = &b->next;
-       b=b->next;
+    ob = &b->next;
+    b=b->next;
 
-       do
+    do
+    {
+       if ( b->pfn == gpfn )                   
        {
-               if ( b->pfn == gpfn )                   
-               {
-                       b->pfn = 0;
-                       b->spfn_and_flags = 0;
+           b->pfn = 0;
+           b->spfn_and_flags = 0;
 
-                       // b is in the list
-            *ob=b->next;
-                       b->next = m->shadow_ht_free;
-                       m->shadow_ht_free = b;
+           // b is in the list
+           *ob=b->next;
+           b->next = m->shadow_ht_free;
+           m->shadow_ht_free = b;
 
 #if SHADOW_HASH_DEBUG
-                       if( __shadow_status(m,gpfn) ) BUG();
+           if( __shadow_status(m,gpfn) ) BUG();
 #endif
-                       shadow_audit(m,0);
-                       return;
-               }
-
-               ob = &b->next;
-               b=b->next;
+           shadow_audit(m,0);
+           return;
        }
-       while (b);
 
-       // if we got here, it wasn't in the list
+       ob = &b->next;
+       b=b->next;
+    }
+    while (b);
+
+    // if we got here, it wasn't in the list
     BUG();
 }
 
 
 static inline void set_shadow_status( struct mm_struct *m,
-                                                                         unsigned int gpfn, unsigned long s )
+                                     unsigned int gpfn, unsigned long s )
 {
-       struct shadow_status *b, *B, *extra, **fptr;
+    struct shadow_status *b, *B, *extra, **fptr;
     int i;
 
-       B = b = hash_bucket( m, gpfn );
+    ASSERT(spin_is_locked(&m->shadow_lock));
+
+    B = b = hash_bucket( m, gpfn );
    
     ASSERT(gpfn);
     //ASSERT(s);
@@ -458,106 +476,107 @@ static inline void set_shadow_status( struct mm_struct *m,
 
     shadow_audit(m,0);
 
-       do
+    do
+    {
+       if ( b->pfn == gpfn )                   
        {
-               if ( b->pfn == gpfn )                   
-               {
-                       b->spfn_and_flags = s;
-                       shadow_audit(m,0);
-                       return;
-               }
-
-               b=b->next;
+           b->spfn_and_flags = s;
+           shadow_audit(m,0);
+           return;
        }
-       while (b);
 
-       // if we got here, this is an insert rather than update
+       b=b->next;
+    }
+    while (b);
+
+    // if we got here, this is an insert rather than update
 
     ASSERT( s );  // deletes must have succeeded by here
 
     if ( B->pfn == 0 )
-       {
-               // we can use this head
-        ASSERT( B->next == 0 );
-               B->pfn = gpfn;
-               B->spfn_and_flags = s;
-               shadow_audit(m,0);
-               return;
-       }
+    {
+       // we can use this head
+       ASSERT( B->next == 0 );
+       B->pfn = gpfn;
+       B->spfn_and_flags = s;
+       shadow_audit(m,0);
+       return;
+    }
 
     if( unlikely(m->shadow_ht_free == NULL) )
     {
-        SH_LOG("allocate more shadow hashtable blocks");
+       SH_LOG("allocate more shadow hashtable blocks");
 
-        // we need to allocate more space
-        extra = kmalloc( sizeof(void*) + (shadow_ht_extra_size * 
-                                                          sizeof(struct shadow_status)), GFP_KERNEL );
+       // we need to allocate more space
+       extra = kmalloc( sizeof(void*) + (shadow_ht_extra_size * 
+                                         sizeof(struct shadow_status)), GFP_KERNEL );
 
-           if( ! extra ) BUG(); // should be more graceful here....
+       if( ! extra ) BUG(); // should be more graceful here....
 
-           memset( extra, 0, sizeof(void*) + (shadow_ht_extra_size * 
-                                                          sizeof(struct shadow_status)) );
+       memset( extra, 0, sizeof(void*) + (shadow_ht_extra_size * 
+                                          sizeof(struct shadow_status)) );
 
-        m->shadow_extras_count++;
+       m->shadow_extras_count++;
        
-        // add extras to free list
-           fptr = &m->shadow_ht_free;
-           for ( i=0; i<shadow_ht_extra_size; i++ )
-           {
-                   *fptr = &extra[i];
-                   fptr = &(extra[i].next);
-           }
-           *fptr = NULL;
+       // add extras to free list
+       fptr = &m->shadow_ht_free;
+       for ( i=0; i<shadow_ht_extra_size; i++ )
+       {
+           *fptr = &extra[i];
+           fptr = &(extra[i].next);
+       }
+       *fptr = NULL;
 
-           *((struct shadow_status ** ) &extra[shadow_ht_extra_size]) = 
-                                            m->shadow_ht_extras;
-        m->shadow_ht_extras = extra;
+       *((struct shadow_status ** ) &extra[shadow_ht_extra_size]) = 
+           m->shadow_ht_extras;
+       m->shadow_ht_extras = extra;
 
     }
 
-       // should really put this in B to go right to front
-       b = m->shadow_ht_free;
+    // should really put this in B to go right to front
+    b = m->shadow_ht_free;
     m->shadow_ht_free = b->next;
     b->spfn_and_flags = s;
-       b->pfn = gpfn;
-       b->next = B->next;
-       B->next = b;
+    b->pfn = gpfn;
+    b->next = B->next;
+    B->next = b;
 
-       shadow_audit(m,0);
+    shadow_audit(m,0);
 
-       return;
+    return;
 }
 
 static inline void __shadow_mk_pagetable( struct mm_struct *mm )
 {
-       unsigned long gpfn, spfn=0;
+    unsigned long gpfn, spfn=0;
 
-       gpfn =  pagetable_val(mm->pagetable) >> PAGE_SHIFT;
+    gpfn =  pagetable_val(mm->pagetable) >> PAGE_SHIFT;
                
-       if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) )
-       {
-               spfn = shadow_l2_table(mm, gpfn );
-       }      
-       mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+    if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) )
+    {
+       spfn = shadow_l2_table(mm, gpfn );
+    }      
+    mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
 }
 
 static inline void shadow_mk_pagetable( struct mm_struct *mm )
 {
-       SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
-                        pagetable_val(mm->pagetable), mm->shadow_mode );
+    SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
+            pagetable_val(mm->pagetable), mm->shadow_mode );
 
-       if ( unlikely(mm->shadow_mode) )
-       {
+    if ( unlikely(mm->shadow_mode) )
+    {
+       ASSERT(local_irq_is_enabled());
         spin_lock(&mm->shadow_lock);
 
-               __shadow_mk_pagetable( mm );
+       __shadow_mk_pagetable( mm );
 
         spin_unlock(&mm->shadow_lock);         
-       }
+    }
 
-       SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
-                        pagetable_val(mm->pagetable), mm->shadow_mode, 
-                        pagetable_val(mm->shadow_table) );
+    SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
+            pagetable_val(mm->pagetable), mm->shadow_mode, 
+            pagetable_val(mm->shadow_table) );
 
 }
 
@@ -570,3 +589,5 @@ extern int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s);
 
 
 #endif /* XEN_SHADOW_H */
+
+
index 025256813120c00adbbe3705435d0347cd4a391b..909e586b53df46ec2728e040e68114f3e955c28b 100644 (file)
@@ -2267,7 +2267,19 @@ long flush_bufs_for_vif(net_vif_t *vif)
 
        /* if in shadow mode, mark the PTE as dirty */
        if( p->mm.shadow_mode == SHM_logdirty )
+       {
            mark_dirty( &p->mm, rx->pte_ptr>>PAGE_SHIFT );
+#if 0
+           mark_dirty( &p->mm, rx->buf_pfn ); // XXXXXXX debug
+
+           {
+               unsigned long * p = map_domain_mem( rx->buf_pfn<<PAGE_SHIFT );
+               p[2] = 0xdeadc001;
+               unmap_domain_mem(p);
+           }
+#endif
+
+       }
        /* assume the shadow page table is about to be blown away,
           and that its not worth marking the buffer as dirty */
 
index a56783932f5b4455dfcff2bc307479bd1a317f7c..fbe5c4ecf605bd9aaa9c338abc3d542ee8cb9ef8 100644 (file)
@@ -366,6 +366,10 @@ static inline void _network_interrupt(struct net_device *dev)
         skb = np->rx_skbs[rx->id];
         ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
 
+/* XXXXX this is unsafe for live migrate -- if we do a scan be fore this
+point we won't transmit the right mfn! We have to fix this up in 
+xc_linux_save  */
+
         phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
             (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
 
index b06c6c26b08b92c50d2cc25db30d4e3ca250c0f9..1d1306df2b63217173bf8d1c1469b92040e199b2 100644 (file)
@@ -62,6 +62,9 @@ unsigned long *phys_to_machine_mapping;
 multicall_entry_t multicall_list[8];
 int nr_multicall_ents = 0;
 
+/* used so we treat multiple stop requests as a single one */
+int suspending = 0;
+
 /*
  * Machine setup..
  */
@@ -1204,6 +1207,8 @@ static void stop_task(void *unused)
 
     HYPERVISOR_stop(virt_to_machine(suspend_record) >> PAGE_SHIFT);
 
+    suspending = 0; 
+
     memcpy(&start_info, &suspend_record->resume_info, sizeof(start_info));
 
     set_fixmap(FIX_SHARED_INFO, start_info.shared_info);
@@ -1261,8 +1266,14 @@ static int stop_irq;
 
 static void stop_interrupt(int irq, void *unused, struct pt_regs *regs)
 {
-    stop_tq.routine = stop_task;
-    schedule_task(&stop_tq);
+    if (!suspending)
+    {
+       suspending = 1;
+       stop_tq.routine = stop_task;
+       schedule_task(&stop_tq);        
+    }
+    else
+       printk(KERN_ALERT"Ignore queued stop request\n");
 }
 
 static int __init setup_stop_event(void)