bitkeeper revision 1.896 (40a0e9e8M0uaTwE5LBe9sIhr2vdX7Q)
authoriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>
Tue, 11 May 2004 14:57:44 +0000 (14:57 +0000)
committeriap10@labyrinth.cl.cam.ac.uk <iap10@labyrinth.cl.cam.ac.uk>
Tue, 11 May 2004 14:57:44 +0000 (14:57 +0000)
Live migration initial checkin.

32 files changed:
tools/examples/xc_dom_control.py
tools/xc/lib/xc.h
tools/xc/lib/xc_domain.c
tools/xc/lib/xc_linux_build.c
tools/xc/lib/xc_linux_restore.c
tools/xc/lib/xc_linux_save.c
tools/xc/lib/xc_private.c
tools/xc/lib/xc_private.h
tools/xc/py/Xc.c
tools/xend/lib/utils.c
xen/common/dom0_ops.c
xen/common/domain.c
xen/common/memory.c
xen/common/network.c
xen/common/shadow.c
xen/drivers/block/xen_block.c
xen/include/asm-i386/processor.h
xen/include/hypervisor-ifs/dom0_ops.h
xen/include/xen/mm.h
xen/include/xen/shadow.h
xen/net/dev.c
xenolinux-2.4.26-sparse/arch/xen/drivers/block/block.c
xenolinux-2.4.26-sparse/arch/xen/drivers/dom0/core.c
xenolinux-2.4.26-sparse/arch/xen/drivers/network/network.c
xenolinux-2.4.26-sparse/arch/xen/kernel/setup.c
xenolinux-2.4.26-sparse/arch/xen/kernel/time.c
xenolinux-2.4.26-sparse/arch/xen/kernel/traps.c
xenolinux-2.4.26-sparse/arch/xen/mm/hypervisor.c
xenolinux-2.4.26-sparse/arch/xen/mm/ioremap.c
xenolinux-2.4.26-sparse/include/asm-xen/hypervisor.h
xenolinux-2.4.26-sparse/include/asm-xen/pgalloc.h
xenolinux-2.4.26-sparse/include/asm-xen/proc_cmd.h

index 4f0bd5de528fbc6daf6e6282629f0e3b8a8996c5..d6cae4f720b4ae01083fa39d00d96fc2e551e57d 100755 (executable)
@@ -139,10 +139,12 @@ elif cmd == 'suspend':
     xc.domain_stop( dom=dom )
     
     while not xc.domain_getinfo( first_dom=dom, max_doms=1 )[0]['stopped']:
-       time.sleep(0.1);
+       print "Sleep..."
+       time.sleep(0.001);
 
     rc = xc.linux_save( dom=dom, state_file=file, progress=1)
     if rc == 0 : xc.domain_destroy( dom=dom, force=1 )
+    else: xc.domain_start( dom=dom )  # sensible for production use
 
 elif cmd == 'cpu_bvtslice':
     if len(sys.argv) < 3:
index a0205bcc6b292aa36d72e6a8862089a5388e948e..2132d6e7c1ee63d6ee5763dde474dbab14ed382a 100644 (file)
@@ -57,7 +57,10 @@ int xc_domain_getinfo(int xc_handle,
 
 int xc_shadow_control(int xc_handle,
                       u64 domid, 
-                      unsigned int sop);
+                      unsigned int sop,
+                     unsigned long *dirty_bitmap,
+                     unsigned long pages);
+
 
 #define XCFLAGS_VERBOSE 1
 #define XCFLAGS_LIVE    2
@@ -247,11 +250,6 @@ int xc_readconsolering(int xc_handle,
 int xc_physinfo(int xc_handle,
                 xc_physinfo_t *info);
 
-
-int xc_shadow_control(int xc_handle,
-                      u64 domid, 
-                      unsigned int sop);
-
 int xc_domain_setname(int xc_handle,
                       u64 domid, 
                       char *name);
index c26a3f87c3f4db501c64437f958518cd81dc9340..6d0dd6d0f3bb76f9fd80f3abf6d7d017c7580b18 100644 (file)
@@ -109,13 +109,24 @@ int xc_domain_getinfo(int xc_handle,
 
 int xc_shadow_control(int xc_handle,
                       u64 domid, 
-                      unsigned int sop)
+                      unsigned int sop,
+                     unsigned long *dirty_bitmap,
+                     unsigned long pages)
 {
+    int rc;
     dom0_op_t op;
     op.cmd = DOM0_SHADOW_CONTROL;
     op.u.shadow_control.domain = (domid_t)domid;
     op.u.shadow_control.op     = sop;
-    return do_dom0_op(xc_handle, &op);
+    op.u.shadow_control.dirty_bitmap = dirty_bitmap;
+    op.u.shadow_control.pages  = pages;
+
+    rc = do_dom0_op(xc_handle, &op);
+
+    if ( rc == 0 )
+       return op.u.shadow_control.pages;
+    else
+       return rc;
 }
 
 int xc_domain_setname(int xc_handle,
index 83debd904d097b0e5b36999e5ab324bd79a9d421..98a3fb6a60e6342d379dbf080040fb677a93d584 100644 (file)
@@ -284,7 +284,7 @@ static int setup_guestos(int xc_handle,
 
     /* shared_info page starts its life empty. */
     shared_info = map_pfn_writeable(pm_handle, shared_info_frame);
-    memset(shared_info, 0, PAGE_SIZE);
+    memset(shared_info, 0, sizeof(shared_info_t));
     /* Mask all upcalls... */
     for ( i = 0; i < MAX_VIRT_CPUS; i++ )
         shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
index 861d38a5f757f319c6f7a51fe4524d7c584e50c2..9b1532159f546e7bb6d17d6eb0134e319a9f6c3a 100644 (file)
@@ -230,9 +230,16 @@ int xc_linux_restore(int xc_handle,
             goto out;
         }
 
-       //printf("batch=%d\n",j);
+       printf("batch %d\n",j);
        
-       if(j==0) break;  // our work here is done
+       if (j == 0) 
+           break;  // our work here is done
+
+       if( j > MAX_BATCH_SIZE )
+       {
+           ERROR("Max batch size exceeded. Giving up.");
+           goto out;
+       }
        
         if ( (*readerfn)(readerst, region_pfn_type, j*sizeof(unsigned long)) )
         {
@@ -242,6 +249,9 @@ int xc_linux_restore(int xc_handle,
 
        for(i=0;i<j;i++)
        {
+            if ((region_pfn_type[i]>>29) == 7)
+               continue;
+
            pfn = region_pfn_type[i] & ~PGT_type_mask;
            mfn = pfn_to_mfn_table[pfn];
            
@@ -261,9 +271,15 @@ int xc_linux_restore(int xc_handle,
            unsigned long *ppage;
 
            pfn = region_pfn_type[i] & ~PGT_type_mask;
+
+//if(n>=nr_pfns || ((region_pfn_type[i] & PGT_type_mask) == L2TAB) ) printf("pfn=%08lx mfn=%x\n",region_pfn_type[i],pfn_to_mfn_table[pfn]);
+
                            
 //if(pfn_type[i])printf("^pfn=%d %08lx\n",pfn,pfn_type[i]);
 
+            if ((region_pfn_type[i]>>29) == 7)
+               continue;
+
             if (pfn>nr_pfns)
            {
                ERROR("pfn out of range");
@@ -304,7 +320,7 @@ int xc_linux_restore(int xc_handle,
 
                        if ( xpfn >= nr_pfns )
                        {
-                           ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=%d nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns);
+                           ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=0x%x nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns);
                            goto out;
                        }
 #if 0
@@ -355,17 +371,19 @@ int xc_linux_restore(int xc_handle,
            default:
                ERROR("Bogus page type %x page table is out of range. i=%d nr_pfns=%d",region_pfn_type[i],i,nr_pfns);
                goto out;
-           }
+
+           } // end of page type switch statement
 
            if ( add_mmu_update(xc_handle, mmu,
                                (mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) )
                goto out;
 
-       }
+       } // end of 'batch' for loop
 
        n+=j; // crude stats
 
     }
+printf("RECEIVED ALL PAGES\n");
 
     mfn_mapper_close( region_mapper );
 
@@ -381,7 +399,10 @@ int xc_linux_restore(int xc_handle,
                                 (pfn_to_mfn_table[i]<<PAGE_SHIFT) | 
                                 MMU_EXTENDED_COMMAND,
                                 MMUEXT_PIN_L1_TABLE) )
+           {
+               printf("ERR pin L1 pfn=%lx mfn=%lx\n");
                 goto out;
+           }
         }
         else if ( pfn_type[i] == L2TAB )
         {
@@ -389,7 +410,10 @@ int xc_linux_restore(int xc_handle,
                                 (pfn_to_mfn_table[i]<<PAGE_SHIFT) | 
                                 MMU_EXTENDED_COMMAND,
                                 MMUEXT_PIN_L2_TABLE) )
+           {
+               printf("ERR pin L2 pfn=%lx mfn=%lx\n");
                 goto out;
+           }
         }
     }
 
@@ -421,6 +445,8 @@ int xc_linux_restore(int xc_handle,
     p_srec->resume_info.flags       = 0;
     unmap_pfn(pm_handle, p_srec);
 
+printf("new shared info is %lx\n", shared_info_frame);
+
     /* Uncanonicalise each GDT frame number. */
     if ( ctxt.gdt_ents > 8192 )
     {
@@ -451,7 +477,7 @@ int xc_linux_restore(int xc_handle,
 
     /* Copy saved contents of shared-info page. No checking needed. */
     ppage = map_pfn_writeable(pm_handle, shared_info_frame);
-    memcpy(ppage, shared_info, PAGE_SIZE);
+    memcpy(ppage, shared_info, sizeof(shared_info_t));
     unmap_pfn(pm_handle, ppage);
 
 
@@ -528,7 +554,9 @@ int xc_linux_restore(int xc_handle,
     op.u.builddomain.ctxt = &ctxt;
     rc = do_dom0_op(xc_handle, &op);
 
+printf("NORMAL EXIT RESTORE\n");
  out:
+printf("EXIT RESTORE\n");
     if ( mmu != NULL )
         free(mmu);
 
index 02e3ffc352d818b0b52b49a6943b0f7742890c4a..cbb1d66fd1a7b031ce358e489cdb0a023c07a9ae 100644 (file)
 /*
  * Returns TRUE if the given machine frame number has a unique mapping
  * in the guest's pseudophysical map.
+ * 0x80000000-3 mark the shared_info, and blk/net rings
  */
 #define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
-    (((_mfn) < (1024*1024)) &&          \
-     (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn)))
-
+    (((_mfn) < (1024*1024)) && \
+     ( ( (live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
+       (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn)) ) || \
+\
+       (live_mfn_to_pfn_table[_mfn] >= 0x80000000 && \
+       live_mfn_to_pfn_table[_mfn] <= 0x80000003 ) || \
+       live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == 0x80000004 )  )
+     
 /* Returns TRUE if MFN is successfully converted to a PFN. */
 #define translate_mfn_to_pfn(_pmfn)         \
 ({                                          \
 })
 
 
+/* test_bit */
+inline int test_bit ( int nr, volatile void * addr)
+{
+    return ( ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> 
+            (nr % (sizeof(unsigned long)*8) ) ) & 1;
+}
+
+
 int xc_linux_save(int xc_handle,
                   u64 domid, 
                  unsigned int flags,
@@ -47,14 +61,11 @@ int xc_linux_save(int xc_handle,
                  void *writerst )
 {
     dom0_op_t op;
-    int rc = 1, i, j, k, n;
+    int rc = 1, i, j, k, n, last_iter, iter = 0;
     unsigned long mfn;
-    unsigned int prev_pc, this_pc;
     int verbose = flags & XCFLAGS_VERBOSE;
-    //int live = flags & XCFLAGS_LIVE;
-
-    /* state of the new MFN mapper */
-    mfn_mapper_t *mapper_handle1, *mapper_handle2;
+    int live = 1; //flags & XCFLAGS_LIVE;     // XXXXXXXXXXXXXXXXXXX
+    int sent_last_iter, sent_this_iter, max_iters;
 
     /* Remember if we stopped the guest, so we can restart it on exit. */
     int we_stopped_it = 0;
@@ -90,8 +101,13 @@ int xc_linux_save(int xc_handle,
     unsigned char *region_base;
 
     /* A temporary mapping, and a copy, of the guest's suspend record. */
-    suspend_record_t *p_srec, srec;
+    suspend_record_t *p_srec;
+
+    /* number of pages we're dealing with */
+    unsigned long nr_pfns;
 
+    /* bitmap of pages left to send */
+    unsigned long *to_send;
 
     if ( mlock(&ctxt, sizeof(ctxt) ) )
     {
@@ -129,21 +145,24 @@ int xc_linux_save(int xc_handle,
             goto out;
         }
 
-        sleep(1);
+        usleep(1000); // 1ms
+       printf("Sleep for 1ms\n");
     }
 
+#if 1
     /* A cheesy test to see whether the domain contains valid state. */
     if ( ctxt.pt_base == 0 )
     {
         ERROR("Domain is not in a valid Linux guest OS state");
         goto out;
     }
+#endif
 
 
     /* Map the suspend-record MFN to pin it. The page must be owned by 
        domid for this to succeed. */
     p_srec = mfn_mapper_map_single(xc_handle, domid,
-                                sizeof(srec), PROT_READ, 
+                                sizeof(*p_srec), PROT_READ, 
                                 ctxt.cpu_ctxt.esi );
 
     if (!p_srec)
@@ -152,10 +171,10 @@ int xc_linux_save(int xc_handle,
         goto out;
     }
 
-    memcpy( &srec, p_srec, sizeof(srec) );
+    nr_pfns = p_srec->nr_pfns;
 
     /* cheesy sanity check */
-    if ( srec.nr_pfns > 1024*1024 )
+    if ( nr_pfns > 1024*1024 )
     {
         ERROR("Invalid state record -- pfn count out of range");
         goto out;
@@ -165,55 +184,13 @@ int xc_linux_save(int xc_handle,
     live_pfn_to_mfn_frame_list = 
        mfn_mapper_map_single(xc_handle, domid, 
                              PAGE_SIZE, PROT_READ, 
-                             srec.pfn_to_mfn_frame_list );
+                             p_srec->pfn_to_mfn_frame_list );
 
     if (!live_pfn_to_mfn_frame_list)
     {
         ERROR("Couldn't map pfn_to_mfn_frame_list");
         goto out;
     }
-   
-
-    if ( (mapper_handle1 = mfn_mapper_init(xc_handle, domid,
-                                          1024*1024, PROT_READ )) 
-        == NULL )
-        goto out;
-       
-    for ( i = 0; i < (srec.nr_pfns+1023)/1024; i++ )
-    {
-       /* Grab a copy of the pfn-to-mfn table frame list. 
-        This has the effect of preventing the page from being freed and
-        given to another domain. (though the domain is stopped anyway...) */
-       mfn_mapper_queue_entry( mapper_handle1, i<<PAGE_SHIFT, 
-                               live_pfn_to_mfn_frame_list[i],
-                               PAGE_SIZE );
-    }
-    
-    if ( mfn_mapper_flush_queue(mapper_handle1) )
-    {
-        ERROR("Couldn't map pfn_to_mfn table");
-        goto out;
-    }
-
-    live_pfn_to_mfn_table = mfn_mapper_base( mapper_handle1 );
-
-
-
-    /* We want zeroed memory so use calloc rather than malloc. */
-    pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
-
-    if ( (pfn_type == NULL) )
-    {
-        errno = ENOMEM;
-        goto out;
-    }
-
-    if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) )
-    {
-       ERROR("Unable to mlock");
-       goto out;
-    }
-
 
     /* Track the mfn_to_pfn table down from the domains PT */
     {
@@ -233,49 +210,106 @@ int xc_linux_save(int xc_handle,
                                  mfn_to_pfn_table_start_mfn );
     }
 
+    /* Map all the frames of the pfn->mfn table. For migrate to succeed, 
+       the guest must not change which frames are used for this purpose. 
+       (its not clear why it would want to change them, and we'll be OK
+       from a safety POV anyhow. */
 
-    /*
-     * Quick belt and braces sanity check.
-     */
+    live_pfn_to_mfn_table = mfn_mapper_map_batch( xc_handle, domid, 
+                                                 PROT_READ,
+                                                 live_pfn_to_mfn_frame_list,
+                                                 (nr_pfns+1023)/1024 );  
+    if( !live_pfn_to_mfn_table )
+    {
+        PERROR("Couldn't map pfn_to_mfn table");
+        goto out;
+    }
+
+    for(i=0;i<(nr_pfns+1023)/1024 ;i++)
+       printf("LF: %d %x\n",i,live_pfn_to_mfn_frame_list[i]);
 
-    for ( i = 0; i < srec.nr_pfns; i++ )
+
+    /* At this point, we can start the domain again if we're doign a
+       live suspend */
+
+    if( live )
+    { 
+#if 1
+       if ( xc_shadow_control( xc_handle, domid, 
+                          DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
+                          NULL, 0 ) < 0 )
+       {
+           ERROR("Couldn't enable shadow mode");
+           goto out;
+       }
+#endif 
+       if ( xc_domain_start( xc_handle, domid ) < 0 )
+       {
+           ERROR("Couldn't restart domain");
+           goto out;
+       }
+//exit(-1);
+       last_iter = 0;
+       sent_last_iter = 1<<20; // 4GB's worth of pages
+       max_iters = 8; // limit us to 9 time round loop
+    }
+    else
+       last_iter = 1;
+
+
+    /* Setup to_send bitmap */
     {
-        mfn = live_pfn_to_mfn_table[i];
+       int sz = (nr_pfns/8) + 8; // includes slop at end of array
+       
+       to_send = malloc( sz );
 
-       if( live_mfn_to_pfn_table[mfn] != i )
-           printf("i=%d mfn=%d live_mfn_to_pfn_table=%d\n",
-                  i,mfn,live_mfn_to_pfn_table[mfn]);
+       if (!to_send)
+       {
+           ERROR("Couldn't allocate to_send array");
+           goto out;
+       }
+       memset( to_send, 0xff, sz );
+
+       if ( mlock( to_send, sz ) )
+       {
+           PERROR("Unable to mlock to_send");
+           return 1;
+       }
     }
 
 
-    /* Canonicalise the suspend-record frame number. */
-    if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) )
+    /* We want zeroed memory so use calloc rather than malloc. */
+    pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
+
+    if ( (pfn_type == NULL) )
     {
-        ERROR("State record is not in range of pseudophys map");
+        errno = ENOMEM;
         goto out;
     }
 
-    /* Canonicalise each GDT frame number. */
-    for ( i = 0; i < ctxt.gdt_ents; i += 512 )
+    if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) )
     {
-        if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) )
-        {
-            ERROR("GDT frame is not in range of pseudophys map");
-            goto out;
-        }
+       ERROR("Unable to mlock");
+       goto out;
     }
 
-    /* Canonicalise the page table base pointer. */
-    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) )
+
+    /*
+     * Quick belt and braces sanity check.
+     */
+
+    for ( i = 0; i < nr_pfns; i++ )
     {
-        ERROR("PT base is not in range of pseudophys map");
-        goto out;
+        mfn = live_pfn_to_mfn_table[i];
+
+       if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0x80000004) )
+           printf("i=0x%x mfn=%x live_mfn_to_pfn_table=%x\n",
+                  i,mfn,live_mfn_to_pfn_table[mfn]);
     }
-    ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT;
 
     /* Canonicalise the pfn-to-mfn table frame-number list. */
     memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
-    for ( i = 0; i < srec.nr_pfns; i += 1024 )
+    for ( i = 0; i < nr_pfns; i += 1024 )
     {
         if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) )
         {
@@ -284,7 +318,7 @@ int xc_linux_save(int xc_handle,
         }
     }
 
-    /* Start writing out the saved-domain record. */
+    /* Map the shared info frame */
     live_shinfo = mfn_mapper_map_single(xc_handle, domid,
                                        PAGE_SIZE, PROT_READ,
                                        shared_info_frame);
@@ -295,164 +329,290 @@ int xc_linux_save(int xc_handle,
         goto out;
     }
 
+    /* Start writing out the saved-domain record. */
+
     if ( (*writerfn)(writerst, "LinuxGuestRecord",    16) ||
          (*writerfn)(writerst, name,                  sizeof(name)) ||
-         (*writerfn)(writerst, &srec.nr_pfns,         sizeof(unsigned long)) ||
-         (*writerfn)(writerst, &ctxt,                 sizeof(ctxt)) ||
-         (*writerfn)(writerst, live_shinfo,           PAGE_SIZE) ||
+         (*writerfn)(writerst, &nr_pfns,              sizeof(unsigned long)) ||
          (*writerfn)(writerst, pfn_to_mfn_frame_list, PAGE_SIZE) )
     {
         ERROR("Error when writing to state file (1)");
         goto out;
     }
-    munmap(live_shinfo, PAGE_SIZE);
-
-    verbose_printf("Saving memory pages:   0%%");
-
-    if ( (mapper_handle2 = mfn_mapper_init(xc_handle, domid,
-                                          BATCH_SIZE*4096, PROT_READ )) 
-        == NULL )
-        goto out;
-
-    region_base = mfn_mapper_base( mapper_handle2 );
 
     /* Now write out each data page, canonicalising page tables as we go... */
-    prev_pc = 0;
-    for ( n = 0; n < srec.nr_pfns; )
+
+    while(1)
     {
-        this_pc = (n * 100) / srec.nr_pfns;
-        if ( (this_pc - prev_pc) >= 5 )
-        {
-            verbose_printf("\b\b\b\b%3d%%", this_pc);
-            prev_pc = this_pc;
-        }
+       unsigned int prev_pc, batch, sent_this_iter;
 
-       for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ )
-       {               
-           pfn_type[j] = live_pfn_to_mfn_table[i];
-       }
+       iter++;
 
+       sent_this_iter = 0;
+       prev_pc = 0;
+       verbose_printf("Saving memory pages: iter %d   0%%", iter);
 
-       for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ )
+       n=0;
+       while( n < nr_pfns )
        {
-           /* queue up mappings for all of the pages in this batch */
+           unsigned int this_pc = (n * 100) / nr_pfns;
+           if ( (this_pc - prev_pc) >= 5 )
+           {
+               verbose_printf("\b\b\b\b%3d%%", this_pc);
+               prev_pc = this_pc;
+           }
 
-//printf("region n=%d j=%d i=%d mfn=%d\n",n,j,i,live_pfn_to_mfn_table[i]);
-           mfn_mapper_queue_entry( mapper_handle2, j<<PAGE_SHIFT, 
-                                   live_pfn_to_mfn_table[i],
-                                   PAGE_SIZE );
-       }
 
-       if( mfn_mapper_flush_queue(mapper_handle2) )
-       {
-           ERROR("Couldn't map page region");
-           goto out;
-       }
+           /* load pfn_type[] with the mfn of all the pages we're doing in
+              this batch. */
 
-       if ( get_pfn_type_batch(xc_handle, domid, j, pfn_type) )
-       {
-           ERROR("get_pfn_type_batch failed");
-           goto out;
-       }
-       
-       for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ )
-       {
-           if((pfn_type[j]>>29) == 7)
+           for( batch = 0; batch < BATCH_SIZE && n < nr_pfns ; n++ )
            {
-               ERROR("bogus page");
-               goto out;
-           }
+               if ( !test_bit(n, to_send ) ) continue;
 
-           /* canonicalise mfn->pfn */
-           pfn_type[j] = (pfn_type[j] & PGT_type_mask) |
-               live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask];
-           
-/*         if(pfn_type[j]>>29)
-                   printf("i=%d type=%d\n",i,pfn_type[i]);    */
-       }
+               pfn_type[batch] = live_pfn_to_mfn_table[n];
 
+               if( pfn_type[batch] == 0x80000004 )
+               {
+                   //printf("Skip netbuf pfn %lx. mfn %lx\n",n,pfn_type[batch]);
+                   continue;
+               }
 
-       if ( (*writerfn)(writerst, &j, sizeof(int) ) )
-       {
-           ERROR("Error when writing to state file (2)");
-           goto out;
-       }
+//if(iter>1) printf("pfn=%x mfn=%x\n",n,pfn_type[batch]);
+               
+               batch++;
+           }
 
-       if ( (*writerfn)(writerst, pfn_type, sizeof(unsigned long)*j ) )
-       {
-           ERROR("Error when writing to state file (3)");
-           goto out;
-       }
+           for( j = 0; j < batch; j++ )
+           {
 
+               if( (pfn_type[j] &0xfffff) == 0x0000004 )
+               {
+                   printf("XXXXXXXXSkip netbuf entry %d mfn %lx\n",j,pfn_type[j]);
+               }
 
-       for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ )
-       {
-           /* write out pages in batch */
+               
+           }
 
-           if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) || 
-                ((pfn_type[j] & PGT_type_mask) == L2TAB) )
+           
+           printf("batch %d:%d (n=%d)\n",iter,batch,n);
+
+           if(batch == 0) goto skip; // vanishingly unlikely...
+           
+           if ( (region_base = mfn_mapper_map_batch( xc_handle, domid, 
+                                                     PROT_READ,
+                                                     pfn_type,
+                                                     batch )) == 0)
+           {
+               PERROR("map batch failed");
+               goto out;
+           }
+           
+           if ( get_pfn_type_batch(xc_handle, domid, batch, pfn_type) )
            {
+               ERROR("get_pfn_type_batch failed");
+               goto out;
+           }
+           
+           for( j = 0; j < batch; j++ )
+           {
+               if((pfn_type[j]>>29) == 7)
+               {
+                   //printf("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
+                   continue;
+               }
+//if((pfn_type[j] & PGT_type_mask) == L2TAB) printf("L2 pfn=%08lx mfn=%lx\n",pfn_type[j],live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask]);
                
-               memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
+               /* canonicalise mfn->pfn */
+               pfn_type[j] = (pfn_type[j] & PGT_type_mask) |
+                   live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask];
+           }
+           
+           
+           if ( (*writerfn)(writerst, &batch, sizeof(int) ) )
+           {
+               ERROR("Error when writing to state file (2)");
+               goto out;
+           }
 
-               for ( k = 0; 
-                     k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ? 
-                          (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); 
-                     k++ )
+           if ( (*writerfn)(writerst, pfn_type, sizeof(unsigned long)*j ) )
+           {
+               ERROR("Error when writing to state file (3)");
+               goto out;
+           }
+           
+           /* entering this loop, pfn_type is now in pfns (Not mfns) */
+           for( j = 0; j < batch; j++ )
+           {
+               /* write out pages in batch */
+               
+               if((pfn_type[j]>>29) == 7)
                {
-                   if ( !(page[k] & _PAGE_PRESENT) ) continue;
-                   mfn = page[k] >> PAGE_SHIFT;                    
-
-                   if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+                   //printf("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
+                   continue;
+               }
+               
+               if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) || 
+                    ((pfn_type[j] & PGT_type_mask) == L2TAB) )
+               {
+                   
+                   memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
+                   
+                   for ( k = 0; 
+                         k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ? 
+                      (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024); 
+                         k++ )
                    {
-                       ERROR("Frame number in pagetable page is invalid");
+                       unsigned long pfn;
+
+                       if ( !(page[k] & _PAGE_PRESENT) ) continue;
+                       mfn = page[k] >> PAGE_SHIFT;                
+                       pfn = live_mfn_to_pfn_table[mfn];
+
+                       if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+                       {
+                           printf("FNI %d : [%08lx,%d] pte=%08lx, mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
+                                  j, pfn_type[j], k,
+                                  page[k], mfn, live_mfn_to_pfn_table[mfn],
+                                  (live_mfn_to_pfn_table[mfn]<nr_pfns)? 
+                               live_pfn_to_mfn_table[live_mfn_to_pfn_table[mfn]]: 0xdeadbeef);
+                           pfn = 0; // be suspicious
+                           
+//                         ERROR("Frame number in pagetable page is invalid");
+//                         goto out;
+
+
+                       }
+                       page[k] &= PAGE_SIZE - 1;
+                       page[k] |= pfn << PAGE_SHIFT;
+                       
+                       /*
+                         printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx xpfn=%d\n",
+                         pfn_type[j]>>29,
+                         j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
+                         */
+                       
+                   } /* end of page table rewrite for loop */
+                   
+                   if ( (*writerfn)(writerst, page, PAGE_SIZE) )
+                   {
+                       ERROR("Error when writing to state file (4)");
+                       goto out;
+                   }
+                   
+               }  /* end of it's a PT page */
+               else
+               {  /* normal page */
+                   if ( (*writerfn)(writerst, region_base + (PAGE_SIZE*j), PAGE_SIZE) )
+                   {
+                       ERROR("Error when writing to state file (5)");
                        goto out;
                    }
-                   page[k] &= PAGE_SIZE - 1;
-                   page[k] |= live_mfn_to_pfn_table[mfn] << PAGE_SHIFT;
-
-                   /*
-                   printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx xpfn=%d\n",
-                          pfn_type[j]>>29,
-                          j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
-                          */
-
                }
+           } /* end of the write out for this batch */
+           
+           sent_this_iter += batch;
 
-               if ( (*writerfn)(writerst, page, PAGE_SIZE) )
-               {
-                   ERROR("Error when writing to state file (4)");
-                   goto out;
-               }
+       } /* end of this while loop for this iteration */
 
+       munmap(region_base, batch*PAGE_SIZE);
+
+    skip: 
+       
+       verbose_printf("\b\b\b\b100%% (%d pages)\n", sent_this_iter );
+       
+       if ( last_iter )
+           break;
 
+       if ( live )
+       {
+           if ( sent_this_iter < (sent_last_iter * 0.95) && iter < max_iters )
+           {
+               // we seem to be doing OK, keep going
            }
            else
            {
-               if ( (*writerfn)(writerst, region_base + (PAGE_SIZE*j), PAGE_SIZE) )
-               {
-                   ERROR("Error when writing to state file (5)");
-                   goto out;
-               }
+               printf("Start last iteration\n");
+               last_iter = 1;
+
+               xc_domain_stop_sync( xc_handle, domid );
+
+           } 
+
+           if ( xc_shadow_control( xc_handle, domid, 
+                                   DOM0_SHADOW_CONTROL_OP_CLEAN,
+                                   to_send, nr_pfns ) != nr_pfns ) 
+           {
+               ERROR("Error flushing shadow PT");
+               goto out;
            }
+
+#if 0
+           if(last_iter) memset(to_send, 0xff, (nr_pfns+7)/8 );
+#endif
+
+           sent_last_iter = sent_this_iter;
        }
-       
-       n+=j; /* i is the master loop counter */
-    }
 
-    verbose_printf("\b\b\b\b100%%\nMemory saved.\n");
+
+    } /* end of while 1 */
+
+printf("All memory is saved\n");
 
     /* Success! */
     rc = 0;
-
+    
     /* Zero terminate */
     if ( (*writerfn)(writerst, &rc, sizeof(int)) )
     {
        ERROR("Error when writing to state file (6)");
        goto out;
     }
-    
 
+    /* Get the final execution context */
+    op.cmd = DOM0_GETDOMAININFO;
+    op.u.getdomaininfo.domain = (domid_t)domid;
+    op.u.getdomaininfo.ctxt = &ctxt;
+    if ( (do_dom0_op(xc_handle, &op) < 0) || 
+        ((u64)op.u.getdomaininfo.domain != domid) )
+    {
+       PERROR("Could not get info on domain");
+       goto out;
+    }
+printf("A\n");    
+    /* Canonicalise the suspend-record frame number. */
+    if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) )
+    {
+        ERROR("State record is not in range of pseudophys map");
+        goto out;
+    }
+printf("B\n");    
+    /* Canonicalise each GDT frame number. */
+    for ( i = 0; i < ctxt.gdt_ents; i += 512 )
+    {
+        if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) )
+        {
+            ERROR("GDT frame is not in range of pseudophys map");
+            goto out;
+        }
+    }
+printf("C\n");    
+    /* Canonicalise the page table base pointer. */
+    if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) )
+    {
+        ERROR("PT base is not in range of pseudophys map");
+        goto out;
+    }
+    ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT;
+printf("D\n");    
+    if ( (*writerfn)(writerst, &ctxt,                 sizeof(ctxt)) ||
+         (*writerfn)(writerst, live_shinfo,           PAGE_SIZE) )
+    {
+        ERROR("Error when writing to state file (1)");
+        goto out;
+    }
+    munmap(live_shinfo, PAGE_SIZE);
+printf("E\n");        
 out:
     /* Restart the domain if we had to stop it to save its state. */
     if ( we_stopped_it )
index d137176ca80e32a158a304612aa82c93011a326b..47931f28ec311613792fec4159c4d7050860747a 100644 (file)
@@ -47,6 +47,31 @@ void unmap_pfn(int pm_handle, void *vaddr)
 
 /*******************/
 
+void * mfn_mapper_map_batch(int xc_handle, domid_t dom, int prot,
+                           unsigned long *arr, int num )
+{
+    privcmd_mmapbatch_t ioctlx; 
+    void *addr;
+    addr = mmap( NULL, num*PAGE_SIZE, prot, MAP_SHARED, xc_handle, 0 );
+    if (addr)
+    {
+       ioctlx.num=num;
+       ioctlx.dom=dom;
+       ioctlx.addr=(unsigned long)addr;
+       ioctlx.arr=arr;
+       if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAPBATCH, &ioctlx ) <0 )
+       {
+           perror("XXXXXXXX");
+           munmap(addr, num*PAGE_SIZE);
+           return 0;
+       }
+    }
+    return addr;
+
+}
+
+/*******************/
+
 void * mfn_mapper_map_single(int xc_handle, domid_t dom,
                             int size, int prot,
                             unsigned long mfn )
@@ -64,7 +89,10 @@ void * mfn_mapper_map_single(int xc_handle, domid_t dom,
        entry.mfn=mfn;
        entry.npages=(size+PAGE_SIZE-1)>>PAGE_SHIFT;
        if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx ) <0 )
+       {
+           munmap(addr, size);
            return 0;
+       }
     }
     return addr;
 }
@@ -295,7 +323,7 @@ static int flush_mmu_updates(int xc_handle, mmu_t *mmu)
 
     hypercall.op     = __HYPERVISOR_mmu_update;
     hypercall.arg[0] = (unsigned long)mmu->updates;
-    hypercall.arg[1] = (unsigned long)mmu->idx;
+    hypercall.arg[1] = (unsigned long)&(mmu->idx);
 
     if ( mlock(mmu->updates, sizeof(mmu->updates)) != 0 )
     {
@@ -342,3 +370,47 @@ int finish_mmu_updates(int xc_handle, mmu_t *mmu)
 {
     return flush_mmu_updates(xc_handle, mmu);
 }
+
+
+/***********************************************************/
+
+/* this function is a hack until we get proper synchronous domain stop */
+
+int xc_domain_stop_sync( int xc_handle, domid_t domid )
+{
+    dom0_op_t op;
+
+    while (1)
+    {
+        op.cmd = DOM0_STOPDOMAIN;
+        op.u.stopdomain.domain = (domid_t)domid;
+        if ( do_dom0_op(xc_handle, &op) != 0 )
+        {
+            PERROR("Stopping target domain failed");
+            goto out;
+        }
+
+        usleep(1000); // 1ms
+       printf("Sleep for 1ms\n");
+
+        op.cmd = DOM0_GETDOMAININFO;
+        op.u.getdomaininfo.domain = (domid_t)domid;
+        op.u.getdomaininfo.ctxt = NULL;
+        if ( (do_dom0_op(xc_handle, &op) < 0) || 
+             ((u64)op.u.getdomaininfo.domain != domid) )
+        {
+            PERROR("Could not get info on domain");
+            goto out;
+        }
+
+        if ( op.u.getdomaininfo.state == DOMSTATE_STOPPED )
+       {
+           printf("Domain %lld stopped\n",domid);
+            return 0;
+       }
+
+    }
+
+out:
+    return -1;    
+}
index 3a2e3ea9f164aa5c226511c8da1ebab16ea519b9..e3eff85e598269a14f04621ee1d45fb718deffd8 100644 (file)
@@ -232,6 +232,9 @@ typedef struct mfn_mapper {
 void * mfn_mapper_map_single(int xc_handle, domid_t dom, int size, int prot, 
                             unsigned long mfn );
 
+void * mfn_mapper_map_batch(int xc_handle, domid_t dom, int prot,
+                           unsigned long *arr, int num );
+
 mfn_mapper_t * mfn_mapper_init(int xc_handle, domid_t dom, int size, int prot);
 
 void * mfn_mapper_base(mfn_mapper_t *t);
@@ -245,5 +248,6 @@ void * mfn_mapper_queue_entry(mfn_mapper_t *t, int offset,
 
 /*********************/
 
+int xc_domain_stop_sync( int xc_handle, domid_t dom );
 
 #endif /* __XC_PRIVATE_H__ */
index 929e9f31045d5ef4bd86346e00d3bf13508f862b..97bff12492ee94679c685dc66259ac7458008b6b 100644 (file)
@@ -190,16 +190,17 @@ static PyObject *pyxc_linux_save(PyObject *self,
 
     u64   dom;
     char *state_file;
-    int   progress = 1;
+    int   progress = 1, live = 0;
     unsigned int flags = 0;
 
-    static char *kwd_list[] = { "dom", "state_file", "progress", NULL };
+    static char *kwd_list[] = { "dom", "state_file", "progress", "live", NULL };
 
-    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Ls|i", kwd_list, 
-                                      &dom, &state_file, &progress) )
+    if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Ls|ii", kwd_list, 
+                                      &dom, &state_file, &progress, &live) )
         return NULL;
 
     if (progress) flags |= XCFLAGS_VERBOSE;
+    if (live)     flags |= XCFLAGS_LIVE;
 
     if (strncmp(state_file,"tcp:", strlen("tcp:")) == 0)
     {
@@ -1273,7 +1274,7 @@ static PyObject *pyxc_shadow_control(PyObject *self,
                                       &dom, &op) )
         return NULL;
 
-    if ( xc_shadow_control(xc->xc_handle, dom, op) != 0 )
+    if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0) < 0 )
         return PyErr_SetFromErrno(xc_error);
     
     Py_INCREF(zero);
index 297976e9bec8d3e6b926e293d30c2cd892668761..441b62f15358f1ea5d80d65a4a66a67289be82af 100644 (file)
@@ -723,6 +723,11 @@ static PyObject *xu_port_new(PyObject *self, PyObject *args)
         goto fail4;
     }
 
+    xup->interface->tx_resp_prod = 0;
+    xup->interface->rx_req_prod  = 0;
+    xup->interface->tx_req_prod = 0;
+    xup->interface->rx_resp_prod = 0;
+
     xup->tx_req_cons  = 0;
     xup->tx_resp_prod = 0;
     xup->rx_req_prod  = 0;
index 9370a61a8d60d552f8fa27a3ff2074af4c639b56..dee7552bdd1d5a2445f4907e304516cfe34ebe37 100644 (file)
@@ -525,10 +525,10 @@ long do_dom0_op(dom0_op_t *u_dom0_op)
        p = find_domain_by_id( op->u.shadow_control.domain );
        if ( p )
        {
-            ret = shadow_mode_control(p, op->u.shadow_control.op );
+            ret = shadow_mode_control(p, &op->u.shadow_control );
            put_task_struct(p);
-        }
-       
+           copy_to_user(u_dom0_op, op, sizeof(*op));
+        }      
     }
     break;
 
index a9c40ae98f22451621ac31c55722cd79bbf7941f..b9e8150bfb45ee6751e5d2418ef22011fe80bec5 100644 (file)
@@ -89,9 +89,15 @@ struct task_struct *do_createdomain(domid_t dom_id, unsigned int cpu)
         memset(p->shared_info, 0, PAGE_SIZE);
         SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p);
         
+       machine_to_phys_mapping[virt_to_phys(p->shared_info) >> PAGE_SHIFT] =
+           0x80000000UL;  // set m2p table to magic marker (helps debug)
+
         p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
         memset(p->mm.perdomain_pt, 0, PAGE_SIZE);
         
+       machine_to_phys_mapping[virt_to_phys(p->mm.perdomain_pt) >> PAGE_SHIFT] =
+           0x0fffdeadUL;  // set m2p table to magic marker (helps debug)
+
         init_blkdev_info(p);
         
         /* Per-domain PCI-device list. */
@@ -486,6 +492,7 @@ void free_all_dom_mem(struct task_struct *p)
 unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
 {
     unsigned int alloc_pfns, nr_pages;
+    struct pfn_info *page;
 
     nr_pages = (kbytes + ((PAGE_SIZE-1)>>10)) >> (PAGE_SHIFT - 10);
     p->max_pages = nr_pages; /* this can now be controlled independently */
@@ -493,13 +500,16 @@ unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
     /* grow the allocation if necessary */
     for ( alloc_pfns = p->tot_pages; alloc_pfns < nr_pages; alloc_pfns++ )
     {
-        if ( unlikely(alloc_domain_page(p) == NULL) ||
+        if ( unlikely((page=alloc_domain_page(p)) == NULL) ||
              unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >> 
                                    (PAGE_SHIFT-10))) )
         {
             free_all_dom_mem(p);
             return -ENOMEM;
         }
+
+       /* initialise to machine_to_phys_mapping table to likely pfn */
+       machine_to_phys_mapping[page-frame_table] = alloc_pfns;
     }
 
     p->tot_pages = nr_pages;
index ed2e5b6e170df202f7ae34f92e4bdfade17d77df..243875f22edd14ae57bb779358acd6ca3407032a 100644 (file)
@@ -213,7 +213,12 @@ void __init init_frametable(unsigned long nr_pages)
        belonging to the machine_to_phys_mapping to CPU0 idle task */
     
     mfn = virt_to_phys((void *)RDWR_MPT_VIRT_START)>>PAGE_SHIFT;
-//    for(i=0;i<nr_pages;i+=1024,mfn++)
+
+    /* initialise to a magic of 0x55555555 so easier to spot bugs later */
+    memset( machine_to_phys_mapping, 0x55, 4*1024*1024 );
+
+    /* The array is sized for a 4GB machine regardless of actuall mem size. 
+       This costs 4MB -- may want to fix some day */
     for(i=0;i<1024*1024;i+=1024,mfn++)
     {
        frame_table[mfn].count_and_flags = 1 | PGC_allocated;
@@ -325,7 +330,7 @@ static int get_page_from_pagenr(unsigned long page_nr, struct task_struct *p)
 
     if ( unlikely(!get_page(page, p)) )
     {
-        MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr);
+        MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
         return 0;
     }
 
@@ -944,8 +949,9 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
 }
 
 
-int do_mmu_update(mmu_update_t *ureqs, int count)
+int do_mmu_update(mmu_update_t *ureqs, int * p_count)
 {
+    int count;
     mmu_update_t req;
     unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
     struct pfn_info *page;
@@ -954,6 +960,11 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
     unsigned long prev_spfn = 0;
     l1_pgentry_t *prev_spl1e = 0;
 
+    if ( unlikely( get_user(count, p_count) ) )
+    {
+       return -EFAULT;
+    }
+
     perfc_incrc(calls_to_mmu_update); 
     perfc_addc(num_page_updates, count);
 
@@ -1110,6 +1121,9 @@ int do_mmu_update(mmu_update_t *ureqs, int count)
         percpu_info[cpu].gps = percpu_info[cpu].pts = NULL;
     }
 
+    if ( unlikely(rc) )
+       put_user( count, p_count );
+
     return rc;
 }
 
index 2f9051d9e56c4c5533cb3707b89d4f18c449b1aa..befc929474c47a6a143f02e0ead42413e78f4cfd 100644 (file)
@@ -111,6 +111,9 @@ net_vif_t *create_net_vif(domid_t dom)
     clear_page(new_ring);
     SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p);
 
+    machine_to_phys_mapping[virt_to_phys(new_ring)>>PAGE_SHIFT] = 
+       0x80000001; // magic value aids debugging
+
     /*
      * Fill in the new vif struct. Note that, while the vif's refcnt is
      * non-zero, we hold a reference to the task structure.
index 1144c0e65e3fd3105b974afb1d261483c024eedc..fe142e3ee9fbc87f0a7ff7bf9dbf72495f3b9d7e 100644 (file)
@@ -123,6 +123,7 @@ static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
     }
     return work;
 }
+
 static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
 {
     int j, work=0;
@@ -150,7 +151,7 @@ static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
         }
         shadow_audit(m,0);
     }
-    SH_LOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
+    SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
 }
 
 
@@ -160,7 +161,6 @@ int shadow_mode_enable( struct task_struct *p, unsigned int mode )
     struct shadow_status **fptr;
     int i;
 
-
     spin_lock_init(&m->shadow_lock);
     spin_lock(&m->shadow_lock);
 
@@ -217,7 +217,6 @@ int shadow_mode_enable( struct task_struct *p, unsigned int mode )
 
     // call shadow_mk_pagetable
     shadow_mk_pagetable( m );
-
     return 0;
 
  nomem:
@@ -260,9 +259,12 @@ void shadow_mode_disable( struct task_struct *p )
     kfree( &m->shadow_ht[0] );
 }
 
-static void shadow_mode_table_op( struct task_struct *p, unsigned int op )
+static int shadow_mode_table_op( struct task_struct *p, 
+                                                                 dom0_shadow_control_t *sc )
 {
+       unsigned int op = sc->op;
     struct mm_struct *m = &p->mm;
+       int rc = 0;
 
     // since Dom0 did the hypercall, we should be running with it's page
     // tables right now. Calling flush on yourself would be really
@@ -271,13 +273,13 @@ static void shadow_mode_table_op( struct task_struct *p, unsigned int op )
     if ( m == &current->mm )
     {
         printk("Don't try and flush your own page tables!\n");
-        return;
+        return -EINVAL;
     }
    
 
     spin_lock(&m->shadow_lock);
 
-    SH_LOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
+    SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
 
     shadow_audit(m,1);
 
@@ -288,27 +290,60 @@ static void shadow_mode_table_op( struct task_struct *p, unsigned int op )
         break;
    
     case DOM0_SHADOW_CONTROL_OP_CLEAN:
-        __scan_shadow_table( m, op );
-        // we used to bzero dirty bitmap here, but now leave this to user space
-        // if we were double buffering we'd do the flip here
+       {
+               int i;
+
+           __scan_shadow_table( m, op );
+
+           if( p->tot_pages > sc->pages || 
+                       !sc->dirty_bitmap || !p->mm.shadow_dirty_bitmap )
+           {
+                       rc = -EINVAL;
+                       goto out;
+           }
+           
+           sc->pages = p->tot_pages;
+          
+#define chunk (8*1024) // do this in 1KB chunks for L1 cache
+
+           for(i=0;i<p->tot_pages;i+=chunk)
+           {
+                       int bytes = ((  ((p->tot_pages-i) > (chunk))?
+                               (chunk):(p->tot_pages-i) ) + 7) / 8;
+
+                       copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+                                                 p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                                                 bytes );
+
+                       memset( p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+                                  0, bytes);
+               }
+
         break;
+       }
     }
 
+
+out:
+
     spin_unlock(&m->shadow_lock);
 
-    SH_LOG("shadow mode table op : page count %d", m->shadow_page_count);
+    SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count);
 
     shadow_audit(m,1);
 
     // call shadow_mk_pagetable
     shadow_mk_pagetable( m );
 
+       return rc;
 }
 
 
-int shadow_mode_control( struct task_struct *p, unsigned int op )
+int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc )
 {
     int  we_paused = 0;
+       unsigned int cmd = sc->op;
+       int rc = 0;
  
     // don't call if already shadowed...
 
@@ -321,18 +356,23 @@ int shadow_mode_control( struct task_struct *p, unsigned int op )
         we_paused = 1;
     }
 
-    if ( p->mm.shadow_mode && op == DOM0_SHADOW_CONTROL_OP_OFF )
+    if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
     {
         shadow_mode_disable(p);
     }
-    else if ( op == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
+    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
     {
         if(p->mm.shadow_mode) shadow_mode_disable(p);
         shadow_mode_enable(p, SHM_test);
     } 
-    else if ( p->mm.shadow_mode && op >= DOM0_SHADOW_CONTROL_OP_FLUSH && op<=DOM0_SHADOW_CONTROL_OP_CLEAN )
+    else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY )
     {
-        shadow_mode_table_op(p, op);
+        if(p->mm.shadow_mode) shadow_mode_disable(p);
+        shadow_mode_enable(p, SHM_logdirty);
+    } 
+    else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN )
+    {
+        rc = shadow_mode_table_op(p, sc);
     }
     else
     {
@@ -341,7 +381,7 @@ int shadow_mode_control( struct task_struct *p, unsigned int op )
     }
 
     if ( we_paused ) wake_up(p);
-    return 0;
+    return rc;
 }
 
 
index f44902b1c9a08e248d0e278d91e03993d6440e3a..6901262cb8fd14b8a711d02e9ff773946a473e00 100644 (file)
@@ -19,6 +19,7 @@
 #include <xen/interrupt.h>
 #include <xen/vbd.h>
 #include <xen/slab.h>
+#include <xen/shadow.h>
 
 /*
  * These are rather arbitrary. They are fairly large because adjacent requests
@@ -358,9 +359,18 @@ static void unlock_buffer(unsigned long buffer,
           pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
           pfn++ )
     {
+
+       /* Find the domain from the frame_table. Yuk... */
+       struct task_struct *p = frame_table[pfn].u.domain;
+
+       if( p->mm.shadow_mode == SHM_logdirty )
+           mark_dirty( &p->mm, pfn );  
+
+
         if ( writeable_buffer )
             put_page_type(&frame_table[pfn]);
         put_page(&frame_table[pfn]);
+
     }
 }
 
@@ -597,6 +607,10 @@ void init_blkdev_info(struct task_struct *p)
     p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
     clear_page(p->blk_ring_base);
     SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p);
+
+    machine_to_phys_mapping[virt_to_phys(p->blk_ring_base)>>PAGE_SHIFT] =
+       0x80000002; // magic value aids debugging
+
     p->blkdev_list.next = NULL;
     spin_lock_init(&p->vbd_lock);
 }
index 2968e2e4e9dfb9806a0b1caac40ac72b04e210d5..c16d4760163d21a3bc9a5cdf85ccfef0b270fc62 100644 (file)
@@ -449,7 +449,7 @@ struct mm_struct {
     struct shadow_status *shadow_ht;
     struct shadow_status *shadow_ht_free;
     struct shadow_status *shadow_ht_extras; /* extra allocation units */
-    unsigned int *shadow_dirty_bitmap;
+    unsigned long *shadow_dirty_bitmap;
     unsigned int shadow_dirty_bitmap_size;  /* in pages, bit per page */
     unsigned int shadow_page_count;
     unsigned int shadow_max_page_count;
index 024e75ad8304ad9ebc95cba06724c7767aba70d4..0027e9df29deba92cd40cdec75514947a0912aa1 100644 (file)
@@ -243,6 +243,9 @@ typedef struct dom0_shadow_control_st
     /* IN variables. */
     domid_t      domain;
     int          op;
+    unsigned long  *dirty_bitmap; // pointe to mlocked buffer
+    /* IN/OUT variables */
+    unsigned long  pages;  // size of buffer, updated with actual size
 } dom0_shadow_control_t;
 
 #define DOM0_SETDOMAINNAME     26
index 628d20c17eff9f898001b6f2b859569a594003c9..c132ad9662a4064e6bd84e3eb49b3bb852c945b7 100644 (file)
@@ -164,8 +164,8 @@ static inline int get_page(struct pfn_info *page,
              unlikely(x & PGC_zombie) ||             /* Zombie? */
              unlikely(p != domain) )                 /* Wrong owner? */
         {
-            DPRINTK("Error pfn %08lx: ed=%p(%lld), sd=%p(%lld), caf=%08x\n",
-                    page_to_pfn(page), domain, (domain)?domain->domain:1234, p, (p)?p->domain:1234, x);
+            DPRINTK("Error pfn %08lx: ed=%p(%lld), sd=%p(%lld), caf=%08x, taf=%08x\n",
+                    page_to_pfn(page), domain, (domain)?domain->domain:999, p, (p && !((x & PGC_count_mask) == 0))?p->domain:999, x, page->type_and_flags);
             return 0;
         }
         __asm__ __volatile__(
@@ -314,7 +314,7 @@ int check_descriptor(unsigned long a, unsigned long b);
 #define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START)
 
 /* Part of the domain API. */
-int do_mmu_update(mmu_update_t *updates, int count);
+int do_mmu_update(mmu_update_t *updates, int *count);
 
 #define DEFAULT_GDT_ENTRIES     ((LAST_RESERVED_GDT_ENTRY*8)+7)
 #define DEFAULT_GDT_ADDRESS     ((unsigned long)gdt_table)
index fba6fe3dfdf99d0677c454a44a1022e2393b0578..01b46301aa49b23ee20fde59ac6cf56fcad66265 100644 (file)
@@ -23,7 +23,7 @@
 #define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
 #define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
 
-extern int shadow_mode_control( struct task_struct *p, unsigned int op );
+extern int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc );
 extern int shadow_fault( unsigned long va, long error_code );
 extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte, 
                                                                                unsigned long *prev_spfn_ptr,
@@ -50,7 +50,7 @@ struct shadow_status {
 
 #ifndef NDEBUG
 #define SH_LOG(_f, _a...)                             \
-  printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \
+  printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
          current->domain , __LINE__ , ## _a )
 #else
 #define SH_LOG(_f, _a...) 
@@ -58,7 +58,7 @@ struct shadow_status {
 
 #if SHADOW_DEBUG
 #define SH_VLOG(_f, _a...)                             \
-  printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \
+  printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
          current->domain , __LINE__ , ## _a )
 #else
 #define SH_VLOG(_f, _a...) 
@@ -66,19 +66,27 @@ struct shadow_status {
 
 #if 0
 #define SH_VVLOG(_f, _a...)                             \
-  printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \
+  printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
          current->domain , __LINE__ , ## _a )
 #else
 #define SH_VVLOG(_f, _a...) 
 #endif
 
 
-
 /************************************************************************/
 
 static inline void mark_dirty( struct mm_struct *m, unsigned int mfn )
 {
-       unsigned int pfn = machine_to_phys_mapping[mfn];
+       unsigned int pfn;
+
+       pfn = machine_to_phys_mapping[mfn];
+
+       /* We use values with the top bit set to mark MFNs that aren't
+          really part of the domain's psuedo-physical memory map e.g.
+           the shared info frame. Nothing to do here...
+         */
+       if ( unlikely(pfn & 0x80000000U) ) return; 
+
        ASSERT(m->shadow_dirty_bitmap);
        if( likely(pfn<m->shadow_dirty_bitmap_size) )
        {
@@ -91,7 +99,14 @@ static inline void mark_dirty( struct mm_struct *m, unsigned int mfn )
        }
        else
        {
-               SH_LOG("mark_dirty pfn out of range attempt!");
+               extern void show_traceX(void);
+               SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
+                          mfn, pfn, m->shadow_dirty_bitmap_size, m );
+               SH_LOG("dom=%lld caf=%08x taf=%08x\n", 
+                          frame_table[mfn].u.domain->domain,
+                          frame_table[mfn].count_and_flags, 
+                          frame_table[mfn].type_and_flags );
+               //show_traceX();
        }
 
 }
@@ -116,7 +131,7 @@ static inline void l1pte_write_fault( struct mm_struct *m,
                spte = gpte;
                gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
                spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;                        
-               mark_dirty( m, gpte >> PAGE_SHIFT );
+               mark_dirty( m, (gpte >> PAGE_SHIFT) );
                break;
     }
 
@@ -343,7 +358,7 @@ static inline unsigned long get_shadow_status( struct mm_struct *m,
 
        if( m->shadow_mode == SHM_logdirty )
                mark_dirty( m, gpfn );
-
+       
        spin_lock(&m->shadow_lock);
        res = __shadow_status( m, gpfn );
        if (!res) spin_unlock(&m->shadow_lock);
index 5ab01092f0e726218a8eb33ea8a07286f98f6fd1..025256813120c00adbbe3705435d0347cd4a391b 100644 (file)
@@ -547,6 +547,9 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
         goto out;
     }
 
+    machine_to_phys_mapping[new_page - frame_table] = 
+       machine_to_phys_mapping[old_page - frame_table];
+
     if ( p->mm.shadow_mode && 
         (spte_pfn=get_shadow_status(&p->mm, pte_page-frame_table)) )
     {
@@ -557,17 +560,15 @@ void deliver_packet(struct sk_buff *skb, net_vif_t *vif)
        *sptr = new_pte;
        unmap_domain_mem(sptr);
 
-       if( p->mm.shadow_mode == SHM_logdirty )
-               mark_dirty( &p->mm, new_page-frame_table );
-
        put_shadow_status(&p->mm);
     }
-
-    machine_to_phys_mapping[new_page - frame_table] 
-        = machine_to_phys_mapping[old_page - frame_table];
     
     unmap_domain_mem(ptep);
 
+    /* if in shadow mode, mark the buffer as dirty */
+    if( p->mm.shadow_mode == SHM_logdirty )
+       mark_dirty( &p->mm, (new_page-frame_table) );
+
     /* Updates must happen before releasing the descriptor. */
     smp_wmb();
 
@@ -2143,8 +2144,6 @@ static void get_rx_bufs(net_vif_t *vif)
             put_page_and_type(pte_page);
             make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
             goto rx_unmap_and_continue;
-
-           /* XXX IAP should SHADOW_CONFIG do something here? */
         }
 
         /*
@@ -2156,9 +2155,11 @@ static void get_rx_bufs(net_vif_t *vif)
                               0) != 
                       (PGC_allocated | PGC_tlb_flush_on_type_change | 2)) )
         {
-            DPRINTK("Page held more than once %08x %s\n", 
+            DPRINTK("Page held more than once mfn=%x %08x %s\n", 
+                   buf_page-frame_table,
                     buf_page->count_and_flags,
                    (buf_page->u.domain)?buf_page->u.domain->name:"None");
+
             if ( !get_page_type(buf_page, PGT_writeable_page) )
                 put_page(buf_page);
             else if ( cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) !=
@@ -2264,6 +2265,13 @@ long flush_bufs_for_vif(net_vif_t *vif)
 
         put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]);
 
+       /* if in shadow mode, mark the PTE as dirty */
+       if( p->mm.shadow_mode == SHM_logdirty )
+           mark_dirty( &p->mm, rx->pte_ptr>>PAGE_SHIFT );
+       /* assume the shadow page table is about to be blown away,
+          and that its not worth marking the buffer as dirty */
+
+
         make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0);
     }
     vif->rx_cons = i;
index d00dd98f7b49d4273121d6a2cd77a81958bf5116..43a6a234798aea68e5d5390d7d229e22e8f08103 100644 (file)
@@ -527,8 +527,6 @@ static void reset_xlblk_interface(void)
 {
     block_io_op_t op; 
 
-    nr_pending = 0;
-
     op.cmd = BLOCK_IO_OP_RESET;
     if ( HYPERVISOR_block_io_op(&op) != 0 )
         printk(KERN_ALERT "Possible blkdev trouble: couldn't reset ring\n");
@@ -549,6 +547,8 @@ int __init xlblk_init(void)
 {
     int error; 
 
+    nr_pending = 0;
+
     reset_xlblk_interface();
 
     xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV);
index ccda2c2022fd7fc0ebb617070923bec6d837e0c4..2fc577061e3cbc772f34754311c309a18c13463e 100644 (file)
@@ -103,12 +103,12 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
                if (msg[j].va + (msg[j].npages<<PAGE_SHIFT) > vma->vm_end)
                    return -EINVAL;
 
-               if (rc = direct_remap_area_pages(vma->vm_mm, 
+               if ( (rc = direct_remap_area_pages(vma->vm_mm, 
                                            msg[j].va&PAGE_MASK, 
                                            msg[j].mfn<<PAGE_SHIFT, 
                                            msg[j].npages<<PAGE_SHIFT, 
                                            vma->vm_page_prot,
-                                           mmapcmd.dom))
+                                           mmapcmd.dom)) <0)
                    return rc;
            }
        }
@@ -116,6 +116,91 @@ static int privcmd_ioctl(struct inode *inode, struct file *file,
     }
     break;
 
+    case IOCTL_PRIVCMD_MMAPBATCH:
+    {
+#define MAX_DIRECTMAP_MMU_QUEUE 130
+       mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v;
+       privcmd_mmapbatch_t m;
+       struct vm_area_struct *vma = NULL;
+       unsigned long *p, addr;
+       unsigned long mfn;
+       int i;
+
+        if ( copy_from_user(&m, (void *)data, sizeof(m)) )
+       { ret = -EFAULT; goto batch_err; }
+
+       vma = find_vma( current->mm, m.addr );
+
+       if (!vma)
+       { ret = -EINVAL; goto batch_err; }
+
+       if (m.addr > PAGE_OFFSET)
+       { ret = -EFAULT; goto batch_err; }
+
+       if (m.addr + (m.num<<PAGE_SHIFT) > vma->vm_end)
+       { ret = -EFAULT; goto batch_err; }
+
+       // everything fits inside the vma
+
+//printk("direct_r_a_p sx=%ld address=%lx macaddr=%lx dom=%lld\n",size,address,machine_addr,domid);
+//    memset( u, 0, sizeof(mmu_update_t)*MAX_DIRECTMAP_MMU_QUEUE );// XXX
+
+
+       if ( m.dom != 0 )
+       {
+           u[0].val  = (unsigned long)(m.dom<<16) & ~0xFFFFUL;
+           u[0].ptr  = (unsigned long)(m.dom<< 0) & ~0xFFFFUL;
+           u[1].val  = (unsigned long)(m.dom>>16) & ~0xFFFFUL;
+           u[1].ptr  = (unsigned long)(m.dom>>32) & ~0xFFFFUL;
+           u[0].ptr |= MMU_EXTENDED_COMMAND;
+           u[0].val |= MMUEXT_SET_SUBJECTDOM_L;
+           u[1].ptr |= MMU_EXTENDED_COMMAND;
+           u[1].val |= MMUEXT_SET_SUBJECTDOM_H;
+           v = w = &u[2];
+       }
+       else
+       {
+           v = w = &u[0];
+       }
+
+       p = m.arr;
+       addr = m.addr;
+//printk("BATCH: arr=%p addr=%lx num=%d u=%p,w=%p\n",p,addr,m.num,u,w);
+       for (i=0; i<m.num; i++, addr+=PAGE_SIZE, p++)
+       {
+           unsigned int count;
+           if ( get_user(mfn, p) ) return -EFAULT;
+
+           v->val = (mfn << PAGE_SHIFT) | pgprot_val(vma->vm_page_prot) |
+               _PAGE_IO;
+
+           __direct_remap_area_pages( vma->vm_mm,
+                                      addr, 
+                                      PAGE_SIZE, 
+                                      v);
+           v++;
+           count = v-u;
+//printk("Q i=%d mfn=%x co=%d v=%p : %lx %lx\n",i,mfn,count,v, w->val,w->ptr);
+
+           if ( HYPERVISOR_mmu_update(u, &count) < 0 )
+           {
+               //printk("Fail %d->%d mfn=%lx\n",v-u,count, w->val);
+               put_user( 0xe0000000 | mfn, p );
+           }
+           v=w;
+       }
+       ret = 0;
+       break;
+
+    batch_err:
+       printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%lx %lx-%lx\n", 
+              ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end);
+       break;
+    }
+    break;
+
+
+
     default:
         ret = -EINVAL;
        break;
index ef54ff7fe9e7102c4f5890f3af1ece9cfa74dcd8..daa8441d7b2ede877eec01341fd99ae7485d129b 100644 (file)
@@ -248,6 +248,8 @@ static void network_alloc_rx_buffers(struct net_device *dev)
         np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr = 
             virt_to_machine(get_ppte(skb->head));
 
+       /* Shadow optimisation: disown this page from p->m map */
+       phys_to_machine_mapping[virt_to_phys(skb->head)>>PAGE_SHIFT] = 0x80000004;
         np->rx_bufs_to_notify++;
     }
     while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE );
@@ -364,6 +366,9 @@ static inline void _network_interrupt(struct net_device *dev)
         skb = np->rx_skbs[rx->id];
         ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
 
+        phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
+            (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
+
         if ( unlikely(rx->status != RING_STATUS_OK) )
         {
             /* Gate this error. We get a (valid) slew of them on suspend. */
@@ -382,9 +387,6 @@ static inline void _network_interrupt(struct net_device *dev)
         skb_shinfo(skb)->nr_frags = 0;
         skb_shinfo(skb)->frag_list = NULL;
                                 
-        phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
-            (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
-
         skb->data = skb->tail = skb->head + rx->offset;
         skb_put(skb, rx->size);
         skb->protocol = eth_type_trans(skb, dev);
index 6be85db7f1641150f57626ff5ac8dff3b7c1d818..b06c6c26b08b92c50d2cc25db30d4e3ca250c0f9 100644 (file)
@@ -1161,11 +1161,11 @@ static void stop_task(void *unused)
         virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT;
     suspend_record->nr_pfns = max_pfn;
 
-    j = 0;
-    for ( i = 0; i < max_pfn; i += (PAGE_SIZE / sizeof(unsigned long)) )
-        pfn_to_mfn_frame_list[j++] = 
+    for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ )
+    {  
+        pfn_to_mfn_frame_list[j] = 
             virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT;
-
+    }
     /*
      * NB. This is /not/ a full dev_close() as that loses route information!
      * Instead we do essentialy the same as dev_close() but without notifying
@@ -1207,7 +1207,9 @@ static void stop_task(void *unused)
     memcpy(&start_info, &suspend_record->resume_info, sizeof(start_info));
 
     set_fixmap(FIX_SHARED_INFO, start_info.shared_info);
+
     HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+
     memset(empty_zero_page, 0, PAGE_SIZE);
 
     irq_resume();
index 52920cd0fc9569f40ef24ddcd99c4443e4aad207..3291a0338db1b4a835e839e48fedc5d37965534a 100644 (file)
@@ -62,6 +62,7 @@
 #include <linux/smp.h>
 #include <linux/irq.h>
 #include <linux/sysctl.h>
+#include <linux/sysrq.h>
 
 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
 extern rwlock_t xtime_lock;
@@ -581,6 +582,10 @@ static void dbg_time_int(int irq, void *dev_id, struct pt_regs *ptregs)
            timer->expires,(u32)(t_st>>32), (u32)t_st);
     printk(KERN_ALERT "time: processed_system_time=0x%X:%08X\n",
            (u32)(processed_system_time>>32), (u32)processed_system_time);
+
+
+    handle_sysrq('t',NULL,NULL,NULL);
+
 }
 
 static struct irqaction dbg_time = {
index 0337cae1ca34cb9bc41356eab83773a9d1c20c2c..78dbb9ef23ffdc552ec9966b1f89a4d8849247ab 100644 (file)
@@ -317,16 +317,17 @@ asmlinkage void do_general_protection(struct pt_regs * regs, long error_code)
                __asm__ __volatile__ ( "sldt %0" : "=r" (ldt) );
                if ( ldt == 0 )
                {
-                       mmu_update_t u;
-                       u.ptr  = MMU_EXTENDED_COMMAND;
-                       u.ptr |= (unsigned long)&default_ldt[0];
-                       u.val  = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT);
-                       if ( unlikely(HYPERVISOR_mmu_update(&u, 1) < 0) )
-                       {
-                               show_trace(NULL);
-                               panic("Failed to install default LDT");
-                       }
-                       return;
+                   int count = 1;
+                   mmu_update_t u;
+                   u.ptr  = MMU_EXTENDED_COMMAND;
+                   u.ptr |= (unsigned long)&default_ldt[0];
+                   u.val  = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT);
+                   if ( unlikely(HYPERVISOR_mmu_update(&u, &count) < 0) )
+                   {
+                       show_trace(NULL);
+                       panic("Failed to install default LDT");
+                   }
+                   return;
                }
        }
 
index c6dc7105766c56e5d613b8aaa66ef843b8dbf47a..daa5ee1d73f993b4b65e4bcbd3af3519afad2abc 100644 (file)
@@ -37,12 +37,13 @@ static void DEBUG_allow_pt_reads(void)
     int i;
     for ( i = idx-1; i >= 0; i-- )
     {
+       int count = 1;
         pte = update_debug_queue[i].ptep;
         if ( pte == NULL ) continue;
         update_debug_queue[i].ptep = NULL;
         update.ptr = virt_to_machine(pte);
         update.val = update_debug_queue[i].pteval;
-        HYPERVISOR_mmu_update(&update, 1);
+        HYPERVISOR_mmu_update(&update, &count);
     }
 }
 static void DEBUG_disallow_pt_read(unsigned long va)
@@ -51,6 +52,7 @@ static void DEBUG_disallow_pt_read(unsigned long va)
     pmd_t *pmd;
     pgd_t *pgd;
     unsigned long pteval;
+    int count = 1;
     /*
      * We may fault because of an already outstanding update.
      * That's okay -- it'll get fixed up in the fault handler.
@@ -62,7 +64,7 @@ static void DEBUG_disallow_pt_read(unsigned long va)
     update.ptr = virt_to_machine(pte);
     pteval = *(unsigned long *)pte;
     update.val = pteval & ~_PAGE_PRESENT;
-    HYPERVISOR_mmu_update(&update, 1);
+    HYPERVISOR_mmu_update(&update, &count);
     update_debug_queue[idx].ptep = pte;
     update_debug_queue[idx].pteval = pteval;
 }
@@ -100,7 +102,7 @@ void MULTICALL_flush_page_update_queue(void)
         wmb(); /* Make sure index is cleared first to avoid double updates. */
         queue_multicall2(__HYPERVISOR_mmu_update, 
                          (unsigned long)update_queue, 
-                         _idx);
+                         &_idx);
     }
     spin_unlock_irqrestore(&update_lock, flags);
 }
@@ -116,7 +118,7 @@ static inline void __flush_page_update_queue(void)
 #endif
     idx = 0;
     wmb(); /* Make sure index is cleared first to avoid double updates. */
-    if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx) < 0) )
+    if ( unlikely(HYPERVISOR_mmu_update(update_queue, &_idx) < 0) )
         panic("Failed to execute MMU updates");
 }
 
index 28a0a4071ae9f336c6626fafd0439c41342fd5e8..dbe706bb9534fe13ba6bb9860af08933d640e879 100644 (file)
 #define direct_mk_pte_phys(physpage, pgprot) \
   __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
 
-static inline int direct_remap_area_pte(pte_t *pte, 
+static inline void direct_remap_area_pte(pte_t *pte, 
                                         unsigned long address, 
                                         unsigned long size,
-                                        unsigned long machine_addr, 
-                                        pgprot_t prot,
-                                        domid_t  domid)
+                                       mmu_update_t **v)
 {
     unsigned long end;
-#define MAX_DIRECTMAP_MMU_QUEUE 130
-    mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v, *w;
 
     address &= ~PMD_MASK;
     end = address + size;
@@ -45,95 +41,55 @@ static inline int direct_remap_area_pte(pte_t *pte,
     if (address >= end)
         BUG();
 
-    /* If not I/O mapping then specify General-Purpose Subject Domain (GPS). */
-    if ( domid != 0 )
-    {
-        u[0].val  = (unsigned long)(domid<<16) & ~0xFFFFUL;
-        u[0].ptr  = (unsigned long)(domid<< 0) & ~0xFFFFUL;
-        u[1].val  = (unsigned long)(domid>>16) & ~0xFFFFUL;
-        u[1].ptr  = (unsigned long)(domid>>32) & ~0xFFFFUL;
-        u[0].ptr |= MMU_EXTENDED_COMMAND;
-        u[0].val |= MMUEXT_SET_SUBJECTDOM_L;
-        u[1].ptr |= MMU_EXTENDED_COMMAND;
-        u[1].val |= MMUEXT_SET_SUBJECTDOM_H;
-        v = w = &u[2];
-    }
-    else
-    {
-        v = w = &u[0];
-    }
-
     do {
-        if ( (v-u) == MAX_DIRECTMAP_MMU_QUEUE )
-        {
-            if ( HYPERVISOR_mmu_update(u, MAX_DIRECTMAP_MMU_QUEUE) < 0 )
-                return -EFAULT;
-            v = w;
-        }
-#if 0  /* thanks to new ioctl mmaping interface this is no longer a bug */
+#if 0 // XXX
         if (!pte_none(*pte)) {
             printk("direct_remap_area_pte: page already exists\n");
             BUG();
         }
 #endif
-        v->ptr = virt_to_machine(pte);
-        v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO;
-        v++;
+        (*v)->ptr = virt_to_machine(pte);
+        (*v)++;
         address += PAGE_SIZE;
-        machine_addr += PAGE_SIZE;
         pte++;
     } while (address && (address < end));
-
-    if ( ((v-w) != 0) && (HYPERVISOR_mmu_update(u, v-u) < 0) )
-        return -EFAULT;
-
-    return 0;
+    return ;
 }
 
 static inline int direct_remap_area_pmd(struct mm_struct *mm,
                                         pmd_t *pmd, 
                                         unsigned long address, 
                                         unsigned long size,
-                                        unsigned long machine_addr,
-                                        pgprot_t prot,
-                                        domid_t  domid)
+                                       mmu_update_t **v)
 {
-    int error = 0;
     unsigned long end;
 
     address &= ~PGDIR_MASK;
     end = address + size;
     if (end > PGDIR_SIZE)
         end = PGDIR_SIZE;
-    machine_addr -= address;
     if (address >= end)
         BUG();
     do {
         pte_t * pte = pte_alloc(mm, pmd, address);
         if (!pte)
             return -ENOMEM;
-        error = direct_remap_area_pte(pte, address, end - address, 
-                                      address + machine_addr, prot, domid);
-        if ( error )
-            break;
+        direct_remap_area_pte(pte, address, end - address, v);
+
         address = (address + PMD_SIZE) & PMD_MASK;
         pmd++;
     } while (address && (address < end));
-    return error;
+    return 0;
 }
  
-int direct_remap_area_pages(struct mm_struct *mm,
-                            unsigned long address, 
-                            unsigned long machine_addr,
-                            unsigned long size, 
-                            pgprot_t prot,
-                            domid_t  domid)
+int __direct_remap_area_pages(struct mm_struct *mm,
+                             unsigned long address, 
+                             unsigned long size, 
+                             mmu_update_t *v)
 {
-    int error = 0;
     pgd_t * dir;
     unsigned long end = address + size;
 
-    machine_addr -= address;
     dir = pgd_offset(mm, address);
     flush_cache_all();
     if (address >= end)
@@ -141,21 +97,89 @@ int direct_remap_area_pages(struct mm_struct *mm,
     spin_lock(&mm->page_table_lock);
     do {
         pmd_t *pmd = pmd_alloc(mm, dir, address);
-        error = -ENOMEM;
         if (!pmd)
-            break;
-        error = direct_remap_area_pmd(mm, pmd, address, end - address,
-                                      machine_addr + address, prot, domid);
-        if (error)
-            break;
+           return -ENOMEM;
+        direct_remap_area_pmd(mm, pmd, address, end - address, &v);
         address = (address + PGDIR_SIZE) & PGDIR_MASK;
         dir++;
+
     } while (address && (address < end));
     spin_unlock(&mm->page_table_lock);
     flush_tlb_all();
-    return error;
+    return 0;
 }
 
+
+int direct_remap_area_pages(struct mm_struct *mm,
+                            unsigned long address, 
+                            unsigned long machine_addr,
+                            unsigned long size, 
+                            pgprot_t prot,
+                            domid_t  domid)
+{
+    int i, count;
+    unsigned long start_address;
+#define MAX_DIRECTMAP_MMU_QUEUE 130
+    mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v;
+
+    if ( domid != 0 )
+    {
+        u[0].val  = (unsigned long)(domid<<16) & ~0xFFFFUL;
+        u[0].ptr  = (unsigned long)(domid<< 0) & ~0xFFFFUL;
+        u[1].val  = (unsigned long)(domid>>16) & ~0xFFFFUL;
+        u[1].ptr  = (unsigned long)(domid>>32) & ~0xFFFFUL;
+        u[0].ptr |= MMU_EXTENDED_COMMAND;
+        u[0].val |= MMUEXT_SET_SUBJECTDOM_L;
+        u[1].ptr |= MMU_EXTENDED_COMMAND;
+        u[1].val |= MMUEXT_SET_SUBJECTDOM_H;
+        v = w = &u[2];
+    }
+    else
+    {
+        v = w = &u[0];
+    }
+
+    start_address = address;
+
+    for(i=0; i<size; 
+       i+=PAGE_SIZE, machine_addr+=PAGE_SIZE, address+=PAGE_SIZE, v++)
+    {
+       if( (v-u) == MAX_DIRECTMAP_MMU_QUEUE )
+       {
+           /* get the ptep's filled in */
+           __direct_remap_area_pages( mm,
+                                      start_address, 
+                                      address-start_address, 
+                                      w);
+           
+           count = v-u;
+           if ( HYPERVISOR_mmu_update(u, &count) < 0 )
+               return -EFAULT;     
+           v=w;
+           start_address = address;
+       }
+
+       /* fill in the machine addresses */
+        v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO;
+    }
+
+    if(v!=w)
+    {
+       /* get the ptep's filled in */
+       __direct_remap_area_pages( mm,
+                                  start_address, 
+                                  address-start_address, 
+                                  w);   
+       count = v-u;
+       if ( HYPERVISOR_mmu_update(u, &count) < 0 )
+           return -EFAULT;         
+
+    }
+    
+    return 0;
+}
+
+
 #endif /* CONFIG_XEN_PRIVILEGED_GUEST */
 
 
index c454728c0e65b240da89d7b01ff93aa46e1eb32c..e8b2bc40b0de7bd9767fa133dabec2f0152f58a2 100644 (file)
@@ -153,7 +153,7 @@ static inline int HYPERVISOR_set_trap_table(trap_info_t *table)
     return ret;
 }
 
-static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count)
+static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int *count)
 {
     int ret;
     __asm__ __volatile__ (
index d853a3f2af155b4f69648d7f171342d78df650e0..143beeeef5cd47f019e452b059487ebc80abfafe 100644 (file)
@@ -276,4 +276,11 @@ extern int direct_remap_area_pages(struct mm_struct *mm,
                                    pgprot_t prot,
                                    domid_t  domid);
 
+extern int __direct_remap_area_pages(struct mm_struct *mm,
+                                    unsigned long address, 
+                                    unsigned long size, 
+                                    mmu_update_t *v);
+
+
+
 #endif /* _I386_PGALLOC_H */
index 3bf03c6064db71f4641996fbde285e31d7f869f9..08e452de15bd7f582d61286bbf7681642e4f777e 100644 (file)
@@ -25,6 +25,13 @@ typedef struct privcmd_mmap {
     privcmd_mmap_entry_t *entry;
 } privcmd_mmap_t; 
 
+typedef struct privcmd_mmapbatch {
+    int num;     // number of pages to populate
+    domid_t dom; // target domain 
+    unsigned long addr;  // virtual address
+    unsigned long *arr; // array of mfns - top nibble set on err
+} privcmd_mmapbatch_t; 
+
 typedef struct privcmd_blkmsg
 {
     unsigned long op;
@@ -50,5 +57,7 @@ typedef struct privcmd_blkmsg
     _IOC(_IOC_NONE, 'P', 1, 0)
 #define IOCTL_PRIVCMD_MMAP             \
     _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
+#define IOCTL_PRIVCMD_MMAPBATCH             \
+    _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmapbatch_t))
 
 #endif /* __PROC_CMD_H__ */