Live migration initial checkin.
xc.domain_stop( dom=dom )
while not xc.domain_getinfo( first_dom=dom, max_doms=1 )[0]['stopped']:
- time.sleep(0.1);
+ print "Sleep..."
+ time.sleep(0.001);
rc = xc.linux_save( dom=dom, state_file=file, progress=1)
if rc == 0 : xc.domain_destroy( dom=dom, force=1 )
+ else: xc.domain_start( dom=dom ) # sensible for production use
elif cmd == 'cpu_bvtslice':
if len(sys.argv) < 3:
int xc_shadow_control(int xc_handle,
u64 domid,
- unsigned int sop);
+ unsigned int sop,
+ unsigned long *dirty_bitmap,
+ unsigned long pages);
+
#define XCFLAGS_VERBOSE 1
#define XCFLAGS_LIVE 2
int xc_physinfo(int xc_handle,
xc_physinfo_t *info);
-
-int xc_shadow_control(int xc_handle,
- u64 domid,
- unsigned int sop);
-
int xc_domain_setname(int xc_handle,
u64 domid,
char *name);
int xc_shadow_control(int xc_handle,
u64 domid,
- unsigned int sop)
+ unsigned int sop,
+ unsigned long *dirty_bitmap,
+ unsigned long pages)
{
+ int rc;
dom0_op_t op;
op.cmd = DOM0_SHADOW_CONTROL;
op.u.shadow_control.domain = (domid_t)domid;
op.u.shadow_control.op = sop;
- return do_dom0_op(xc_handle, &op);
+ op.u.shadow_control.dirty_bitmap = dirty_bitmap;
+ op.u.shadow_control.pages = pages;
+
+ rc = do_dom0_op(xc_handle, &op);
+
+ if ( rc == 0 )
+ return op.u.shadow_control.pages;
+ else
+ return rc;
}
int xc_domain_setname(int xc_handle,
/* shared_info page starts its life empty. */
shared_info = map_pfn_writeable(pm_handle, shared_info_frame);
- memset(shared_info, 0, PAGE_SIZE);
+ memset(shared_info, 0, sizeof(shared_info_t));
/* Mask all upcalls... */
for ( i = 0; i < MAX_VIRT_CPUS; i++ )
shared_info->vcpu_data[i].evtchn_upcall_mask = 1;
goto out;
}
- //printf("batch=%d\n",j);
+ printf("batch %d\n",j);
- if(j==0) break; // our work here is done
+ if (j == 0)
+ break; // our work here is done
+
+ if( j > MAX_BATCH_SIZE )
+ {
+ ERROR("Max batch size exceeded. Giving up.");
+ goto out;
+ }
if ( (*readerfn)(readerst, region_pfn_type, j*sizeof(unsigned long)) )
{
for(i=0;i<j;i++)
{
+ if ((region_pfn_type[i]>>29) == 7)
+ continue;
+
pfn = region_pfn_type[i] & ~PGT_type_mask;
mfn = pfn_to_mfn_table[pfn];
unsigned long *ppage;
pfn = region_pfn_type[i] & ~PGT_type_mask;
+
+//if(n>=nr_pfns || ((region_pfn_type[i] & PGT_type_mask) == L2TAB) ) printf("pfn=%08lx mfn=%x\n",region_pfn_type[i],pfn_to_mfn_table[pfn]);
+
//if(pfn_type[i])printf("^pfn=%d %08lx\n",pfn,pfn_type[i]);
+ if ((region_pfn_type[i]>>29) == 7)
+ continue;
+
if (pfn>nr_pfns)
{
ERROR("pfn out of range");
if ( xpfn >= nr_pfns )
{
- ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=%d nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns);
+ ERROR("Frame number in type %d page table is out of range. i=%d k=%d pfn=0x%x nr_pfns=%d",region_pfn_type[i]>>29,i,k,xpfn,nr_pfns);
goto out;
}
#if 0
default:
ERROR("Bogus page type %x page table is out of range. i=%d nr_pfns=%d",region_pfn_type[i],i,nr_pfns);
goto out;
- }
+
+ } // end of page type switch statement
if ( add_mmu_update(xc_handle, mmu,
(mfn<<PAGE_SHIFT) | MMU_MACHPHYS_UPDATE, pfn) )
goto out;
- }
+ } // end of 'batch' for loop
n+=j; // crude stats
}
+printf("RECEIVED ALL PAGES\n");
mfn_mapper_close( region_mapper );
(pfn_to_mfn_table[i]<<PAGE_SHIFT) |
MMU_EXTENDED_COMMAND,
MMUEXT_PIN_L1_TABLE) )
+ {
+ printf("ERR pin L1 pfn=%lx mfn=%lx\n");
goto out;
+ }
}
else if ( pfn_type[i] == L2TAB )
{
(pfn_to_mfn_table[i]<<PAGE_SHIFT) |
MMU_EXTENDED_COMMAND,
MMUEXT_PIN_L2_TABLE) )
+ {
+ printf("ERR pin L2 pfn=%lx mfn=%lx\n");
goto out;
+ }
}
}
p_srec->resume_info.flags = 0;
unmap_pfn(pm_handle, p_srec);
+printf("new shared info is %lx\n", shared_info_frame);
+
/* Uncanonicalise each GDT frame number. */
if ( ctxt.gdt_ents > 8192 )
{
/* Copy saved contents of shared-info page. No checking needed. */
ppage = map_pfn_writeable(pm_handle, shared_info_frame);
- memcpy(ppage, shared_info, PAGE_SIZE);
+ memcpy(ppage, shared_info, sizeof(shared_info_t));
unmap_pfn(pm_handle, ppage);
op.u.builddomain.ctxt = &ctxt;
rc = do_dom0_op(xc_handle, &op);
+printf("NORMAL EXIT RESTORE\n");
out:
+printf("EXIT RESTORE\n");
if ( mmu != NULL )
free(mmu);
/*
* Returns TRUE if the given machine frame number has a unique mapping
* in the guest's pseudophysical map.
+ * 0x80000000-3 mark the shared_info, and blk/net rings
*/
#define MFN_IS_IN_PSEUDOPHYS_MAP(_mfn) \
- (((_mfn) < (1024*1024)) && \
- (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn)))
-
+ (((_mfn) < (1024*1024)) && \
+ ( ( (live_mfn_to_pfn_table[_mfn] < nr_pfns) && \
+ (live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == (_mfn)) ) || \
+\
+ (live_mfn_to_pfn_table[_mfn] >= 0x80000000 && \
+ live_mfn_to_pfn_table[_mfn] <= 0x80000003 ) || \
+ live_pfn_to_mfn_table[live_mfn_to_pfn_table[_mfn]] == 0x80000004 ) )
+
/* Returns TRUE if MFN is successfully converted to a PFN. */
#define translate_mfn_to_pfn(_pmfn) \
({ \
})
+/* test_bit */
+inline int test_bit ( int nr, volatile void * addr)
+{
+ return ( ((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >>
+ (nr % (sizeof(unsigned long)*8) ) ) & 1;
+}
+
+
int xc_linux_save(int xc_handle,
u64 domid,
unsigned int flags,
void *writerst )
{
dom0_op_t op;
- int rc = 1, i, j, k, n;
+ int rc = 1, i, j, k, n, last_iter, iter = 0;
unsigned long mfn;
- unsigned int prev_pc, this_pc;
int verbose = flags & XCFLAGS_VERBOSE;
- //int live = flags & XCFLAGS_LIVE;
-
- /* state of the new MFN mapper */
- mfn_mapper_t *mapper_handle1, *mapper_handle2;
+ int live = 1; //flags & XCFLAGS_LIVE; // XXXXXXXXXXXXXXXXXXX
+ int sent_last_iter, sent_this_iter, max_iters;
/* Remember if we stopped the guest, so we can restart it on exit. */
int we_stopped_it = 0;
unsigned char *region_base;
/* A temporary mapping, and a copy, of the guest's suspend record. */
- suspend_record_t *p_srec, srec;
+ suspend_record_t *p_srec;
+
+ /* number of pages we're dealing with */
+ unsigned long nr_pfns;
+ /* bitmap of pages left to send */
+ unsigned long *to_send;
if ( mlock(&ctxt, sizeof(ctxt) ) )
{
goto out;
}
- sleep(1);
+ usleep(1000); // 1ms
+ printf("Sleep for 1ms\n");
}
+#if 1
/* A cheesy test to see whether the domain contains valid state. */
if ( ctxt.pt_base == 0 )
{
ERROR("Domain is not in a valid Linux guest OS state");
goto out;
}
+#endif
/* Map the suspend-record MFN to pin it. The page must be owned by
domid for this to succeed. */
p_srec = mfn_mapper_map_single(xc_handle, domid,
- sizeof(srec), PROT_READ,
+ sizeof(*p_srec), PROT_READ,
ctxt.cpu_ctxt.esi );
if (!p_srec)
goto out;
}
- memcpy( &srec, p_srec, sizeof(srec) );
+ nr_pfns = p_srec->nr_pfns;
/* cheesy sanity check */
- if ( srec.nr_pfns > 1024*1024 )
+ if ( nr_pfns > 1024*1024 )
{
ERROR("Invalid state record -- pfn count out of range");
goto out;
live_pfn_to_mfn_frame_list =
mfn_mapper_map_single(xc_handle, domid,
PAGE_SIZE, PROT_READ,
- srec.pfn_to_mfn_frame_list );
+ p_srec->pfn_to_mfn_frame_list );
if (!live_pfn_to_mfn_frame_list)
{
ERROR("Couldn't map pfn_to_mfn_frame_list");
goto out;
}
-
-
- if ( (mapper_handle1 = mfn_mapper_init(xc_handle, domid,
- 1024*1024, PROT_READ ))
- == NULL )
- goto out;
-
- for ( i = 0; i < (srec.nr_pfns+1023)/1024; i++ )
- {
- /* Grab a copy of the pfn-to-mfn table frame list.
- This has the effect of preventing the page from being freed and
- given to another domain. (though the domain is stopped anyway...) */
- mfn_mapper_queue_entry( mapper_handle1, i<<PAGE_SHIFT,
- live_pfn_to_mfn_frame_list[i],
- PAGE_SIZE );
- }
-
- if ( mfn_mapper_flush_queue(mapper_handle1) )
- {
- ERROR("Couldn't map pfn_to_mfn table");
- goto out;
- }
-
- live_pfn_to_mfn_table = mfn_mapper_base( mapper_handle1 );
-
-
-
- /* We want zeroed memory so use calloc rather than malloc. */
- pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
-
- if ( (pfn_type == NULL) )
- {
- errno = ENOMEM;
- goto out;
- }
-
- if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) )
- {
- ERROR("Unable to mlock");
- goto out;
- }
-
/* Track the mfn_to_pfn table down from the domains PT */
{
mfn_to_pfn_table_start_mfn );
}
+ /* Map all the frames of the pfn->mfn table. For migrate to succeed,
+ the guest must not change which frames are used for this purpose.
+ (its not clear why it would want to change them, and we'll be OK
+ from a safety POV anyhow. */
- /*
- * Quick belt and braces sanity check.
- */
+ live_pfn_to_mfn_table = mfn_mapper_map_batch( xc_handle, domid,
+ PROT_READ,
+ live_pfn_to_mfn_frame_list,
+ (nr_pfns+1023)/1024 );
+ if( !live_pfn_to_mfn_table )
+ {
+ PERROR("Couldn't map pfn_to_mfn table");
+ goto out;
+ }
+
+ for(i=0;i<(nr_pfns+1023)/1024 ;i++)
+ printf("LF: %d %x\n",i,live_pfn_to_mfn_frame_list[i]);
- for ( i = 0; i < srec.nr_pfns; i++ )
+
+ /* At this point, we can start the domain again if we're doign a
+ live suspend */
+
+ if( live )
+ {
+#if 1
+ if ( xc_shadow_control( xc_handle, domid,
+ DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY,
+ NULL, 0 ) < 0 )
+ {
+ ERROR("Couldn't enable shadow mode");
+ goto out;
+ }
+#endif
+ if ( xc_domain_start( xc_handle, domid ) < 0 )
+ {
+ ERROR("Couldn't restart domain");
+ goto out;
+ }
+//exit(-1);
+ last_iter = 0;
+ sent_last_iter = 1<<20; // 4GB's worth of pages
+ max_iters = 8; // limit us to 9 time round loop
+ }
+ else
+ last_iter = 1;
+
+
+ /* Setup to_send bitmap */
{
- mfn = live_pfn_to_mfn_table[i];
+ int sz = (nr_pfns/8) + 8; // includes slop at end of array
+
+ to_send = malloc( sz );
- if( live_mfn_to_pfn_table[mfn] != i )
- printf("i=%d mfn=%d live_mfn_to_pfn_table=%d\n",
- i,mfn,live_mfn_to_pfn_table[mfn]);
+ if (!to_send)
+ {
+ ERROR("Couldn't allocate to_send array");
+ goto out;
+ }
+ memset( to_send, 0xff, sz );
+
+ if ( mlock( to_send, sz ) )
+ {
+ PERROR("Unable to mlock to_send");
+ return 1;
+ }
}
- /* Canonicalise the suspend-record frame number. */
- if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) )
+ /* We want zeroed memory so use calloc rather than malloc. */
+ pfn_type = calloc(BATCH_SIZE, sizeof(unsigned long));
+
+ if ( (pfn_type == NULL) )
{
- ERROR("State record is not in range of pseudophys map");
+ errno = ENOMEM;
goto out;
}
- /* Canonicalise each GDT frame number. */
- for ( i = 0; i < ctxt.gdt_ents; i += 512 )
+ if ( mlock( pfn_type, BATCH_SIZE * sizeof(unsigned long) ) )
{
- if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) )
- {
- ERROR("GDT frame is not in range of pseudophys map");
- goto out;
- }
+ ERROR("Unable to mlock");
+ goto out;
}
- /* Canonicalise the page table base pointer. */
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) )
+
+ /*
+ * Quick belt and braces sanity check.
+ */
+
+ for ( i = 0; i < nr_pfns; i++ )
{
- ERROR("PT base is not in range of pseudophys map");
- goto out;
+ mfn = live_pfn_to_mfn_table[i];
+
+ if( (live_mfn_to_pfn_table[mfn] != i) && (mfn != 0x80000004) )
+ printf("i=0x%x mfn=%x live_mfn_to_pfn_table=%x\n",
+ i,mfn,live_mfn_to_pfn_table[mfn]);
}
- ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT;
/* Canonicalise the pfn-to-mfn table frame-number list. */
memcpy( pfn_to_mfn_frame_list, live_pfn_to_mfn_frame_list, PAGE_SIZE );
- for ( i = 0; i < srec.nr_pfns; i += 1024 )
+ for ( i = 0; i < nr_pfns; i += 1024 )
{
if ( !translate_mfn_to_pfn(&pfn_to_mfn_frame_list[i/1024]) )
{
}
}
- /* Start writing out the saved-domain record. */
+ /* Map the shared info frame */
live_shinfo = mfn_mapper_map_single(xc_handle, domid,
PAGE_SIZE, PROT_READ,
shared_info_frame);
goto out;
}
+ /* Start writing out the saved-domain record. */
+
if ( (*writerfn)(writerst, "LinuxGuestRecord", 16) ||
(*writerfn)(writerst, name, sizeof(name)) ||
- (*writerfn)(writerst, &srec.nr_pfns, sizeof(unsigned long)) ||
- (*writerfn)(writerst, &ctxt, sizeof(ctxt)) ||
- (*writerfn)(writerst, live_shinfo, PAGE_SIZE) ||
+ (*writerfn)(writerst, &nr_pfns, sizeof(unsigned long)) ||
(*writerfn)(writerst, pfn_to_mfn_frame_list, PAGE_SIZE) )
{
ERROR("Error when writing to state file (1)");
goto out;
}
- munmap(live_shinfo, PAGE_SIZE);
-
- verbose_printf("Saving memory pages: 0%%");
-
- if ( (mapper_handle2 = mfn_mapper_init(xc_handle, domid,
- BATCH_SIZE*4096, PROT_READ ))
- == NULL )
- goto out;
-
- region_base = mfn_mapper_base( mapper_handle2 );
/* Now write out each data page, canonicalising page tables as we go... */
- prev_pc = 0;
- for ( n = 0; n < srec.nr_pfns; )
+
+ while(1)
{
- this_pc = (n * 100) / srec.nr_pfns;
- if ( (this_pc - prev_pc) >= 5 )
- {
- verbose_printf("\b\b\b\b%3d%%", this_pc);
- prev_pc = this_pc;
- }
+ unsigned int prev_pc, batch, sent_this_iter;
- for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ )
- {
- pfn_type[j] = live_pfn_to_mfn_table[i];
- }
+ iter++;
+ sent_this_iter = 0;
+ prev_pc = 0;
+ verbose_printf("Saving memory pages: iter %d 0%%", iter);
- for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ )
+ n=0;
+ while( n < nr_pfns )
{
- /* queue up mappings for all of the pages in this batch */
+ unsigned int this_pc = (n * 100) / nr_pfns;
+ if ( (this_pc - prev_pc) >= 5 )
+ {
+ verbose_printf("\b\b\b\b%3d%%", this_pc);
+ prev_pc = this_pc;
+ }
-//printf("region n=%d j=%d i=%d mfn=%d\n",n,j,i,live_pfn_to_mfn_table[i]);
- mfn_mapper_queue_entry( mapper_handle2, j<<PAGE_SHIFT,
- live_pfn_to_mfn_table[i],
- PAGE_SIZE );
- }
- if( mfn_mapper_flush_queue(mapper_handle2) )
- {
- ERROR("Couldn't map page region");
- goto out;
- }
+ /* load pfn_type[] with the mfn of all the pages we're doing in
+ this batch. */
- if ( get_pfn_type_batch(xc_handle, domid, j, pfn_type) )
- {
- ERROR("get_pfn_type_batch failed");
- goto out;
- }
-
- for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ )
- {
- if((pfn_type[j]>>29) == 7)
+ for( batch = 0; batch < BATCH_SIZE && n < nr_pfns ; n++ )
{
- ERROR("bogus page");
- goto out;
- }
+ if ( !test_bit(n, to_send ) ) continue;
- /* canonicalise mfn->pfn */
- pfn_type[j] = (pfn_type[j] & PGT_type_mask) |
- live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask];
-
-/* if(pfn_type[j]>>29)
- printf("i=%d type=%d\n",i,pfn_type[i]); */
- }
+ pfn_type[batch] = live_pfn_to_mfn_table[n];
+ if( pfn_type[batch] == 0x80000004 )
+ {
+ //printf("Skip netbuf pfn %lx. mfn %lx\n",n,pfn_type[batch]);
+ continue;
+ }
- if ( (*writerfn)(writerst, &j, sizeof(int) ) )
- {
- ERROR("Error when writing to state file (2)");
- goto out;
- }
+//if(iter>1) printf("pfn=%x mfn=%x\n",n,pfn_type[batch]);
+
+ batch++;
+ }
- if ( (*writerfn)(writerst, pfn_type, sizeof(unsigned long)*j ) )
- {
- ERROR("Error when writing to state file (3)");
- goto out;
- }
+ for( j = 0; j < batch; j++ )
+ {
+ if( (pfn_type[j] &0xfffff) == 0x0000004 )
+ {
+ printf("XXXXXXXXSkip netbuf entry %d mfn %lx\n",j,pfn_type[j]);
+ }
- for( j = 0, i = n; j < BATCH_SIZE && i < srec.nr_pfns ; j++, i++ )
- {
- /* write out pages in batch */
+
+ }
- if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) ||
- ((pfn_type[j] & PGT_type_mask) == L2TAB) )
+
+ printf("batch %d:%d (n=%d)\n",iter,batch,n);
+
+ if(batch == 0) goto skip; // vanishingly unlikely...
+
+ if ( (region_base = mfn_mapper_map_batch( xc_handle, domid,
+ PROT_READ,
+ pfn_type,
+ batch )) == 0)
+ {
+ PERROR("map batch failed");
+ goto out;
+ }
+
+ if ( get_pfn_type_batch(xc_handle, domid, batch, pfn_type) )
{
+ ERROR("get_pfn_type_batch failed");
+ goto out;
+ }
+
+ for( j = 0; j < batch; j++ )
+ {
+ if((pfn_type[j]>>29) == 7)
+ {
+ //printf("type fail: page %i mfn %08lx\n",j,pfn_type[j]);
+ continue;
+ }
+//if((pfn_type[j] & PGT_type_mask) == L2TAB) printf("L2 pfn=%08lx mfn=%lx\n",pfn_type[j],live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask]);
- memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
+ /* canonicalise mfn->pfn */
+ pfn_type[j] = (pfn_type[j] & PGT_type_mask) |
+ live_mfn_to_pfn_table[pfn_type[j]&~PGT_type_mask];
+ }
+
+
+ if ( (*writerfn)(writerst, &batch, sizeof(int) ) )
+ {
+ ERROR("Error when writing to state file (2)");
+ goto out;
+ }
- for ( k = 0;
- k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ?
- (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024);
- k++ )
+ if ( (*writerfn)(writerst, pfn_type, sizeof(unsigned long)*j ) )
+ {
+ ERROR("Error when writing to state file (3)");
+ goto out;
+ }
+
+ /* entering this loop, pfn_type is now in pfns (Not mfns) */
+ for( j = 0; j < batch; j++ )
+ {
+ /* write out pages in batch */
+
+ if((pfn_type[j]>>29) == 7)
{
- if ( !(page[k] & _PAGE_PRESENT) ) continue;
- mfn = page[k] >> PAGE_SHIFT;
-
- if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+ //printf("SKIP BOGUS page %i mfn %08lx\n",j,pfn_type[j]);
+ continue;
+ }
+
+ if ( ((pfn_type[j] & PGT_type_mask) == L1TAB) ||
+ ((pfn_type[j] & PGT_type_mask) == L2TAB) )
+ {
+
+ memcpy(page, region_base + (PAGE_SIZE*j), PAGE_SIZE);
+
+ for ( k = 0;
+ k < (((pfn_type[j] & PGT_type_mask) == L2TAB) ?
+ (HYPERVISOR_VIRT_START >> L2_PAGETABLE_SHIFT) : 1024);
+ k++ )
{
- ERROR("Frame number in pagetable page is invalid");
+ unsigned long pfn;
+
+ if ( !(page[k] & _PAGE_PRESENT) ) continue;
+ mfn = page[k] >> PAGE_SHIFT;
+ pfn = live_mfn_to_pfn_table[mfn];
+
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(mfn) )
+ {
+ printf("FNI %d : [%08lx,%d] pte=%08lx, mfn=%08lx, pfn=%08lx [mfn]=%08lx\n",
+ j, pfn_type[j], k,
+ page[k], mfn, live_mfn_to_pfn_table[mfn],
+ (live_mfn_to_pfn_table[mfn]<nr_pfns)?
+ live_pfn_to_mfn_table[live_mfn_to_pfn_table[mfn]]: 0xdeadbeef);
+ pfn = 0; // be suspicious
+
+// ERROR("Frame number in pagetable page is invalid");
+// goto out;
+
+
+ }
+ page[k] &= PAGE_SIZE - 1;
+ page[k] |= pfn << PAGE_SHIFT;
+
+ /*
+ printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx xpfn=%d\n",
+ pfn_type[j]>>29,
+ j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
+ */
+
+ } /* end of page table rewrite for loop */
+
+ if ( (*writerfn)(writerst, page, PAGE_SIZE) )
+ {
+ ERROR("Error when writing to state file (4)");
+ goto out;
+ }
+
+ } /* end of it's a PT page */
+ else
+ { /* normal page */
+ if ( (*writerfn)(writerst, region_base + (PAGE_SIZE*j), PAGE_SIZE) )
+ {
+ ERROR("Error when writing to state file (5)");
goto out;
}
- page[k] &= PAGE_SIZE - 1;
- page[k] |= live_mfn_to_pfn_table[mfn] << PAGE_SHIFT;
-
- /*
- printf("L%d i=%d pfn=%d mfn=%d k=%d pte=%08lx xpfn=%d\n",
- pfn_type[j]>>29,
- j,i,mfn,k,page[k],page[k]>>PAGE_SHIFT);
- */
-
}
+ } /* end of the write out for this batch */
+
+ sent_this_iter += batch;
- if ( (*writerfn)(writerst, page, PAGE_SIZE) )
- {
- ERROR("Error when writing to state file (4)");
- goto out;
- }
+ } /* end of this while loop for this iteration */
+ munmap(region_base, batch*PAGE_SIZE);
+
+ skip:
+
+ verbose_printf("\b\b\b\b100%% (%d pages)\n", sent_this_iter );
+
+ if ( last_iter )
+ break;
+ if ( live )
+ {
+ if ( sent_this_iter < (sent_last_iter * 0.95) && iter < max_iters )
+ {
+ // we seem to be doing OK, keep going
}
else
{
- if ( (*writerfn)(writerst, region_base + (PAGE_SIZE*j), PAGE_SIZE) )
- {
- ERROR("Error when writing to state file (5)");
- goto out;
- }
+ printf("Start last iteration\n");
+ last_iter = 1;
+
+ xc_domain_stop_sync( xc_handle, domid );
+
+ }
+
+ if ( xc_shadow_control( xc_handle, domid,
+ DOM0_SHADOW_CONTROL_OP_CLEAN,
+ to_send, nr_pfns ) != nr_pfns )
+ {
+ ERROR("Error flushing shadow PT");
+ goto out;
}
+
+#if 0
+ if(last_iter) memset(to_send, 0xff, (nr_pfns+7)/8 );
+#endif
+
+ sent_last_iter = sent_this_iter;
}
-
- n+=j; /* i is the master loop counter */
- }
- verbose_printf("\b\b\b\b100%%\nMemory saved.\n");
+
+ } /* end of while 1 */
+
+printf("All memory is saved\n");
/* Success! */
rc = 0;
-
+
/* Zero terminate */
if ( (*writerfn)(writerst, &rc, sizeof(int)) )
{
ERROR("Error when writing to state file (6)");
goto out;
}
-
+ /* Get the final execution context */
+ op.cmd = DOM0_GETDOMAININFO;
+ op.u.getdomaininfo.domain = (domid_t)domid;
+ op.u.getdomaininfo.ctxt = &ctxt;
+ if ( (do_dom0_op(xc_handle, &op) < 0) ||
+ ((u64)op.u.getdomaininfo.domain != domid) )
+ {
+ PERROR("Could not get info on domain");
+ goto out;
+ }
+printf("A\n");
+ /* Canonicalise the suspend-record frame number. */
+ if ( !translate_mfn_to_pfn(&ctxt.cpu_ctxt.esi) )
+ {
+ ERROR("State record is not in range of pseudophys map");
+ goto out;
+ }
+printf("B\n");
+ /* Canonicalise each GDT frame number. */
+ for ( i = 0; i < ctxt.gdt_ents; i += 512 )
+ {
+ if ( !translate_mfn_to_pfn(&ctxt.gdt_frames[i]) )
+ {
+ ERROR("GDT frame is not in range of pseudophys map");
+ goto out;
+ }
+ }
+printf("C\n");
+ /* Canonicalise the page table base pointer. */
+ if ( !MFN_IS_IN_PSEUDOPHYS_MAP(ctxt.pt_base >> PAGE_SHIFT) )
+ {
+ ERROR("PT base is not in range of pseudophys map");
+ goto out;
+ }
+ ctxt.pt_base = live_mfn_to_pfn_table[ctxt.pt_base >> PAGE_SHIFT] << PAGE_SHIFT;
+printf("D\n");
+ if ( (*writerfn)(writerst, &ctxt, sizeof(ctxt)) ||
+ (*writerfn)(writerst, live_shinfo, PAGE_SIZE) )
+ {
+ ERROR("Error when writing to state file (1)");
+ goto out;
+ }
+ munmap(live_shinfo, PAGE_SIZE);
+printf("E\n");
out:
/* Restart the domain if we had to stop it to save its state. */
if ( we_stopped_it )
/*******************/
+void * mfn_mapper_map_batch(int xc_handle, domid_t dom, int prot,
+ unsigned long *arr, int num )
+{
+ privcmd_mmapbatch_t ioctlx;
+ void *addr;
+ addr = mmap( NULL, num*PAGE_SIZE, prot, MAP_SHARED, xc_handle, 0 );
+ if (addr)
+ {
+ ioctlx.num=num;
+ ioctlx.dom=dom;
+ ioctlx.addr=(unsigned long)addr;
+ ioctlx.arr=arr;
+ if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAPBATCH, &ioctlx ) <0 )
+ {
+ perror("XXXXXXXX");
+ munmap(addr, num*PAGE_SIZE);
+ return 0;
+ }
+ }
+ return addr;
+
+}
+
+/*******************/
+
void * mfn_mapper_map_single(int xc_handle, domid_t dom,
int size, int prot,
unsigned long mfn )
entry.mfn=mfn;
entry.npages=(size+PAGE_SIZE-1)>>PAGE_SHIFT;
if ( ioctl( xc_handle, IOCTL_PRIVCMD_MMAP, &ioctlx ) <0 )
+ {
+ munmap(addr, size);
return 0;
+ }
}
return addr;
}
hypercall.op = __HYPERVISOR_mmu_update;
hypercall.arg[0] = (unsigned long)mmu->updates;
- hypercall.arg[1] = (unsigned long)mmu->idx;
+ hypercall.arg[1] = (unsigned long)&(mmu->idx);
if ( mlock(mmu->updates, sizeof(mmu->updates)) != 0 )
{
{
return flush_mmu_updates(xc_handle, mmu);
}
+
+
+/***********************************************************/
+
+/* this function is a hack until we get proper synchronous domain stop */
+
+int xc_domain_stop_sync( int xc_handle, domid_t domid )
+{
+ dom0_op_t op;
+
+ while (1)
+ {
+ op.cmd = DOM0_STOPDOMAIN;
+ op.u.stopdomain.domain = (domid_t)domid;
+ if ( do_dom0_op(xc_handle, &op) != 0 )
+ {
+ PERROR("Stopping target domain failed");
+ goto out;
+ }
+
+ usleep(1000); // 1ms
+ printf("Sleep for 1ms\n");
+
+ op.cmd = DOM0_GETDOMAININFO;
+ op.u.getdomaininfo.domain = (domid_t)domid;
+ op.u.getdomaininfo.ctxt = NULL;
+ if ( (do_dom0_op(xc_handle, &op) < 0) ||
+ ((u64)op.u.getdomaininfo.domain != domid) )
+ {
+ PERROR("Could not get info on domain");
+ goto out;
+ }
+
+ if ( op.u.getdomaininfo.state == DOMSTATE_STOPPED )
+ {
+ printf("Domain %lld stopped\n",domid);
+ return 0;
+ }
+
+ }
+
+out:
+ return -1;
+}
void * mfn_mapper_map_single(int xc_handle, domid_t dom, int size, int prot,
unsigned long mfn );
+void * mfn_mapper_map_batch(int xc_handle, domid_t dom, int prot,
+ unsigned long *arr, int num );
+
mfn_mapper_t * mfn_mapper_init(int xc_handle, domid_t dom, int size, int prot);
void * mfn_mapper_base(mfn_mapper_t *t);
/*********************/
+int xc_domain_stop_sync( int xc_handle, domid_t dom );
#endif /* __XC_PRIVATE_H__ */
u64 dom;
char *state_file;
- int progress = 1;
+ int progress = 1, live = 0;
unsigned int flags = 0;
- static char *kwd_list[] = { "dom", "state_file", "progress", NULL };
+ static char *kwd_list[] = { "dom", "state_file", "progress", "live", NULL };
- if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Ls|i", kwd_list,
- &dom, &state_file, &progress) )
+ if ( !PyArg_ParseTupleAndKeywords(args, kwds, "Ls|ii", kwd_list,
+ &dom, &state_file, &progress, &live) )
return NULL;
if (progress) flags |= XCFLAGS_VERBOSE;
+ if (live) flags |= XCFLAGS_LIVE;
if (strncmp(state_file,"tcp:", strlen("tcp:")) == 0)
{
&dom, &op) )
return NULL;
- if ( xc_shadow_control(xc->xc_handle, dom, op) != 0 )
+ if ( xc_shadow_control(xc->xc_handle, dom, op, NULL, 0) < 0 )
return PyErr_SetFromErrno(xc_error);
Py_INCREF(zero);
goto fail4;
}
+ xup->interface->tx_resp_prod = 0;
+ xup->interface->rx_req_prod = 0;
+ xup->interface->tx_req_prod = 0;
+ xup->interface->rx_resp_prod = 0;
+
xup->tx_req_cons = 0;
xup->tx_resp_prod = 0;
xup->rx_req_prod = 0;
p = find_domain_by_id( op->u.shadow_control.domain );
if ( p )
{
- ret = shadow_mode_control(p, op->u.shadow_control.op );
+ ret = shadow_mode_control(p, &op->u.shadow_control );
put_task_struct(p);
- }
-
+ copy_to_user(u_dom0_op, op, sizeof(*op));
+ }
}
break;
memset(p->shared_info, 0, PAGE_SIZE);
SHARE_PFN_WITH_DOMAIN(virt_to_page(p->shared_info), p);
+ machine_to_phys_mapping[virt_to_phys(p->shared_info) >> PAGE_SHIFT] =
+ 0x80000000UL; // set m2p table to magic marker (helps debug)
+
p->mm.perdomain_pt = (l1_pgentry_t *)get_free_page(GFP_KERNEL);
memset(p->mm.perdomain_pt, 0, PAGE_SIZE);
+ machine_to_phys_mapping[virt_to_phys(p->mm.perdomain_pt) >> PAGE_SHIFT] =
+ 0x0fffdeadUL; // set m2p table to magic marker (helps debug)
+
init_blkdev_info(p);
/* Per-domain PCI-device list. */
unsigned int alloc_new_dom_mem(struct task_struct *p, unsigned int kbytes)
{
unsigned int alloc_pfns, nr_pages;
+ struct pfn_info *page;
nr_pages = (kbytes + ((PAGE_SIZE-1)>>10)) >> (PAGE_SHIFT - 10);
p->max_pages = nr_pages; /* this can now be controlled independently */
/* grow the allocation if necessary */
for ( alloc_pfns = p->tot_pages; alloc_pfns < nr_pages; alloc_pfns++ )
{
- if ( unlikely(alloc_domain_page(p) == NULL) ||
+ if ( unlikely((page=alloc_domain_page(p)) == NULL) ||
unlikely(free_pfns < (SLACK_DOMAIN_MEM_KILOBYTES >>
(PAGE_SHIFT-10))) )
{
free_all_dom_mem(p);
return -ENOMEM;
}
+
+ /* initialise to machine_to_phys_mapping table to likely pfn */
+ machine_to_phys_mapping[page-frame_table] = alloc_pfns;
}
p->tot_pages = nr_pages;
belonging to the machine_to_phys_mapping to CPU0 idle task */
mfn = virt_to_phys((void *)RDWR_MPT_VIRT_START)>>PAGE_SHIFT;
-// for(i=0;i<nr_pages;i+=1024,mfn++)
+
+ /* initialise to a magic of 0x55555555 so easier to spot bugs later */
+ memset( machine_to_phys_mapping, 0x55, 4*1024*1024 );
+
+ /* The array is sized for a 4GB machine regardless of actuall mem size.
+ This costs 4MB -- may want to fix some day */
for(i=0;i<1024*1024;i+=1024,mfn++)
{
frame_table[mfn].count_and_flags = 1 | PGC_allocated;
if ( unlikely(!get_page(page, p)) )
{
- MEM_LOG("Could not get page ref for pfn %08lx\n", page_nr);
+ MEM_LOG("Could not get page ref for pfn %08lx", page_nr);
return 0;
}
}
-int do_mmu_update(mmu_update_t *ureqs, int count)
+int do_mmu_update(mmu_update_t *ureqs, int * p_count)
{
+ int count;
mmu_update_t req;
unsigned long va = 0, deferred_ops, pfn, prev_pfn = 0;
struct pfn_info *page;
unsigned long prev_spfn = 0;
l1_pgentry_t *prev_spl1e = 0;
+ if ( unlikely( get_user(count, p_count) ) )
+ {
+ return -EFAULT;
+ }
+
perfc_incrc(calls_to_mmu_update);
perfc_addc(num_page_updates, count);
percpu_info[cpu].gps = percpu_info[cpu].pts = NULL;
}
+ if ( unlikely(rc) )
+ put_user( count, p_count );
+
return rc;
}
clear_page(new_ring);
SHARE_PFN_WITH_DOMAIN(virt_to_page(new_ring), p);
+ machine_to_phys_mapping[virt_to_phys(new_ring)>>PAGE_SHIFT] =
+ 0x80000001; // magic value aids debugging
+
/*
* Fill in the new vif struct. Note that, while the vif's refcnt is
* non-zero, we hold a reference to the task structure.
}
return work;
}
+
static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
{
int j, work=0;
}
shadow_audit(m,0);
}
- SH_LOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
+ SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
}
struct shadow_status **fptr;
int i;
-
spin_lock_init(&m->shadow_lock);
spin_lock(&m->shadow_lock);
// call shadow_mk_pagetable
shadow_mk_pagetable( m );
-
return 0;
nomem:
kfree( &m->shadow_ht[0] );
}
-static void shadow_mode_table_op( struct task_struct *p, unsigned int op )
+static int shadow_mode_table_op( struct task_struct *p,
+ dom0_shadow_control_t *sc )
{
+ unsigned int op = sc->op;
struct mm_struct *m = &p->mm;
+ int rc = 0;
// since Dom0 did the hypercall, we should be running with it's page
// tables right now. Calling flush on yourself would be really
if ( m == ¤t->mm )
{
printk("Don't try and flush your own page tables!\n");
- return;
+ return -EINVAL;
}
spin_lock(&m->shadow_lock);
- SH_LOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
+ SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
shadow_audit(m,1);
break;
case DOM0_SHADOW_CONTROL_OP_CLEAN:
- __scan_shadow_table( m, op );
- // we used to bzero dirty bitmap here, but now leave this to user space
- // if we were double buffering we'd do the flip here
+ {
+ int i;
+
+ __scan_shadow_table( m, op );
+
+ if( p->tot_pages > sc->pages ||
+ !sc->dirty_bitmap || !p->mm.shadow_dirty_bitmap )
+ {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ sc->pages = p->tot_pages;
+
+#define chunk (8*1024) // do this in 1KB chunks for L1 cache
+
+ for(i=0;i<p->tot_pages;i+=chunk)
+ {
+ int bytes = (( ((p->tot_pages-i) > (chunk))?
+ (chunk):(p->tot_pages-i) ) + 7) / 8;
+
+ copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+ bytes );
+
+ memset( p->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+ 0, bytes);
+ }
+
break;
+ }
}
+
+out:
+
spin_unlock(&m->shadow_lock);
- SH_LOG("shadow mode table op : page count %d", m->shadow_page_count);
+ SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count);
shadow_audit(m,1);
// call shadow_mk_pagetable
shadow_mk_pagetable( m );
+ return rc;
}
-int shadow_mode_control( struct task_struct *p, unsigned int op )
+int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc )
{
int we_paused = 0;
+ unsigned int cmd = sc->op;
+ int rc = 0;
// don't call if already shadowed...
we_paused = 1;
}
- if ( p->mm.shadow_mode && op == DOM0_SHADOW_CONTROL_OP_OFF )
+ if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
{
shadow_mode_disable(p);
}
- else if ( op == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
+ else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
{
if(p->mm.shadow_mode) shadow_mode_disable(p);
shadow_mode_enable(p, SHM_test);
}
- else if ( p->mm.shadow_mode && op >= DOM0_SHADOW_CONTROL_OP_FLUSH && op<=DOM0_SHADOW_CONTROL_OP_CLEAN )
+ else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY )
{
- shadow_mode_table_op(p, op);
+ if(p->mm.shadow_mode) shadow_mode_disable(p);
+ shadow_mode_enable(p, SHM_logdirty);
+ }
+ else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN )
+ {
+ rc = shadow_mode_table_op(p, sc);
}
else
{
}
if ( we_paused ) wake_up(p);
- return 0;
+ return rc;
}
#include <xen/interrupt.h>
#include <xen/vbd.h>
#include <xen/slab.h>
+#include <xen/shadow.h>
/*
* These are rather arbitrary. They are fairly large because adjacent requests
pfn < ((buffer + size + PAGE_SIZE - 1) >> PAGE_SHIFT);
pfn++ )
{
+
+ /* Find the domain from the frame_table. Yuk... */
+ struct task_struct *p = frame_table[pfn].u.domain;
+
+ if( p->mm.shadow_mode == SHM_logdirty )
+ mark_dirty( &p->mm, pfn );
+
+
if ( writeable_buffer )
put_page_type(&frame_table[pfn]);
put_page(&frame_table[pfn]);
+
}
}
p->blk_ring_base = (blk_ring_t *)get_free_page(GFP_KERNEL);
clear_page(p->blk_ring_base);
SHARE_PFN_WITH_DOMAIN(virt_to_page(p->blk_ring_base), p);
+
+ machine_to_phys_mapping[virt_to_phys(p->blk_ring_base)>>PAGE_SHIFT] =
+ 0x80000002; // magic value aids debugging
+
p->blkdev_list.next = NULL;
spin_lock_init(&p->vbd_lock);
}
struct shadow_status *shadow_ht;
struct shadow_status *shadow_ht_free;
struct shadow_status *shadow_ht_extras; /* extra allocation units */
- unsigned int *shadow_dirty_bitmap;
+ unsigned long *shadow_dirty_bitmap;
unsigned int shadow_dirty_bitmap_size; /* in pages, bit per page */
unsigned int shadow_page_count;
unsigned int shadow_max_page_count;
/* IN variables. */
domid_t domain;
int op;
+ unsigned long *dirty_bitmap; // pointe to mlocked buffer
+ /* IN/OUT variables */
+ unsigned long pages; // size of buffer, updated with actual size
} dom0_shadow_control_t;
#define DOM0_SETDOMAINNAME 26
unlikely(x & PGC_zombie) || /* Zombie? */
unlikely(p != domain) ) /* Wrong owner? */
{
- DPRINTK("Error pfn %08lx: ed=%p(%lld), sd=%p(%lld), caf=%08x\n",
- page_to_pfn(page), domain, (domain)?domain->domain:1234, p, (p)?p->domain:1234, x);
+ DPRINTK("Error pfn %08lx: ed=%p(%lld), sd=%p(%lld), caf=%08x, taf=%08x\n",
+ page_to_pfn(page), domain, (domain)?domain->domain:999, p, (p && !((x & PGC_count_mask) == 0))?p->domain:999, x, page->type_and_flags);
return 0;
}
__asm__ __volatile__(
#define machine_to_phys_mapping ((unsigned long *)RDWR_MPT_VIRT_START)
/* Part of the domain API. */
-int do_mmu_update(mmu_update_t *updates, int count);
+int do_mmu_update(mmu_update_t *updates, int *count);
#define DEFAULT_GDT_ENTRIES ((LAST_RESERVED_GDT_ENTRY*8)+7)
#define DEFAULT_GDT_ADDRESS ((unsigned long)gdt_table)
#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
-extern int shadow_mode_control( struct task_struct *p, unsigned int op );
+extern int shadow_mode_control( struct task_struct *p, dom0_shadow_control_t *sc );
extern int shadow_fault( unsigned long va, long error_code );
extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
unsigned long *prev_spfn_ptr,
#ifndef NDEBUG
#define SH_LOG(_f, _a...) \
- printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \
+ printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
current->domain , __LINE__ , ## _a )
#else
#define SH_LOG(_f, _a...)
#if SHADOW_DEBUG
#define SH_VLOG(_f, _a...) \
- printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \
+ printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
current->domain , __LINE__ , ## _a )
#else
#define SH_VLOG(_f, _a...)
#if 0
#define SH_VVLOG(_f, _a...) \
- printk("DOM%llu: (file=shadow.c, line=%d) " _f "\n", \
+ printk("DOM%lld: (file=shadow.c, line=%d) " _f "\n", \
current->domain , __LINE__ , ## _a )
#else
#define SH_VVLOG(_f, _a...)
#endif
-
/************************************************************************/
static inline void mark_dirty( struct mm_struct *m, unsigned int mfn )
{
- unsigned int pfn = machine_to_phys_mapping[mfn];
+ unsigned int pfn;
+
+ pfn = machine_to_phys_mapping[mfn];
+
+ /* We use values with the top bit set to mark MFNs that aren't
+ really part of the domain's psuedo-physical memory map e.g.
+ the shared info frame. Nothing to do here...
+ */
+ if ( unlikely(pfn & 0x80000000U) ) return;
+
ASSERT(m->shadow_dirty_bitmap);
if( likely(pfn<m->shadow_dirty_bitmap_size) )
{
}
else
{
- SH_LOG("mark_dirty pfn out of range attempt!");
+ extern void show_traceX(void);
+ SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
+ mfn, pfn, m->shadow_dirty_bitmap_size, m );
+ SH_LOG("dom=%lld caf=%08x taf=%08x\n",
+ frame_table[mfn].u.domain->domain,
+ frame_table[mfn].count_and_flags,
+ frame_table[mfn].type_and_flags );
+ //show_traceX();
}
}
spte = gpte;
gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
- mark_dirty( m, gpte >> PAGE_SHIFT );
+ mark_dirty( m, (gpte >> PAGE_SHIFT) );
break;
}
if( m->shadow_mode == SHM_logdirty )
mark_dirty( m, gpfn );
-
+
spin_lock(&m->shadow_lock);
res = __shadow_status( m, gpfn );
if (!res) spin_unlock(&m->shadow_lock);
goto out;
}
+ machine_to_phys_mapping[new_page - frame_table] =
+ machine_to_phys_mapping[old_page - frame_table];
+
if ( p->mm.shadow_mode &&
(spte_pfn=get_shadow_status(&p->mm, pte_page-frame_table)) )
{
*sptr = new_pte;
unmap_domain_mem(sptr);
- if( p->mm.shadow_mode == SHM_logdirty )
- mark_dirty( &p->mm, new_page-frame_table );
-
put_shadow_status(&p->mm);
}
-
- machine_to_phys_mapping[new_page - frame_table]
- = machine_to_phys_mapping[old_page - frame_table];
unmap_domain_mem(ptep);
+ /* if in shadow mode, mark the buffer as dirty */
+ if( p->mm.shadow_mode == SHM_logdirty )
+ mark_dirty( &p->mm, (new_page-frame_table) );
+
/* Updates must happen before releasing the descriptor. */
smp_wmb();
put_page_and_type(pte_page);
make_rx_response(vif, rx.id, 0, RING_STATUS_BAD_PAGE, 0);
goto rx_unmap_and_continue;
-
- /* XXX IAP should SHADOW_CONFIG do something here? */
}
/*
0) !=
(PGC_allocated | PGC_tlb_flush_on_type_change | 2)) )
{
- DPRINTK("Page held more than once %08x %s\n",
+ DPRINTK("Page held more than once mfn=%x %08x %s\n",
+ buf_page-frame_table,
buf_page->count_and_flags,
(buf_page->u.domain)?buf_page->u.domain->name:"None");
+
if ( !get_page_type(buf_page, PGT_writeable_page) )
put_page(buf_page);
else if ( cmpxchg(ptep, pte & ~_PAGE_PRESENT, pte) !=
put_page_and_type(&frame_table[rx->pte_ptr >> PAGE_SHIFT]);
+ /* if in shadow mode, mark the PTE as dirty */
+ if( p->mm.shadow_mode == SHM_logdirty )
+ mark_dirty( &p->mm, rx->pte_ptr>>PAGE_SHIFT );
+ /* assume the shadow page table is about to be blown away,
+ and that its not worth marking the buffer as dirty */
+
+
make_rx_response(vif, rx->id, 0, RING_STATUS_DROPPED, 0);
}
vif->rx_cons = i;
{
block_io_op_t op;
- nr_pending = 0;
-
op.cmd = BLOCK_IO_OP_RESET;
if ( HYPERVISOR_block_io_op(&op) != 0 )
printk(KERN_ALERT "Possible blkdev trouble: couldn't reset ring\n");
{
int error;
+ nr_pending = 0;
+
reset_xlblk_interface();
xlblk_response_irq = bind_virq_to_irq(VIRQ_BLKDEV);
if (msg[j].va + (msg[j].npages<<PAGE_SHIFT) > vma->vm_end)
return -EINVAL;
- if (rc = direct_remap_area_pages(vma->vm_mm,
+ if ( (rc = direct_remap_area_pages(vma->vm_mm,
msg[j].va&PAGE_MASK,
msg[j].mfn<<PAGE_SHIFT,
msg[j].npages<<PAGE_SHIFT,
vma->vm_page_prot,
- mmapcmd.dom))
+ mmapcmd.dom)) <0)
return rc;
}
}
}
break;
+ case IOCTL_PRIVCMD_MMAPBATCH:
+ {
+#define MAX_DIRECTMAP_MMU_QUEUE 130
+ mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v;
+ privcmd_mmapbatch_t m;
+ struct vm_area_struct *vma = NULL;
+ unsigned long *p, addr;
+ unsigned long mfn;
+ int i;
+
+ if ( copy_from_user(&m, (void *)data, sizeof(m)) )
+ { ret = -EFAULT; goto batch_err; }
+
+ vma = find_vma( current->mm, m.addr );
+
+ if (!vma)
+ { ret = -EINVAL; goto batch_err; }
+
+ if (m.addr > PAGE_OFFSET)
+ { ret = -EFAULT; goto batch_err; }
+
+ if (m.addr + (m.num<<PAGE_SHIFT) > vma->vm_end)
+ { ret = -EFAULT; goto batch_err; }
+
+ // everything fits inside the vma
+
+//printk("direct_r_a_p sx=%ld address=%lx macaddr=%lx dom=%lld\n",size,address,machine_addr,domid);
+// memset( u, 0, sizeof(mmu_update_t)*MAX_DIRECTMAP_MMU_QUEUE );// XXX
+
+
+ if ( m.dom != 0 )
+ {
+ u[0].val = (unsigned long)(m.dom<<16) & ~0xFFFFUL;
+ u[0].ptr = (unsigned long)(m.dom<< 0) & ~0xFFFFUL;
+ u[1].val = (unsigned long)(m.dom>>16) & ~0xFFFFUL;
+ u[1].ptr = (unsigned long)(m.dom>>32) & ~0xFFFFUL;
+ u[0].ptr |= MMU_EXTENDED_COMMAND;
+ u[0].val |= MMUEXT_SET_SUBJECTDOM_L;
+ u[1].ptr |= MMU_EXTENDED_COMMAND;
+ u[1].val |= MMUEXT_SET_SUBJECTDOM_H;
+ v = w = &u[2];
+ }
+ else
+ {
+ v = w = &u[0];
+ }
+
+ p = m.arr;
+ addr = m.addr;
+//printk("BATCH: arr=%p addr=%lx num=%d u=%p,w=%p\n",p,addr,m.num,u,w);
+ for (i=0; i<m.num; i++, addr+=PAGE_SIZE, p++)
+ {
+ unsigned int count;
+ if ( get_user(mfn, p) ) return -EFAULT;
+
+ v->val = (mfn << PAGE_SHIFT) | pgprot_val(vma->vm_page_prot) |
+ _PAGE_IO;
+
+ __direct_remap_area_pages( vma->vm_mm,
+ addr,
+ PAGE_SIZE,
+ v);
+ v++;
+ count = v-u;
+//printk("Q i=%d mfn=%x co=%d v=%p : %lx %lx\n",i,mfn,count,v, w->val,w->ptr);
+
+ if ( HYPERVISOR_mmu_update(u, &count) < 0 )
+ {
+ //printk("Fail %d->%d mfn=%lx\n",v-u,count, w->val);
+ put_user( 0xe0000000 | mfn, p );
+ }
+ v=w;
+ }
+ ret = 0;
+ break;
+
+ batch_err:
+ printk("batch_err ret=%d vma=%p addr=%lx num=%d arr=%lx %lx-%lx\n",
+ ret, vma, m.addr, m.num, m.arr, vma->vm_start, vma->vm_end);
+ break;
+ }
+ break;
+
+
+
default:
ret = -EINVAL;
break;
np->net_ring->rx_ring[MASK_NET_RX_IDX(i)].req.addr =
virt_to_machine(get_ppte(skb->head));
+ /* Shadow optimisation: disown this page from p->m map */
+ phys_to_machine_mapping[virt_to_phys(skb->head)>>PAGE_SHIFT] = 0x80000004;
np->rx_bufs_to_notify++;
}
while ( (++i - np->rx_resp_cons) != XENNET_RX_RING_SIZE );
skb = np->rx_skbs[rx->id];
ADD_ID_TO_FREELIST(np->rx_skbs, rx->id);
+ phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
+ (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
+
if ( unlikely(rx->status != RING_STATUS_OK) )
{
/* Gate this error. We get a (valid) slew of them on suspend. */
skb_shinfo(skb)->nr_frags = 0;
skb_shinfo(skb)->frag_list = NULL;
- phys_to_machine_mapping[virt_to_phys(skb->head) >> PAGE_SHIFT] =
- (*(unsigned long *)get_ppte(skb->head)) >> PAGE_SHIFT;
-
skb->data = skb->tail = skb->head + rx->offset;
skb_put(skb, rx->size);
skb->protocol = eth_type_trans(skb, dev);
virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT;
suspend_record->nr_pfns = max_pfn;
- j = 0;
- for ( i = 0; i < max_pfn; i += (PAGE_SIZE / sizeof(unsigned long)) )
- pfn_to_mfn_frame_list[j++] =
+ for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ )
+ {
+ pfn_to_mfn_frame_list[j] =
virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT;
-
+ }
/*
* NB. This is /not/ a full dev_close() as that loses route information!
* Instead we do essentialy the same as dev_close() but without notifying
memcpy(&start_info, &suspend_record->resume_info, sizeof(start_info));
set_fixmap(FIX_SHARED_INFO, start_info.shared_info);
+
HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+
memset(empty_zero_page, 0, PAGE_SIZE);
irq_resume();
#include <linux/smp.h>
#include <linux/irq.h>
#include <linux/sysctl.h>
+#include <linux/sysrq.h>
spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
extern rwlock_t xtime_lock;
timer->expires,(u32)(t_st>>32), (u32)t_st);
printk(KERN_ALERT "time: processed_system_time=0x%X:%08X\n",
(u32)(processed_system_time>>32), (u32)processed_system_time);
+
+
+ handle_sysrq('t',NULL,NULL,NULL);
+
}
static struct irqaction dbg_time = {
__asm__ __volatile__ ( "sldt %0" : "=r" (ldt) );
if ( ldt == 0 )
{
- mmu_update_t u;
- u.ptr = MMU_EXTENDED_COMMAND;
- u.ptr |= (unsigned long)&default_ldt[0];
- u.val = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT);
- if ( unlikely(HYPERVISOR_mmu_update(&u, 1) < 0) )
- {
- show_trace(NULL);
- panic("Failed to install default LDT");
- }
- return;
+ int count = 1;
+ mmu_update_t u;
+ u.ptr = MMU_EXTENDED_COMMAND;
+ u.ptr |= (unsigned long)&default_ldt[0];
+ u.val = MMUEXT_SET_LDT | (5 << MMUEXT_CMD_SHIFT);
+ if ( unlikely(HYPERVISOR_mmu_update(&u, &count) < 0) )
+ {
+ show_trace(NULL);
+ panic("Failed to install default LDT");
+ }
+ return;
}
}
int i;
for ( i = idx-1; i >= 0; i-- )
{
+ int count = 1;
pte = update_debug_queue[i].ptep;
if ( pte == NULL ) continue;
update_debug_queue[i].ptep = NULL;
update.ptr = virt_to_machine(pte);
update.val = update_debug_queue[i].pteval;
- HYPERVISOR_mmu_update(&update, 1);
+ HYPERVISOR_mmu_update(&update, &count);
}
}
static void DEBUG_disallow_pt_read(unsigned long va)
pmd_t *pmd;
pgd_t *pgd;
unsigned long pteval;
+ int count = 1;
/*
* We may fault because of an already outstanding update.
* That's okay -- it'll get fixed up in the fault handler.
update.ptr = virt_to_machine(pte);
pteval = *(unsigned long *)pte;
update.val = pteval & ~_PAGE_PRESENT;
- HYPERVISOR_mmu_update(&update, 1);
+ HYPERVISOR_mmu_update(&update, &count);
update_debug_queue[idx].ptep = pte;
update_debug_queue[idx].pteval = pteval;
}
wmb(); /* Make sure index is cleared first to avoid double updates. */
queue_multicall2(__HYPERVISOR_mmu_update,
(unsigned long)update_queue,
- _idx);
+ &_idx);
}
spin_unlock_irqrestore(&update_lock, flags);
}
#endif
idx = 0;
wmb(); /* Make sure index is cleared first to avoid double updates. */
- if ( unlikely(HYPERVISOR_mmu_update(update_queue, _idx) < 0) )
+ if ( unlikely(HYPERVISOR_mmu_update(update_queue, &_idx) < 0) )
panic("Failed to execute MMU updates");
}
#define direct_mk_pte_phys(physpage, pgprot) \
__direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
-static inline int direct_remap_area_pte(pte_t *pte,
+static inline void direct_remap_area_pte(pte_t *pte,
unsigned long address,
unsigned long size,
- unsigned long machine_addr,
- pgprot_t prot,
- domid_t domid)
+ mmu_update_t **v)
{
unsigned long end;
-#define MAX_DIRECTMAP_MMU_QUEUE 130
- mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v, *w;
address &= ~PMD_MASK;
end = address + size;
if (address >= end)
BUG();
- /* If not I/O mapping then specify General-Purpose Subject Domain (GPS). */
- if ( domid != 0 )
- {
- u[0].val = (unsigned long)(domid<<16) & ~0xFFFFUL;
- u[0].ptr = (unsigned long)(domid<< 0) & ~0xFFFFUL;
- u[1].val = (unsigned long)(domid>>16) & ~0xFFFFUL;
- u[1].ptr = (unsigned long)(domid>>32) & ~0xFFFFUL;
- u[0].ptr |= MMU_EXTENDED_COMMAND;
- u[0].val |= MMUEXT_SET_SUBJECTDOM_L;
- u[1].ptr |= MMU_EXTENDED_COMMAND;
- u[1].val |= MMUEXT_SET_SUBJECTDOM_H;
- v = w = &u[2];
- }
- else
- {
- v = w = &u[0];
- }
-
do {
- if ( (v-u) == MAX_DIRECTMAP_MMU_QUEUE )
- {
- if ( HYPERVISOR_mmu_update(u, MAX_DIRECTMAP_MMU_QUEUE) < 0 )
- return -EFAULT;
- v = w;
- }
-#if 0 /* thanks to new ioctl mmaping interface this is no longer a bug */
+#if 0 // XXX
if (!pte_none(*pte)) {
printk("direct_remap_area_pte: page already exists\n");
BUG();
}
#endif
- v->ptr = virt_to_machine(pte);
- v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO;
- v++;
+ (*v)->ptr = virt_to_machine(pte);
+ (*v)++;
address += PAGE_SIZE;
- machine_addr += PAGE_SIZE;
pte++;
} while (address && (address < end));
-
- if ( ((v-w) != 0) && (HYPERVISOR_mmu_update(u, v-u) < 0) )
- return -EFAULT;
-
- return 0;
+ return ;
}
static inline int direct_remap_area_pmd(struct mm_struct *mm,
pmd_t *pmd,
unsigned long address,
unsigned long size,
- unsigned long machine_addr,
- pgprot_t prot,
- domid_t domid)
+ mmu_update_t **v)
{
- int error = 0;
unsigned long end;
address &= ~PGDIR_MASK;
end = address + size;
if (end > PGDIR_SIZE)
end = PGDIR_SIZE;
- machine_addr -= address;
if (address >= end)
BUG();
do {
pte_t * pte = pte_alloc(mm, pmd, address);
if (!pte)
return -ENOMEM;
- error = direct_remap_area_pte(pte, address, end - address,
- address + machine_addr, prot, domid);
- if ( error )
- break;
+ direct_remap_area_pte(pte, address, end - address, v);
+
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address && (address < end));
- return error;
+ return 0;
}
-int direct_remap_area_pages(struct mm_struct *mm,
- unsigned long address,
- unsigned long machine_addr,
- unsigned long size,
- pgprot_t prot,
- domid_t domid)
+int __direct_remap_area_pages(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long size,
+ mmu_update_t *v)
{
- int error = 0;
pgd_t * dir;
unsigned long end = address + size;
- machine_addr -= address;
dir = pgd_offset(mm, address);
flush_cache_all();
if (address >= end)
spin_lock(&mm->page_table_lock);
do {
pmd_t *pmd = pmd_alloc(mm, dir, address);
- error = -ENOMEM;
if (!pmd)
- break;
- error = direct_remap_area_pmd(mm, pmd, address, end - address,
- machine_addr + address, prot, domid);
- if (error)
- break;
+ return -ENOMEM;
+ direct_remap_area_pmd(mm, pmd, address, end - address, &v);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
+
} while (address && (address < end));
spin_unlock(&mm->page_table_lock);
flush_tlb_all();
- return error;
+ return 0;
}
+
+int direct_remap_area_pages(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long machine_addr,
+ unsigned long size,
+ pgprot_t prot,
+ domid_t domid)
+{
+ int i, count;
+ unsigned long start_address;
+#define MAX_DIRECTMAP_MMU_QUEUE 130
+ mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *w, *v;
+
+ if ( domid != 0 )
+ {
+ u[0].val = (unsigned long)(domid<<16) & ~0xFFFFUL;
+ u[0].ptr = (unsigned long)(domid<< 0) & ~0xFFFFUL;
+ u[1].val = (unsigned long)(domid>>16) & ~0xFFFFUL;
+ u[1].ptr = (unsigned long)(domid>>32) & ~0xFFFFUL;
+ u[0].ptr |= MMU_EXTENDED_COMMAND;
+ u[0].val |= MMUEXT_SET_SUBJECTDOM_L;
+ u[1].ptr |= MMU_EXTENDED_COMMAND;
+ u[1].val |= MMUEXT_SET_SUBJECTDOM_H;
+ v = w = &u[2];
+ }
+ else
+ {
+ v = w = &u[0];
+ }
+
+ start_address = address;
+
+ for(i=0; i<size;
+ i+=PAGE_SIZE, machine_addr+=PAGE_SIZE, address+=PAGE_SIZE, v++)
+ {
+ if( (v-u) == MAX_DIRECTMAP_MMU_QUEUE )
+ {
+ /* get the ptep's filled in */
+ __direct_remap_area_pages( mm,
+ start_address,
+ address-start_address,
+ w);
+
+ count = v-u;
+ if ( HYPERVISOR_mmu_update(u, &count) < 0 )
+ return -EFAULT;
+ v=w;
+ start_address = address;
+ }
+
+ /* fill in the machine addresses */
+ v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot) | _PAGE_IO;
+ }
+
+ if(v!=w)
+ {
+ /* get the ptep's filled in */
+ __direct_remap_area_pages( mm,
+ start_address,
+ address-start_address,
+ w);
+ count = v-u;
+ if ( HYPERVISOR_mmu_update(u, &count) < 0 )
+ return -EFAULT;
+
+ }
+
+ return 0;
+}
+
+
#endif /* CONFIG_XEN_PRIVILEGED_GUEST */
return ret;
}
-static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int count)
+static inline int HYPERVISOR_mmu_update(mmu_update_t *req, int *count)
{
int ret;
__asm__ __volatile__ (
pgprot_t prot,
domid_t domid);
+extern int __direct_remap_area_pages(struct mm_struct *mm,
+ unsigned long address,
+ unsigned long size,
+ mmu_update_t *v);
+
+
+
#endif /* _I386_PGALLOC_H */
privcmd_mmap_entry_t *entry;
} privcmd_mmap_t;
+typedef struct privcmd_mmapbatch {
+ int num; // number of pages to populate
+ domid_t dom; // target domain
+ unsigned long addr; // virtual address
+ unsigned long *arr; // array of mfns - top nibble set on err
+} privcmd_mmapbatch_t;
+
typedef struct privcmd_blkmsg
{
unsigned long op;
_IOC(_IOC_NONE, 'P', 1, 0)
#define IOCTL_PRIVCMD_MMAP \
_IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
+#define IOCTL_PRIVCMD_MMAPBATCH \
+ _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmapbatch_t))
#endif /* __PROC_CMD_H__ */