libxc/restore: support COLO restore
authorWen Congyang <wency@cn.fujitsu.com>
Wed, 15 Jul 2015 09:18:44 +0000 (17:18 +0800)
committerChanglong Xie <xiecl.fnst@cn.fujitsu.com>
Fri, 1 Apr 2016 03:07:29 +0000 (11:07 +0800)
a. call callbacks resume/checkpoint/suspend while secondary vm
   status is consistent with primary
b. send dirty pfn list to primary when checkpoint under colo
c. send store gfn and console gfn to xl before resuming secondary vm

Signed-off-by: Yang Hongyang <hongyang.yang@easystack.cn>
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Signed-off-by: Changlong Xie <xiecl.fnst@cn.fujitsu.com>
CC: Andrew Cooper <andrew.cooper3@citrix.com>
Acked-by: Ian Jackson <ian.jackson@eu.citrix.com>
tools/libxc/include/xenguest.h
tools/libxc/xc_sr_common.h
tools/libxc/xc_sr_restore.c
tools/libxl/libxl_colo_restore.c
tools/libxl/libxl_create.c
tools/libxl/libxl_save_msgs_gen.pl

index 8ea5a3c93787e170278c739ecf264e2a2775e524..40902ee4c9d959258e04fe9f749eda55a4eb1a71 100644 (file)
@@ -133,6 +133,14 @@ struct restore_callbacks {
      */
     int (*wait_checkpoint)(void* data);
 
+    /*
+     * callback to send store gfn and console gfn to xl
+     * if we want to resume vm before xc_domain_save()
+     * exits.
+     */
+    void (*restore_results)(xen_pfn_t store_gfn, xen_pfn_t console_gfn,
+                            void *data);
+
     /* to be provided as the last argument to each callback function */
     void* data;
 };
index c990664d7c0b2ddb87fb6ac7a06e11ee76c7103a..cf32ab80ffc6c64b701a6076153a028d13bb0928 100644 (file)
@@ -214,6 +214,10 @@ struct xc_sr_context
             struct xc_sr_restore_ops ops;
             struct restore_callbacks *callbacks;
 
+            int send_back_fd;
+            unsigned long p2m_size;
+            xc_hypercall_buffer_t dirty_bitmap_hbuf;
+
             /* From Image Header. */
             uint32_t format_version;
 
@@ -222,13 +226,13 @@ struct xc_sr_context
             uint32_t guest_page_size;
 
             /* Plain VM, or checkpoints over time. */
-            bool checkpointed;
+            int checkpointed;
 
             /* Currently buffering records between a checkpoint */
             bool buffer_all_records;
 
 /*
- * With Remus, we buffer the records sent by the primary at checkpoint,
+ * With Remus/COLO, we buffer the records sent by the primary at checkpoint,
  * in case the primary will fail, we can recover from the last
  * checkpoint state.
  * This should be enough for most of the cases because primary only send
index 3e4ca7ffdd59c794ea0165bbdfd69663bd97fddf..728edbc009356672a7706765ca3d14e2c6cd7691 100644 (file)
@@ -411,6 +411,92 @@ static int handle_page_data(struct xc_sr_context *ctx, struct xc_sr_record *rec)
     return rc;
 }
 
+/*
+ * Send checkpoint dirty pfn list to primary.
+ */
+static int send_checkpoint_dirty_pfn_list(struct xc_sr_context *ctx)
+{
+    xc_interface *xch = ctx->xch;
+    int rc = -1;
+    unsigned count, written;
+    uint64_t i, *pfns = NULL;
+    struct iovec *iov = NULL;
+    xc_shadow_op_stats_t stats = { 0, ctx->restore.p2m_size };
+    struct xc_sr_record rec =
+    {
+        .type = REC_TYPE_CHECKPOINT_DIRTY_PFN_LIST,
+    };
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->restore.dirty_bitmap_hbuf);
+
+    if ( xc_shadow_control(
+             xch, ctx->domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+             HYPERCALL_BUFFER(dirty_bitmap), ctx->restore.p2m_size,
+             NULL, 0, &stats) != ctx->restore.p2m_size )
+    {
+        PERROR("Failed to retrieve logdirty bitmap");
+        goto err;
+    }
+
+    for ( i = 0, count = 0; i < ctx->restore.p2m_size; i++ )
+    {
+        if ( test_bit(i, dirty_bitmap) )
+            count++;
+    }
+
+
+    pfns = malloc(count * sizeof(*pfns));
+    if ( !pfns )
+    {
+        ERROR("Unable to allocate %zu bytes of memory for dirty pfn list",
+              count * sizeof(*pfns));
+        goto err;
+    }
+
+    for ( i = 0, written = 0; i < ctx->restore.p2m_size; ++i )
+    {
+        if ( !test_bit(i, dirty_bitmap) )
+            continue;
+
+        if ( written > count )
+        {
+            ERROR("Dirty pfn list exceed");
+            goto err;
+        }
+
+        pfns[written++] = i;
+    }
+
+    /* iovec[] for writev(). */
+    iov = malloc(3 * sizeof(*iov));
+    if ( !iov )
+    {
+        ERROR("Unable to allocate memory for sending dirty bitmap");
+        goto err;
+    }
+
+    rec.length = count * sizeof(*pfns);
+
+    iov[0].iov_base = &rec.type;
+    iov[0].iov_len = sizeof(rec.type);
+
+    iov[1].iov_base = &rec.length;
+    iov[1].iov_len = sizeof(rec.length);
+
+    iov[2].iov_base = pfns;
+    iov[2].iov_len = count * sizeof(*pfns);
+
+    if ( writev_exact(ctx->restore.send_back_fd, iov, 3) )
+    {
+        PERROR("Failed to write dirty bitmap to stream");
+        goto err;
+    }
+
+    rc = 0;
+ err:
+    return rc;
+}
+
 static int process_record(struct xc_sr_context *ctx, struct xc_sr_record *rec);
 static int handle_checkpoint(struct xc_sr_context *ctx)
 {
@@ -460,6 +546,53 @@ static int handle_checkpoint(struct xc_sr_context *ctx)
     else
         ctx->restore.buffer_all_records = true;
 
+    if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+    {
+#define HANDLE_CALLBACK_RETURN_VALUE(ret)                   \
+    do {                                                    \
+        if ( ret == 1 )                                     \
+            rc = 0; /* Success */                           \
+        else                                                \
+        {                                                   \
+            if ( ret == 2 )                                 \
+                rc = BROKEN_CHANNEL;                        \
+            else                                            \
+                rc = -1; /* Some unspecified error */       \
+            goto err;                                       \
+        }                                                   \
+    } while (0)
+
+        /* COLO */
+
+        /* We need to resume guest */
+        rc = ctx->restore.ops.stream_complete(ctx);
+        if ( rc )
+            goto err;
+
+        ctx->restore.callbacks->restore_results(ctx->restore.xenstore_gfn,
+                                                ctx->restore.console_gfn,
+                                                ctx->restore.callbacks->data);
+
+        /* Resume secondary vm */
+        ret = ctx->restore.callbacks->postcopy(ctx->restore.callbacks->data);
+        HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+        /* Wait for a new checkpoint */
+        ret = ctx->restore.callbacks->wait_checkpoint(
+                                                ctx->restore.callbacks->data);
+        HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+        /* suspend secondary vm */
+        ret = ctx->restore.callbacks->suspend(ctx->restore.callbacks->data);
+        HANDLE_CALLBACK_RETURN_VALUE(ret);
+
+#undef HANDLE_CALLBACK_RETURN_VALUE
+
+        rc = send_checkpoint_dirty_pfn_list(ctx);
+        if ( rc )
+            goto err;
+    }
+
  err:
     return rc;
 }
@@ -529,6 +662,21 @@ static int setup(struct xc_sr_context *ctx)
 {
     xc_interface *xch = ctx->xch;
     int rc;
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->restore.dirty_bitmap_hbuf);
+
+    if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+    {
+        dirty_bitmap = xc_hypercall_buffer_alloc_pages(xch, dirty_bitmap,
+                                NRPAGES(bitmap_size(ctx->restore.p2m_size)));
+
+        if ( !dirty_bitmap )
+        {
+            ERROR("Unable to allocate memory for dirty bitmap");
+            rc = -1;
+            goto err;
+        }
+    }
 
     rc = ctx->restore.ops.setup(ctx);
     if ( rc )
@@ -562,10 +710,15 @@ static void cleanup(struct xc_sr_context *ctx)
 {
     xc_interface *xch = ctx->xch;
     unsigned i;
+    DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap,
+                                    &ctx->restore.dirty_bitmap_hbuf);
 
     for ( i = 0; i < ctx->restore.buffered_rec_num; i++ )
         free(ctx->restore.buffered_records[i].data);
 
+    if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+        xc_hypercall_buffer_free_pages(xch, dirty_bitmap,
+                                   NRPAGES(bitmap_size(ctx->restore.p2m_size)));
     free(ctx->restore.buffered_records);
     free(ctx->restore.populated_pfns);
     if ( ctx->restore.ops.cleanup(ctx) )
@@ -631,6 +784,15 @@ static int restore(struct xc_sr_context *ctx)
     } while ( rec.type != REC_TYPE_END );
 
  remus_failover:
+
+    if ( ctx->restore.checkpointed == XC_MIG_STREAM_COLO )
+    {
+        /* With COLO, we have already called stream_complete */
+        rc = 0;
+        IPRINTF("COLO Failover");
+        goto done;
+    }
+
     /*
      * With Remus, if we reach here, there must be some error on primary,
      * failover from the last checkpoint state.
@@ -667,6 +829,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
                       xc_migration_stream_t stream_type,
                       struct restore_callbacks *callbacks, int send_back_fd)
 {
+    xen_pfn_t nr_pfns;
     struct xc_sr_context ctx =
         {
             .xch = xch,
@@ -680,11 +843,21 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
     ctx.restore.xenstore_domid = store_domid;
     ctx.restore.checkpointed = stream_type;
     ctx.restore.callbacks = callbacks;
+    ctx.restore.send_back_fd = send_back_fd;
 
     /* Sanity checks for callbacks. */
     if ( stream_type )
         assert(callbacks->checkpoint);
 
+    if ( ctx.restore.checkpointed == XC_MIG_STREAM_COLO )
+    {
+        /* this is COLO restore */
+        assert(callbacks->suspend &&
+               callbacks->postcopy &&
+               callbacks->wait_checkpoint &&
+               callbacks->restore_results);
+    }
+
     DPRINTF("fd %d, dom %u, hvm %u, pae %u, superpages %d"
             ", stream_type %d", io_fd, dom, hvm, pae,
             superpages, stream_type);
@@ -706,6 +879,14 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
     if ( read_headers(&ctx) )
         return -1;
 
+    if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 )
+    {
+        PERROR("Unable to obtain the guest p2m size");
+        return -1;
+    }
+
+    ctx.restore.p2m_size = nr_pfns;
+
     if ( ctx.dominfo.hvm )
     {
         ctx.restore.ops = restore_ops_x86_hvm;
index a8f74a793a7dfe8bffa70c2c7f92f9bef540dcc6..04b02d84e25337ef5e82c2ac4afb58eeb087370f 100644 (file)
@@ -126,11 +126,6 @@ static void colo_resume_vm(libxl__egc *egc,
         return;
     }
 
-    /*
-     * TODO: get store gfn and console gfn
-     *  We should call the callback restore_results in
-     *  xc_domain_restore() before resuming the guest.
-     */
     libxl__xc_domain_restore_done(egc, dcs, 0, 0, 0);
 
     return;
index c58dd7e60256bc49eed9dd611b8ecee9771ee331..d6c794ee5d9dc9e4518a96ee5bd202708697d1bb 100644 (file)
@@ -1017,6 +1017,8 @@ static void domcreate_bootloader_done(libxl__egc *egc,
     const int checkpointed_stream = dcs->restore_params.checkpointed_stream;
     libxl__colo_restore_state *const crs = &dcs->crs;
     libxl_domain_build_info *const info = &d_config->b_info;
+    libxl__srm_restore_autogen_callbacks *const callbacks =
+        &dcs->srs.shs.callbacks.restore.a;
 
     if (rc) {
         domcreate_rebuild_done(egc, dcs, rc);
@@ -1044,6 +1046,7 @@ static void domcreate_bootloader_done(libxl__egc *egc,
     }
 
     /* Restore */
+    callbacks->restore_results = libxl__srm_callout_callback_restore_results;
 
     /* COLO only supports HVM now because it does not work very
      * well with pv drivers:
index 6016706d6ef2873388ec0cc1ad2734ca028a7c03..c2243f242c5ea8e17940d099882c1192475d4e36 100755 (executable)
@@ -29,8 +29,8 @@ our @msgs = (
     [  6, 'srcxA',  "wait_checkpoint", [] ],
     [  7, 'scxA',   "switch_qemu_logdirty",  [qw(int domid
                                               unsigned enable)] ],
-    [  8, 'r',      "restore_results",       ['unsigned long', 'store_mfn',
-                                              'unsigned long', 'console_mfn'] ],
+    [  8, 'rcx',    "restore_results",       ['unsigned long', 'store_gfn',
+                                              'unsigned long', 'console_gfn'] ],
     [  9, 'srW',    "complete",              [qw(int retval
                                                  int errnoval)] ],
 );