File xen.sr-restore-hvm-legacy-superpage.patch of Package xen

From: Olaf Hering <olaf@aepfle.de>
Date: Mon, 7 Aug 2017 12:58:02 +0000
Subject: sr restore hvm legacy superpage

tools: use superpages during restore of HVM guest

bsc#1035231 - migration of HVM domU does not use superpages on destination dom0
bsc#1055695 - XEN: 11SP4 and 12SP3 HVM guests can not be restored

During creating of a HVM domU meminit_hvm() tries to map superpages.
After save/restore or migration this mapping is lost, everything is
allocated in single pages. This causes a performance degradation after
migration.

Add neccessary code to preallocate a superpage for an incoming chunk of
pfns. In case a pfn was not populated on the sending side, it must be
freed on the receiving side to avoid over-allocation.

The existing code for x86_pv is moved unmodified into its own file.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
---
 tools/libs/guest/xg_dom_x86.c            |   5 -
 tools/libs/guest/xg_private.h            |   5 +
 tools/libs/guest/xg_sr_common.h          |  28 +-
 tools/libs/guest/xg_sr_restore.c         |  60 +-
 tools/libs/guest/xg_sr_restore_x86_hvm.c | 381 ++++++++-
 tools/libs/guest/xg_sr_restore_x86_pv.c  |  61 +-
 6 files changed, 467 insertions(+), 73 deletions(-)

--- a/tools/libs/guest/xg_dom_x86.c
+++ b/tools/libs/guest/xg_dom_x86.c
@@ -35,29 +35,24 @@
 #include <xen/arch-x86/hvm/start_info.h>
 #include <xen/io/protocols.h>
 
 #include <xen-tools/common-macros.h>
 
 #include "xg_private.h"
 #include "xenctrl.h"
 
 /* ------------------------------------------------------------------------ */
 
 #define SUPERPAGE_BATCH_SIZE 512
 
-#define SUPERPAGE_2MB_SHIFT   9
-#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
-#define SUPERPAGE_1GB_SHIFT   18
-#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
-
 #define X86_CR0_PE 0x01
 #define X86_CR0_ET 0x10
 
 #define X86_DR6_DEFAULT 0xffff0ff0u
 #define X86_DR7_DEFAULT 0x00000400u
 
 #define MTRR_TYPE_WRBACK     6
 #define MTRR_DEF_TYPE_ENABLE (1u << 11)
 
 #define SPECIALPAGE_PAGING   0
 #define SPECIALPAGE_ACCESS   1
 #define SPECIALPAGE_SHARING  2
--- a/tools/libs/guest/xg_private.h
+++ b/tools/libs/guest/xg_private.h
@@ -171,13 +171,18 @@ int pin_table(xc_interface *xch, unsigned int type, unsigned long mfn,
 #define M2P_CHUNKS(_m)  (M2P_SIZE((_m)) >> M2P_SHIFT)
 
 #if defined(__x86_64__) || defined(__i386__)
 #include <xen/lib/x86/cpu-policy.h>
 
 struct xc_cpu_policy {
     struct cpu_policy policy;
     xen_cpuid_leaf_t leaves[CPUID_MAX_SERIALISED_LEAVES];
     xen_msr_entry_t msrs[MSR_MAX_SERIALISED_ENTRIES];
 };
 #endif /* x86 */
 
+#define SUPERPAGE_2MB_SHIFT   9
+#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
+#define SUPERPAGE_1GB_SHIFT   18
+#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
+
 #endif /* XG_PRIVATE_H */
--- a/tools/libs/guest/xg_sr_common.h
+++ b/tools/libs/guest/xg_sr_common.h
@@ -198,24 +198,34 @@ struct xc_sr_restore_ops
      * @returns 0 for success, -1 for failure, with errno appropriately set.
      */
     int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page);
 
     /**
      * Set up local environment to restore a domain.
      *
      * This is called once before any common setup has occurred, allowing for
      * guest-specific adjustments to be made to common state.
      */
     int (*setup)(struct xc_sr_context *ctx);
 
+    /**
+     * Populate PFNs
+     *
+     * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+     * unpopulated subset.
+     */
+    int (*populate_pfns)(struct xc_sr_context *ctx, unsigned count,
+                         const xen_pfn_t *original_pfns, const uint32_t *types);
+
+
     /**
      * Process an individual record from the stream.  The caller shall take
      * care of processing common records (e.g. END, PAGE_DATA).
      *
      * @return 0 for success, -1 for failure, or the following sentinels:
      *  - RECORD_NOT_PROCESSED
      *  - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and
      *    a failover is needed.
      */
 #define RECORD_NOT_PROCESSED 1
 #define BROKEN_CHANNEL 2
     int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec);
@@ -329,24 +339,26 @@ struct xc_sr_context
             int *map_errs;
             xen_pfn_t *pp_pfns;
             xen_pfn_t *pp_mfns;
             void **guest_data;
             struct iovec *iov;
             struct xc_sr_rec_page_data_header *pages;
 
             void *guest_mapping;
             uint32_t nr_mapped_pages;
 
             int send_back_fd;
             unsigned long p2m_size;
+            unsigned long max_pages;
+            unsigned long tot_pages;
             xc_hypercall_buffer_t dirty_bitmap_hbuf;
 
             /* From Image Header. */
             uint32_t format_version;
 
             /* From Domain Header. */
             uint32_t guest_type;
             uint32_t guest_page_size;
 
             /* Currently buffering records between a checkpoint */
             bool buffer_all_records;
 
@@ -462,24 +474,32 @@ struct xc_sr_context
                 {
                     struct
                     {
                         /* Whether qemu enabled logdirty mode, and we should
                          * disable on cleanup. */
                         bool qemu_enabled_logdirty;
                     } save;
 
                     struct
                     {
                         /* HVM context blob. */
                         struct xc_sr_blob context;
+
+                        /* Bitmap of currently allocated PFNs during restore. */
+                        struct sr_bitmap attempted_1g;
+                        struct sr_bitmap attempted_2m;
+                        struct sr_bitmap allocated_pfns;
+                        xen_pfn_t prev_populated_pfn;
+                        xen_pfn_t iteration_tracker_pfn;
+                        unsigned long iteration;
                     } restore;
                 };
             } hvm;
 
         } x86;
     };
 };
 
 extern struct xc_sr_save_ops save_ops_x86_pv;
 extern struct xc_sr_save_ops save_ops_x86_hvm;
 
 extern struct xc_sr_restore_ops restore_ops_x86_pv;
@@ -526,32 +546,24 @@ static inline int write_record(struct xc_sr_context *ctx,
  *
  * On success, the records type and size shall be valid.
  * - If size is 0, data shall be NULL.
  * - If size is non-0, data shall be a buffer allocated by malloc() which must
  *   be passed to free() by the caller.
  *
  * On failure, the contents of the record structure are undefined.
  */
 int read_record_header(struct xc_sr_context *ctx, int fd, struct xc_sr_rhdr *rhdr);
 int read_record_data(struct xc_sr_context *ctx, int fd, struct xc_sr_rhdr *rhdr,
                      struct xc_sr_record *rec);
 
-/*
- * This would ideally be private in restore.c, but is needed by
- * x86_pv_localise_page() if we receive pagetables frames ahead of the
- * contents of the frames they point at.
- */
-int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
-                  const xen_pfn_t *original_pfns, const uint32_t *types);
-
 /* Handle a STATIC_DATA_END record. */
 int handle_static_data_end(struct xc_sr_context *ctx);
 
 /* Page type known to the migration logic? */
 static inline bool is_known_page_type(uint32_t type)
 {
     switch ( type )
     {
     case XEN_DOMCTL_PFINFO_NOTAB:
 
     case XEN_DOMCTL_PFINFO_L1TAB:
     case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
--- a/tools/libs/guest/xg_sr_restore.c
+++ b/tools/libs/guest/xg_sr_restore.c
@@ -62,78 +62,24 @@ static int read_headers(struct xc_sr_context *ctx)
     if ( dhdr.xen_major == 0 )
     {
         IPRINTF("Found %s domain, converted from legacy stream format",
                 dhdr_type_to_str(dhdr.type));
         DPRINTF("  Legacy conversion script version %u", dhdr.xen_minor);
     }
     else
         IPRINTF("Found %s domain from Xen %u.%u",
                 dhdr_type_to_str(dhdr.type), dhdr.xen_major, dhdr.xen_minor);
     return 0;
 }
 
-/*
- * Given a set of pfns, obtain memory from Xen to fill the physmap for the
- * unpopulated subset.  If types is NULL, no page type checking is performed
- * and all unpopulated pfns are populated.
- */
-int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
-                  const xen_pfn_t *original_pfns, const uint32_t *types)
-{
-    xc_interface *xch = ctx->xch;
-    unsigned int i, nr_pfns = 0;
-    int rc = -1;
-
-    for ( i = 0; i < count; ++i )
-    {
-        if ( (!types || page_type_to_populate(types[i])) &&
-             !pfn_is_populated(ctx, original_pfns[i]) )
-        {
-            rc = pfn_set_populated(ctx, original_pfns[i]);
-            if ( rc )
-                goto err;
-            ctx->restore.pp_pfns[nr_pfns] = ctx->restore.pp_mfns[nr_pfns] = original_pfns[i];
-            ++nr_pfns;
-        }
-    }
-
-    if ( nr_pfns )
-    {
-        rc = xc_domain_populate_physmap_exact(
-            xch, ctx->domid, nr_pfns, 0, 0, ctx->restore.pp_mfns);
-        if ( rc )
-        {
-            PERROR("Failed to populate physmap");
-            goto err;
-        }
-
-        for ( i = 0; i < nr_pfns; ++i )
-        {
-            if ( ctx->restore.pp_mfns[i] == INVALID_MFN )
-            {
-                ERROR("Populate physmap failed for pfn %u", i);
-                rc = -1;
-                goto err;
-            }
-
-            ctx->restore.ops.set_gfn(ctx, ctx->restore.pp_pfns[i], ctx->restore.pp_mfns[i]);
-        }
-    }
-
-    rc = 0;
-
- err:
-    return rc;
-}
-
 static int handle_static_data_end_v2(struct xc_sr_context *ctx)
 {
     int rc = 0;
 
 #if defined(__i386__) || defined(__x86_64__)
     xc_interface *xch = ctx->xch;
     /*
      * v2 compatibility only exists for x86 streams.  This is a bit of a
      * bodge, but it is less bad than duplicating handle_page_data() between
      * different architectures.
      */
 
@@ -250,25 +196,26 @@ err:
 /*
  * Populate pfns, if required
  * Fill guest_data with either mapped address or NULL
  * The caller must unmap guest_mapping
  */
 static int map_guest_pages(struct xc_sr_context *ctx,
                            struct xc_sr_rec_page_data_header *pages)
 {
     xc_interface *xch = ctx->xch;
     uint32_t i, p;
     int rc;
 
-    rc = populate_pfns(ctx, pages->count, ctx->restore.pfns, ctx->restore.types);
+    rc = ctx->restore.ops.populate_pfns(ctx, pages->count, ctx->restore.pfns,
+                                        ctx->restore.types);
     if ( rc )
     {
         ERROR("Failed to populate pfns for batch of %u pages", pages->count);
         goto err;
     }
 
     ctx->restore.nr_mapped_pages = 0;
 
     for ( i = 0; i < pages->count; i++ )
     {
         ctx->restore.ops.set_page_type(ctx, ctx->restore.pfns[i], ctx->restore.types[i]);
 
@@ -1065,24 +1012,27 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
 
     ctx.domid = dom;
 
     if ( read_headers(&ctx) )
         return -1;
 
     if ( xc_domain_nr_gpfns(xch, dom, &nr_pfns) < 0 )
     {
         PERROR("Unable to obtain the guest p2m size");
         return -1;
     }
 
+    /* See xc_domain_getinfo */
+    ctx.restore.max_pages = ctx.dominfo.max_pages;
+    ctx.restore.tot_pages = ctx.dominfo.tot_pages;
     ctx.restore.p2m_size = nr_pfns;
     ctx.restore.ops = hvm ? restore_ops_x86_hvm : restore_ops_x86_pv;
 
     if ( restore(&ctx) )
         return -1;
 
     IPRINTF("XenStore: mfn %#"PRIpfn", dom %d, evt %u",
             ctx.restore.xenstore_gfn,
             ctx.restore.xenstore_domid,
             ctx.restore.xenstore_evtchn);
 
     IPRINTF("Console: mfn %#"PRIpfn", dom %d, evt %u",
--- a/tools/libs/guest/xg_sr_restore_x86_hvm.c
+++ b/tools/libs/guest/xg_sr_restore_x86_hvm.c
@@ -121,24 +121,51 @@ static void x86_hvm_set_page_type(struct xc_sr_context *ctx,
 {
     /* no-op */
 }
 
 /* restore_ops function. */
 static int x86_hvm_localise_page(struct xc_sr_context *ctx,
                                  uint32_t type, void *page)
 {
     /* no-op */
     return 0;
 }
 
+static bool x86_hvm_expand_sp_bitmaps(struct xc_sr_context *ctx, unsigned long max_pfn)
+{
+    struct sr_bitmap *bm;
+
+    bm = &ctx->x86.hvm.restore.attempted_1g;
+    if ( !sr_bitmap_expand(bm, max_pfn >> SUPERPAGE_1GB_SHIFT) )
+        return false;
+
+    bm = &ctx->x86.hvm.restore.attempted_2m;
+    if ( !sr_bitmap_expand(bm, max_pfn >> SUPERPAGE_2MB_SHIFT) )
+        return false;
+
+    bm = &ctx->x86.hvm.restore.allocated_pfns;
+    if ( !sr_bitmap_expand(bm, max_pfn) )
+        return false;
+
+    return true;
+}
+
+static void x86_hvm_no_superpage(struct xc_sr_context *ctx, unsigned long addr)
+{
+    unsigned long pfn = addr >> XC_PAGE_SHIFT;
+
+    sr_set_bit(pfn >> SUPERPAGE_1GB_SHIFT, &ctx->x86.hvm.restore.attempted_1g);
+    sr_set_bit(pfn >> SUPERPAGE_2MB_SHIFT, &ctx->x86.hvm.restore.attempted_2m);
+}
+
 /*
  * restore_ops function. Confirms the stream matches the domain.
  */
 static int x86_hvm_setup(struct xc_sr_context *ctx)
 {
     xc_interface *xch = ctx->xch;
     unsigned long max_pfn;
 
     if ( ctx->restore.guest_type != DHDR_TYPE_X86_HVM )
     {
         ERROR("Unable to restore %s domain into an x86 HVM domain",
               dhdr_type_to_str(ctx->restore.guest_type));
@@ -155,30 +182,42 @@ static int x86_hvm_setup(struct xc_sr_context *ctx)
 #ifdef __i386__
     /* Very large domains (> 1TB) will exhaust virtual address space. */
     if ( ctx->restore.p2m_size > 0x0fffffff )
     {
         errno = E2BIG;
         PERROR("Cannot restore this big a guest");
         return -1;
     }
 #endif
 
     max_pfn = max(ctx->restore.p2m_size, ctx->dominfo.max_pages);
     if ( !sr_bitmap_expand(&ctx->restore.populated_pfns, max_pfn) )
-    {
-        PERROR("Unable to allocate memory for populated_pfns bitmap");
-        return -1;
-    }
+        goto out;
+
+    if ( !x86_hvm_expand_sp_bitmaps(ctx, max_pfn) )
+        goto out;
+
+    /* FIXME: distinguish between PVH and HVM */
+    /* No superpage in 1st 2MB due to VGA hole */
+    x86_hvm_no_superpage(ctx, 0xA0000u);
+#define LAPIC_BASE_ADDRESS  0xfee00000u
+#define ACPI_INFO_PHYSICAL_ADDRESS 0xfc000000u
+    x86_hvm_no_superpage(ctx, LAPIC_BASE_ADDRESS);
+    x86_hvm_no_superpage(ctx, ACPI_INFO_PHYSICAL_ADDRESS);
 
     return 0;
+
+out:
+    PERROR("Unable to allocate memory for pfn bitmaps");
+    return -1;
 }
 
 /*
  * restore_ops function.
  */
 static int x86_hvm_process_record(struct xc_sr_context *ctx,
                                   struct xc_sr_record *rec)
 {
     switch ( rec->type )
     {
     case REC_TYPE_X86_TSC_INFO:
         return handle_x86_tsc_info(ctx, rec);
@@ -241,40 +280,374 @@ static int x86_hvm_stream_complete(struct xc_sr_context *ctx)
     if ( rc )
     {
         PERROR("Failed to seed grant table");
         return rc;
     }
 
     return rc;
 }
 
 static int x86_hvm_cleanup(struct xc_sr_context *ctx)
 {
     sr_bitmap_free(&ctx->restore.populated_pfns);
+    sr_bitmap_free(&ctx->x86.hvm.restore.attempted_1g);
+    sr_bitmap_free(&ctx->x86.hvm.restore.attempted_2m);
+    sr_bitmap_free(&ctx->x86.hvm.restore.allocated_pfns);
     free(ctx->x86.hvm.restore.context.ptr);
 
     free(ctx->x86.restore.cpuid.ptr);
     free(ctx->x86.restore.msr.ptr);
 
     return 0;
 }
 
+/*
+ * Set a range of pfns as allocated
+ */
+static void pfn_set_long_allocated(struct xc_sr_context *ctx, xen_pfn_t base_pfn)
+{
+    sr_set_long_bit(base_pfn, &ctx->x86.hvm.restore.allocated_pfns);
+}
+
+static void pfn_set_allocated(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    sr_set_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns);
+}
+
+struct x86_hvm_sp {
+    xen_pfn_t pfn;
+    xen_pfn_t base_pfn;
+    unsigned long index;
+    unsigned long count;
+};
+
+/*
+ * Try to allocate a 1GB page for this pfn, but avoid Over-allocation.
+ * If this succeeds, mark the range of 2MB pages as busy.
+ */
+static bool x86_hvm_alloc_1g(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned int order;
+    int i, done;
+    xen_pfn_t extent;
+
+    /* Only one attempt to avoid overlapping allocation */
+    if ( sr_test_and_set_bit(sp->index, &ctx->x86.hvm.restore.attempted_1g) )
+        return false;
+
+    order = SUPERPAGE_1GB_SHIFT;
+    sp->count = SUPERPAGE_1GB_NR_PFNS;
+
+    /* Allocate only if there is room for another superpage */
+    if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages )
+        return false;
+
+    extent = sp->base_pfn = (sp->pfn >> order) << order;
+    done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+    if ( done < 0 ) {
+        PERROR("populate_physmap failed.");
+        return false;
+    }
+    if ( done == 0 )
+        return false;
+
+    DPRINTF("1G %" PRI_xen_pfn "\n", sp->base_pfn);
+
+    /* Mark all 2MB pages as done to avoid overlapping allocation */
+    for ( i = 0; i < (SUPERPAGE_1GB_NR_PFNS/SUPERPAGE_2MB_NR_PFNS); i++ )
+        sr_set_bit((sp->base_pfn >> SUPERPAGE_2MB_SHIFT) + i, &ctx->x86.hvm.restore.attempted_2m);
+
+    return true;
+}
+
+/* Allocate a 2MB page if x86_hvm_alloc_1g failed, avoid Over-allocation. */
+static bool x86_hvm_alloc_2m(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned int order;
+    int done;
+    xen_pfn_t extent;
+
+    /* Only one attempt to avoid overlapping allocation */
+    if ( sr_test_and_set_bit(sp->index, &ctx->x86.hvm.restore.attempted_2m) )
+        return false;
+
+    order = SUPERPAGE_2MB_SHIFT;
+    sp->count = SUPERPAGE_2MB_NR_PFNS;
+
+    /* Allocate only if there is room for another superpage */
+    if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages )
+        return false;
+
+    extent = sp->base_pfn = (sp->pfn >> order) << order;
+    done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+    if ( done < 0 ) {
+        PERROR("populate_physmap failed.");
+        return false;
+    }
+    if ( done == 0 )
+        return false;
+
+    DPRINTF("2M %" PRI_xen_pfn "\n", sp->base_pfn);
+    return true;
+}
+
+/* Allocate a single page if x86_hvm_alloc_2m failed. */
+static bool x86_hvm_alloc_4k(struct xc_sr_context *ctx, struct x86_hvm_sp *sp)
+{
+    xc_interface *xch = ctx->xch;
+    unsigned int order;
+    int done;
+    xen_pfn_t extent;
+
+    order = 0;
+    sp->count = 1UL;
+
+    /* Allocate only if there is room for another page */
+    if ( ctx->restore.tot_pages + sp->count > ctx->restore.max_pages ) {
+        errno = E2BIG;
+        return false;
+    }
+
+    extent = sp->base_pfn = (sp->pfn >> order) << order;
+    done = xc_domain_populate_physmap(xch, ctx->domid, 1, order, 0, &extent);
+    if ( done < 0 ) {
+        PERROR("populate_physmap failed.");
+        return false;
+    }
+    if ( done == 0 ) {
+        errno = ENOMEM;
+        return false;
+    }
+
+    DPRINTF("4K %" PRI_xen_pfn "\n", sp->base_pfn);
+    return true;
+}
+/*
+ * Attempt to allocate a superpage where the pfn resides.
+ */
+static int x86_hvm_allocate_pfn(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    bool success;
+    unsigned long idx_1g, idx_2m;
+    struct x86_hvm_sp sp = {
+        .pfn = pfn
+    };
+
+    if ( sr_test_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns) )
+        return 0;
+
+    idx_1g = pfn >> SUPERPAGE_1GB_SHIFT;
+    idx_2m = pfn >> SUPERPAGE_2MB_SHIFT;
+
+    sp.index = idx_1g;
+    success = x86_hvm_alloc_1g(ctx, &sp);
+
+    if ( success == false ) {
+        sp.index = idx_2m;
+        success = x86_hvm_alloc_2m(ctx, &sp);
+    }
+
+    if ( success == false ) {
+        sp.index = 0;
+        success = x86_hvm_alloc_4k(ctx, &sp);
+    }
+
+    if ( success == false )
+        return -1;
+
+    do {
+        if ( sp.count >= BITS_PER_LONG && (sp.count % BITS_PER_LONG) == 0 ) {
+            sp.count -= BITS_PER_LONG;
+            ctx->restore.tot_pages += BITS_PER_LONG;
+            pfn_set_long_allocated(ctx, sp.base_pfn + sp.count);
+        } else {
+            sp.count--;
+            ctx->restore.tot_pages++;
+            pfn_set_allocated(ctx, sp.base_pfn + sp.count);
+        }
+    } while ( sp.count );
+
+    return 0;
+}
+
+/*
+ * Deallocate memory.
+ * There was likely an optimistic superpage allocation.
+ * This means more pages may have been allocated past gap_end.
+ * This range is not freed now. Incoming higher pfns will release it.
+ */
+static int x86_hvm_punch_hole(struct xc_sr_context *ctx,
+                               xen_pfn_t gap_start, xen_pfn_t gap_end)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t _pfn, pfn;
+    uint32_t domid, freed = 0;
+    int rc;
+
+    pfn = gap_start >> SUPERPAGE_1GB_SHIFT;
+    do
+    {
+        sr_set_bit(pfn, &ctx->x86.hvm.restore.attempted_1g);
+    } while (++pfn <= gap_end >> SUPERPAGE_1GB_SHIFT);
+
+    pfn = gap_start >> SUPERPAGE_2MB_SHIFT;
+    do
+    {
+        sr_set_bit(pfn, &ctx->x86.hvm.restore.attempted_2m);
+    } while (++pfn <= gap_end >> SUPERPAGE_2MB_SHIFT);
+
+    pfn = gap_start;
+
+    while ( pfn <= gap_end )
+    {
+        if ( sr_test_and_clear_bit(pfn, &ctx->x86.hvm.restore.allocated_pfns) )
+        {
+            domid = ctx->domid;
+            _pfn = pfn;
+            rc = xc_domain_decrease_reservation_exact(xch, domid, 1, 0, &_pfn);
+            if ( rc )
+            {
+                PERROR("Failed to release pfn %" PRI_xen_pfn, pfn);
+                return -1;
+            }
+            ctx->restore.tot_pages--;
+            freed++;
+        }
+        pfn++;
+    }
+    if ( freed )
+        DPRINTF("freed %u between %" PRI_xen_pfn " %" PRI_xen_pfn "\n",
+                freed, gap_start, gap_end);
+    return 0;
+}
+
+static int x86_hvm_unpopulate_page(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    sr_clear_bit(pfn, &ctx->restore.populated_pfns);
+    return x86_hvm_punch_hole(ctx, pfn, pfn);
+}
+
+static int x86_hvm_populate_page(struct xc_sr_context *ctx, xen_pfn_t pfn)
+{
+    xen_pfn_t gap_start, gap_end;
+    bool has_gap, first_iteration;
+    int rc;
+
+    /*
+     * Check for a gap between the previous populated pfn and this pfn.
+     * In case a gap exists, it is required to punch a hole to release memory,
+     * starting after the previous pfn and before this pfn.
+     *
+     * But: this can be done only during the first iteration, which is the
+     * only place where superpage allocations are attempted. All following
+     * iterations lack the info to properly maintain prev_populated_pfn.
+     */
+    has_gap = ctx->x86.hvm.restore.prev_populated_pfn + 1 < pfn;
+    first_iteration = ctx->x86.hvm.restore.iteration == 0;
+    if ( has_gap && first_iteration )
+    {
+        gap_start = ctx->x86.hvm.restore.prev_populated_pfn + 1;
+        gap_end = pfn - 1;
+
+        rc = x86_hvm_punch_hole(ctx, gap_start, gap_end);
+        if ( rc )
+            goto err;
+    }
+
+    rc = x86_hvm_allocate_pfn(ctx, pfn);
+    if ( rc )
+        goto err;
+    pfn_set_populated(ctx, pfn);
+    ctx->x86.hvm.restore.prev_populated_pfn = pfn;
+
+    rc = 0;
+err:
+    return rc;
+}
+
+/*
+ * Try to allocate superpages.
+ * This works without memory map because the pfns arrive in incremental order.
+ * All pfn numbers and their type are submitted.
+ * Only pfns with data will have also pfn content transmitted.
+ */
+static int x86_hvm_populate_pfns(struct xc_sr_context *ctx, unsigned count,
+                                 const xen_pfn_t *original_pfns,
+                                 const uint32_t *types)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t pfn, min_pfn, max_pfn;
+    bool to_populate, populated;
+    unsigned i = count;
+    int rc = 0;
+
+    min_pfn = count ? original_pfns[0] : 0;
+    max_pfn = count ? original_pfns[count - 1] : 0;
+    DPRINTF("batch of %u pfns between %" PRI_xen_pfn " %" PRI_xen_pfn "\n",
+            count, min_pfn, max_pfn);
+
+    if ( !x86_hvm_expand_sp_bitmaps(ctx, max_pfn) )
+    {
+        ERROR("Unable to allocate memory for pfn bitmaps");
+        return -1;
+    }
+
+    /*
+     * There is no indicator for a new iteration.
+     * Simulate it by checking if a lower pfn is coming in.
+     * In the end it matters only to know if this iteration is the first one.
+     */
+    if ( min_pfn < ctx->x86.hvm.restore.iteration_tracker_pfn )
+        ctx->x86.hvm.restore.iteration++;
+    ctx->x86.hvm.restore.iteration_tracker_pfn = min_pfn;
+
+    for ( i = 0; i < count; ++i )
+    {
+        pfn = original_pfns[i];
+
+        to_populate = page_type_to_populate(types[i]);
+        populated = pfn_is_populated(ctx, pfn);
+
+        /*
+         * page has data, pfn populated: nothing to do
+         * page has data, pfn not populated: likely never seen before
+         * page has no data, pfn populated: likely ballooned out during migration
+         * page has no data, pfn not populated: nothing to do
+         */
+        if ( to_populate && !populated )
+        {
+            rc = x86_hvm_populate_page(ctx, pfn);
+        } else if ( !to_populate && populated )
+        {
+            rc = x86_hvm_unpopulate_page(ctx, pfn);
+        }
+        if ( rc )
+            break;
+    }
+
+    return rc;
+}
+
+
 struct xc_sr_restore_ops restore_ops_x86_hvm =
 {
     .pfn_is_valid    = x86_hvm_pfn_is_valid,
     .pfn_to_gfn      = x86_hvm_pfn_to_gfn,
     .set_gfn         = x86_hvm_set_gfn,
     .set_page_type   = x86_hvm_set_page_type,
     .localise_page   = x86_hvm_localise_page,
     .setup           = x86_hvm_setup,
+    .populate_pfns   = x86_hvm_populate_pfns,
     .process_record  = x86_hvm_process_record,
     .static_data_complete = x86_static_data_complete,
     .stream_complete = x86_hvm_stream_complete,
     .cleanup         = x86_hvm_cleanup,
 };
 
 /*
  * Local variables:
  * mode: C
  * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
--- a/tools/libs/guest/xg_sr_restore_x86_pv.c
+++ b/tools/libs/guest/xg_sr_restore_x86_pv.c
@@ -950,24 +950,82 @@ static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
                            xen_pfn_t mfn)
 {
     assert(pfn <= ctx->x86.pv.max_pfn);
 
     if ( ctx->x86.pv.width == sizeof(uint64_t) )
         /* 64 bit guest.  Need to expand INVALID_MFN for 32 bit toolstacks. */
         ((uint64_t *)ctx->x86.pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : mfn;
     else
         /* 32 bit guest.  Can truncate INVALID_MFN for 64 bit toolstacks. */
         ((uint32_t *)ctx->x86.pv.p2m)[pfn] = mfn;
 }
 
+/*
+ * Given a set of pfns, obtain memory from Xen to fill the physmap for the
+ * unpopulated subset.  If types is NULL, no page type checking is performed
+ * and all unpopulated pfns are populated.
+ */
+static int x86_pv_populate_pfns(struct xc_sr_context *ctx, unsigned count,
+                                const xen_pfn_t *original_pfns,
+                                const uint32_t *types)
+{
+    xc_interface *xch = ctx->xch;
+    xen_pfn_t *mfns = ctx->restore.pp_mfns,
+        *pfns = ctx->restore.pp_pfns;
+    unsigned int i, nr_pfns = 0;
+    int rc = -1;
+
+    for ( i = 0; i < count; ++i )
+    {
+        if ( (!types ||
+              (types && page_type_has_stream_data(types[i]) == true)) &&
+             !pfn_is_populated(ctx, original_pfns[i]) )
+        {
+            rc = pfn_set_populated(ctx, original_pfns[i]);
+            if ( rc )
+                goto err;
+            pfns[nr_pfns] = mfns[nr_pfns] = original_pfns[i];
+            ++nr_pfns;
+        }
+    }
+
+    if ( nr_pfns )
+    {
+        rc = xc_domain_populate_physmap_exact(
+            xch, ctx->domid, nr_pfns, 0, 0, mfns);
+        if ( rc )
+        {
+            PERROR("Failed to populate physmap");
+            goto err;
+        }
+
+        for ( i = 0; i < nr_pfns; ++i )
+        {
+            if ( mfns[i] == INVALID_MFN )
+            {
+                ERROR("Populate physmap failed for pfn %u", i);
+                rc = -1;
+                goto err;
+            }
+
+            ctx->restore.ops.set_gfn(ctx, pfns[i], mfns[i]);
+        }
+    }
+
+    rc = 0;
+
+ err:
+    return rc;
+}
+
 /*
  * restore_ops function.  Convert pfns back to mfns in pagetables.  Possibly
  * needs to populate new frames if a PTE is found referring to a frame which
  * hasn't yet been seen from PAGE_DATA records.
  */
 static int x86_pv_localise_page(struct xc_sr_context *ctx,
                                 uint32_t type, void *page)
 {
     xc_interface *xch = ctx->xch;
     uint64_t *table = page;
     uint64_t pte;
     unsigned int i, to_populate;
@@ -994,25 +1052,25 @@ static int x86_pv_localise_page(struct xc_sr_context *ctx,
                 ERROR("PTE truncation detected.  L%u[%u] = %016"PRIx64,
                       type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
                 errno = E2BIG;
                 return -1;
             }
 #endif
 
             if ( pfn_to_mfn(ctx, pfn) == INVALID_MFN )
                 pfns[to_populate++] = pfn;
         }
     }
 
-    if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) )
+    if ( to_populate && x86_pv_populate_pfns(ctx, to_populate, pfns, NULL) )
         return -1;
 
     for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
     {
         pte = table[i];
 
         if ( pte & _PAGE_PRESENT )
         {
             xen_pfn_t mfn, pfn;
 
             pfn = pte_to_frame(pte);
             mfn = pfn_to_mfn(ctx, pfn);
@@ -1191,24 +1249,25 @@ static int x86_pv_cleanup(struct xc_sr_context *ctx)
 
     return 0;
 }
 
 struct xc_sr_restore_ops restore_ops_x86_pv =
 {
     .pfn_is_valid    = x86_pv_pfn_is_valid,
     .pfn_to_gfn      = pfn_to_mfn,
     .set_page_type   = x86_pv_set_page_type,
     .set_gfn         = x86_pv_set_gfn,
     .localise_page   = x86_pv_localise_page,
     .setup           = x86_pv_setup,
+    .populate_pfns   = x86_pv_populate_pfns,
     .process_record  = x86_pv_process_record,
     .static_data_complete = x86_static_data_complete,
     .stream_complete = x86_pv_stream_complete,
     .cleanup         = x86_pv_cleanup,
 };
 
 /*
  * Local variables:
  * mode: C
  * c-file-style: "BSD"
  * c-basic-offset: 4
  * tab-width: 4
openSUSE Build Service is sponsored by