File xsa299-4.patch of Package xen.23721

From 39063864e70a30a2ee958953ed71760e7596f202 Mon Sep 17 00:00:00 2001
From: George Dunlap <george.dunlap@citrix.com>
Date: Thu, 10 Oct 2019 17:57:49 +0100
Subject: [PATCH 04/12] x86/mm: Separate out partial_pte tristate into
 individual flags

At the moment, partial_pte is a tri-state that contains two distinct bits
of information:

1. If zero, the pte at index [nr_validated_ptes] is un-validated.  If
   non-zero, the pte was last seen with PGT_partial set.

2. If positive, the pte at index [nr_validated_ptes] does not hold a
   general reference count.  If negative, it does.

To make future patches more clear, separate out this functionality
into two distinct, named bits: PTF_partial_set (for #1) and
PTF_partial_general_ref (for #2).

Additionally, a number of functions which need this information also
take other flags to control behavior (such as `preemptible` and
`defer`).  These are hard to read in the caller (since you only see
'true' or 'false'), and ugly when many are added together.  In
preparation for adding yet another flag in a future patch, collapse
all of these into a single `flag` variable.

NB that this does mean checking for what was previously the '-1'
condition a bit more ugly in the put_page_from_lNe functions (since
you have to check for both partial_set and general ref); but this
clause will go away in a future patch.

Also note that the original comment had an off-by-one error:
partial_flags (like partial_pte before it) concerns
plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1].

No functional change intended.

This is part of XSA-299.

Reported-by: George Dunlap <george.dunlap@citrix.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
 xen/arch/x86/mm.c        | 166 +++++++++++++++++++++++----------------
 xen/include/asm-x86/mm.h |  41 +++++++---
 2 files changed, 128 insertions(+), 79 deletions(-)

--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -755,22 +755,35 @@ static int get_page_from_pagenr(unsigned
 static int __get_page_type(struct page_info *page, unsigned long type,
                            int preemptible);
 
+/*
+ * The following flags are used to specify behavior of various get and
+ * put commands.  The first two are also stored in page->partial_flags
+ * to indicate the state of the page pointed to by
+ * page->pte[page->nr_validated_entries].  See the comment in mm.h for
+ * more information.
+ */
+#define PTF_partial_set         (1 << 0)
+#define PTF_partial_general_ref (1 << 1)
+#define PTF_preemptible         (1 << 2)
+#define PTF_defer               (1 << 3)
+
 static int get_page_and_type_from_pagenr(unsigned long page_nr,
                                          unsigned long type,
                                          struct domain *d,
-                                         int partial,
-                                         int preemptible)
+                                         unsigned int flags)
 {
     struct page_info *page = mfn_to_page(page_nr);
     int rc;
+    bool_t preemptible = !!(flags & PTF_preemptible),
+           partial_ref = !!(flags & PTF_partial_general_ref);
 
-    if ( likely(partial >= 0) &&
+    if ( likely(!partial_ref) &&
          unlikely(!get_page_from_pagenr(page_nr, d)) )
         return -EINVAL;
 
     rc = __get_page_type(page, type, preemptible);
 
-    if ( unlikely(rc) && partial >= 0 &&
+    if ( unlikely(rc) && !partial_ref &&
          (!preemptible || page != current->arch.old_guest_table) )
         put_page(page);
 
@@ -1229,7 +1242,7 @@ get_page_from_l1e(
 define_get_linear_pagetable(l2);
 static int
 get_page_from_l2e(
-    l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial)
+    l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags)
 {
     unsigned long mfn = l2e_get_pfn(l2e);
     int rc;
@@ -1245,8 +1258,9 @@ get_page_from_l2e(
 
     if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
     {
-        rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d,
-                                           partial, 0);
+        ASSERT(!(flags & PTF_preemptible));
+
+        rc = get_page_and_type_from_pagenr(mfn, PGT_l1_page_table, d, flags);
         if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
             rc = 0;
         return rc;
@@ -1272,7 +1286,7 @@ get_page_from_l2e(
 define_get_linear_pagetable(l3);
 static int
 get_page_from_l3e(
-    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
+    l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags)
 {
     int rc;
 
@@ -1286,7 +1300,7 @@ get_page_from_l3e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l3e_get_pfn(l3e), PGT_l2_page_table, d, partial, 1);
+        l3e_get_pfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible);
     if ( unlikely(rc == -EINVAL) &&
          !is_pv_32bit_domain(d) &&
          get_l3_linear_pagetable(l3e, pfn, d) )
@@ -1298,7 +1312,7 @@ get_page_from_l3e(
 define_get_linear_pagetable(l4);
 static int
 get_page_from_l4e(
-    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
+    l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags)
 {
     int rc;
 
@@ -1312,7 +1326,7 @@ get_page_from_l4e(
     }
 
     rc = get_page_and_type_from_pagenr(
-        l4e_get_pfn(l4e), PGT_l3_page_table, d, partial, 1);
+        l4e_get_pfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible);
     if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
         rc = 0;
 
@@ -1442,7 +1456,7 @@ void put_page_from_l1e(l1_pgentry_t l1e,
  * Note also that this automatically deals correctly with linear p.t.'s.
  */
 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
-                             int partial, bool_t defer)
+                             unsigned int flags)
 {
     int rc = 0;
 
@@ -1456,12 +1470,13 @@ static int put_page_from_l2e(l2_pgentry_
         struct page_info *pg = l2e_get_page(l2e);
         struct page_info *ptpg = mfn_to_page(pfn);
 
-        if ( unlikely(partial > 0) )
+        if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+              PTF_partial_set )
         {
-            ASSERT(!defer);
+            ASSERT(!(flags & PTF_defer));
             rc = _put_page_type(pg, 1, ptpg);
         }
-        else if ( defer )
+        else if ( flags & PTF_defer )
         {
             current->arch.old_guest_ptpg = ptpg;
             current->arch.old_guest_table = pg;
@@ -1478,7 +1493,7 @@ static int put_page_from_l2e(l2_pgentry_
 }
 
 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-                             int partial, bool_t defer)
+                             unsigned int flags)
 {
     struct page_info *pg;
     int rc;
@@ -1501,13 +1516,14 @@ static int put_page_from_l3e(l3_pgentry_
 
     pg = l3e_get_page(l3e);
 
-    if ( unlikely(partial > 0) )
+    if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+         PTF_partial_set )
     {
-        ASSERT(!defer);
+        ASSERT(!(flags & PTF_defer));
         return _put_page_type(pg, 1, mfn_to_page(pfn));
     }
 
-    if ( defer )
+    if ( flags & PTF_defer )
     {
         current->arch.old_guest_ptpg = mfn_to_page(pfn);
         current->arch.old_guest_table = pg;
@@ -1522,7 +1538,7 @@ static int put_page_from_l3e(l3_pgentry_
 }
 
 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
-                             int partial, bool_t defer)
+                             unsigned int flags)
 {
     int rc = 1;
 
@@ -1531,13 +1547,14 @@ static int put_page_from_l4e(l4_pgentry_
     {
         struct page_info *pg = l4e_get_page(l4e);
 
-        if ( unlikely(partial > 0) )
+        if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+              PTF_partial_set )
         {
-            ASSERT(!defer);
+            ASSERT(!(flags & PTF_defer));
             return _put_page_type(pg, 1, mfn_to_page(pfn));
         }
 
-        if ( defer )
+        if ( flags & PTF_defer )
         {
             current->arch.old_guest_ptpg = mfn_to_page(pfn);
             current->arch.old_guest_table = pg;
@@ -1649,12 +1666,13 @@ static int alloc_l2_table(struct page_in
     unsigned long  pfn = page_to_mfn(page);
     l2_pgentry_t  *pl2e;
     unsigned int   i;
-    int            rc = 0, partial = page->partial_pte;
+    int            rc = 0;
+    unsigned int   partial_flags = page->partial_flags;
 
     pl2e = map_domain_page(_mfn(pfn));
 
     for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
-          i++, partial = 0 )
+          i++, partial_flags = 0 )
     {
         if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
         {
@@ -1664,18 +1682,19 @@ static int alloc_l2_table(struct page_in
         }
 
         if ( !is_guest_l2_slot(d, type, i) ||
-             (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 )
+             (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
             continue;
 
         if ( rc == -ERESTART )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = partial ?: 1;
+            /* Set 'set', retain 'general ref' */
+            page->partial_flags = partial_flags | PTF_partial_set;
         }
         else if ( rc == -EINTR && i )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 0;
+            page->partial_flags = 0;
             rc = -ERESTART;
         }
         else if ( rc < 0 && rc != -EINTR )
@@ -1684,7 +1703,7 @@ static int alloc_l2_table(struct page_in
             if ( i )
             {
                 page->nr_validated_ptes = i;
-                page->partial_pte = 0;
+                page->partial_flags = 0;
                 current->arch.old_guest_ptpg = NULL;
                 current->arch.old_guest_table = page;
             }
@@ -1714,7 +1733,8 @@ static int alloc_l3_table(struct page_in
     unsigned long  pfn = page_to_mfn(page);
     l3_pgentry_t  *pl3e;
     unsigned int   i;
-    int            rc = 0, partial = page->partial_pte;
+    int            rc = 0;
+    unsigned int   partial_flags = page->partial_flags;
 
     pl3e = map_domain_page(_mfn(pfn));
 
@@ -1729,7 +1749,7 @@ static int alloc_l3_table(struct page_in
         memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
 
     for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
-          i++, partial = 0 )
+          i++, partial_flags = 0 )
     {
         if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
         {
@@ -1747,21 +1767,23 @@ static int alloc_l3_table(struct page_in
                 rc = get_page_and_type_from_pagenr(l3e_get_pfn(pl3e[i]),
                                                    PGT_l2_page_table |
                                                    PGT_pae_xen_l2,
-                                                   d, partial, 1);
+                                                   d,
+                                                   partial_flags | PTF_preemptible);
         }
         else if ( !is_guest_l3_slot(i) ||
-                  (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
+                  (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 )
             continue;
 
         if ( rc == -ERESTART )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = partial ?: 1;
+            /* Set 'set', leave 'general ref' set if this entry was set */
+            page->partial_flags = partial_flags | PTF_partial_set;
         }
         else if ( rc == -EINTR && i )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 0;
+            page->partial_flags = 0;
             rc = -ERESTART;
         }
         if ( rc < 0 )
@@ -1778,7 +1800,7 @@ static int alloc_l3_table(struct page_in
         if ( i )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = 0;
+            page->partial_flags = 0;
             current->arch.old_guest_ptpg = NULL;
             current->arch.old_guest_table = page;
         }
@@ -1844,19 +1866,21 @@ static int alloc_l4_table(struct page_in
     unsigned long  pfn = page_to_mfn(page);
     l4_pgentry_t  *pl4e = map_domain_page(_mfn(pfn));
     unsigned int   i;
-    int            rc = 0, partial = page->partial_pte;
+    int            rc = 0;
+    unsigned int   partial_flags = page->partial_flags;
 
     for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
-          i++, partial = 0 )
+          i++, partial_flags = 0 )
     {
         if ( !is_guest_l4_slot(d, i) ||
-             (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
+             (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 )
             continue;
 
         if ( rc == -ERESTART )
         {
             page->nr_validated_ptes = i;
-            page->partial_pte = partial ?: 1;
+            /* Set 'set', leave 'general ref' set if this entry was set */
+            page->partial_flags = partial_flags | PTF_partial_set;
         }
         else if ( rc < 0 )
         {
@@ -1865,7 +1889,7 @@ static int alloc_l4_table(struct page_in
             if ( i )
             {
                 page->nr_validated_ptes = i;
-                page->partial_pte = 0;
+                page->partial_flags = 0;
                 if ( rc == -EINTR )
                     rc = -ERESTART;
                 else
@@ -1919,19 +1943,20 @@ static int free_l2_table(struct page_inf
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l2_pgentry_t *pl2e;
-    int rc = 0, partial = page->partial_pte;
-    unsigned int i = page->nr_validated_ptes - !partial;
+    int rc = 0;
+    unsigned int partial_flags = page->partial_flags,
+        i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
 
     pl2e = map_domain_page(_mfn(pfn));
 
     for ( ; ; )
     {
         if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
-            rc = put_page_from_l2e(pl2e[i], pfn, partial, 0);
+            rc = put_page_from_l2e(pl2e[i], pfn, partial_flags);
         if ( rc < 0 )
             break;
 
-        partial = 0;
+        partial_flags = 0;
 
         if ( !i-- )
             break;
@@ -1953,12 +1978,14 @@ static int free_l2_table(struct page_inf
     else if ( rc == -ERESTART )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = partial ?: -1;
+        page->partial_flags = (partial_flags & PTF_partial_set) ?
+            partial_flags :
+            (PTF_partial_set | PTF_partial_general_ref);
     }
     else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
     {
         page->nr_validated_ptes = i + 1;
-        page->partial_pte = 0;
+        page->partial_flags = 0;
         rc = -ERESTART;
     }
 
@@ -1970,8 +1997,9 @@ static int free_l3_table(struct page_inf
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l3_pgentry_t *pl3e;
-    int rc = 0, partial = page->partial_pte;
-    unsigned int  i = page->nr_validated_ptes - !partial;
+    int rc = 0;
+    unsigned int partial_flags = page->partial_flags,
+        i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
 
     pl3e = map_domain_page(_mfn(pfn));
 
@@ -1979,11 +2007,11 @@ static int free_l3_table(struct page_inf
     {
         if ( is_guest_l3_slot(i) )
         {
-            rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
+            rc = put_page_from_l3e(pl3e[i], pfn, partial_flags);
             if ( rc < 0 )
                 break;
 
-            partial = 0;
+            partial_flags = 0;
             if ( rc == 0 )
                 unadjust_guest_l3e(pl3e[i], d);
         }
@@ -2003,12 +2031,14 @@ static int free_l3_table(struct page_inf
     if ( rc == -ERESTART )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = partial ?: -1;
+        page->partial_flags = (partial_flags & PTF_partial_set) ?
+            partial_flags :
+            (PTF_partial_set | PTF_partial_general_ref);
     }
     else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
     {
         page->nr_validated_ptes = i + 1;
-        page->partial_pte = 0;
+        page->partial_flags = 0;
         rc = -ERESTART;
     }
     return rc > 0 ? 0 : rc;
@@ -2019,26 +2049,29 @@ static int free_l4_table(struct page_inf
     struct domain *d = page_get_owner(page);
     unsigned long pfn = page_to_mfn(page);
     l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
-    int rc = 0, partial = page->partial_pte;
-    unsigned int  i = page->nr_validated_ptes - !partial;
+    int rc = 0;
+    unsigned partial_flags = page->partial_flags,
+        i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
 
     do {
         if ( is_guest_l4_slot(d, i) )
-            rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
+            rc = put_page_from_l4e(pl4e[i], pfn, partial_flags);
         if ( rc < 0 )
             break;
-        partial = 0;
+        partial_flags = 0;
     } while ( i-- );
 
     if ( rc == -ERESTART )
     {
         page->nr_validated_ptes = i;
-        page->partial_pte = partial ?: -1;
+        page->partial_flags = (partial_flags & PTF_partial_set) ?
+            partial_flags :
+            (PTF_partial_set | PTF_partial_general_ref);
     }
     else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
     {
         page->nr_validated_ptes = i + 1;
-        page->partial_pte = 0;
+        page->partial_flags = 0;
         rc = -ERESTART;
     }
 
@@ -2316,7 +2349,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
         return -EBUSY;
     }
 
-    put_page_from_l2e(ol2e, pfn, 0, 1);
+    put_page_from_l2e(ol2e, pfn, PTF_defer);
 
     return rc;
 }
@@ -2390,7 +2423,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
         if ( !create_pae_xen_mappings(d, pl3e) )
             BUG();
 
-    put_page_from_l3e(ol3e, pfn, 0, 1);
+    put_page_from_l3e(ol3e, pfn, PTF_defer);
     return rc;
 }
 
@@ -2452,7 +2485,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
         return -EFAULT;
     }
 
-    put_page_from_l4e(ol4e, pfn, 0, 1);
+    put_page_from_l4e(ol4e, pfn, PTF_defer);
     return rc;
 }
 
@@ -2716,7 +2749,7 @@ int free_page_type(struct page_info *pag
     if ( !(type & PGT_partial) )
     {
         page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
-        page->partial_pte = 0;
+        page->partial_flags = 0;
     }
 
     switch ( type & PGT_type_mask )
@@ -2997,7 +3030,7 @@ static int __get_page_type(struct page_i
         if ( !(x & PGT_partial) )
         {
             page->nr_validated_ptes = 0;
-            page->partial_pte = 0;
+            page->partial_flags = 0;
         }
         page->linear_pt_count = 0;
         rc = alloc_page_type(page, type, preemptible);
@@ -3362,7 +3395,8 @@ int new_guest_cr3(unsigned long mfn)
 
     rc = paging_mode_refcounts(d)
          ? (get_page_from_pagenr(mfn, d) ? 0 : -EINVAL)
-         : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d, 0, 1);
+         : get_page_and_type_from_pagenr(mfn, PGT_root_page_table, d,
+                                         PTF_preemptible);
     switch ( rc )
     {
     case 0:
@@ -3750,7 +3784,7 @@ long do_mmuext_op(
                     rc = get_page_from_pagenr(op.arg1.mfn, d) ? 0 : -EINVAL;
                 else
                     rc = get_page_and_type_from_pagenr(
-                        op.arg1.mfn, PGT_root_page_table, d, 0, 1);
+                        op.arg1.mfn, PGT_root_page_table, d, PTF_preemptible);
 
                 if ( unlikely(rc) )
                 {
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -141,19 +141,34 @@ struct page_info
          * setting the flag must not drop that reference, whereas the instance
          * clearing it will have to.
          *
-         * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
-         * been partially validated. This implies that the general reference
-         * to the page (acquired from get_page_from_lNe()) would be dropped
-         * (again due to the apparent failure) and hence must be re-acquired
-         * when resuming the validation, but must not be dropped when picking
-         * up the page for invalidation.
-         *
-         * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
-         * been partially invalidated. This is basically the opposite case of
-         * above, i.e. the general reference to the page was not dropped in
-         * put_page_from_lNe() (due to the apparent failure), and hence it
-         * must be dropped when the put operation is resumed (and completes),
-         * but it must not be acquired if picking up the page for validation.
+         * If partial_flags & PTF_partial_set is set, then the page at
+         * at @nr_validated_ptes had PGT_partial set as a result of an
+         * operation on the current page.  (That page may or may not
+         * still have PGT_partial set.)
+         *
+         * If PTF_partial_general_ref is set, then the PTE at
+         * @nr_validated_ptef holds a general reference count for the
+         * page.
+         *
+         * This happens:
+         * - During de-validation, if de-validation of the page was
+         *   interrupted
+         * - During validation, if an invalid entry is encountered and
+         *   validation is preemptible
+         * - During validation, if PTF_partial_general_ref was set on
+         *   this entry to begin with (perhaps because we're picking
+         *   up from a partial de-validation).
+         *
+         * When resuming validation, if PTF_partial_general_ref is clear,
+         * then a general reference must be re-acquired; if it is set, no
+         * reference should be acquired.
+         *
+         * When resuming de-validation, if PTF_partial_general_ref is
+         * clear, no reference should be dropped; if it is set, a
+         * reference should be dropped.
+         *
+         * NB that PTF_partial_set and PTF_partial_general_ref are
+         * defined in mm.c, the only place where they are used.
          *
          * The 3rd field, @linear_pt_count, indicates
          * - by a positive value, how many same-level page table entries a page
@@ -164,7 +179,7 @@ struct page_info
         struct {
             u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
             u16 :16 - PAGETABLE_ORDER - 1 - 2;
-            s16 partial_pte:2;
+            u16 partial_flags:2;
             s16 linear_pt_count;
         };
 
openSUSE Build Service is sponsored by