File 5dbaf9ce-x86-mm-split-partial_pte-tristate.patch of Package xen.16553
# Commit 1b6fa638d21006d3c0a3038132c6cb326d8bba08
# Date 2019-10-31 16:12:14 +0100
# Author George Dunlap <george.dunlap@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/mm: Separate out partial_pte tristate into individual flags
At the moment, partial_pte is a tri-state that contains two distinct bits
of information:
1. If zero, the pte at index [nr_validated_ptes] is un-validated. If
non-zero, the pte was last seen with PGT_partial set.
2. If positive, the pte at index [nr_validated_ptes] does not hold a
general reference count. If negative, it does.
To make future patches more clear, separate out this functionality
into two distinct, named bits: PTF_partial_set (for #1) and
PTF_partial_general_ref (for #2).
Additionally, a number of functions which need this information also
take other flags to control behavior (such as `preemptible` and
`defer`). These are hard to read in the caller (since you only see
'true' or 'false'), and ugly when many are added together. In
preparation for adding yet another flag in a future patch, collapse
all of these into a single `flag` variable.
NB that this does mean checking for what was previously the '-1'
condition a bit more ugly in the put_page_from_lNe functions (since
you have to check for both partial_set and general ref); but this
clause will go away in a future patch.
Also note that the original comment had an off-by-one error:
partial_flags (like partial_pte before it) concerns
plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1].
No functional change intended.
This is part of XSA-299.
Reported-by: George Dunlap <george.dunlap@citrix.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -651,20 +651,34 @@ static int alloc_segdesc_page(struct pag
static int __get_page_type(struct page_info *page, unsigned long type,
int preemptible);
+/*
+ * The following flags are used to specify behavior of various get and
+ * put commands. The first two are also stored in page->partial_flags
+ * to indicate the state of the page pointed to by
+ * page->pte[page->nr_validated_entries]. See the comment in mm.h for
+ * more information.
+ */
+#define PTF_partial_set (1 << 0)
+#define PTF_partial_general_ref (1 << 1)
+#define PTF_preemptible (1 << 2)
+#define PTF_defer (1 << 3)
+
static int get_page_and_type_from_mfn(
mfn_t mfn, unsigned long type, struct domain *d,
- int partial, int preemptible)
+ unsigned int flags)
{
struct page_info *page = mfn_to_page(mfn);
int rc;
+ bool preemptible = flags & PTF_preemptible,
+ partial_ref = flags & PTF_partial_general_ref;
- if ( likely(partial >= 0) &&
+ if ( likely(!partial_ref) &&
unlikely(!get_page_from_mfn(mfn, d)) )
return -EINVAL;
rc = __get_page_type(page, type, preemptible);
- if ( unlikely(rc) && partial >= 0 &&
+ if ( unlikely(rc) && !partial_ref &&
(!preemptible || page != current->arch.old_guest_table) )
put_page(page);
@@ -1146,7 +1160,7 @@ get_page_from_l1e(
define_get_linear_pagetable(l2);
static int
get_page_from_l2e(
- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial)
+ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags)
{
unsigned long mfn = l2e_get_pfn(l2e);
int rc;
@@ -1163,8 +1177,9 @@ get_page_from_l2e(
if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
{
- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d,
- partial, false);
+ ASSERT(!(flags & PTF_preemptible));
+
+ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags);
if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
rc = 0;
return rc;
@@ -1183,7 +1198,7 @@ get_page_from_l2e(
define_get_linear_pagetable(l3);
static int
get_page_from_l3e(
- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags)
{
int rc;
@@ -1198,7 +1213,7 @@ get_page_from_l3e(
}
rc = get_page_and_type_from_mfn(
- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
+ l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible);
if ( unlikely(rc == -EINVAL) &&
!is_pv_32bit_domain(d) &&
get_l3_linear_pagetable(l3e, pfn, d) )
@@ -1216,7 +1231,7 @@ get_page_from_l3e(
define_get_linear_pagetable(l4);
static int
get_page_from_l4e(
- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags)
{
int rc;
@@ -1231,7 +1246,7 @@ get_page_from_l4e(
}
rc = get_page_and_type_from_mfn(
- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
+ l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible);
if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
rc = 0;
@@ -1306,7 +1321,7 @@ void put_page_from_l1e(l1_pgentry_t l1e,
* Note also that this automatically deals correctly with linear p.t.'s.
*/
static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
- int partial, bool defer)
+ unsigned int flags)
{
int rc = 0;
@@ -1326,12 +1341,13 @@ static int put_page_from_l2e(l2_pgentry_
struct page_info *pg = l2e_get_page(l2e);
struct page_info *ptpg = mfn_to_page(_mfn(pfn));
- if ( unlikely(partial > 0) )
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+ PTF_partial_set )
{
- ASSERT(!defer);
+ ASSERT(!(flags & PTF_defer));
rc = _put_page_type(pg, true, ptpg);
}
- else if ( defer )
+ else if ( flags & PTF_defer )
{
current->arch.old_guest_ptpg = ptpg;
current->arch.old_guest_table = pg;
@@ -1348,7 +1364,7 @@ static int put_page_from_l2e(l2_pgentry_
}
static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- int partial, bool defer)
+ unsigned int flags)
{
struct page_info *pg;
int rc;
@@ -1371,13 +1387,14 @@ static int put_page_from_l3e(l3_pgentry_
pg = l3e_get_page(l3e);
- if ( unlikely(partial > 0) )
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+ PTF_partial_set )
{
- ASSERT(!defer);
+ ASSERT(!(flags & PTF_defer));
return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
}
- if ( defer )
+ if ( flags & PTF_defer )
{
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
current->arch.old_guest_table = pg;
@@ -1392,7 +1409,7 @@ static int put_page_from_l3e(l3_pgentry_
}
static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- int partial, bool defer)
+ unsigned int flags)
{
int rc = 1;
@@ -1401,13 +1418,14 @@ static int put_page_from_l4e(l4_pgentry_
{
struct page_info *pg = l4e_get_page(l4e);
- if ( unlikely(partial > 0) )
+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
+ PTF_partial_set )
{
- ASSERT(!defer);
+ ASSERT(!(flags & PTF_defer));
return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
}
- if ( defer )
+ if ( flags & PTF_defer )
{
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
current->arch.old_guest_table = pg;
@@ -1514,12 +1532,13 @@ static int alloc_l2_table(struct page_in
unsigned long pfn = mfn_x(page_to_mfn(page));
l2_pgentry_t *pl2e;
unsigned int i;
- int rc = 0, partial = page->partial_pte;
+ int rc = 0;
+ unsigned int partial_flags = page->partial_flags;
pl2e = map_domain_page(_mfn(pfn));
for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
- i++, partial = 0 )
+ i++, partial_flags = 0 )
{
if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
{
@@ -1529,18 +1548,19 @@ static int alloc_l2_table(struct page_in
}
if ( !is_guest_l2_slot(d, type, i) ||
- (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 )
+ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
continue;
if ( rc == -ERESTART )
{
page->nr_validated_ptes = i;
- page->partial_pte = partial ?: 1;
+ /* Set 'set', retain 'general ref' */
+ page->partial_flags = partial_flags | PTF_partial_set;
}
else if ( rc == -EINTR && i )
{
page->nr_validated_ptes = i;
- page->partial_pte = 0;
+ page->partial_flags = 0;
rc = -ERESTART;
}
else if ( rc < 0 && rc != -EINTR )
@@ -1549,7 +1569,7 @@ static int alloc_l2_table(struct page_in
if ( i )
{
page->nr_validated_ptes = i;
- page->partial_pte = 0;
+ page->partial_flags = 0;
current->arch.old_guest_ptpg = NULL;
current->arch.old_guest_table = page;
}
@@ -1573,7 +1593,8 @@ static int alloc_l3_table(struct page_in
unsigned long pfn = mfn_x(page_to_mfn(page));
l3_pgentry_t *pl3e;
unsigned int i;
- int rc = 0, partial = page->partial_pte;
+ int rc = 0;
+ unsigned int partial_flags = page->partial_flags;
pl3e = map_domain_page(_mfn(pfn));
@@ -1588,7 +1609,7 @@ static int alloc_l3_table(struct page_in
memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
- i++, partial = 0 )
+ i++, partial_flags = 0 )
{
if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
{
@@ -1605,20 +1626,22 @@ static int alloc_l3_table(struct page_in
else
rc = get_page_and_type_from_mfn(
l3e_get_mfn(pl3e[i]),
- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
+ PGT_l2_page_table | PGT_pae_xen_l2, d,
+ partial_flags | PTF_preemptible);
}
- else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
+ else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 )
continue;
if ( rc == -ERESTART )
{
page->nr_validated_ptes = i;
- page->partial_pte = partial ?: 1;
+ /* Set 'set', leave 'general ref' set if this entry was set */
+ page->partial_flags = partial_flags | PTF_partial_set;
}
else if ( rc == -EINTR && i )
{
page->nr_validated_ptes = i;
- page->partial_pte = 0;
+ page->partial_flags = 0;
rc = -ERESTART;
}
if ( rc < 0 )
@@ -1635,7 +1658,7 @@ static int alloc_l3_table(struct page_in
if ( i )
{
page->nr_validated_ptes = i;
- page->partial_pte = 0;
+ page->partial_flags = 0;
current->arch.old_guest_ptpg = NULL;
current->arch.old_guest_table = page;
}
@@ -1767,19 +1790,21 @@ static int alloc_l4_table(struct page_in
unsigned long pfn = mfn_x(page_to_mfn(page));
l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
unsigned int i;
- int rc = 0, partial = page->partial_pte;
+ int rc = 0;
+ unsigned int partial_flags = page->partial_flags;
for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
- i++, partial = 0 )
+ i++, partial_flags = 0 )
{
if ( !is_guest_l4_slot(d, i) ||
- (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
+ (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 )
continue;
if ( rc == -ERESTART )
{
page->nr_validated_ptes = i;
- page->partial_pte = partial ?: 1;
+ /* Set 'set', leave 'general ref' set if this entry was set */
+ page->partial_flags = partial_flags | PTF_partial_set;
}
else if ( rc < 0 )
{
@@ -1789,7 +1814,7 @@ static int alloc_l4_table(struct page_in
if ( i )
{
page->nr_validated_ptes = i;
- page->partial_pte = 0;
+ page->partial_flags = 0;
if ( rc == -EINTR )
rc = -ERESTART;
else
@@ -1842,19 +1867,20 @@ static int free_l2_table(struct page_inf
struct domain *d = page_get_owner(page);
unsigned long pfn = mfn_x(page_to_mfn(page));
l2_pgentry_t *pl2e;
- int rc = 0, partial = page->partial_pte;
- unsigned int i = page->nr_validated_ptes - !partial;
+ int rc = 0;
+ unsigned int partial_flags = page->partial_flags,
+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
pl2e = map_domain_page(_mfn(pfn));
for ( ; ; )
{
if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
- rc = put_page_from_l2e(pl2e[i], pfn, partial, false);
+ rc = put_page_from_l2e(pl2e[i], pfn, partial_flags);
if ( rc < 0 )
break;
- partial = 0;
+ partial_flags = 0;
if ( !i-- )
break;
@@ -1876,12 +1902,14 @@ static int free_l2_table(struct page_inf
else if ( rc == -ERESTART )
{
page->nr_validated_ptes = i;
- page->partial_pte = partial ?: -1;
+ page->partial_flags = (partial_flags & PTF_partial_set) ?
+ partial_flags :
+ (PTF_partial_set | PTF_partial_general_ref);
}
else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
{
page->nr_validated_ptes = i + 1;
- page->partial_pte = 0;
+ page->partial_flags = 0;
rc = -ERESTART;
}
@@ -1893,18 +1921,19 @@ static int free_l3_table(struct page_inf
struct domain *d = page_get_owner(page);
unsigned long pfn = mfn_x(page_to_mfn(page));
l3_pgentry_t *pl3e;
- int rc = 0, partial = page->partial_pte;
- unsigned int i = page->nr_validated_ptes - !partial;
+ int rc = 0;
+ unsigned int partial_flags = page->partial_flags,
+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
pl3e = map_domain_page(_mfn(pfn));
for ( ; ; )
{
- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
+ rc = put_page_from_l3e(pl3e[i], pfn, partial_flags);
if ( rc < 0 )
break;
- partial = 0;
+ partial_flags = 0;
if ( rc == 0 )
pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
@@ -1923,12 +1952,14 @@ static int free_l3_table(struct page_inf
if ( rc == -ERESTART )
{
page->nr_validated_ptes = i;
- page->partial_pte = partial ?: -1;
+ page->partial_flags = (partial_flags & PTF_partial_set) ?
+ partial_flags :
+ (PTF_partial_set | PTF_partial_general_ref);
}
else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
{
page->nr_validated_ptes = i + 1;
- page->partial_pte = 0;
+ page->partial_flags = 0;
rc = -ERESTART;
}
return rc > 0 ? 0 : rc;
@@ -1939,26 +1970,29 @@ static int free_l4_table(struct page_inf
struct domain *d = page_get_owner(page);
unsigned long pfn = mfn_x(page_to_mfn(page));
l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
- int rc = 0, partial = page->partial_pte;
- unsigned int i = page->nr_validated_ptes - !partial;
+ int rc = 0;
+ unsigned partial_flags = page->partial_flags,
+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
do {
if ( is_guest_l4_slot(d, i) )
- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
+ rc = put_page_from_l4e(pl4e[i], pfn, partial_flags);
if ( rc < 0 )
break;
- partial = 0;
+ partial_flags = 0;
} while ( i-- );
if ( rc == -ERESTART )
{
page->nr_validated_ptes = i;
- page->partial_pte = partial ?: -1;
+ page->partial_flags = (partial_flags & PTF_partial_set) ?
+ partial_flags :
+ (PTF_partial_set | PTF_partial_general_ref);
}
else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
{
page->nr_validated_ptes = i + 1;
- page->partial_pte = 0;
+ page->partial_flags = 0;
rc = -ERESTART;
}
@@ -2180,7 +2214,7 @@ static int mod_l2_entry(l2_pgentry_t *pl
return -EBUSY;
}
- put_page_from_l2e(ol2e, pfn, 0, true);
+ put_page_from_l2e(ol2e, pfn, PTF_defer);
return rc;
}
@@ -2248,7 +2282,7 @@ static int mod_l3_entry(l3_pgentry_t *pl
if ( !create_pae_xen_mappings(d, pl3e) )
BUG();
- put_page_from_l3e(ol3e, pfn, 0, 1);
+ put_page_from_l3e(ol3e, pfn, PTF_defer);
return rc;
}
@@ -2311,7 +2345,7 @@ static int mod_l4_entry(l4_pgentry_t *pl
return -EFAULT;
}
- put_page_from_l4e(ol4e, pfn, 0, 1);
+ put_page_from_l4e(ol4e, pfn, PTF_defer);
return rc;
}
@@ -2577,7 +2611,7 @@ int free_page_type(struct page_info *pag
if ( !(type & PGT_partial) )
{
page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
- page->partial_pte = 0;
+ page->partial_flags = 0;
}
switch ( type & PGT_type_mask )
@@ -2862,7 +2896,7 @@ static int __get_page_type(struct page_i
if ( !(x & PGT_partial) )
{
page->nr_validated_ptes = 0;
- page->partial_pte = 0;
+ page->partial_flags = 0;
}
page->linear_pt_count = 0;
rc = alloc_page_type(page, type, preemptible);
@@ -3037,7 +3071,8 @@ int new_guest_cr3(mfn_t mfn)
rc = paging_mode_refcounts(d)
? (get_page_from_mfn(mfn, d) ? 0 : -EINVAL)
- : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
+ : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d,
+ PTF_preemptible);
switch ( rc )
{
case 0:
@@ -3420,7 +3455,7 @@ long do_mmuext_op(
if ( op.arg1.mfn != 0 )
{
rc = get_page_and_type_from_mfn(
- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
+ _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible);
if ( unlikely(rc) )
{
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -157,19 +157,34 @@ struct page_info
* setting the flag must not drop that reference, whereas the instance
* clearing it will have to.
*
- * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
- * been partially validated. This implies that the general reference
- * to the page (acquired from get_page_from_lNe()) would be dropped
- * (again due to the apparent failure) and hence must be re-acquired
- * when resuming the validation, but must not be dropped when picking
- * up the page for invalidation.
- *
- * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
- * been partially invalidated. This is basically the opposite case of
- * above, i.e. the general reference to the page was not dropped in
- * put_page_from_lNe() (due to the apparent failure), and hence it
- * must be dropped when the put operation is resumed (and completes),
- * but it must not be acquired if picking up the page for validation.
+ * If partial_flags & PTF_partial_set is set, then the page at
+ * at @nr_validated_ptes had PGT_partial set as a result of an
+ * operation on the current page. (That page may or may not
+ * still have PGT_partial set.)
+ *
+ * If PTF_partial_general_ref is set, then the PTE at
+ * @nr_validated_ptef holds a general reference count for the
+ * page.
+ *
+ * This happens:
+ * - During de-validation, if de-validation of the page was
+ * interrupted
+ * - During validation, if an invalid entry is encountered and
+ * validation is preemptible
+ * - During validation, if PTF_partial_general_ref was set on
+ * this entry to begin with (perhaps because we're picking
+ * up from a partial de-validation).
+ *
+ * When resuming validation, if PTF_partial_general_ref is clear,
+ * then a general reference must be re-acquired; if it is set, no
+ * reference should be acquired.
+ *
+ * When resuming de-validation, if PTF_partial_general_ref is
+ * clear, no reference should be dropped; if it is set, a
+ * reference should be dropped.
+ *
+ * NB that PTF_partial_set and PTF_partial_general_ref are
+ * defined in mm.c, the only place where they are used.
*
* The 3rd field, @linear_pt_count, indicates
* - by a positive value, how many same-level page table entries a page
@@ -180,7 +195,7 @@ struct page_info
struct {
u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
u16 :16 - PAGETABLE_ORDER - 1 - 2;
- s16 partial_pte:2;
+ u16 partial_flags:2;
s16 linear_pt_count;
};