File 5dbafad5-x86-mm-dont-drop-type-ref-unless.patch of Package xen.16553
# Commit c40b33d72630dcfa506d6fd856532d6152cb97dc
# Date 2019-10-31 16:16:37 +0100
# Author George Dunlap <george.dunlap@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/mm: Don't drop a type ref unless you held a ref to begin with
Validation and de-validation of pagetable trees may take arbitrarily
large amounts of time, and so must be preemptible. This is indicated
by setting the PGT_partial bit in the type_info, and setting
nr_validated_entries and partial_flags appropriately. Specifically,
if the entry at [nr_validated_entries] is partially validated,
partial_flags should have the PGT_partial_set bit set, and the entry
should hold a general reference count. During de-validation,
put_page_type() is called on partially validated entries.
Unfortunately, there are a number of issues with the current algorithm.
First, doing a "normal" put_page_type() is not safe when no type ref
is held: there is nothing to stop another vcpu from coming along and
picking up validation again: at which point the put_page_type may drop
the only page ref on an in-use page. Some examples are listed in the
appendix.
The core issue is that put_page_type() is being called both to clean
up PGT_partial, and to drop a type count; and has no way of knowing
which is which; and so if in between, PGT_partial is cleared,
put_page_type() will drop the type ref erroneously.
What is needed is to distinguish between two states:
- Dropping a type ref which is held
- Cleaning up a page which has been partially de/validated
Fix this by telling put_page_type() which of the two activities you
intend.
When cleaning up a partial de/validation, take no action unless you
find a page partially validated.
If put_page_type() is called without PTF_partial_set, and finds the
page in a PGT_partial state anyway, then there's certainly been a
misaccounting somewhere, and carrying on would almost certainly cause
a security issue, so crash the host instead.
In put_page_from_lNe, pass partial_flags on to _put_page_type().
old_guest_table may be set either with a fully validated page (when
using the "deferred put" pattern), or with a partially validated page
(when a normal "de-validation" is interrupted, or when a validation
fails part-way through due to invalid entries). Add a flag,
old_guest_table_partial, to indicate which of these it is, and use
that to pass the appropriate flag to _put_page_type().
While here, delete stray trailing whitespace.
This is part of XSA-299.
Reported-by: George Dunlap <george.dunlap@citrix.com>
Signed-off-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
-----
Appendix:
Suppose page A, when interpreted as an l3 pagetable, contains all
valid entries; and suppose A[x] points to page B, which when
interpreted as an l2 pagetable, contains all valid entries.
P1: PIN_L3_TABLE
A -> PGT_l3_table | 1 | valid
B -> PGT_l2_table | 1 | valid
P1: UNPIN_TABLE
> Arrange to interrupt after B has been de-validated
B:
type_info -> PGT_l2_table | 0
A:
type_info -> PGT_l3_table | 1 | partial
nr_validated_enties -> (less than x)
P2: mod_l4_entry to point to A
> Arrange for this to be interrupted while B is being validated
B:
type_info -> PGT_l2_table | 1 | partial
(nr_validated_entires &c set as appropriate)
A:
type_info -> PGT_l3_table | 1 | partial
nr_validated_entries -> x
partial_pte = 1
P3: mod_l3_entry some other unrelated l3 to point to B:
B:
type_info -> PGT_l2_table | 1
P1: Restart UNPIN_TABLE
At this point, since A.nr_validate_entries == x and A.partial_pte !=
0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping
its type count to 0 while it's still being pointed to by some other l3
A similar issue arises with old_guest_table. Consider the following
scenario:
Suppose A is a page which, when interpreted as an l2, has valid entries
until entry x, which is invalid.
V1: PIN_L2_TABLE(A)
<Validate until we try to validate [x], get -EINVAL>
A -> PGT_l2_table | 1 | PGT_partial
V1 -> old_guest_table = A
<delayed>
V2: PIN_L2_TABLE(A)
<Pick up where V1 left off, try to re-validate [x], get -EINVAL>
A -> PGT_l2_table | 1 | PGT_partial
V2 -> old_guest_table = A
<restart>
put_old_guest_table()
_put_page_type(A)
A -> PGT_l2_table | 0
V1: <restart>
put_old_guest_table()
_put_page_type(A) # UNDERFLOW
Indeed, it is possible to engineer for old_guest_table for every vcpu
a guest has to point to the same page.
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1075,9 +1075,15 @@ int arch_set_info_guest(
rc = -ERESTART;
/* Fallthrough */
case -ERESTART:
+ /*
+ * NB that we're putting the kernel-mode table
+ * here, which we've already successfully
+ * validated above; hence partial = false;
+ */
v->arch.old_guest_ptpg = NULL;
v->arch.old_guest_table =
pagetable_get_page(v->arch.guest_table);
+ v->arch.old_guest_table_partial = false;
v->arch.guest_table = pagetable_null();
break;
default:
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -1384,10 +1384,11 @@ static int put_page_from_l2e(l2_pgentry_
{
current->arch.old_guest_ptpg = ptpg;
current->arch.old_guest_table = pg;
+ current->arch.old_guest_table_partial = false;
}
else
{
- rc = _put_page_type(pg, PTF_preemptible, ptpg);
+ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg);
if ( likely(!rc) )
put_page(pg);
}
@@ -1410,6 +1411,7 @@ static int put_page_from_l3e(l3_pgentry_
unsigned long mfn = l3e_get_pfn(l3e);
int writeable = l3e_get_flags(l3e) & _PAGE_RW;
+ ASSERT(!(flags & PTF_partial_set));
ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
do {
put_data_page(mfn_to_page(_mfn(mfn)), writeable);
@@ -1422,12 +1424,14 @@ static int put_page_from_l3e(l3_pgentry_
if ( flags & PTF_defer )
{
+ ASSERT(!(flags & PTF_partial_set));
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
current->arch.old_guest_table = pg;
+ current->arch.old_guest_table_partial = false;
return 0;
}
- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
+ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn)));
if ( likely(!rc) )
put_page(pg);
@@ -1446,12 +1450,15 @@ static int put_page_from_l4e(l4_pgentry_
if ( flags & PTF_defer )
{
+ ASSERT(!(flags & PTF_partial_set));
current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
current->arch.old_guest_table = pg;
+ current->arch.old_guest_table_partial = false;
return 0;
}
- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
+ rc = _put_page_type(pg, flags | PTF_preemptible,
+ mfn_to_page(_mfn(pfn)));
if ( likely(!rc) )
put_page(pg);
}
@@ -1556,6 +1563,14 @@ static int alloc_l2_table(struct page_in
pl2e = map_domain_page(_mfn(pfn));
+ /*
+ * NB that alloc_l2_table will never set partial_pte on an l2; but
+ * free_l2_table might if a linear_pagetable entry is interrupted
+ * partway through de-validation. In that circumstance,
+ * get_page_from_l2e() will always return -EINVAL; and we must
+ * retain the type ref by doing the normal partial_flags tracking.
+ */
+
for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
i++, partial_flags = 0 )
{
@@ -1610,6 +1625,7 @@ static int alloc_l2_table(struct page_in
page->partial_flags = partial_flags;
current->arch.old_guest_ptpg = NULL;
current->arch.old_guest_table = page;
+ current->arch.old_guest_table_partial = true;
}
}
if ( rc < 0 )
@@ -1712,12 +1728,16 @@ static int alloc_l3_table(struct page_in
* builds.
*/
if ( current->arch.old_guest_table == l3e_get_page(l3e) )
+ {
+ ASSERT(current->arch.old_guest_table_partial);
page->partial_flags = PTF_partial_set;
+ }
else
ASSERT_UNREACHABLE();
}
current->arch.old_guest_ptpg = NULL;
current->arch.old_guest_table = page;
+ current->arch.old_guest_table_partial = true;
}
while ( i-- > 0 )
pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
@@ -1891,12 +1911,16 @@ static int alloc_l4_table(struct page_in
* builds.
*/
if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) )
+ {
+ ASSERT(current->arch.old_guest_table_partial);
page->partial_flags = PTF_partial_set;
+ }
else
ASSERT_UNREACHABLE();
}
current->arch.old_guest_ptpg = NULL;
current->arch.old_guest_table = page;
+ current->arch.old_guest_table_partial = true;
}
}
}
@@ -2760,6 +2784,28 @@ static int _put_page_type(struct page_in
x = y;
nx = x - 1;
+ /*
+ * Is this expected to do a full reference drop, or only
+ * cleanup partial validation / devalidation?
+ *
+ * If the former, the caller must hold a "full" type ref;
+ * which means the page must be validated. If the page is
+ * *not* fully validated, continuing would almost certainly
+ * open up a security hole. An exception to this is during
+ * domain destruction, where PGT_validated can be dropped
+ * without dropping a type ref.
+ *
+ * If the latter, do nothing unless type PGT_partial is set.
+ * If it is set, the type count must be 1.
+ */
+ if ( !(flags & PTF_partial_set) )
+ BUG_ON((x & PGT_partial) ||
+ !((x & PGT_validated) || page_get_owner(page)->is_dying));
+ else if ( !(x & PGT_partial) )
+ return 0;
+ else
+ BUG_ON((x & PGT_count_mask) != 1);
+
ASSERT((x & PGT_count_mask) != 0);
if ( unlikely((nx & PGT_count_mask) == 0) )
@@ -3012,17 +3058,34 @@ int put_old_guest_table(struct vcpu *v)
if ( !v->arch.old_guest_table )
return 0;
- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
- v->arch.old_guest_ptpg) )
+ rc = _put_page_type(v->arch.old_guest_table,
+ PTF_preemptible |
+ ( v->arch.old_guest_table_partial ?
+ PTF_partial_set : 0 ),
+ v->arch.old_guest_ptpg);
+
+ if ( rc == -ERESTART || rc == -EINTR )
{
- case -EINTR:
- case -ERESTART:
+ v->arch.old_guest_table_partial = (rc == -ERESTART);
return -ERESTART;
- case 0:
- put_page(v->arch.old_guest_table);
}
+ /*
+ * It shouldn't be possible for _put_page_type() to return
+ * anything else at the moment; but if it does happen in
+ * production, leaking the type ref is probably the best thing to
+ * do. Either way, drop the general ref held by old_guest_table.
+ */
+ ASSERT(rc == 0);
+
+ put_page(v->arch.old_guest_table);
v->arch.old_guest_table = NULL;
+ v->arch.old_guest_ptpg = NULL;
+ /*
+ * Safest default if someone sets old_guest_table without
+ * explicitly setting old_guest_table_partial.
+ */
+ v->arch.old_guest_table_partial = true;
return rc;
}
@@ -3175,11 +3238,11 @@ int new_guest_cr3(mfn_t mfn)
switch ( rc = put_page_and_type_preemptible(page) )
{
case -EINTR:
- rc = -ERESTART;
- /* fallthrough */
case -ERESTART:
curr->arch.old_guest_ptpg = NULL;
curr->arch.old_guest_table = page;
+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
+ rc = -ERESTART;
break;
default:
BUG_ON(rc);
@@ -3448,6 +3511,7 @@ long do_mmuext_op(
{
curr->arch.old_guest_ptpg = NULL;
curr->arch.old_guest_table = page;
+ curr->arch.old_guest_table_partial = false;
}
}
}
@@ -3482,6 +3546,11 @@ long do_mmuext_op(
case -ERESTART:
curr->arch.old_guest_ptpg = NULL;
curr->arch.old_guest_table = page;
+ /*
+ * EINTR means we still hold the type ref; ERESTART
+ * means PGT_partial holds the type ref
+ */
+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
rc = 0;
break;
default:
@@ -3550,11 +3619,15 @@ long do_mmuext_op(
switch ( rc = put_page_and_type_preemptible(page) )
{
case -EINTR:
- rc = -ERESTART;
- /* fallthrough */
case -ERESTART:
curr->arch.old_guest_ptpg = NULL;
curr->arch.old_guest_table = page;
+ /*
+ * EINTR means we still hold the type ref;
+ * ERESTART means PGT_partial holds the ref
+ */
+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
+ rc = -ERESTART;
break;
default:
BUG_ON(rc);
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -309,7 +309,7 @@ struct arch_domain
struct paging_domain paging;
struct p2m_domain *p2m;
- /* To enforce lock ordering in the pod code wrt the
+ /* To enforce lock ordering in the pod code wrt the
* page_alloc lock */
int page_alloc_unlock_level;
@@ -548,6 +548,8 @@ struct arch_vcpu
struct page_info *old_guest_table; /* partially destructed pagetable */
struct page_info *old_guest_ptpg; /* containing page table of the */
/* former, if any */
+ bool old_guest_table_partial; /* Are we dropping a type ref, or just
+ * finishing up a partial de-validation? */
/* guest_table holds a ref to the page, and also a type-count unless
* shadow refcounts are in use */
pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */