File 62a1e649-x86-track-and-flush-non-coherent.patch of Package xen.25153
# Commit c1c9cae3a9633054b177c5de21ad7268162b2f2c
# Date 2022-06-09 14:23:37 +0200
# Author Andrew Cooper <andrew.cooper3@citrix.com>
# Committer Jan Beulich <jbeulich@suse.com>
x86/pv: Track and flush non-coherent mappings of RAM
There are legitimate uses of WC mappings of RAM, e.g. for DMA buffers with
devices that make non-coherent writes.  The Linux sound subsystem makes
extensive use of this technique.
For such usecases, the guest's DMA buffer is mapped and consistently used as
WC, and Xen doesn't interact with the buffer.
However, a mischevious guest can use WC mappings to deliberately create
non-coherency between the cache and RAM, and use this to trick Xen into
validating a pagetable which isn't actually safe.
Allocate a new PGT_non_coherent to track the non-coherency of mappings.  Set
it whenever a non-coherent writeable mapping is created.  If the page is used
as anything other than PGT_writable_page, force a cache flush before
validation.  Also force a cache flush before the page is returned to the heap.
This is CVE-2022-26364, part of XSA-402.
Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -997,6 +997,15 @@ get_page_from_l1e(
         return -EACCES;
     }
 
+    /*
+     * Track writeable non-coherent mappings to RAM pages, to trigger a cache
+     * flush later if the target is used as anything but a PGT_writeable page.
+     * We care about all writeable mappings, including foreign mappings.
+     */
+    if ( !boot_cpu_has(X86_FEATURE_XEN_SELFSNOOP) &&
+         (l1f & (PAGE_CACHE_ATTRS | _PAGE_RW)) == (_PAGE_WC | _PAGE_RW) )
+        set_bit(_PGT_non_coherent, &page->u.inuse.type_info);
+
     return 0;
 
  could_not_pin:
@@ -2454,6 +2463,19 @@ static int cleanup_page_mappings(struct
         }
     }
 
+    /*
+     * Flush the cache if there were previously non-coherent writeable
+     * mappings of this page.  This forces the page to be coherent before it
+     * is freed back to the heap.
+     */
+    if ( __test_and_clear_bit(_PGT_non_coherent, &page->u.inuse.type_info) )
+    {
+        void *addr = __map_domain_page(page);
+
+        cache_flush(addr, PAGE_SIZE);
+        unmap_domain_page(addr);
+    }
+
     return rc;
 }
 
@@ -3029,6 +3051,22 @@ static int _get_page_type(struct page_in
     if ( unlikely(!(nx & PGT_validated)) )
     {
         /*
+         * Flush the cache if there were previously non-coherent mappings of
+         * this page, and we're trying to use it as anything other than a
+         * writeable page.  This forces the page to be coherent before we
+         * validate its contents for safety.
+         */
+        if ( (nx & PGT_non_coherent) && type != PGT_writable_page )
+        {
+            void *addr = __map_domain_page(page);
+
+            cache_flush(addr, PAGE_SIZE);
+            unmap_domain_page(addr);
+
+            page->u.inuse.type_info &= ~PGT_non_coherent;
+        }
+
+        /*
          * No special validation needed for writable or shared pages.  Page
          * tables and GDT/LDT need to have their contents audited.
          *
--- a/xen/arch/x86/pv/grant_table.c
+++ b/xen/arch/x86/pv/grant_table.c
@@ -109,7 +109,17 @@ int create_grant_pv_mapping(uint64_t add
 
     ol1e = *pl1e;
     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
+    {
+        /*
+         * We always create mappings in this path.  However, our caller,
+         * map_grant_ref(), only passes potentially non-zero cache_flags for
+         * MMIO frames, so this path doesn't create non-coherent mappings of
+         * RAM frames and there's no need to calculate PGT_non_coherent.
+         */
+        ASSERT(!cache_flags || is_iomem_page(frame));
+
         rc = GNTST_okay;
+    }
 
  out_unlock:
     page_unlock(page);
@@ -294,7 +304,18 @@ int replace_grant_pv_mapping(uint64_t ad
                  l1e_get_flags(ol1e), addr, grant_pte_flags);
 
     if ( UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, curr, 0) )
+    {
+        /*
+         * Generally, replace_grant_pv_mapping() is used to destroy mappings
+         * (n1le = l1e_empty()), but it can be a present mapping on the
+         * GNTABOP_unmap_and_replace path.
+         *
+         * In such cases, the PTE is fully transplanted from its old location
+         * via steal_linear_addr(), so we need not perform PGT_non_coherent
+         * checking here.
+         */
         rc = GNTST_okay;
+    }
 
  out_unlock:
     page_unlock(page);
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -53,8 +53,12 @@
 #define _PGT_partial      PG_shift(8)
 #define PGT_partial       PG_mask(1, 8)
 
+/* Has this page been mapped writeable with a non-coherent memory type? */
+#define _PGT_non_coherent PG_shift(9)
+#define PGT_non_coherent  PG_mask(1, 9)
+
  /* Count of uses of this frame as its current type. */
-#define PGT_count_width   PG_shift(8)
+#define PGT_count_width   PG_shift(9)
 #define PGT_count_mask    ((1UL<<PGT_count_width)-1)
 
 /* Are the 'type mask' bits identical? */