File xsa378-6.patch of Package xen.21119

AMD/IOMMU: re-arrange exclusion range and unity map recording

The spec makes no provisions for OS behavior here to depend on the
amount of RAM found on the system. While the spec may not sufficiently
clearly distinguish both kinds of regions, they are surely meant to be
separate things: Only regions with ACPI_IVMD_EXCLUSION_RANGE set should
be candidates for putting in the exclusion range registers. (As there's
only a single such pair of registers per IOMMU, secondary non-adjacent
regions with the flag set already get converted to unity mapped
regions.)

First of all, drop the dependency on max_page. With commit b4f042236ae0
("AMD/IOMMU: Cease using a dynamic height for the IOMMU pagetables") the
use of it here was stale anyway; it was bogus already before, as it
didn't account for max_page getting increased later on. Simply try an
exclusion range registration first, and if it fails (for being
unsuitable or non-mergeable), register a unity mapping range.

With this various local variables become unnecessary and hence get
dropped at the same time.

With the max_page boundary dropped for using unity maps, the minimum
page table tree height now needs both recording and enforcing in
amd_iommu_domain_init(). Since we can't predict which devices may get
assigned to a domain, our only option is to uniformly force at least
that height for all domains, now that the height isn't dynamic anymore.

Further don't make use of the exclusion range unless ACPI data says so.

Note that exclusion range registration in
register_range_for_all_devices() is on a best effort basis. Hence unity
map entries also registered are redundant when the former succeeded, but
they also do no harm. Improvements in this area can be done later imo.

Also adjust types where suitable without touching extra lines.

This is part of XSA-378.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>

--- a/xen/drivers/passthrough/amd/iommu_acpi.c
+++ b/xen/drivers/passthrough/amd/iommu_acpi.c
@@ -99,12 +99,8 @@ static struct amd_iommu * __init find_io
 }
 
 static int __init reserve_iommu_exclusion_range(
-    struct amd_iommu *iommu, uint64_t base, uint64_t limit,
-    bool all, bool iw, bool ir)
+    struct amd_iommu *iommu, paddr_t base, paddr_t limit, bool all)
 {
-    if ( !ir || !iw )
-        return -EPERM;
-
     /* need to extend exclusion range? */
     if ( iommu->exclusion_enable )
     {
@@ -133,14 +129,18 @@ static int __init reserve_unity_map_for_
 {
     struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg);
     struct ivrs_unity_map *unity_map = ivrs_mappings[bdf].unity_map;
+    int paging_mode = amd_iommu_get_paging_mode(PFN_UP(base + length));
+
+    if ( paging_mode < 0 )
+        return paging_mode;
 
     /* Check for overlaps. */
     for ( ; unity_map; unity_map = unity_map->next )
     {
         /*
          * Exact matches are okay. This can in particular happen when
-         * register_exclusion_range_for_device() calls here twice for the
-         * same (s,b,d,f).
+         * register_range_for_device() calls here twice for the same
+         * (s,b,d,f).
          */
         if ( base == unity_map->addr && length == unity_map->length &&
              ir == unity_map->read && iw == unity_map->write )
@@ -168,55 +168,52 @@ static int __init reserve_unity_map_for_
     unity_map->next = ivrs_mappings[bdf].unity_map;
     ivrs_mappings[bdf].unity_map = unity_map;
 
+    if ( paging_mode > amd_iommu_min_paging_mode )
+        amd_iommu_min_paging_mode = paging_mode;
+
     return 0;
 }
 
-static int __init register_exclusion_range_for_all_devices(
-    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+static int __init register_range_for_all_devices(
+    paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion)
 {
     int seg = 0; /* XXX */
-    unsigned long range_top, iommu_top, length;
     struct amd_iommu *iommu;
-    unsigned int bdf;
     int rc = 0;
 
     /* is part of exclusion range inside of IOMMU virtual address space? */
     /* note: 'limit' parameter is assumed to be page-aligned */
-    range_top = limit + PAGE_SIZE;
-    iommu_top = max_page * PAGE_SIZE;
-    if ( base < iommu_top )
-    {
-        if ( range_top > iommu_top )
-            range_top = iommu_top;
-        length = range_top - base;
-        /* reserve r/w unity-mapped page entries for devices */
-        /* note: these entries are part of the exclusion range */
-        for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ )
-            rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir);
-        /* push 'base' just outside of virtual address space */
-        base = iommu_top;
-    }
-    /* register IOMMU exclusion range settings */
-    if ( !rc && limit >= iommu_top )
+    if ( exclusion )
     {
         for_each_amd_iommu( iommu )
         {
-            rc = reserve_iommu_exclusion_range(iommu, base, limit,
-                                               true /* all */, iw, ir);
-            if ( rc )
-                break;
+            int ret = reserve_iommu_exclusion_range(iommu, base, limit,
+                                                    true /* all */);
+
+            if ( ret && !rc )
+                rc = ret;
         }
     }
 
+    if ( !exclusion || rc )
+    {
+        paddr_t length = limit + PAGE_SIZE - base;
+        unsigned int bdf;
+
+        /* reserve r/w unity-mapped page entries for devices */
+        for ( bdf = rc = 0; !rc && bdf < ivrs_bdf_entries; bdf++ )
+            rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir);
+    }
+
     return rc;
 }
 
-static int __init register_exclusion_range_for_device(
-    u16 bdf, unsigned long base, unsigned long limit, u8 iw, u8 ir)
+static int __init register_range_for_device(
+    unsigned int bdf, paddr_t base, paddr_t limit,
+    bool iw, bool ir, bool exclusion)
 {
     int seg = 0; /* XXX */
     struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(seg);
-    unsigned long range_top, iommu_top, length;
     struct amd_iommu *iommu;
     u16 req;
     int rc = 0;
@@ -230,27 +227,19 @@ static int __init register_exclusion_ran
     req = ivrs_mappings[bdf].dte_requestor_id;
 
     /* note: 'limit' parameter is assumed to be page-aligned */
-    range_top = limit + PAGE_SIZE;
-    iommu_top = max_page * PAGE_SIZE;
-    if ( base < iommu_top )
-    {
-        if ( range_top > iommu_top )
-            range_top = iommu_top;
-        length = range_top - base;
+    if ( exclusion )
+        rc = reserve_iommu_exclusion_range(iommu, base, limit,
+                                           false /* all */);
+    if ( !exclusion || rc )
+    {
+        paddr_t length = limit + PAGE_SIZE - base;
+
         /* reserve unity-mapped page entries for device */
-        /* note: these entries are part of the exclusion range */
         rc = reserve_unity_map_for_device(seg, bdf, base, length, iw, ir) ?:
              reserve_unity_map_for_device(seg, req, base, length, iw, ir);
-
-        /* push 'base' just outside of virtual address space */
-        base = iommu_top;
     }
-
-    /* register IOMMU exclusion range settings for device */
-    if ( !rc && limit >= iommu_top  )
+    else
     {
-        rc = reserve_iommu_exclusion_range(iommu, base, limit,
-                                           false /* all */, iw, ir);
         ivrs_mappings[bdf].dte_allow_exclusion = IOMMU_CONTROL_ENABLED;
         ivrs_mappings[req].dte_allow_exclusion = IOMMU_CONTROL_ENABLED;
     }
@@ -258,53 +247,42 @@ static int __init register_exclusion_ran
     return rc;
 }
 
-static int __init register_exclusion_range_for_iommu_devices(
-    struct amd_iommu *iommu,
-    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+static int __init register_range_for_iommu_devices(
+    struct amd_iommu *iommu, paddr_t base, paddr_t limit,
+    bool iw, bool ir, bool exclusion)
 {
-    unsigned long range_top, iommu_top, length;
+    /* note: 'limit' parameter is assumed to be page-aligned */
+    paddr_t length = limit + PAGE_SIZE - base;
     unsigned int bdf;
     u16 req;
-    int rc = 0;
+    int rc;
 
-    /* is part of exclusion range inside of IOMMU virtual address space? */
-    /* note: 'limit' parameter is assumed to be page-aligned */
-    range_top = limit + PAGE_SIZE;
-    iommu_top = max_page * PAGE_SIZE;
-    if ( base < iommu_top )
-    {
-        if ( range_top > iommu_top )
-            range_top = iommu_top;
-        length = range_top - base;
-        /* reserve r/w unity-mapped page entries for devices */
-        /* note: these entries are part of the exclusion range */
-        for ( bdf = 0; !rc && bdf < ivrs_bdf_entries; bdf++ )
-        {
-            if ( iommu == find_iommu_for_device(iommu->seg, bdf) )
-            {
-                req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id;
-                rc = reserve_unity_map_for_device(iommu->seg, bdf, base, length,
-                                                  iw, ir) ?:
-                     reserve_unity_map_for_device(iommu->seg, req, base, length,
-                                                  iw, ir);
-            }
-        }
-
-        /* push 'base' just outside of virtual address space */
-        base = iommu_top;
+    if ( exclusion )
+    {
+        rc = reserve_iommu_exclusion_range(iommu, base, limit, true /* all */);
+        if ( !rc )
+            return 0;
     }
 
-    /* register IOMMU exclusion range settings */
-    if ( !rc && limit >= iommu_top )
-        rc = reserve_iommu_exclusion_range(iommu, base, limit,
-                                           true /* all */, iw, ir);
+    /* reserve unity-mapped page entries for devices */
+    for ( bdf = rc = 0; !rc && bdf < ivrs_bdf_entries; bdf++ )
+    {
+        if ( iommu != find_iommu_for_device(iommu->seg, bdf) )
+            continue;
+
+        req = get_ivrs_mappings(iommu->seg)[bdf].dte_requestor_id;
+        rc = reserve_unity_map_for_device(iommu->seg, bdf, base, length,
+                                          iw, ir) ?:
+             reserve_unity_map_for_device(iommu->seg, req, base, length,
+                                          iw, ir);
+    }
 
     return rc;
 }
 
 static int __init parse_ivmd_device_select(
     const struct acpi_ivrs_memory *ivmd_block,
-    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+    paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion)
 {
     u16 bdf;
 
@@ -315,12 +293,12 @@ static int __init parse_ivmd_device_sele
         return -ENODEV;
     }
 
-    return register_exclusion_range_for_device(bdf, base, limit, iw, ir);
+    return register_range_for_device(bdf, base, limit, iw, ir, exclusion);
 }
 
 static int __init parse_ivmd_device_range(
     const struct acpi_ivrs_memory *ivmd_block,
-    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+    paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion)
 {
     unsigned int first_bdf, last_bdf, bdf;
     int error;
@@ -342,15 +320,15 @@ static int __init parse_ivmd_device_rang
     }
 
     for ( bdf = first_bdf, error = 0; (bdf <= last_bdf) && !error; bdf++ )
-        error = register_exclusion_range_for_device(
-            bdf, base, limit, iw, ir);
+        error = register_range_for_device(
+            bdf, base, limit, iw, ir, exclusion);
 
     return error;
 }
 
 static int __init parse_ivmd_device_iommu(
     const struct acpi_ivrs_memory *ivmd_block,
-    unsigned long base, unsigned long limit, u8 iw, u8 ir)
+    paddr_t base, paddr_t limit, bool iw, bool ir, bool exclusion)
 {
     int seg = 0; /* XXX */
     struct amd_iommu *iommu;
@@ -365,14 +343,14 @@ static int __init parse_ivmd_device_iomm
         return -ENODEV;
     }
 
-    return register_exclusion_range_for_iommu_devices(
-        iommu, base, limit, iw, ir);
+    return register_range_for_iommu_devices(
+        iommu, base, limit, iw, ir, exclusion);
 }
 
 static int __init parse_ivmd_block(const struct acpi_ivrs_memory *ivmd_block)
 {
     unsigned long start_addr, mem_length, base, limit;
-    u8 iw, ir;
+    bool iw = true, ir = true, exclusion = false;
 
     if ( ivmd_block->header.length < sizeof(*ivmd_block) )
     {
@@ -389,13 +367,11 @@ static int __init parse_ivmd_block(const
                     ivmd_block->header.type, start_addr, mem_length);
 
     if ( ivmd_block->header.flags & ACPI_IVMD_EXCLUSION_RANGE )
-        iw = ir = IOMMU_CONTROL_ENABLED;
+        exclusion = true;
     else if ( ivmd_block->header.flags & ACPI_IVMD_UNITY )
     {
-        iw = ivmd_block->header.flags & ACPI_IVMD_READ ?
-            IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED;
-        ir = ivmd_block->header.flags & ACPI_IVMD_WRITE ?
-            IOMMU_CONTROL_ENABLED : IOMMU_CONTROL_DISABLED;
+        iw = ivmd_block->header.flags & ACPI_IVMD_READ;
+        ir = ivmd_block->header.flags & ACPI_IVMD_WRITE;
     }
     else
     {
@@ -406,20 +382,20 @@ static int __init parse_ivmd_block(const
     switch( ivmd_block->header.type )
     {
     case ACPI_IVRS_TYPE_MEMORY_ALL:
-        return register_exclusion_range_for_all_devices(
-            base, limit, iw, ir);
+        return register_range_for_all_devices(
+            base, limit, iw, ir, exclusion);
 
     case ACPI_IVRS_TYPE_MEMORY_ONE:
-        return parse_ivmd_device_select(ivmd_block,
-                                        base, limit, iw, ir);
+        return parse_ivmd_device_select(ivmd_block, base, limit,
+                                        iw, ir, exclusion);
 
     case ACPI_IVRS_TYPE_MEMORY_RANGE:
-        return parse_ivmd_device_range(ivmd_block,
-                                       base, limit, iw, ir);
+        return parse_ivmd_device_range(ivmd_block, base, limit,
+                                       iw, ir, exclusion);
 
     case ACPI_IVRS_TYPE_MEMORY_IOMMU:
-        return parse_ivmd_device_iommu(ivmd_block,
-                                       base, limit, iw, ir);
+        return parse_ivmd_device_iommu(ivmd_block, base, limit,
+                                       iw, ir, exclusion);
 
     default:
         AMD_IOMMU_DEBUG("IVMD Error: Invalid Block Type!\n");
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -222,21 +222,7 @@ static int __must_check allocate_domain_
     return rc;
 }
 
-static int get_paging_mode(unsigned long max_frames)
-{
-    int level = 1;
-
-    BUG_ON(!max_frames);
-
-    while ( max_frames > PTE_PER_TABLE_SIZE )
-    {
-        max_frames = PTE_PER_TABLE_ALIGN(max_frames) >> PTE_PER_TABLE_SHIFT;
-        if ( ++level > 6 )
-            return -ENOMEM;
-    }
-
-    return level;
-}
+int __read_mostly amd_iommu_min_paging_mode = 1;
 
 static int amd_iommu_domain_init(struct domain *d)
 {
@@ -249,11 +235,13 @@ static int amd_iommu_domain_init(struct
      * - HVM could in principle use 3 or 4 depending on how much guest
      *   physical address space we give it, but this isn't known yet so use 4
      *   unilaterally.
+     * - Unity maps may require an even higher number.
      */
-    hd->arch.paging_mode = get_paging_mode(
-        is_hvm_domain(d)
-        ? 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT)
-        : get_upper_mfn_bound() + 1);
+    hd->arch.paging_mode = max(amd_iommu_get_paging_mode(
+            is_hvm_domain(d)
+            ? 1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT)
+            : get_upper_mfn_bound() + 1),
+        amd_iommu_min_paging_mode);
 
     return 0;
 }
--- a/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
+++ b/xen/include/asm-x86/hvm/svm/amd-iommu-proto.h
@@ -124,6 +124,8 @@ extern struct hpet_sbdf {
     } init;
 } hpet_sbdf;
 
+extern int amd_iommu_min_paging_mode;
+
 extern void *shared_intremap_table;
 extern unsigned long *shared_intremap_inuse;
 
@@ -167,6 +169,22 @@ static inline unsigned long region_to_pa
     return (PAGE_ALIGN(addr + size) - (addr & PAGE_MASK)) >> PAGE_SHIFT;
 }
 
+static inline int amd_iommu_get_paging_mode(unsigned long max_frames)
+{
+    int level = 1;
+
+    BUG_ON(!max_frames);
+
+    while ( max_frames > PTE_PER_TABLE_SIZE )
+    {
+        max_frames = PTE_PER_TABLE_ALIGN(max_frames) >> PTE_PER_TABLE_SHIFT;
+        if ( ++level > 6 )
+            return -ENOMEM;
+    }
+
+    return level;
+}
+
 static inline struct page_info* alloc_amd_iommu_pgtable(void)
 {
     struct page_info *pg;
openSUSE Build Service is sponsored by