File xsa400-12.patch of Package xen

From: Jan Beulich <jbeulich@suse.com>
Subject: IOMMU/x86: use per-device page tables for quarantining

Devices with RMRRs / unity mapped regions, due to it being unspecified
how/when these memory regions may be accessed, may not be left
disconnected from the mappings of these regions (as long as it's not
certain that the device has been fully quiesced). Hence even the page
tables used when quarantining such devices need to have mappings of
those regions. This implies installing page tables in the first place
even when not in scratch-page quarantining mode.

This is CVE-2022-26361 / part of XSA-400.

While for the purpose here it would be sufficient to have devices with
RMRRs / unity mapped regions use per-device page tables, extend this to
all devices (in scratch-page quarantining mode). This allows the leaf
pages to be mapped r/w, thus covering also memory writes (rather than
just reads) issued by non-quiescent devices.

Set up quarantine page tables as late as possible, yet early enough to
not encounter failure during de-assign. This means setup generally
happens in assign_device(), while (for now) the one in deassign_device()
is there mainly to be on the safe side.

As to the removal of QUARANTINE_SKIP() from domain_context_unmap_one():
I think this was never really needed there, as the function explicitly
deals with finding a non-present context entry. Leaving it there would
require propagating pgd_maddr into the function (like was done by "VT-d:
prepare for per-device quarantine page tables" for
domain_context_mapping_one()).

In VT-d's DID allocation function don't require the IOMMU lock to be
held anymore: All involved code paths hold pcidevs_lock, so this way we
avoid the need to acquire the IOMMU lock around the new call to
context_set_domain_id().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/xen/include/asm-x86/pci.h
+++ b/xen/include/asm-x86/pci.h
@@ -1,6 +1,8 @@
 #ifndef __X86_PCI_H__
 #define __X86_PCI_H__
 
+#include <xen/mm.h>
+
 #define CF8_BDF(cf8)     (  ((cf8) & 0x00ffff00) >> 8)
 #define CF8_ADDR_LO(cf8) (   (cf8) & 0x000000fc)
 #define CF8_ADDR_HI(cf8) (  ((cf8) & 0x0f000000) >> 16)
@@ -18,7 +20,18 @@ struct arch_pci_dev {
      * them don't race (de)initialization and hence don't strictly need any
      * locking.
      */
+    union {
+        /* Subset of struct arch_iommu's fields, to be used in dom_io. */
+        struct {
+            uint64_t pgd_maddr;
+        } vtd;
+        struct {
+            struct page_info *root_table;
+        } amd;
+    };
     domid_t pseudo_domid;
+    mfn_t leaf_mfn;
+    struct page_list_head pgtables_list;
 };
 
 int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
--- a/xen/drivers/passthrough/amd/iommu.h
+++ b/xen/drivers/passthrough/amd/iommu.h
@@ -237,7 +237,8 @@ int amd_iommu_init_late(void);
 int amd_iommu_update_ivrs_mapping_acpi(void);
 int iov_adjust_irq_affinities(void);
 
-int amd_iommu_quarantine_init(struct domain *d);
+int amd_iommu_quarantine_init(struct pci_dev *pdev, bool scratch_page);
+void amd_iommu_quarantine_teardown(struct pci_dev *pdev);
 
 /* mapping functions */
 int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn,
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -598,64 +598,138 @@ int amd_iommu_get_reserved_device_memory
     return 0;
 }
 
-int __init amd_iommu_quarantine_init(struct domain *d)
+static int fill_qpt(union amd_iommu_pte *this, unsigned int level,
+                    struct page_info *pgs[IOMMU_MAX_PT_LEVELS])
 {
-    struct domain_iommu *hd = dom_iommu(d);
+    struct domain_iommu *hd = dom_iommu(dom_io);
+    unsigned int i;
+    int rc = 0;
+
+    for ( i = 0; !rc && i < PTE_PER_TABLE_SIZE; ++i )
+    {
+        union amd_iommu_pte *pte = &this[i], *next;
+
+        if ( !pte->pr )
+        {
+            if ( !pgs[level] )
+            {
+                /*
+                 * The pgtable allocator is fine for the leaf page, as well as
+                 * page table pages, and the resulting allocations are always
+                 * zeroed.
+                 */
+                pgs[level] = iommu_alloc_pgtable(hd);
+                if ( !pgs[level] )
+                {
+                    rc = -ENOMEM;
+                    break;
+                }
+
+                if ( level )
+                {
+                    next = __map_domain_page(pgs[level]);
+                    rc = fill_qpt(next, level - 1, pgs);
+                    unmap_domain_page(next);
+                }
+            }
+
+            /*
+             * PDEs are essentially a subset of PTEs, so this function
+             * is fine to use even at the leaf.
+             */
+            set_iommu_pde_present(pte, mfn_x(page_to_mfn(pgs[level])), level,
+                                  true, true);
+        }
+        else if ( level && pte->next_level )
+        {
+            next = map_domain_page(_mfn(pte->mfn));
+            rc = fill_qpt(next, level - 1, pgs);
+            unmap_domain_page(next);
+        }
+    }
+
+    return rc;
+}
+
+int amd_iommu_quarantine_init(struct pci_dev *pdev, bool scratch_page)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
     unsigned long end_gfn =
         1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
     unsigned int level = amd_iommu_get_paging_mode(end_gfn);
-    union amd_iommu_pte *table;
+    unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf);
+    const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    int rc;
+
+    ASSERT(pcidevs_locked());
+    ASSERT(!hd->arch.amd.root_table);
+    ASSERT(page_list_empty(&hd->arch.pgtables.list));
 
-    if ( hd->arch.amd.root_table )
-    {
-        ASSERT_UNREACHABLE();
+    if ( !scratch_page && !ivrs_mappings[req_id].unity_map )
         return 0;
-    }
 
-    spin_lock(&hd->arch.mapping_lock);
+    ASSERT(pdev->arch.pseudo_domid != DOMID_INVALID);
 
-    hd->arch.amd.root_table = iommu_alloc_pgtable(hd);
-    if ( !hd->arch.amd.root_table )
-        goto out;
+    if ( pdev->arch.amd.root_table )
+    {
+        clear_domain_page(pdev->arch.leaf_mfn);
+        return 0;
+    }
 
-    table = __map_domain_page(hd->arch.amd.root_table);
-    while ( level )
+    pdev->arch.amd.root_table = iommu_alloc_pgtable(hd);
+    if ( !pdev->arch.amd.root_table )
+        return -ENOMEM;
+
+    /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
+    hd->arch.amd.root_table = pdev->arch.amd.root_table;
+
+    rc = amd_iommu_reserve_domain_unity_map(dom_io,
+                                            ivrs_mappings[req_id].unity_map,
+                                            0);
+
+    iommu_identity_map_teardown(dom_io);
+    hd->arch.amd.root_table = NULL;
+
+    if ( rc )
+        AMD_IOMMU_WARN("%pp: quarantine unity mapping failed\n", &pdev->sbdf);
+    else if ( scratch_page )
     {
-        struct page_info *pg;
-        unsigned int i;
+        union amd_iommu_pte *root;
+        struct page_info *pgs[IOMMU_MAX_PT_LEVELS] = {};
 
-        /*
-         * The pgtable allocator is fine for the leaf page, as well as
-         * page table pages, and the resulting allocations are always
-         * zeroed.
-         */
-        pg = iommu_alloc_pgtable(hd);
-        if ( !pg )
-            break;
+        spin_lock(&hd->arch.mapping_lock);
 
-        for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ )
-        {
-            union amd_iommu_pte *pde = &table[i];
+        root = __map_domain_page(pdev->arch.amd.root_table);
+        rc = fill_qpt(root, level - 1, pgs);
+        unmap_domain_page(root);
 
-            /*
-             * PDEs are essentially a subset of PTEs, so this function
-             * is fine to use even at the leaf.
-             */
-            set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1,
-                                  false, true);
-        }
+        pdev->arch.leaf_mfn = page_to_mfn(pgs[0]);
 
-        unmap_domain_page(table);
-        table = __map_domain_page(pg);
-        level--;
+        spin_unlock(&hd->arch.mapping_lock);
     }
-    unmap_domain_page(table);
 
- out:
-    spin_unlock(&hd->arch.mapping_lock);
+    page_list_move(&pdev->arch.pgtables_list, &hd->arch.pgtables.list);
+
+    if ( rc )
+        amd_iommu_quarantine_teardown(pdev);
+
+    return rc;
+}
+
+void amd_iommu_quarantine_teardown(struct pci_dev *pdev)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
+
+    ASSERT(pcidevs_locked());
+
+    if ( !pdev->arch.amd.root_table )
+        return;
 
-    /* Pages leaked in failure case */
-    return level ? -ENOMEM : 0;
+    ASSERT(page_list_empty(&hd->arch.pgtables.list));
+    page_list_move(&hd->arch.pgtables.list, &pdev->arch.pgtables_list);
+    while ( iommu_free_pgtables(dom_io) == -ERESTART )
+        /* nothing */;
+    pdev->arch.amd.root_table = NULL;
 }
 
 /*
--- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -26,7 +26,7 @@
 #include "../ats.h"
 
 /* dom_io is used as a sentinel for quarantined devices */
-#define QUARANTINE_SKIP(d) ((d) == dom_io && !dom_iommu(d)->arch.amd.root_table)
+#define QUARANTINE_SKIP(d, p) ((d) == dom_io && !(p)->arch.amd.root_table)
 
 static bool_t __read_mostly init_done;
 
@@ -125,8 +125,10 @@ static int __must_check amd_iommu_setup_
     u8 bus = pdev->bus;
     struct domain_iommu *hd = dom_iommu(domain);
     const struct ivrs_mappings *ivrs_dev;
+    const struct page_info *root_pg;
+    domid_t domid;
 
-    if ( QUARANTINE_SKIP(domain) )
+    if ( QUARANTINE_SKIP(domain, pdev) )
         return 0;
 
     BUG_ON(!hd->arch.amd.paging_mode || !iommu->dev_table.buffer);
@@ -147,14 +149,25 @@ static int __must_check amd_iommu_setup_
     dte = &table[req_id];
     ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
 
+    if ( domain != dom_io )
+    {
+        root_pg = hd->arch.amd.root_table;
+        domid = domain->domain_id;
+    }
+    else
+    {
+        root_pg = pdev->arch.amd.root_table;
+        domid = pdev->arch.pseudo_domid;
+    }
+
     spin_lock_irqsave(&iommu->lock, flags);
 
     if ( !dte->v || !dte->tv )
     {
         /* bind DTE to domain page-tables */
         rc = amd_iommu_set_root_page_table(
-                 dte, page_to_maddr(hd->arch.amd.root_table),
-                 domain->domain_id, hd->arch.amd.paging_mode, sr_flags);
+                 dte, page_to_maddr(root_pg), domid,
+                 hd->arch.amd.paging_mode, sr_flags);
         if ( rc )
         {
             ASSERT(rc < 0);
@@ -181,7 +194,7 @@ static int __must_check amd_iommu_setup_
 
         amd_iommu_flush_device(iommu, req_id);
     }
-    else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.amd.root_table)) )
+    else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) )
     {
         /*
          * Strictly speaking if the device is the only one with this requestor
@@ -194,8 +207,8 @@ static int __must_check amd_iommu_setup_
             rc = -EOPNOTSUPP;
         else
             rc = amd_iommu_set_root_page_table(
-                     dte, page_to_maddr(hd->arch.amd.root_table),
-                     domain->domain_id, hd->arch.amd.paging_mode, sr_flags);
+                     dte, page_to_maddr(root_pg), domid,
+                     hd->arch.amd.paging_mode, sr_flags);
         if ( rc < 0 )
         {
             spin_unlock_irqrestore(&iommu->lock, flags);
@@ -214,6 +227,7 @@ static int __must_check amd_iommu_setup_
               * intended anyway.
               */
              !pdev->domain->is_dying &&
+             pdev->domain != dom_io &&
              (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
               pdev->phantom_stride) )
             AMD_IOMMU_WARN(" %pp: reassignment may cause %pd data corruption\n",
@@ -246,9 +260,8 @@ static int __must_check amd_iommu_setup_
     AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
                     "root table = %#"PRIx64", "
                     "domain = %d, paging mode = %d\n",
-                    req_id, pdev->type,
-                    page_to_maddr(hd->arch.amd.root_table),
-                    domain->domain_id, hd->arch.amd.paging_mode);
+                    req_id, pdev->type, page_to_maddr(root_pg),
+                    domid, hd->arch.amd.paging_mode);
 
     ASSERT(pcidevs_locked());
 
@@ -327,7 +340,7 @@ int amd_iommu_alloc_root(struct domain *
 {
     struct domain_iommu *hd = dom_iommu(d);
 
-    if ( unlikely(!hd->arch.amd.root_table) )
+    if ( unlikely(!hd->arch.amd.root_table) && d != dom_io )
     {
         hd->arch.amd.root_table = iommu_alloc_pgtable(hd);
         if ( !hd->arch.amd.root_table )
@@ -391,7 +404,7 @@ static void amd_iommu_disable_domain_dev
     int req_id;
     u8 bus = pdev->bus;
 
-    if ( QUARANTINE_SKIP(domain) )
+    if ( QUARANTINE_SKIP(domain, pdev) )
         return;
 
     ASSERT(pcidevs_locked());
@@ -430,7 +443,7 @@ static void amd_iommu_disable_domain_dev
 
         AMD_IOMMU_DEBUG("Disable: device id = %#x, "
                         "domain = %d, paging mode = %d\n",
-                        req_id,  domain->domain_id,
+                        req_id, dte->domain_id,
                         dom_iommu(domain)->arch.amd.paging_mode);
     }
     else
@@ -453,7 +466,7 @@ static int reassign_device(struct domain
         return -ENODEV;
     }
 
-    if ( !QUARANTINE_SKIP(target) )
+    if ( !QUARANTINE_SKIP(target, pdev) )
     {
         rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
         if ( rc )
@@ -655,6 +668,8 @@ static int amd_iommu_remove_device(u8 de
         AMD_IOMMU_WARN("%pd: unity unmapping failed for %pp\n",
                        pdev->domain, &pdev->sbdf);
 
+    amd_iommu_quarantine_teardown(pdev);
+
     iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
     pdev->arch.pseudo_domid = DOMID_INVALID;
 
--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -443,21 +443,22 @@ int iommu_iotlb_flush_all(struct domain
     return rc;
 }
 
-static int __init iommu_quarantine_init(void)
+int iommu_quarantine_dev_init(device_t *dev)
 {
     const struct domain_iommu *hd = dom_iommu(dom_io);
-    int rc;
 
-    dom_io->options |= XEN_DOMCTL_CDF_iommu;
+    if ( !iommu_quarantine || !hd->platform_ops->quarantine_init )
+        return 0;
 
-    rc = iommu_domain_init(dom_io, 0);
-    if ( rc || iommu_quarantine < IOMMU_quarantine_scratch_page )
-        return rc;
+    return iommu_call(hd->platform_ops, quarantine_init,
+                      dev, iommu_quarantine == IOMMU_quarantine_scratch_page);
+}
 
-    if ( !hd->platform_ops->quarantine_init )
-        return 0;
+static int __init iommu_quarantine_init(void)
+{
+    dom_io->options |= XEN_DOMCTL_CDF_iommu;
 
-    return hd->platform_ops->quarantine_init(dom_io);
+    return iommu_domain_init(dom_io, 0);
 }
 
 int __init iommu_setup(void)
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -852,9 +852,16 @@ static int deassign_device(struct domain
         return -ENODEV;
 
     /* De-assignment from dom_io should de-quarantine the device */
-    target = ((pdev->quarantine || iommu_quarantine) &&
-              pdev->domain != dom_io) ?
-        dom_io : hardware_domain;
+    if ( (pdev->quarantine || iommu_quarantine) && pdev->domain != dom_io )
+    {
+        ret = iommu_quarantine_dev_init(pci_to_dev(pdev));
+        if ( ret )
+           return ret;
+
+        target = dom_io;
+    }
+    else
+        target = hardware_domain;
 
     while ( pdev->phantom_stride )
     {
@@ -1424,6 +1431,13 @@ static int assign_device(struct domain *
     if ( rc )
         goto done;
 
+    if ( pdev->domain != dom_io )
+    {
+        rc = iommu_quarantine_dev_init(pci_to_dev(pdev));
+        if ( rc )
+            goto done;
+    }
+
     pdev->fault.count = 0;
 
     if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag)) )
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -45,6 +45,11 @@
 
 /* dom_io is used as a sentinel for quarantined devices */
 #define QUARANTINE_SKIP(d, pgd_maddr) ((d) == dom_io && !(pgd_maddr))
+#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \
+                                             : (pdev)->arch.pseudo_domid)
+#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \
+                                 ? dom_iommu(d)->arch.vtd.pgd_maddr \
+                                 : (pdev)->arch.vtd.pgd_maddr)
 
 /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
 bool __read_mostly untrusted_msi;
@@ -88,13 +93,18 @@ static int get_iommu_did(domid_t domid,
 
 #define DID_FIELD_WIDTH 16
 #define DID_HIGH_OFFSET 8
+
+/*
+ * This function may have "context" passed as NULL, to merely obtain a DID
+ * for "domid".
+ */
 static int context_set_domain_id(struct context_entry *context,
                                  domid_t domid, struct vtd_iommu *iommu)
 {
     unsigned long nr_dom, i;
     int found = 0;
 
-    ASSERT(spin_is_locked(&iommu->lock));
+    ASSERT(pcidevs_locked());
 
     nr_dom = cap_ndoms(iommu->cap);
     i = find_first_bit(iommu->domid_bitmap, nr_dom);
@@ -120,8 +130,13 @@ static int context_set_domain_id(struct
     }
 
     set_bit(i, iommu->domid_bitmap);
-    context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
-    context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
+
+    if ( context )
+    {
+        context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
+        context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
+    }
+
     return 0;
 }
 
@@ -171,8 +186,12 @@ static void check_cleanup_domid_map(cons
                                     const struct pci_dev *exclude,
                                     struct vtd_iommu *iommu)
 {
-    bool found = any_pdev_behind_iommu(d, exclude, iommu);
+    bool found;
+
+    if ( d == dom_io )
+        return;
 
+    found = any_pdev_behind_iommu(d, exclude, iommu);
     /*
      * Hidden devices are associated with DomXEN but usable by the hardware
      * domain. Hence they need considering here as well.
@@ -1426,7 +1445,7 @@ int domain_context_mapping_one(
         domid = iommu->domid_map[prev_did];
         if ( domid < DOMID_FIRST_RESERVED )
             prev_dom = rcu_lock_domain_by_id(domid);
-        else if ( domid == DOMID_IO )
+        else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK )
             prev_dom = rcu_lock_domain(dom_io);
         if ( !prev_dom )
         {
@@ -1582,15 +1601,12 @@ int domain_context_mapping_one(
     {
         if ( !prev_dom )
             ret = domain_context_unmap_one(domain, iommu, bus, devfn,
-                                           domain->domain_id);
+                                           DEVICE_DOMID(domain, pdev));
         else if ( prev_dom != domain ) /* Avoid infinite recursion. */
-        {
-            hd = dom_iommu(prev_dom);
             ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
-                                             domain->domain_id,
-                                             hd->arch.vtd.pgd_maddr,
+                                             DEVICE_DOMID(prev_dom, pdev),
+                                             DEVICE_PGTABLE(prev_dom, pdev),
                                              mode & MAP_WITH_RMRR) < 0;
-        }
         else
             ret = 1;
 
@@ -1612,7 +1628,7 @@ static int domain_context_mapping(struct
 {
     const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
     const struct acpi_rmrr_unit *rmrr;
-    paddr_t pgd_maddr = dom_iommu(domain)->arch.vtd.pgd_maddr;
+    paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev);
     domid_t orig_domid = pdev->arch.pseudo_domid;
     int ret = 0;
     unsigned int i, mode = 0;
@@ -1641,7 +1657,7 @@ static int domain_context_mapping(struct
         break;
     }
 
-    if ( domain != pdev->domain )
+    if ( domain != pdev->domain && pdev->domain != dom_io )
     {
         if ( pdev->domain->is_dying )
             mode |= MAP_OWNER_DYING;
@@ -1683,8 +1699,8 @@ static int domain_context_mapping(struct
         if ( iommu_debug )
             printk(VTDPREFIX "%pd:PCIe: map %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
-        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev, domain->domain_id, pgd_maddr,
+        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev,
+                                         DEVICE_DOMID(domain, pdev), pgd_maddr,
                                          mode);
         if ( ret > 0 )
             ret = 0;
@@ -1710,8 +1726,8 @@ static int domain_context_mapping(struct
                    domain, &PCI_SBDF3(seg, bus, devfn));
 
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev, domain->domain_id, pgd_maddr,
-                                         mode);
+                                         pdev, DEVICE_DOMID(domain, pdev),
+                                         pgd_maddr, mode);
         if ( ret < 0 )
             break;
         prev_present = ret;
@@ -1739,8 +1755,8 @@ static int domain_context_mapping(struct
          */
         if ( ret >= 0 )
             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                             NULL, domain->domain_id, pgd_maddr,
-                                             mode);
+                                             NULL, DEVICE_DOMID(domain, pdev),
+                                             pgd_maddr, mode);
 
         /*
          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
@@ -1755,8 +1771,8 @@ static int domain_context_mapping(struct
         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
              (secbus != pdev->bus || pdev->devfn != 0) )
             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
-                                             NULL, domain->domain_id, pgd_maddr,
-                                             mode);
+                                             NULL, DEVICE_DOMID(domain, pdev),
+                                             pgd_maddr, mode);
 
         if ( ret )
         {
@@ -1798,9 +1814,6 @@ int domain_context_unmap_one(
     int iommu_domid, rc, ret;
     bool_t flush_dev_iotlb;
 
-    if ( QUARANTINE_SKIP(domain, dom_iommu(domain)->arch.vtd.pgd_maddr) )
-        return 0;
-
     ASSERT(pcidevs_locked());
     spin_lock(&iommu->lock);
 
@@ -1902,7 +1915,7 @@ static const struct acpi_drhd_unit *doma
             printk(VTDPREFIX "%pd:PCIe: unmap %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
         ret = domain_context_unmap_one(domain, iommu, bus, devfn,
-                                       domain->domain_id);
+                                       DEVICE_DOMID(domain, pdev));
         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
             disable_ats_device(pdev);
 
@@ -1916,7 +1929,7 @@ static const struct acpi_drhd_unit *doma
             printk(VTDPREFIX "%pd:PCI: unmap %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
         ret = domain_context_unmap_one(domain, iommu, bus, devfn,
-                                       domain->domain_id);
+                                       DEVICE_DOMID(domain, pdev));
         if ( ret )
             break;
 
@@ -1939,18 +1952,12 @@ static const struct acpi_drhd_unit *doma
             break;
         }
 
+        ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+                                       DEVICE_DOMID(domain, pdev));
         /* PCIe to PCI/PCIx bridge */
-        if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
-        {
-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
-                                           domain->domain_id);
-            if ( !ret )
-                ret = domain_context_unmap_one(domain, iommu, secbus, 0,
-                                               domain->domain_id);
-        }
-        else /* Legacy PCI bridge */
-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
-                                           domain->domain_id);
+        if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
+            ret = domain_context_unmap_one(domain, iommu, secbus, 0,
+                                           DEVICE_DOMID(domain, pdev));
 
         break;
 
@@ -1961,7 +1968,7 @@ static const struct acpi_drhd_unit *doma
     }
 
     if ( !ret && pdev->devfn == devfn &&
-         !QUARANTINE_SKIP(domain, dom_iommu(domain)->arch.vtd.pgd_maddr) )
+         !QUARANTINE_SKIP(domain, pdev->arch.vtd.pgd_maddr) )
         check_cleanup_domid_map(domain, pdev, iommu);
 
     return drhd;
@@ -1994,6 +2001,26 @@ static void iommu_domain_teardown(struct
     XFREE(hd->arch.vtd.iommu_bitmap);
 }
 
+static void quarantine_teardown(struct pci_dev *pdev,
+                                const struct acpi_drhd_unit *drhd)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
+
+    ASSERT(pcidevs_locked());
+
+    if ( !pdev->arch.vtd.pgd_maddr )
+        return;
+
+    ASSERT(page_list_empty(&hd->arch.pgtables.list));
+    page_list_move(&hd->arch.pgtables.list, &pdev->arch.pgtables_list);
+    while ( iommu_free_pgtables(dom_io) == -ERESTART )
+        /* nothing */;
+    pdev->arch.vtd.pgd_maddr = 0;
+
+    if ( drhd )
+        cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu);
+}
+
 static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
                                              mfn_t mfn, unsigned int flags,
                                              unsigned int *flush_flags)
@@ -2218,6 +2245,8 @@ static int intel_iommu_remove_device(u8
                                rmrr->end_address, 0);
     }
 
+    quarantine_teardown(pdev, drhd);
+
     if ( drhd )
     {
         iommu_free_domid(pdev->arch.pseudo_domid,
@@ -2576,7 +2605,7 @@ static int reassign_device_ownership(
 {
     int ret;
 
-    if ( !QUARANTINE_SKIP(target, dom_iommu(target)->arch.vtd.pgd_maddr) )
+    if ( !QUARANTINE_SKIP(target, pdev->arch.vtd.pgd_maddr) )
     {
         if ( !has_arch_pdevs(target) )
             vmx_pi_hooks_assign(target);
@@ -2592,7 +2621,7 @@ static int reassign_device_ownership(
         ret = domain_context_mapping(target, devfn, pdev);
 
         if ( !ret && pdev->devfn == devfn &&
-             !QUARANTINE_SKIP(source, dom_iommu(source)->arch.vtd.pgd_maddr) )
+             !QUARANTINE_SKIP(source, pdev->arch.vtd.pgd_maddr) )
         {
             const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
 
@@ -2913,69 +2942,135 @@ static void vtd_dump_page_tables(struct
                               agaw_to_level(hd->arch.vtd.agaw), 0, 0);
 }
 
-static int __init intel_iommu_quarantine_init(struct domain *d)
+static int fill_qpt(struct dma_pte *this, unsigned int level,
+                    struct page_info *pgs[6])
 {
-    struct domain_iommu *hd = dom_iommu(d);
+    struct domain_iommu *hd = dom_iommu(dom_io);
+    unsigned int i;
+    int rc = 0;
+
+    for ( i = 0; !rc && i < PTE_NUM; ++i )
+    {
+        struct dma_pte *pte = &this[i], *next;
+
+        if ( !dma_pte_present(*pte) )
+        {
+            if ( !pgs[level] )
+            {
+                /*
+                 * The pgtable allocator is fine for the leaf page, as well as
+                 * page table pages, and the resulting allocations are always
+                 * zeroed.
+                 */
+                pgs[level] = iommu_alloc_pgtable(hd);
+                if ( !pgs[level] )
+                {
+                    rc = -ENOMEM;
+                    break;
+                }
+
+                if ( level )
+                {
+                    next = map_vtd_domain_page(page_to_maddr(pgs[level]));
+                    rc = fill_qpt(next, level - 1, pgs);
+                    unmap_vtd_domain_page(next);
+                }
+            }
+
+            dma_set_pte_addr(*pte, page_to_maddr(pgs[level]));
+            dma_set_pte_readable(*pte);
+            dma_set_pte_writable(*pte);
+        }
+        else if ( level && !dma_pte_superpage(*pte) )
+        {
+            next = map_vtd_domain_page(dma_pte_addr(*pte));
+            rc = fill_qpt(next, level - 1, pgs);
+            unmap_vtd_domain_page(next);
+        }
+    }
+
+    return rc;
+}
+
+static int intel_iommu_quarantine_init(struct pci_dev *pdev, bool scratch_page)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
     struct page_info *pg;
-    struct dma_pte *parent;
     unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
     unsigned int level = agaw_to_level(agaw);
-    int rc = 0;
+    const struct acpi_drhd_unit *drhd;
+    const struct acpi_rmrr_unit *rmrr;
+    unsigned int i, bdf;
+    bool rmrr_found = false;
+    int rc;
 
-    spin_lock(&hd->arch.mapping_lock);
+    ASSERT(pcidevs_locked());
+    ASSERT(!hd->arch.vtd.pgd_maddr);
+    ASSERT(page_list_empty(&hd->arch.pgtables.list));
 
-    if ( hd->arch.vtd.pgd_maddr )
+    if ( pdev->arch.vtd.pgd_maddr )
     {
-        ASSERT_UNREACHABLE();
-        goto out;
+        clear_domain_page(pdev->arch.leaf_mfn);
+        return 0;
     }
 
-    pg = iommu_alloc_pgtable(hd);
+    drhd = acpi_find_matched_drhd_unit(pdev);
+    if ( !drhd )
+        return -ENODEV;
 
-    rc = -ENOMEM;
+    pg = iommu_alloc_pgtable(hd);
     if ( !pg )
-        goto out;
+        return -ENOMEM;
 
+    rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu);
+
+    /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
     hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
 
-    parent = map_vtd_domain_page(hd->arch.vtd.pgd_maddr);
-    while ( level )
+    for_each_rmrr_device ( rmrr, bdf, i )
     {
-        uint64_t maddr;
-        unsigned int offset;
-
-        /*
-         * The pgtable allocator is fine for the leaf page, as well as
-         * page table pages, and the resulting allocations are always
-         * zeroed.
-         */
-        pg = iommu_alloc_pgtable(hd);
-
-        if ( !pg )
-            goto out;
+        if ( rc )
+            break;
 
-        maddr = page_to_maddr(pg);
-        for ( offset = 0; offset < PTE_NUM; offset++ )
+        if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf )
         {
-            struct dma_pte *pte = &parent[offset];
+            rmrr_found = true;
 
-            dma_set_pte_addr(*pte, maddr);
-            dma_set_pte_readable(*pte);
+            rc = iommu_identity_mapping(dom_io, p2m_access_rw,
+                                        rmrr->base_address, rmrr->end_address,
+                                        0);
+            if ( rc )
+                printk(XENLOG_ERR VTDPREFIX
+                       "%pp: RMRR quarantine mapping failed\n",
+                       &pdev->sbdf);
         }
-        iommu_sync_cache(parent, PAGE_SIZE);
+    }
 
-        unmap_vtd_domain_page(parent);
-        parent = map_vtd_domain_page(maddr);
-        level--;
+    iommu_identity_map_teardown(dom_io);
+    hd->arch.vtd.pgd_maddr = 0;
+    pdev->arch.vtd.pgd_maddr = page_to_maddr(pg);
+
+    if ( !rc && scratch_page )
+    {
+        struct dma_pte *root;
+        struct page_info *pgs[6] = {};
+
+        spin_lock(&hd->arch.mapping_lock);
+
+        root = map_vtd_domain_page(pdev->arch.vtd.pgd_maddr);
+        rc = fill_qpt(root, level - 1, pgs);
+        unmap_vtd_domain_page(root);
+
+        pdev->arch.leaf_mfn = page_to_mfn(pgs[0]);
+
+        spin_unlock(&hd->arch.mapping_lock);
     }
-    unmap_vtd_domain_page(parent);
 
-    rc = 0;
+    page_list_move(&pdev->arch.pgtables_list, &hd->arch.pgtables.list);
 
- out:
-    spin_unlock(&hd->arch.mapping_lock);
+    if ( rc || (!scratch_page && !rmrr_found) )
+        quarantine_teardown(pdev, drhd);
 
-    /* Pages may be leaked in failure case */
     return rc;
 }
 
--- a/xen/drivers/passthrough/vtd/iommu.h
+++ b/xen/drivers/passthrough/vtd/iommu.h
@@ -482,7 +482,7 @@ struct vtd_iommu {
     u32 nr_pt_levels;
     u64	cap;
     u64	ecap;
-    spinlock_t lock; /* protect context, domain ids */
+    spinlock_t lock; /* protect context */
     spinlock_t register_lock; /* protect iommu register handling */
     u64 root_maddr; /* root entry machine address */
     nodeid_t node;
--- a/xen/include/xen/iommu.h
+++ b/xen/include/xen/iommu.h
@@ -233,7 +233,7 @@ typedef int iommu_grdm_t(xen_pfn_t start
 struct iommu_ops {
     int (*init)(struct domain *d);
     void (*hwdom_init)(struct domain *d);
-    int (*quarantine_init)(struct domain *d);
+    int (*quarantine_init)(device_t *dev, bool scratch_page);
     int (*add_device)(u8 devfn, device_t *dev);
     int (*enable_device)(device_t *dev);
     int (*remove_device)(u8 devfn, device_t *dev);
@@ -350,6 +350,7 @@ int __must_check iommu_suspend(void);
 void iommu_resume(void);
 void iommu_crash_shutdown(void);
 int iommu_get_reserved_device_memory(iommu_grdm_t *, void *);
+int iommu_quarantine_dev_init(device_t *dev);
 
 #ifdef CONFIG_HAS_PCI
 int iommu_do_pci_domctl(struct xen_domctl *, struct domain *d,
openSUSE Build Service is sponsored by