[BACK]Return to patch-XSA400 CVS log [TXT][DIR] Up to [cvs.NetBSD.org] / pkgsrc / sysutils / xenkernel415 / patches

File: [cvs.NetBSD.org] / pkgsrc / sysutils / xenkernel415 / patches / Attic / patch-XSA400 (download)

Revision 1.1, Fri Jun 24 13:07:52 2022 UTC (6 weeks, 6 days ago) by bouyer
Branch: MAIN
CVS Tags: pkgsrc-2022Q2-base, pkgsrc-2022Q2

Apply patches for Xen security advisory 397 up to 402, and 404 (XSA403 still
not released).
Bump PKGREVISION

$NetBSD: patch-XSA400,v 1.1 2022/06/24 13:07:52 bouyer Exp $

From: Jan Beulich <jbeulich@suse.com>
Subject: VT-d: fix (de)assign ordering when RMRRs are in use

In the event that the RMRR mappings are essential for device operation,
they should be established before updating the device's context entry,
while they should be torn down only after the device's context entry was
successfully updated.

Also adjust a related log message.

This is CVE-2022-26358 / part of XSA-400.

Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>

--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -2392,6 +2392,10 @@ static int reassign_device_ownership(
 {
     int ret;
 
+    ret = domain_context_unmap(source, devfn, pdev);
+    if ( ret )
+        return ret;
+
     /*
      * Devices assigned to untrusted domains (here assumed to be any domU)
      * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected
@@ -2428,10 +2432,6 @@ static int reassign_device_ownership(
             }
     }
 
-    ret = domain_context_unmap(source, devfn, pdev);
-    if ( ret )
-        return ret;
-
     if ( devfn == pdev->devfn && pdev->domain != dom_io )
     {
         list_move(&pdev->domain_list, &dom_io->pdev_list);
@@ -2507,9 +2507,8 @@ static int intel_iommu_assign_device(
         }
     }
 
-    ret = reassign_device_ownership(s, d, devfn, pdev);
-    if ( ret || d == dom_io )
-        return ret;
+    if ( d == dom_io )
+        return reassign_device_ownership(s, d, devfn, pdev);
 
     /* Setup rmrr identity mapping */
     for_each_rmrr_device( rmrr, bdf, i )
@@ -2522,20 +2521,37 @@ static int intel_iommu_assign_device(
                                          rmrr->end_address, flag);
             if ( ret )
             {
-                int rc;
-
-                rc = reassign_device_ownership(d, s, devfn, pdev);
                 printk(XENLOG_G_ERR VTDPREFIX
-                       " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
-                       rmrr->base_address, rmrr->end_address,
-                       d->domain_id, ret);
-                if ( rc )
-                {
-                    printk(XENLOG_ERR VTDPREFIX
-                           " failed to reclaim %pp from %pd (%d)\n",
-                           &PCI_SBDF3(seg, bus, devfn), d, rc);
-                    domain_crash(d);
-                }
+                       "%pd: cannot map reserved region [%"PRIx64",%"PRIx64"]: %d\n",
+                       d, rmrr->base_address, rmrr->end_address, ret);
+                break;
+            }
+        }
+    }
+
+    if ( !ret )
+        ret = reassign_device_ownership(s, d, devfn, pdev);
+
+    /* See reassign_device_ownership() for the hwdom aspect. */
+    if ( !ret || is_hardware_domain(d) )
+        return ret;
+
+    for_each_rmrr_device( rmrr, bdf, i )
+    {
+        if ( rmrr->segment == seg &&
+             PCI_BUS(bdf) == bus &&
+             PCI_DEVFN2(bdf) == devfn )
+        {
+            int rc = iommu_identity_mapping(d, p2m_access_x,
+                                            rmrr->base_address,
+                                            rmrr->end_address, 0);
+
+            if ( rc && rc != -ENOENT )
+            {
+                printk(XENLOG_ERR VTDPREFIX
+                       "%pd: cannot unmap reserved region [%"PRIx64",%"PRIx64"]: %d\n",
+                       d, rmrr->base_address, rmrr->end_address, rc);
+                domain_crash(d);
                 break;
             }
         }
From: Jan Beulich <jbeulich@suse.com>
Subject: VT-d: fix add/remove ordering when RMRRs are in use

In the event that the RMRR mappings are essential for device operation,
they should be established before updating the device's context entry,
while they should be torn down only after the device's context entry was
successfully cleared.

Also switch to %pd in related log messages.

Fixes: fa88cfadf918 ("vt-d: Map RMRR in intel_iommu_add_device() if the device has RMRR")
Fixes: 8b99f4400b69 ("VT-d: fix RMRR related error handling")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>

--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -1981,14 +1981,6 @@ static int intel_iommu_add_device(u8 dev
     if ( !pdev->domain )
         return -EINVAL;
 
-    ret = domain_context_mapping(pdev->domain, devfn, pdev);
-    if ( ret )
-    {
-        dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n",
-                pdev->domain->domain_id);
-        return ret;
-    }
-
     for_each_rmrr_device ( rmrr, bdf, i )
     {
         if ( rmrr->segment == pdev->seg &&
@@ -2005,12 +1997,17 @@ static int intel_iommu_add_device(u8 dev
                                          rmrr->base_address, rmrr->end_address,
                                          0);
             if ( ret )
-                dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n",
-                        pdev->domain->domain_id);
+                dprintk(XENLOG_ERR VTDPREFIX, "%pd: RMRR mapping failed\n",
+                        pdev->domain);
         }
     }
 
-    return 0;
+    ret = domain_context_mapping(pdev->domain, devfn, pdev);
+    if ( ret )
+        dprintk(XENLOG_ERR VTDPREFIX, "%pd: context mapping failed\n",
+                pdev->domain);
+
+    return ret;
 }
 
 static int intel_iommu_enable_device(struct pci_dev *pdev)
@@ -2032,11 +2029,15 @@ static int intel_iommu_remove_device(u8
 {
     struct acpi_rmrr_unit *rmrr;
     u16 bdf;
-    int i;
+    int ret, i;
 
     if ( !pdev->domain )
         return -EINVAL;
 
+    ret = domain_context_unmap(pdev->domain, devfn, pdev);
+    if ( ret )
+        return ret;
+
     for_each_rmrr_device ( rmrr, bdf, i )
     {
         if ( rmrr->segment != pdev->seg ||
@@ -2052,7 +2053,7 @@ static int intel_iommu_remove_device(u8
                                rmrr->end_address, 0);
     }
 
-    return domain_context_unmap(pdev->domain, devfn, pdev);
+    return 0;
 }
 
 static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev)
From: Jan Beulich <jbeulich@suse.com>
Subject: IOMMU/x86: tighten iommu_alloc_pgtable()'s parameter

This is to make more obvious that nothing outside of domain_iommu(d)
actually changes or is otherwise needed by the function.

No functional change intended.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>

--- xen/include/asm-x86/iommu.h.orig
+++ xen/include/asm-x86/iommu.h
@@ -143,7 +143,8 @@ int pi_update_irte(const struct pi_desc
 })
 
 int __must_check iommu_free_pgtables(struct domain *d);
-struct page_info *__must_check iommu_alloc_pgtable(struct domain *d);
+struct domain_iommu;
+struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
 
 #endif /* !__ARCH_X86_IOMMU_H__ */
 /*
--- xen/drivers/passthrough/amd/iommu_map.c.orig
+++ xen/drivers/passthrough/amd/iommu_map.c
@@ -184,7 +184,7 @@ static int iommu_pde_from_dfn(struct dom
     unsigned long  next_table_mfn;
     unsigned int level;
     struct page_info *table;
-    const struct domain_iommu *hd = dom_iommu(d);
+    struct domain_iommu *hd = dom_iommu(d);
 
     table = hd->arch.amd.root_table;
     level = hd->arch.amd.paging_mode;
@@ -220,7 +220,7 @@ static int iommu_pde_from_dfn(struct dom
             mfn = next_table_mfn;
 
             /* allocate lower level page table */
-            table = iommu_alloc_pgtable(d);
+            table = iommu_alloc_pgtable(hd);
             if ( table == NULL )
             {
                 AMD_IOMMU_DEBUG("Cannot allocate I/O page table\n");
@@ -250,7 +250,7 @@ static int iommu_pde_from_dfn(struct dom
 
             if ( next_table_mfn == 0 )
             {
-                table = iommu_alloc_pgtable(d);
+                table = iommu_alloc_pgtable(hd);
                 if ( table == NULL )
                 {
                     AMD_IOMMU_DEBUG("Cannot allocate I/O page table\n");
@@ -483,7 +483,7 @@ int __init amd_iommu_quarantine_init(str
 
     spin_lock(&hd->arch.mapping_lock);
 
-    hd->arch.amd.root_table = iommu_alloc_pgtable(d);
+    hd->arch.amd.root_table = iommu_alloc_pgtable(hd);
     if ( !hd->arch.amd.root_table )
         goto out;
 
@@ -498,7 +498,7 @@ int __init amd_iommu_quarantine_init(str
          * page table pages, and the resulting allocations are always
          * zeroed.
          */
-        pg = iommu_alloc_pgtable(d);
+        pg = iommu_alloc_pgtable(hd);
         if ( !pg )
             break;
 
--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
+++ xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -208,7 +208,7 @@ int amd_iommu_alloc_root(struct domain *
 
     if ( unlikely(!hd->arch.amd.root_table) )
     {
-        hd->arch.amd.root_table = iommu_alloc_pgtable(d);
+        hd->arch.amd.root_table = iommu_alloc_pgtable(hd);
         if ( !hd->arch.amd.root_table )
             return -ENOMEM;
     }
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -327,7 +327,7 @@ static u64 addr_to_dma_page_maddr(struct
     {
         struct page_info *pg;
 
-        if ( !alloc || !(pg = iommu_alloc_pgtable(domain)) )
+        if ( !alloc || !(pg = iommu_alloc_pgtable(hd)) )
             goto out;
 
         hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
@@ -347,7 +347,7 @@ static u64 addr_to_dma_page_maddr(struct
             if ( !alloc )
                 break;
 
-            pg = iommu_alloc_pgtable(domain);
+            pg = iommu_alloc_pgtable(hd);
             if ( !pg )
                 break;
 
@@ -2761,7 +2761,7 @@ static int __init intel_iommu_quarantine
         goto out;
     }
 
-    pg = iommu_alloc_pgtable(d);
+    pg = iommu_alloc_pgtable(hd);
 
     rc = -ENOMEM;
     if ( !pg )
@@ -2780,7 +2780,7 @@ static int __init intel_iommu_quarantine
          * page table pages, and the resulting allocations are always
          * zeroed.
          */
-        pg = iommu_alloc_pgtable(d);
+        pg = iommu_alloc_pgtable(hd);
 
         if ( !pg )
             goto out;
--- xen/drivers/passthrough/x86/iommu.c.orig
+++ xen/drivers/passthrough/x86/iommu.c
@@ -415,9 +415,8 @@ int iommu_free_pgtables(struct domain *d
     return 0;
 }
 
-struct page_info *iommu_alloc_pgtable(struct domain *d)
+struct page_info *iommu_alloc_pgtable(struct domain_iommu *hd)
 {
-    struct domain_iommu *hd = dom_iommu(d);
     unsigned int memflags = 0;
     struct page_info *pg;
     void *p;
From: Jan Beulich <jbeulich@suse.com>
Subject: VT-d: drop ownership checking from domain_context_mapping_one()

Despite putting in quite a bit of effort it was not possible to
establish why exactly this code exists (beyond possibly sanity
checking). Instead of a subsequent change further complicating this
logic, simply get rid of it.

Take the opportunity and move the respective unmap_vtd_domain_page() out
of the locked region.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>

--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -121,28 +121,6 @@ static int context_set_domain_id(struct
     return 0;
 }
 
-static int context_get_domain_id(struct context_entry *context,
-                                 struct vtd_iommu *iommu)
-{
-    unsigned long dom_index, nr_dom;
-    int domid = -1;
-
-    if (iommu && context)
-    {
-        nr_dom = cap_ndoms(iommu->cap);
-
-        dom_index = context_domain_id(*context);
-
-        if ( dom_index < nr_dom && iommu->domid_map )
-            domid = iommu->domid_map[dom_index];
-        else
-            dprintk(XENLOG_DEBUG VTDPREFIX,
-                    "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n",
-                    dom_index, nr_dom);
-    }
-    return domid;
-}
-
 static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu)
 {
     int iommu_domid = domain_iommu_domid(domain, iommu);
@@ -1404,44 +1382,9 @@ int domain_context_mapping_one(
 
     if ( context_present(*context) )
     {
-        int res = 0;
-
-        /* Try to get domain ownership from device structure.  If that's
-         * not available, try to read it from the context itself. */
-        if ( pdev )
-        {
-            if ( pdev->domain != domain )
-            {
-                printk(XENLOG_G_INFO VTDPREFIX "%pd: %pp owned by %pd",
-                       domain, &PCI_SBDF3(seg, bus, devfn),
-                       pdev->domain);
-                res = -EINVAL;
-            }
-        }
-        else
-        {
-            int cdomain;
-            cdomain = context_get_domain_id(context, iommu);
-            
-            if ( cdomain < 0 )
-            {
-                printk(XENLOG_G_WARNING VTDPREFIX
-                       "%pd: %pp mapped, but can't find owner\n",
-                       domain, &PCI_SBDF3(seg, bus, devfn));
-                res = -EINVAL;
-            }
-            else if ( cdomain != domain->domain_id )
-            {
-                printk(XENLOG_G_INFO VTDPREFIX
-                       "%pd: %pp already mapped to d%d",
-                       domain, &PCI_SBDF3(seg, bus, devfn), cdomain);
-                res = -EINVAL;
-            }
-        }
-
-        unmap_vtd_domain_page(context_entries);
         spin_unlock(&iommu->lock);
-        return res;
+        unmap_vtd_domain_page(context_entries);
+        return 0;
     }
 
     if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
From: Jan Beulich <jbeulich@suse.com>
Subject: VT-d: re-assign devices directly

Devices with RMRRs, due to it being unspecified how/when the specified
memory regions may get accessed, may not be left disconnected from their
respective mappings (as long as it's not certain that the device has
been fully quiesced). Hence rather than unmapping the old context and
then mapping the new one, re-assignment needs to be done in a single
step.

This is CVE-2022-26359 / part of XSA-400.

Reported-by: Roger Pau Monné <roger.pau@citrix.com>

Similarly quarantining scratch-page mode relies on page tables to be
continuously wired up.

To avoid complicating things more than necessary, treat all devices
mostly equally, i.e. regardless of their association with any RMRRs. The
main difference is when it comes to updating context entries, which need
to be atomic when there are RMRRs. Yet atomicity can only be achieved
with CMPXCHG16B, availability of which we can't take for given.

The seemingly complicated choice of non-negative return values for
domain_context_mapping_one() is to limit code churn: This way callers
passing NULL for pdev don't need fiddling with.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- xen/drivers/passthrough/vtd/extern.h.orig
+++ xen/drivers/passthrough/vtd/extern.h
@@ -84,7 +84,8 @@ void free_pgtable_maddr(u64 maddr);
 void *map_vtd_domain_page(u64 maddr);
 void unmap_vtd_domain_page(const void *va);
 int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
-                               u8 bus, u8 devfn, const struct pci_dev *);
+                               uint8_t bus, uint8_t devfn,
+                               const struct pci_dev *pdev, unsigned int mode);
 int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
                              u8 bus, u8 devfn);
 int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
@@ -103,8 +104,8 @@ int is_igd_vt_enabled_quirk(void);
 void platform_quirks_init(void);
 void vtd_ops_preamble_quirk(struct vtd_iommu *iommu);
 void vtd_ops_postamble_quirk(struct vtd_iommu *iommu);
-int __must_check me_wifi_quirk(struct domain *domain,
-                               u8 bus, u8 devfn, int map);
+int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus,
+                               uint8_t devfn, unsigned int mode);
 void pci_vtd_quirk(const struct pci_dev *);
 void quirk_iommu_caps(struct vtd_iommu *iommu);
 
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -117,6 +117,7 @@ static int context_set_domain_id(struct
     }
 
     set_bit(i, iommu->domid_bitmap);
+    context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
     context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
     return 0;
 }
@@ -1362,15 +1363,27 @@ static void __hwdom_init intel_iommu_hwd
     }
 }
 
+/*
+ * This function returns
+ * - a negative errno value upon error,
+ * - zero upon success when previously the entry was non-present, or this isn't
+ *   the "main" request for a device (pdev == NULL), or for no-op quarantining
+ *   assignments,
+ * - positive (one) upon success when previously the entry was present and this
+ *   is the "main" request for a device (pdev != NULL).
+ */
 int domain_context_mapping_one(
     struct domain *domain,
     struct vtd_iommu *iommu,
-    u8 bus, u8 devfn, const struct pci_dev *pdev)
+    uint8_t bus, uint8_t devfn, const struct pci_dev *pdev,
+    unsigned int mode)
 {
     struct domain_iommu *hd = dom_iommu(domain);
-    struct context_entry *context, *context_entries;
+    struct context_entry *context, *context_entries, lctxt;
+    __uint128_t old;
     u64 maddr, pgd_maddr;
-    u16 seg = iommu->drhd->segment;
+    uint16_t seg = iommu->drhd->segment, prev_did = 0;
+    struct domain *prev_dom = NULL;
     int rc, ret;
     bool_t flush_dev_iotlb;
 
@@ -1379,17 +1392,32 @@ int domain_context_mapping_one(
     maddr = bus_to_context_maddr(iommu, bus);
     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
     context = &context_entries[devfn];
+    old = (lctxt = *context).full;
 
-    if ( context_present(*context) )
+    if ( context_present(lctxt) )
     {
-        spin_unlock(&iommu->lock);
-        unmap_vtd_domain_page(context_entries);
-        return 0;
+        domid_t domid;
+
+        prev_did = context_domain_id(lctxt);
+        domid = iommu->domid_map[prev_did];
+        if ( domid < DOMID_FIRST_RESERVED )
+            prev_dom = rcu_lock_domain_by_id(domid);
+        else if ( domid == DOMID_IO )
+            prev_dom = rcu_lock_domain(dom_io);
+        if ( !prev_dom )
+        {
+            spin_unlock(&iommu->lock);
+            unmap_vtd_domain_page(context_entries);
+            dprintk(XENLOG_DEBUG VTDPREFIX,
+                    "no domain for did %u (nr_dom %u)\n",
+                    prev_did, cap_ndoms(iommu->cap));
+            return -ESRCH;
+        }
     }
 
     if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
     {
-        context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
+        context_set_translation_type(lctxt, CONTEXT_TT_PASS_THRU);
     }
     else
     {
@@ -1401,36 +1429,107 @@ int domain_context_mapping_one(
             spin_unlock(&hd->arch.mapping_lock);
             spin_unlock(&iommu->lock);
             unmap_vtd_domain_page(context_entries);
+            if ( prev_dom )
+                rcu_unlock_domain(prev_dom);
             return -ENOMEM;
         }
 
-        context_set_address_root(*context, pgd_maddr);
+        context_set_address_root(lctxt, pgd_maddr);
         if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
-            context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
+            context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB);
         else
-            context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
+            context_set_translation_type(lctxt, CONTEXT_TT_MULTI_LEVEL);
 
         spin_unlock(&hd->arch.mapping_lock);
     }
 
-    if ( context_set_domain_id(context, domain, iommu) )
+    if ( context_set_domain_id(&lctxt, domain, iommu) )
     {
+    unlock:
         spin_unlock(&iommu->lock);
         unmap_vtd_domain_page(context_entries);
+        if ( prev_dom )
+            rcu_unlock_domain(prev_dom);
         return -EFAULT;
     }
 
-    context_set_address_width(*context, level_to_agaw(iommu->nr_pt_levels));
-    context_set_fault_enable(*context);
-    context_set_present(*context);
+    if ( !prev_dom )
+    {
+        context_set_address_width(lctxt, level_to_agaw(iommu->nr_pt_levels));
+        context_set_fault_enable(lctxt);
+        context_set_present(lctxt);
+    }
+    else if ( prev_dom == domain )
+    {
+        ASSERT(lctxt.full == context->full);
+        rc = !!pdev;
+        goto unlock;
+    }
+    else
+    {
+        ASSERT(context_address_width(lctxt) ==
+               level_to_agaw(iommu->nr_pt_levels));
+        ASSERT(!context_fault_disable(lctxt));
+    }
+
+    if ( cpu_has_cx16 )
+    {
+        __uint128_t res = cmpxchg16b(context, &old, &lctxt.full);
+
+        /*
+         * Hardware does not update the context entry behind our backs,
+         * so the return value should match "old".
+         */
+        if ( res != old )
+        {
+            if ( pdev )
+                check_cleanup_domid_map(domain, pdev, iommu);
+            printk(XENLOG_ERR
+                   "%pp: unexpected context entry %016lx_%016lx (expected %016lx_%016lx)\n",
+                   &PCI_SBDF3(pdev->seg, pdev->bus, devfn),
+                   (uint64_t)(res >> 64), (uint64_t)res,
+                   (uint64_t)(old >> 64), (uint64_t)old);
+            rc = -EILSEQ;
+            goto unlock;
+        }
+    }
+    else if ( !prev_dom || !(mode & MAP_WITH_RMRR) )
+    {
+        context_clear_present(*context);
+        iommu_sync_cache(context, sizeof(*context));
+
+        write_atomic(&context->hi, lctxt.hi);
+        /* No barrier should be needed between these two. */
+        write_atomic(&context->lo, lctxt.lo);
+    }
+    else /* Best effort, updating DID last. */
+    {
+         /*
+          * By non-atomically updating the context entry's DID field last,
+          * during a short window in time TLB entries with the old domain ID
+          * but the new page tables may be inserted.  This could affect I/O
+          * of other devices using this same (old) domain ID.  Such updating
+          * therefore is not a problem if this was the only device associated
+          * with the old domain ID.  Diverting I/O of any of a dying domain's
+          * devices to the quarantine page tables is intended anyway.
+          */
+        if ( !(mode & (MAP_OWNER_DYING | MAP_SINGLE_DEVICE)) )
+            printk(XENLOG_WARNING VTDPREFIX
+                   " %pp: reassignment may cause %pd data corruption\n",
+                   &PCI_SBDF3(seg, bus, devfn), prev_dom);
+
+        write_atomic(&context->lo, lctxt.lo);
+        /* No barrier should be needed between these two. */
+        write_atomic(&context->hi, lctxt.hi);
+    }
+
     iommu_sync_cache(context, sizeof(struct context_entry));
     spin_unlock(&iommu->lock);
 
-    /* Context entry was previously non-present (with domid 0). */
-    rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn),
-                                    DMA_CCMD_MASK_NOBIT, 1);
+    rc = iommu_flush_context_device(iommu, prev_did, PCI_BDF2(bus, devfn),
+                                    DMA_CCMD_MASK_NOBIT, !prev_dom);
     flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
-    ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
+    ret = iommu_flush_iotlb_dsi(iommu, prev_did, !prev_dom, flush_dev_iotlb);
 
     /*
      * The current logic for returns:
@@ -1451,17 +1550,26 @@ int domain_context_mapping_one(
     unmap_vtd_domain_page(context_entries);
 
     if ( !seg && !rc )
-        rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC);
+        rc = me_wifi_quirk(domain, bus, devfn, mode);
 
     if ( rc )
     {
-        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+        if ( !prev_dom )
+            ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+        else if ( prev_dom != domain ) /* Avoid infinite recursion. */
+            ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
+                                             mode & MAP_WITH_RMRR) < 0;
+        else
+            ret = 1;
 
         if ( !ret && pdev && pdev->devfn == devfn )
             check_cleanup_domid_map(domain, pdev, iommu);
     }
 
-    return rc;
+    if ( prev_dom )
+        rcu_unlock_domain(prev_dom);
+
+    return rc ?: pdev && prev_dom;
 }
 
 static int domain_context_unmap(struct domain *d, uint8_t devfn,
@@ -1471,8 +1579,10 @@ static int domain_context_mapping(struct
                                   struct pci_dev *pdev)
 {
     struct acpi_drhd_unit *drhd;
+    const struct acpi_rmrr_unit *rmrr;
     int ret = 0;
-    uint16_t seg = pdev->seg;
+    unsigned int i, mode = 0;
+    uint16_t seg = pdev->seg, bdf;
     uint8_t bus = pdev->bus, secbus;
 
     drhd = acpi_find_matched_drhd_unit(pdev);
@@ -1492,8 +1602,29 @@ static int domain_context_mapping(struct
 
     ASSERT(pcidevs_locked());
 
+    for_each_rmrr_device( rmrr, bdf, i )
+    {
+        if ( rmrr->segment != pdev->seg || bdf != pdev->sbdf.bdf )
+            continue;
+
+        mode |= MAP_WITH_RMRR;
+        break;
+    }
+
+    if ( domain != pdev->domain )
+    {
+        if ( pdev->domain->is_dying )
+            mode |= MAP_OWNER_DYING;
+        else if ( drhd &&
+                  !any_pdev_behind_iommu(pdev->domain, pdev, drhd->iommu) &&
+                  !pdev->phantom_stride )
+            mode |= MAP_SINGLE_DEVICE;
+    }
+
     switch ( pdev->type )
     {
+        bool prev_present;
+
     case DEV_TYPE_PCI_HOST_BRIDGE:
         if ( iommu_debug )
             printk(VTDPREFIX "%pd:Hostbridge: skip %pp map\n",
@@ -1512,7 +1643,9 @@ static int domain_context_mapping(struct
             printk(VTDPREFIX "%pd:PCIe: map %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev);
+                                         pdev, mode);
+        if ( ret > 0 )
+            ret = 0;
         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
             enable_ats_device(pdev, &drhd->iommu->ats_devices);
 
@@ -1524,9 +1657,10 @@ static int domain_context_mapping(struct
                    domain, &PCI_SBDF3(seg, bus, devfn));
 
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev);
-        if ( ret )
+                                         pdev, mode);
+        if ( ret < 0 )
             break;
+        prev_present = ret;
 
         if ( (ret = find_upstream_bridge(seg, &bus, &devfn, &secbus)) < 1 )
         {
@@ -1534,6 +1668,15 @@ static int domain_context_mapping(struct
                 break;
             ret = -ENXIO;
         }
+        /*
+         * Strictly speaking if the device is the only one behind this bridge
+         * and the only one with this (secbus,0,0) tuple, it could be allowed
+         * to be re-assigned regardless of RMRR presence.  But let's deal with
+         * that case only if it is actually found in the wild.
+         */
+        else if ( prev_present && (mode & MAP_WITH_RMRR) &&
+                  domain != pdev->domain )
+            ret = -EOPNOTSUPP;
 
         /*
          * Mapping a bridge should, if anything, pass the struct pci_dev of
@@ -1542,7 +1685,7 @@ static int domain_context_mapping(struct
          */
         if ( ret >= 0 )
             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                             NULL);
+                                             NULL, mode);
 
         /*
          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
@@ -1557,10 +1700,15 @@ static int domain_context_mapping(struct
         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
              (secbus != pdev->bus || pdev->devfn != 0) )
             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
-                                             NULL);
+                                             NULL, mode);
 
         if ( ret )
-            domain_context_unmap(domain, devfn, pdev);
+        {
+            if ( !prev_present )
+                domain_context_unmap(domain, devfn, pdev);
+            else if ( pdev->domain != domain ) /* Avoid infinite recursion. */
+                domain_context_mapping(pdev->domain, devfn, pdev);
+        }
 
         break;
 
@@ -2336,9 +2484,8 @@ static int reassign_device_ownership(
 {
     int ret;
 
-    ret = domain_context_unmap(source, devfn, pdev);
-    if ( ret )
-        return ret;
+    if ( !has_arch_pdevs(target) )
+        vmx_pi_hooks_assign(target);
 
     /*
      * Devices assigned to untrusted domains (here assumed to be any domU)
@@ -2348,6 +2495,31 @@ static int reassign_device_ownership(
     if ( (target != hardware_domain) && !iommu_intremap )
         untrusted_msi = true;
 
+    ret = domain_context_mapping(target, devfn, pdev);
+    if ( ret )
+    {
+        if ( !has_arch_pdevs(target) )
+            vmx_pi_hooks_deassign(target);
+        return ret;
+    }
+
+    if ( pdev->devfn == devfn )
+    {
+        const struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
+
+        if ( drhd )
+            check_cleanup_domid_map(source, pdev, drhd->iommu);
+    }
+
+    if ( devfn == pdev->devfn && pdev->domain != target )
+    {
+        list_move(&pdev->domain_list, &target->pdev_list);
+        pdev->domain = target;
+    }
+
+    if ( !has_arch_pdevs(source) )
+        vmx_pi_hooks_deassign(source);
+
     /*
      * If the device belongs to the hardware domain, and it has RMRR, don't
      * remove it from the hardware domain, because BIOS may use RMRR at
@@ -2376,34 +2548,7 @@ static int reassign_device_ownership(
             }
     }
 
-    if ( devfn == pdev->devfn && pdev->domain != dom_io )
-    {
-        list_move(&pdev->domain_list, &dom_io->pdev_list);
-        pdev->domain = dom_io;
-    }
-
-    if ( !has_arch_pdevs(source) )
-        vmx_pi_hooks_deassign(source);
-
-    if ( !has_arch_pdevs(target) )
-        vmx_pi_hooks_assign(target);
-
-    ret = domain_context_mapping(target, devfn, pdev);
-    if ( ret )
-    {
-        if ( !has_arch_pdevs(target) )
-            vmx_pi_hooks_deassign(target);
-
-        return ret;
-    }
-
-    if ( devfn == pdev->devfn && pdev->domain != target )
-    {
-        list_move(&pdev->domain_list, &target->pdev_list);
-        pdev->domain = target;
-    }
-
-    return ret;
+    return 0;
 }
 
 static int intel_iommu_assign_device(
--- xen/drivers/passthrough/vtd/iommu.h.orig
+++ xen/drivers/passthrough/vtd/iommu.h
@@ -202,8 +202,12 @@ struct root_entry {
     do {(root).val |= ((value) & PAGE_MASK_4K);} while(0)
 
 struct context_entry {
-    u64 lo;
-    u64 hi;
+    union {
+        struct {
+            uint64_t lo, hi;
+        };
+        __uint128_t full;
+    };
 };
 #define ROOT_ENTRY_NR (PAGE_SIZE_4K/sizeof(struct root_entry))
 #define context_present(c) ((c).lo & 1)
--- xen/drivers/passthrough/vtd/quirks.c.orig
+++ xen/drivers/passthrough/vtd/quirks.c
@@ -344,7 +344,8 @@ void __init platform_quirks_init(void)
  */
 
 static int __must_check map_me_phantom_function(struct domain *domain,
-                                                u32 dev, int map)
+                                                unsigned int dev,
+                                                unsigned int mode)
 {
     struct acpi_drhd_unit *drhd;
     struct pci_dev *pdev;
@@ -355,9 +356,9 @@ static int __must_check map_me_phantom_f
     drhd = acpi_find_matched_drhd_unit(pdev);
 
     /* map or unmap ME phantom function */
-    if ( map )
+    if ( !(mode & UNMAP_ME_PHANTOM_FUNC) )
         rc = domain_context_mapping_one(domain, drhd->iommu, 0,
-                                        PCI_DEVFN(dev, 7), NULL);
+                                        PCI_DEVFN(dev, 7), NULL, mode);
     else
         rc = domain_context_unmap_one(domain, drhd->iommu, 0,
                                       PCI_DEVFN(dev, 7));
@@ -365,7 +366,8 @@ static int __must_check map_me_phantom_f
     return rc;
 }
 
-int me_wifi_quirk(struct domain *domain, u8 bus, u8 devfn, int map)
+int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
+                  unsigned int mode)
 {
     u32 id;
     int rc = 0;
@@ -389,7 +391,7 @@ int me_wifi_quirk(struct domain *domain,
             case 0x423b8086:
             case 0x423c8086:
             case 0x423d8086:
-                rc = map_me_phantom_function(domain, 3, map);
+                rc = map_me_phantom_function(domain, 3, mode);
                 break;
             default:
                 break;
@@ -415,7 +417,7 @@ int me_wifi_quirk(struct domain *domain,
             case 0x42388086:        /* Puma Peak */
             case 0x422b8086:
             case 0x422c8086:
-                rc = map_me_phantom_function(domain, 22, map);
+                rc = map_me_phantom_function(domain, 22, mode);
                 break;
             default:
                 break;
--- xen/drivers/passthrough/vtd/vtd.h.orig
+++ xen/drivers/passthrough/vtd/vtd.h
@@ -22,8 +22,14 @@
 
 #include <xen/iommu.h>
 
-#define MAP_ME_PHANTOM_FUNC      1
-#define UNMAP_ME_PHANTOM_FUNC    0
+/*
+ * Values for domain_context_mapping_one()'s and me_wifi_quirk()'s "mode"
+ * parameters.
+ */
+#define MAP_WITH_RMRR         (1u << 0)
+#define MAP_OWNER_DYING       (1u << 1)
+#define MAP_SINGLE_DEVICE     (1u << 2)
+#define UNMAP_ME_PHANTOM_FUNC (1u << 3)
 
 /* Allow for both IOAPIC and IOSAPIC. */
 #define IO_xAPIC_route_entry IO_APIC_route_entry
From: Jan Beulich <jbeulich@suse.com>
Subject: AMD/IOMMU: re-assign devices directly

Devices with unity map ranges, due to it being unspecified how/when
these memory ranges may get accessed, may not be left disconnected from
their unity mappings (as long as it's not certain that the device has
been fully quiesced). Hence rather than tearing down the old root page
table pointer and then establishing the new one, re-assignment needs to
be done in a single step.

This is CVE-2022-26360 / part of XSA-400.

Reported-by: Roger Pau Monné <roger.pau@citrix.com>

Similarly quarantining scratch-page mode relies on page tables to be
continuously wired up.

To avoid complicating things more than necessary, treat all devices
mostly equally, i.e. regardless of their association with any unity map
ranges.  The main difference is when it comes to updating DTEs, which need
to be atomic when there are unity mappings. Yet atomicity can only be
achieved with CMPXCHG16B, availability of which we can't take for given.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- xen/drivers/passthrough/amd/iommu.h.orig
+++ xen/drivers/passthrough/amd/iommu.h
@@ -247,9 +247,13 @@ void amd_iommu_set_intremap_table(struct
                                   const void *ptr,
                                   const struct amd_iommu *iommu,
                                   bool valid);
-void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
-				   uint64_t root_ptr, uint16_t domain_id,
-				   uint8_t paging_mode, bool valid);
+#define SET_ROOT_VALID          (1u << 0)
+#define SET_ROOT_WITH_UNITY_MAP (1u << 1)
+int __must_check amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
+                                               uint64_t root_ptr,
+                                               uint16_t domain_id,
+                                               uint8_t paging_mode,
+                                               unsigned int flags);
 void iommu_dte_add_device_entry(struct amd_iommu_dte *dte,
                                 const struct ivrs_mappings *ivrs_dev);
 
--- xen/drivers/passthrough/amd/iommu_map.c.orig
+++ xen/drivers/passthrough/amd/iommu_map.c
@@ -114,10 +114,69 @@ static unsigned int set_iommu_ptes_prese
     return flush_flags;
 }
 
-void amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
-                                   uint64_t root_ptr, uint16_t domain_id,
-                                   uint8_t paging_mode, bool valid)
+/*
+ * This function returns
+ * - -errno for errors,
+ * - 0 for a successful update, atomic when necessary
+ * - 1 for a successful but non-atomic update, which may need to be warned
+ *   about by the caller.
+ */
+int amd_iommu_set_root_page_table(struct amd_iommu_dte *dte,
+                                  uint64_t root_ptr, uint16_t domain_id,
+                                  uint8_t paging_mode, unsigned int flags)
 {
+    bool valid = flags & SET_ROOT_VALID;
+
+    if ( dte->v && dte->tv &&
+         (cpu_has_cx16 || (flags & SET_ROOT_WITH_UNITY_MAP)) )
+    {
+        union {
+            struct amd_iommu_dte dte;
+            uint64_t raw64[4];
+            __uint128_t raw128[2];
+        } ldte = { .dte = *dte };
+        __uint128_t old = ldte.raw128[0];
+        int ret = 0;
+
+        ldte.dte.domain_id = domain_id;
+        ldte.dte.pt_root = paddr_to_pfn(root_ptr);
+        ldte.dte.iw = true;
+        ldte.dte.ir = true;
+        ldte.dte.paging_mode = paging_mode;
+        ldte.dte.v = valid;
+
+        if ( cpu_has_cx16 )
+        {
+            __uint128_t res = cmpxchg16b(dte, &old, &ldte.raw128[0]);
+
+            /*
+             * Hardware does not update the DTE behind our backs, so the
+             * return value should match "old".
+             */
+            if ( res != old )
+            {
+                printk(XENLOG_ERR
+                       "Dom%d: unexpected DTE %016lx_%016lx (expected %016lx_%016lx)\n",
+                       domain_id,
+                       (uint64_t)(res >> 64), (uint64_t)res,
+                       (uint64_t)(old >> 64), (uint64_t)old);
+                ret = -EILSEQ;
+            }
+        }
+        else /* Best effort, updating domain_id last. */
+        {
+            uint64_t *ptr = (void *)dte;
+
+            write_atomic(ptr + 0, ldte.raw64[0]);
+            /* No barrier should be needed between these two. */
+            write_atomic(ptr + 1, ldte.raw64[1]);
+
+            ret = 1;
+        }
+
+        return ret;
+    }
+
     if ( valid || dte->v )
     {
         dte->tv = false;
@@ -132,6 +191,8 @@ void amd_iommu_set_root_page_table(struc
     smp_wmb();
     dte->tv = true;
     dte->v = valid;
+
+    return 0;
 }
 
 void amd_iommu_set_intremap_table(
--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
+++ xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -81,41 +81,82 @@ int get_dma_requestor_id(uint16_t seg, u
     return req_id;
 }
 
-static void amd_iommu_setup_domain_device(
+static int __must_check allocate_domain_resources(struct domain *d)
+{
+    struct domain_iommu *hd = dom_iommu(d);
+    int rc;
+
+    spin_lock(&hd->arch.mapping_lock);
+    rc = amd_iommu_alloc_root(d);
+    spin_unlock(&hd->arch.mapping_lock);
+
+    return rc;
+}
+
+static bool any_pdev_behind_iommu(const struct domain *d,
+                                  const struct pci_dev *exclude,
+                                  const struct amd_iommu *iommu)
+{
+    const struct pci_dev *pdev;
+
+    for_each_pdev ( d, pdev )
+    {
+        if ( pdev == exclude )
+            continue;
+
+        if ( find_iommu_for_device(pdev->seg, pdev->sbdf.bdf) == iommu )
+            return true;
+    }
+
+    return false;
+}
+
+static int __must_check amd_iommu_setup_domain_device(
     struct domain *domain, struct amd_iommu *iommu,
     uint8_t devfn, struct pci_dev *pdev)
 {
     struct amd_iommu_dte *table, *dte;
     unsigned long flags;
-    int req_id, valid = 1;
+    unsigned int req_id, sr_flags;
+    int rc;
     u8 bus = pdev->bus;
     const struct domain_iommu *hd = dom_iommu(domain);
+    const struct ivrs_mappings *ivrs_dev;
+
+    BUG_ON(!hd->arch.amd.paging_mode || !iommu->dev_table.buffer);
 
-    BUG_ON( !hd->arch.amd.root_table ||
-            !hd->arch.amd.paging_mode ||
-            !iommu->dev_table.buffer );
+    rc = allocate_domain_resources(domain);
+    if ( rc )
+        return rc;
 
-    if ( iommu_hwdom_passthrough && is_hardware_domain(domain) )
-        valid = 0;
+    req_id = get_dma_requestor_id(iommu->seg, pdev->sbdf.bdf);
+    ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
+    sr_flags = (iommu_hwdom_passthrough && is_hardware_domain(domain)
+                ? 0 : SET_ROOT_VALID)
+               | (ivrs_dev->unity_map ? SET_ROOT_WITH_UNITY_MAP : 0);
 
     /* get device-table entry */
     req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
     table = iommu->dev_table.buffer;
     dte = &table[req_id];
+    ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
 
     spin_lock_irqsave(&iommu->lock, flags);
 
     if ( !dte->v || !dte->tv )
     {
-        const struct ivrs_mappings *ivrs_dev;
-
         /* bind DTE to domain page-tables */
-        amd_iommu_set_root_page_table(
-            dte, page_to_maddr(hd->arch.amd.root_table),
-            domain->domain_id, hd->arch.amd.paging_mode, valid);
+        rc = amd_iommu_set_root_page_table(
+                 dte, page_to_maddr(hd->arch.amd.root_table),
+                 domain->domain_id, hd->arch.amd.paging_mode, sr_flags);
+        if ( rc )
+        {
+            ASSERT(rc < 0);
+            spin_unlock_irqrestore(&iommu->lock, flags);
+            return rc;
+        }
 
         /* Undo what amd_iommu_disable_domain_device() may have done. */
-        ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
         if ( dte->it_root )
         {
             dte->int_ctl = IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED;
@@ -130,17 +171,73 @@ static void amd_iommu_setup_domain_devic
             dte->i = ats_enabled;
 
         amd_iommu_flush_device(iommu, req_id);
+    }
+    else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.amd.root_table)) )
+    {
+        /*
+         * Strictly speaking if the device is the only one with this requestor
+         * ID, it could be allowed to be re-assigned regardless of unity map
+         * presence.  But let's deal with that case only if it is actually
+         * found in the wild.
+         */
+        if ( req_id != PCI_BDF2(bus, devfn) &&
+             (sr_flags & SET_ROOT_WITH_UNITY_MAP) )
+            rc = -EOPNOTSUPP;
+        else
+            rc = amd_iommu_set_root_page_table(
+                     dte, page_to_maddr(hd->arch.amd.root_table),
+                     domain->domain_id, hd->arch.amd.paging_mode, sr_flags);
+        if ( rc < 0 )
+        {
+            spin_unlock_irqrestore(&iommu->lock, flags);
+            return rc;
+        }
+        if ( rc &&
+             domain != pdev->domain &&
+             /*
+              * By non-atomically updating the DTE's domain ID field last,
+              * during a short window in time TLB entries with the old domain
+              * ID but the new page tables may have been inserted.  This could
+              * affect I/O of other devices using this same (old) domain ID.
+              * Such updating therefore is not a problem if this was the only
+              * device associated with the old domain ID.  Diverting I/O of any
+              * of a dying domain's devices to the quarantine page tables is
+              * intended anyway.
+              */
+             !pdev->domain->is_dying &&
+             (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
+              pdev->phantom_stride) )
+            printk(" %pp: reassignment may cause %pd data corruption\n",
+                   &PCI_SBDF3(pdev->seg, bus, devfn), pdev->domain);
+
+        /*
+         * Check remaining settings are still in place from an earlier call
+         * here. They're all independent of the domain, so should not have
+         * changed.
+         */
+        if ( dte->it_root )
+            ASSERT(dte->int_ctl == IOMMU_DEV_TABLE_INT_CONTROL_TRANSLATED);
+        ASSERT(dte->iv == iommu_intremap);
+        ASSERT(dte->ex == ivrs_dev->dte_allow_exclusion);
+        ASSERT(dte->sys_mgt == MASK_EXTR(ivrs_dev->device_flags,
+                                         ACPI_IVHD_SYSTEM_MGMT));
 
-        AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
-                        "root table = %#"PRIx64", "
-                        "domain = %d, paging mode = %d\n",
-                        req_id, pdev->type,
-                        page_to_maddr(hd->arch.amd.root_table),
-                        domain->domain_id, hd->arch.amd.paging_mode);
+        if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
+             iommu_has_cap(iommu, PCI_CAP_IOTLB_SHIFT) )
+            ASSERT(dte->i == ats_enabled);
+
+        amd_iommu_flush_device(iommu, req_id);
     }
 
     spin_unlock_irqrestore(&iommu->lock, flags);
 
+    AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
+                    "root table = %#"PRIx64", "
+                    "domain = %d, paging mode = %d\n",
+                    req_id, pdev->type,
+                    page_to_maddr(hd->arch.amd.root_table),
+                    domain->domain_id, hd->arch.amd.paging_mode);
+
     ASSERT(pcidevs_locked());
 
     if ( pci_ats_device(iommu->seg, bus, pdev->devfn) &&
@@ -151,6 +248,8 @@ static void amd_iommu_setup_domain_devic
 
         amd_iommu_flush_iotlb(devfn, pdev, INV_IOMMU_ALL_PAGES_ADDRESS, 0);
     }
+
+    return 0;
 }
 
 int __init acpi_ivrs_init(void)
@@ -216,18 +315,6 @@ int amd_iommu_alloc_root(struct domain *
     return 0;
 }
 
-static int __must_check allocate_domain_resources(struct domain *d)
-{
-    struct domain_iommu *hd = dom_iommu(d);
-    int rc;
-
-    spin_lock(&hd->arch.mapping_lock);
-    rc = amd_iommu_alloc_root(d);
-    spin_unlock(&hd->arch.mapping_lock);
-
-    return rc;
-}
-
 int __read_mostly amd_iommu_min_paging_mode = 1;
 
 static int amd_iommu_domain_init(struct domain *d)
@@ -340,7 +427,15 @@ static int reassign_device(struct domain
         return -ENODEV;
     }
 
-    amd_iommu_disable_domain_device(source, iommu, devfn, pdev);
+    rc = amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
+    if ( rc )
+        return rc;
+
+    if ( devfn == pdev->devfn && pdev->domain != target )
+    {
+        list_move(&pdev->domain_list, &target->pdev_list);
+        pdev->domain = target;
+    }
 
     /*
      * If the device belongs to the hardware domain, and it has a unity mapping,
@@ -356,26 +451,9 @@ static int reassign_device(struct domain
             return rc;
     }
 
-    if ( devfn == pdev->devfn && pdev->domain != dom_io )
-    {
-        list_move(&pdev->domain_list, &dom_io->pdev_list);
-        pdev->domain = dom_io;
-    }
-
-    rc = allocate_domain_resources(target);
-    if ( rc )
-        return rc;
-
-    amd_iommu_setup_domain_device(target, iommu, devfn, pdev);
     AMD_IOMMU_DEBUG("Re-assign %pp from dom%d to dom%d\n",
                     &pdev->sbdf, source->domain_id, target->domain_id);
 
-    if ( devfn == pdev->devfn && pdev->domain != target )
-    {
-        list_move(&pdev->domain_list, &target->pdev_list);
-        pdev->domain = target;
-    }
-
     return 0;
 }
 
@@ -490,8 +568,7 @@ static int amd_iommu_add_device(u8 devfn
         spin_unlock_irqrestore(&iommu->lock, flags);
     }
 
-    amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
-    return 0;
+    return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
 }
 
 static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
From: Jan Beulich <jbeulich@suse.com>
Subject: VT-d: prepare for per-device quarantine page tables (part I)

Arrange for domain ID and page table root to be passed around, the latter in
particular to domain_pgd_maddr() such that taking it from the per-domain
fields can be overridden.

No functional change intended.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>

--- xen/drivers/passthrough/vtd/extern.h.orig
+++ xen/drivers/passthrough/vtd/extern.h
@@ -85,9 +85,10 @@ void *map_vtd_domain_page(u64 maddr);
 void unmap_vtd_domain_page(const void *va);
 int domain_context_mapping_one(struct domain *domain, struct vtd_iommu *iommu,
                                uint8_t bus, uint8_t devfn,
-                               const struct pci_dev *pdev, unsigned int mode);
+                               const struct pci_dev *pdev, domid_t domid,
+                               paddr_t pgd_maddr, unsigned int mode);
 int domain_context_unmap_one(struct domain *domain, struct vtd_iommu *iommu,
-                             u8 bus, u8 devfn);
+                             uint8_t bus, uint8_t devfn, domid_t domid);
 int intel_iommu_get_reserved_device_memory(iommu_grdm_t *func, void *ctxt);
 
 unsigned int io_apic_read_remap_rte(unsigned int apic, unsigned int reg);
@@ -105,7 +106,8 @@ void platform_quirks_init(void);
 void vtd_ops_preamble_quirk(struct vtd_iommu *iommu);
 void vtd_ops_postamble_quirk(struct vtd_iommu *iommu);
 int __must_check me_wifi_quirk(struct domain *domain, uint8_t bus,
-                               uint8_t devfn, unsigned int mode);
+                               uint8_t devfn, domid_t domid, paddr_t pgd_maddr,
+                               unsigned int mode);
 void pci_vtd_quirk(const struct pci_dev *);
 void quirk_iommu_caps(struct vtd_iommu *iommu);
 
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -355,15 +355,17 @@ static u64 addr_to_dma_page_maddr(struct
     return pte_maddr;
 }
 
-static uint64_t domain_pgd_maddr(struct domain *d, unsigned int nr_pt_levels)
+static paddr_t domain_pgd_maddr(struct domain *d, paddr_t pgd_maddr,
+                                unsigned int nr_pt_levels)
 {
     struct domain_iommu *hd = dom_iommu(d);
-    uint64_t pgd_maddr;
     unsigned int agaw;
 
     ASSERT(spin_is_locked(&hd->arch.mapping_lock));
 
-    if ( iommu_use_hap_pt(d) )
+    if ( pgd_maddr )
+        /* nothing */;
+    else if ( iommu_use_hap_pt(d) )
     {
         pagetable_t pgt = p2m_get_pagetable(p2m_get_hostp2m(d));
 
@@ -1376,12 +1378,12 @@ int domain_context_mapping_one(
     struct domain *domain,
     struct vtd_iommu *iommu,
     uint8_t bus, uint8_t devfn, const struct pci_dev *pdev,
-    unsigned int mode)
+    domid_t domid, paddr_t pgd_maddr, unsigned int mode)
 {
     struct domain_iommu *hd = dom_iommu(domain);
     struct context_entry *context, *context_entries, lctxt;
     __uint128_t old;
-    u64 maddr, pgd_maddr;
+    uint64_t maddr;
     uint16_t seg = iommu->drhd->segment, prev_did = 0;
     struct domain *prev_dom = NULL;
     int rc, ret;
@@ -1421,10 +1423,12 @@ int domain_context_mapping_one(
     }
     else
     {
+        paddr_t root;
+
         spin_lock(&hd->arch.mapping_lock);
 
-        pgd_maddr = domain_pgd_maddr(domain, iommu->nr_pt_levels);
-        if ( !pgd_maddr )
+        root = domain_pgd_maddr(domain, pgd_maddr, iommu->nr_pt_levels);
+        if ( !root )
         {
             spin_unlock(&hd->arch.mapping_lock);
             spin_unlock(&iommu->lock);
@@ -1434,7 +1438,7 @@ int domain_context_mapping_one(
             return -ENOMEM;
         }
 
-        context_set_address_root(lctxt, pgd_maddr);
+        context_set_address_root(lctxt, root);
         if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
             context_set_translation_type(lctxt, CONTEXT_TT_DEV_IOTLB);
         else
@@ -1550,15 +1554,21 @@ int domain_context_mapping_one(
     unmap_vtd_domain_page(context_entries);
 
     if ( !seg && !rc )
-        rc = me_wifi_quirk(domain, bus, devfn, mode);
+        rc = me_wifi_quirk(domain, bus, devfn, domid, pgd_maddr, mode);
 
     if ( rc )
     {
         if ( !prev_dom )
-            ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+            ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+                                           domain->domain_id);
         else if ( prev_dom != domain ) /* Avoid infinite recursion. */
+        {
+            hd = dom_iommu(prev_dom);
             ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
+                                             domain->domain_id,
+                                             hd->arch.vtd.pgd_maddr,
                                              mode & MAP_WITH_RMRR) < 0;
+        }
         else
             ret = 1;
 
@@ -1580,6 +1590,7 @@ static int domain_context_mapping(struct
 {
     struct acpi_drhd_unit *drhd;
     const struct acpi_rmrr_unit *rmrr;
+    paddr_t pgd_maddr = dom_iommu(domain)->arch.vtd.pgd_maddr;
     int ret = 0;
     unsigned int i, mode = 0;
     uint16_t seg = pdev->seg, bdf;
@@ -1643,7 +1654,8 @@ static int domain_context_mapping(struct
             printk(VTDPREFIX "%pd:PCIe: map %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev, mode);
+                                         pdev, domain->domain_id, pgd_maddr,
+                                         mode);
         if ( ret > 0 )
             ret = 0;
         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
@@ -1657,7 +1669,8 @@ static int domain_context_mapping(struct
                    domain, &PCI_SBDF3(seg, bus, devfn));
 
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev, mode);
+                                         pdev, domain->domain_id, pgd_maddr,
+                                         mode);
         if ( ret < 0 )
             break;
         prev_present = ret;
@@ -1685,7 +1698,8 @@ static int domain_context_mapping(struct
          */
         if ( ret >= 0 )
             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                             NULL, mode);
+                                             NULL, domain->domain_id, pgd_maddr,
+                                             mode);
 
         /*
          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
@@ -1700,7 +1714,8 @@ static int domain_context_mapping(struct
         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
              (secbus != pdev->bus || pdev->devfn != 0) )
             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
-                                             NULL, mode);
+                                             NULL, domain->domain_id, pgd_maddr,
+                                             mode);
 
         if ( ret )
         {
@@ -1728,7 +1743,7 @@ static int domain_context_mapping(struct
 int domain_context_unmap_one(
     struct domain *domain,
     struct vtd_iommu *iommu,
-    u8 bus, u8 devfn)
+    uint8_t bus, uint8_t devfn, domid_t domid)
 {
     struct context_entry *context, *context_entries;
     u64 maddr;
@@ -1786,7 +1801,7 @@ int domain_context_unmap_one(
     unmap_vtd_domain_page(context_entries);
 
     if ( !iommu->drhd->segment && !rc )
-        rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC);
+        rc = me_wifi_quirk(domain, bus, devfn, domid, 0, UNMAP_ME_PHANTOM_FUNC);
 
     if ( rc && !is_hardware_domain(domain) && domain != dom_io )
     {
@@ -1837,7 +1852,8 @@ static int domain_context_unmap(struct d
         if ( iommu_debug )
             printk(VTDPREFIX "%pd:PCIe: unmap %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
-        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+        ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+                                       domain->domain_id);
         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
             disable_ats_device(pdev);
 
@@ -1847,7 +1863,8 @@ static int domain_context_unmap(struct d
         if ( iommu_debug )
             printk(VTDPREFIX "%pd:PCI: unmap %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
-        ret = domain_context_unmap_one(domain, iommu, bus, devfn);
+        ret = domain_context_unmap_one(domain, iommu, bus, devfn,
+                                       domain->domain_id);
         if ( ret )
             break;
 
@@ -1873,12 +1890,15 @@ static int domain_context_unmap(struct d
         /* PCIe to PCI/PCIx bridge */
         if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
         {
-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
+            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+                                           domain->domain_id);
             if ( !ret )
-                ret = domain_context_unmap_one(domain, iommu, secbus, 0);
+                ret = domain_context_unmap_one(domain, iommu, secbus, 0,
+                                               domain->domain_id);
         }
         else /* Legacy PCI bridge */
-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
+            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+                                           domain->domain_id);
 
         break;
 
--- xen/drivers/passthrough/vtd/quirks.c.orig
+++ xen/drivers/passthrough/vtd/quirks.c
@@ -345,6 +345,8 @@ void __init platform_quirks_init(void)
 
 static int __must_check map_me_phantom_function(struct domain *domain,
                                                 unsigned int dev,
+                                                domid_t domid,
+                                                paddr_t pgd_maddr,
                                                 unsigned int mode)
 {
     struct acpi_drhd_unit *drhd;
@@ -358,16 +360,17 @@ static int __must_check map_me_phantom_f
     /* map or unmap ME phantom function */
     if ( !(mode & UNMAP_ME_PHANTOM_FUNC) )
         rc = domain_context_mapping_one(domain, drhd->iommu, 0,
-                                        PCI_DEVFN(dev, 7), NULL, mode);
+                                        PCI_DEVFN(dev, 7), NULL,
+                                        domid, pgd_maddr, mode);
     else
         rc = domain_context_unmap_one(domain, drhd->iommu, 0,
-                                      PCI_DEVFN(dev, 7));
+                                      PCI_DEVFN(dev, 7), domid);
 
     return rc;
 }
 
 int me_wifi_quirk(struct domain *domain, uint8_t bus, uint8_t devfn,
-                  unsigned int mode)
+                  domid_t domid, paddr_t pgd_maddr, unsigned int mode)
 {
     u32 id;
     int rc = 0;
@@ -391,7 +394,7 @@ int me_wifi_quirk(struct domain *domain,
             case 0x423b8086:
             case 0x423c8086:
             case 0x423d8086:
-                rc = map_me_phantom_function(domain, 3, mode);
+                rc = map_me_phantom_function(domain, 3, domid, pgd_maddr, mode);
                 break;
             default:
                 break;
@@ -417,7 +420,7 @@ int me_wifi_quirk(struct domain *domain,
             case 0x42388086:        /* Puma Peak */
             case 0x422b8086:
             case 0x422c8086:
-                rc = map_me_phantom_function(domain, 22, mode);
+                rc = map_me_phantom_function(domain, 22, domid, pgd_maddr, mode);
                 break;
             default:
                 break;
From: Jan Beulich <jbeulich@suse.com>
Subject: VT-d: prepare for per-device quarantine page tables (part II)

Replace the passing of struct domain * by domid_t in preparation of
per-device quarantine page tables also requiring per-device pseudo
domain IDs, which aren't going to be associated with any struct domain
instances.

No functional change intended (except for slightly adjusted log message
text).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -59,8 +59,8 @@ static struct tasklet vtd_fault_tasklet;
 static int setup_hwdom_device(u8 devfn, struct pci_dev *);
 static void setup_hwdom_rmrr(struct domain *d);
 
-static int domain_iommu_domid(struct domain *d,
-                              struct vtd_iommu *iommu)
+static int get_iommu_did(domid_t domid, const struct vtd_iommu *iommu,
+                         bool warn)
 {
     unsigned long nr_dom, i;
 
@@ -68,16 +68,16 @@ static int domain_iommu_domid(struct dom
     i = find_first_bit(iommu->domid_bitmap, nr_dom);
     while ( i < nr_dom )
     {
-        if ( iommu->domid_map[i] == d->domain_id )
+        if ( iommu->domid_map[i] == domid )
             return i;
 
         i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
     }
 
-    if ( !d->is_dying )
+    if ( warn )
         dprintk(XENLOG_ERR VTDPREFIX,
-                "Cannot get valid iommu %u domid: %pd\n",
-                iommu->index, d);
+                "No valid iommu %u domid for Dom%d\n",
+                iommu->index, domid);
 
     return -1;
 }
@@ -85,8 +85,7 @@ static int domain_iommu_domid(struct dom
 #define DID_FIELD_WIDTH 16
 #define DID_HIGH_OFFSET 8
 static int context_set_domain_id(struct context_entry *context,
-                                 struct domain *d,
-                                 struct vtd_iommu *iommu)
+                                 domid_t domid, struct vtd_iommu *iommu)
 {
     unsigned long nr_dom, i;
     int found = 0;
@@ -97,7 +96,7 @@ static int context_set_domain_id(struct
     i = find_first_bit(iommu->domid_bitmap, nr_dom);
     while ( i < nr_dom )
     {
-        if ( iommu->domid_map[i] == d->domain_id )
+        if ( iommu->domid_map[i] == domid )
         {
             found = 1;
             break;
@@ -113,7 +112,7 @@ static int context_set_domain_id(struct
             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n");
             return -EFAULT;
         }
-        iommu->domid_map[i] = d->domain_id;
+        iommu->domid_map[i] = domid;
     }
 
     set_bit(i, iommu->domid_bitmap);
@@ -122,9 +121,9 @@ static int context_set_domain_id(struct
     return 0;
 }
 
-static void cleanup_domid_map(struct domain *domain, struct vtd_iommu *iommu)
+static void cleanup_domid_map(domid_t domid, struct vtd_iommu *iommu)
 {
-    int iommu_domid = domain_iommu_domid(domain, iommu);
+    int iommu_domid = get_iommu_did(domid, iommu, false);
 
     if ( iommu_domid >= 0 )
     {
@@ -180,7 +179,7 @@ static void check_cleanup_domid_map(stru
     if ( !found )
     {
         clear_bit(iommu->index, &dom_iommu(d)->arch.vtd.iommu_bitmap);
-        cleanup_domid_map(d, iommu);
+        cleanup_domid_map(d->domain_id, iommu);
     }
 }
 
@@ -687,7 +686,7 @@ static int __must_check iommu_flush_iotl
             continue;
 
         flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
-        iommu_domid= domain_iommu_domid(d, iommu);
+        iommu_domid = get_iommu_did(d->domain_id, iommu, !d->is_dying);
         if ( iommu_domid == -1 )
             continue;
 
@@ -1447,7 +1446,7 @@ int domain_context_mapping_one(
         spin_unlock(&hd->arch.mapping_lock);
     }
 
-    if ( context_set_domain_id(&lctxt, domain, iommu) )
+    if ( context_set_domain_id(&lctxt, domid, iommu) )
     {
     unlock:
         spin_unlock(&iommu->lock);
@@ -1768,7 +1767,7 @@ int domain_context_unmap_one(
     context_clear_entry(*context);
     iommu_sync_cache(context, sizeof(struct context_entry));
 
-    iommu_domid= domain_iommu_domid(domain, iommu);
+    iommu_domid = get_iommu_did(domid, iommu, !domain->is_dying);
     if ( iommu_domid == -1 )
     {
         spin_unlock(&iommu->lock);
@@ -1938,7 +1937,7 @@ static void iommu_domain_teardown(struct
     ASSERT(!hd->arch.vtd.pgd_maddr);
 
     for_each_drhd_unit ( drhd )
-        cleanup_domid_map(d, drhd->iommu);
+        cleanup_domid_map(d->domain_id, drhd->iommu);
 }
 
 static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
From: Jan Beulich <jbeulich@suse.com>
Subject: IOMMU/x86: maintain a per-device pseudo domain ID

In order to subsequently enable per-device quarantine page tables, we'll
need domain-ID-like identifiers to be inserted in the respective device
(AMD) or context (Intel) table entries alongside the per-device page
table root addresses.

Make use of "real" domain IDs occupying only half of the value range
coverable by domid_t.

Note that in VT-d's iommu_alloc() I didn't want to introduce new memory
leaks in case of error, but existing ones don't get plugged - that'll be
the subject of a later change.

The VT-d changes are slightly asymmetric, but this way we can avoid
assigning pseudo domain IDs to devices which would never be mapped while
still avoiding to add a new parameter to domain_context_unmap().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- xen/include/asm-x86/iommu.h.orig
+++ xen/include/asm-x86/iommu.h
@@ -142,6 +142,10 @@ int pi_update_irte(const struct pi_desc
         iommu_vcall(ops, sync_cache, addr, size);       \
 })
 
+unsigned long *iommu_init_domid(void);
+domid_t iommu_alloc_domid(unsigned long *map);
+void iommu_free_domid(domid_t domid, unsigned long *map);
+
 int __must_check iommu_free_pgtables(struct domain *d);
 struct domain_iommu;
 struct page_info *__must_check iommu_alloc_pgtable(struct domain_iommu *hd);
--- xen/include/asm-x86/pci.h.orig
+++ xen/include/asm-x86/pci.h
@@ -15,6 +15,12 @@
 
 struct arch_pci_dev {
     vmask_t used_vectors;
+    /*
+     * These fields are (de)initialized under pcidevs-lock. Other uses of
+     * them don't race (de)initialization and hence don't strictly need any
+     * locking.
+     */
+    domid_t pseudo_domid;
 };
 
 int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
--- xen/drivers/passthrough/amd/iommu.h.orig
+++ xen/drivers/passthrough/amd/iommu.h
@@ -96,6 +96,7 @@ struct amd_iommu {
     struct ring_buffer cmd_buffer;
     struct ring_buffer event_log;
     struct ring_buffer ppr_log;
+    unsigned long *domid_map;
 
     int exclusion_enable;
     int exclusion_allow_all;
--- xen/drivers/passthrough/amd/iommu_detect.c.orig
+++ xen/drivers/passthrough/amd/iommu_detect.c
@@ -180,6 +180,11 @@ int __init amd_iommu_detect_one_acpi(
     if ( rt )
         goto out;
 
+    iommu->domid_map = iommu_init_domid();
+    rt = -ENOMEM;
+    if ( !iommu->domid_map )
+        goto out;
+
     rt = pci_ro_device(iommu->seg, bus, PCI_DEVFN(dev, func));
     if ( rt )
         printk(XENLOG_ERR "Could not mark config space of %pp read-only (%d)\n",
@@ -190,7 +195,10 @@ int __init amd_iommu_detect_one_acpi(
 
  out:
     if ( rt )
+    {
+        xfree(iommu->domid_map);
         xfree(iommu);
+    }
 
     return rt;
 }
--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
+++ xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -508,6 +508,8 @@ static int amd_iommu_add_device(u8 devfn
     struct amd_iommu *iommu;
     u16 bdf;
     struct ivrs_mappings *ivrs_mappings;
+    bool fresh_domid = false;
+    int ret;
 
     if ( !pdev->domain )
         return -EINVAL;
@@ -568,7 +570,22 @@ static int amd_iommu_add_device(u8 devfn
         spin_unlock_irqrestore(&iommu->lock, flags);
     }
 
-    return amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
+    if ( iommu_quarantine && pdev->arch.pseudo_domid == DOMID_INVALID )
+    {
+        pdev->arch.pseudo_domid = iommu_alloc_domid(iommu->domid_map);
+        if ( pdev->arch.pseudo_domid == DOMID_INVALID )
+            return -ENOSPC;
+        fresh_domid = true;
+    }
+
+    ret = amd_iommu_setup_domain_device(pdev->domain, iommu, devfn, pdev);
+    if ( ret && fresh_domid )
+    {
+        iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
+        pdev->arch.pseudo_domid = DOMID_INVALID;
+    }
+
+    return ret;
 }
 
 static int amd_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
@@ -591,6 +608,9 @@ static int amd_iommu_remove_device(u8 de
 
     amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
 
+    iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
+    pdev->arch.pseudo_domid = DOMID_INVALID;
+
     ivrs_mappings = get_ivrs_mappings(pdev->seg);
     bdf = PCI_BDF2(pdev->bus, devfn);
     if ( amd_iommu_perdev_intremap &&
--- xen/drivers/passthrough/pci.c.orig
+++ xen/drivers/passthrough/pci.c
@@ -327,6 +327,7 @@ static struct pci_dev *alloc_pdev(struct
     *((u8*) &pdev->bus) = bus;
     *((u8*) &pdev->devfn) = devfn;
     pdev->domain = NULL;
+    pdev->arch.pseudo_domid = DOMID_INVALID;
     INIT_LIST_HEAD(&pdev->msi_list);
 
     pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
@@ -1276,8 +1277,12 @@ static int _dump_pci_devices(struct pci_
 
     list_for_each_entry ( pdev, &pseg->alldevs_list, alldevs_list )
     {
-        printk("%pp - %pd - node %-3d - MSIs < ",
-               &pdev->sbdf, pdev->domain,
+        printk("%pp - ", &pdev->sbdf);
+        if ( pdev->domain == dom_io )
+            printk("DomIO:%x", pdev->arch.pseudo_domid);
+        else
+            printk("%pd", pdev->domain);
+        printk(" - node %-3d - MSIs < ",
                (pdev->node != NUMA_NO_NODE) ? pdev->node : -1);
         list_for_each_entry ( msi, &pdev->msi_list, list )
                printk("%d ", msi->irq);
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -22,6 +22,7 @@
 #include <xen/sched.h>
 #include <xen/xmalloc.h>
 #include <xen/domain_page.h>
+#include <xen/err.h>
 #include <xen/iocap.h>
 #include <xen/iommu.h>
 #include <xen/numa.h>
@@ -1204,7 +1205,7 @@ int __init iommu_alloc(struct acpi_drhd_
 {
     struct vtd_iommu *iommu;
     unsigned long sagaw, nr_dom;
-    int agaw;
+    int agaw, rc;
 
     if ( nr_iommus >= MAX_IOMMUS )
     {
@@ -1297,7 +1298,16 @@ int __init iommu_alloc(struct acpi_drhd_
     if ( !iommu->domid_map )
         return -ENOMEM;
 
+    iommu->pseudo_domid_map = iommu_init_domid();
+    rc = -ENOMEM;
+    if ( !iommu->pseudo_domid_map )
+        goto free;
+
     return 0;
+
+ free:
+    iommu_free(drhd);
+    return rc;
 }
 
 void __init iommu_free(struct acpi_drhd_unit *drhd)
@@ -1320,6 +1330,7 @@ void __init iommu_free(struct acpi_drhd_
 
     xfree(iommu->domid_bitmap);
     xfree(iommu->domid_map);
+    xfree(iommu->pseudo_domid_map);
 
     if ( iommu->msi.irq >= 0 )
         destroy_irq(iommu->msi.irq);
@@ -1581,8 +1592,8 @@ int domain_context_mapping_one(
     return rc ?: pdev && prev_dom;
 }
 
-static int domain_context_unmap(struct domain *d, uint8_t devfn,
-                                struct pci_dev *pdev);
+static const struct acpi_drhd_unit *domain_context_unmap(
+    struct domain *d, uint8_t devfn, struct pci_dev *pdev);
 
 static int domain_context_mapping(struct domain *domain, u8 devfn,
                                   struct pci_dev *pdev)
@@ -1590,6 +1601,7 @@ static int domain_context_mapping(struct
     struct acpi_drhd_unit *drhd;
     const struct acpi_rmrr_unit *rmrr;
     paddr_t pgd_maddr = dom_iommu(domain)->arch.vtd.pgd_maddr;
+    domid_t orig_domid = pdev->arch.pseudo_domid;
     int ret = 0;
     unsigned int i, mode = 0;
     uint16_t seg = pdev->seg, bdf;
@@ -1649,6 +1661,14 @@ static int domain_context_mapping(struct
         break;
 
     case DEV_TYPE_PCIe_ENDPOINT:
+        if ( iommu_quarantine && orig_domid == DOMID_INVALID )
+        {
+            pdev->arch.pseudo_domid =
+                iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
+            if ( pdev->arch.pseudo_domid == DOMID_INVALID )
+                return -ENOSPC;
+        }
+
         if ( iommu_debug )
             printk(VTDPREFIX "%pd:PCIe: map %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
@@ -1663,6 +1683,14 @@ static int domain_context_mapping(struct
         break;
 
     case DEV_TYPE_PCI:
+        if ( iommu_quarantine && orig_domid == DOMID_INVALID )
+        {
+            pdev->arch.pseudo_domid =
+                iommu_alloc_domid(drhd->iommu->pseudo_domid_map);
+            if ( pdev->arch.pseudo_domid == DOMID_INVALID )
+                return -ENOSPC;
+        }
+
         if ( iommu_debug )
             printk(VTDPREFIX "%pd:PCI: map %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
@@ -1736,6 +1764,13 @@ static int domain_context_mapping(struct
     if ( !ret && devfn == pdev->devfn )
         pci_vtd_quirk(pdev);
 
+    if ( ret && drhd && orig_domid == DOMID_INVALID )
+    {
+        iommu_free_domid(pdev->arch.pseudo_domid,
+                         drhd->iommu->pseudo_domid_map);
+        pdev->arch.pseudo_domid = DOMID_INVALID;
+    }
+
     return ret;
 }
 
@@ -1818,8 +1853,10 @@ int domain_context_unmap_one(
     return rc;
 }
 
-static int domain_context_unmap(struct domain *domain, u8 devfn,
-                                struct pci_dev *pdev)
+static const struct acpi_drhd_unit *domain_context_unmap(
+    struct domain *domain,
+    uint8_t devfn,
+    struct pci_dev *pdev)
 {
     struct acpi_drhd_unit *drhd;
     struct vtd_iommu *iommu;
@@ -1829,7 +1866,7 @@ static int domain_context_unmap(struct d
 
     drhd = acpi_find_matched_drhd_unit(pdev);
     if ( !drhd )
-        return -ENODEV;
+        return ERR_PTR(-ENODEV);
     iommu = drhd->iommu;
 
     switch ( pdev->type )
@@ -1839,7 +1876,7 @@ static int domain_context_unmap(struct d
             printk(VTDPREFIX "%pd:Hostbridge: skip %pp unmap\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
         if ( !is_hardware_domain(domain) )
-            return -EPERM;
+            return ERR_PTR(-EPERM);
         goto out;
 
     case DEV_TYPE_PCIe_BRIDGE:
@@ -1912,7 +1949,7 @@ static int domain_context_unmap(struct d
         check_cleanup_domid_map(domain, pdev, iommu);
 
 out:
-    return ret;
+    return ret ? ERR_PTR(ret) : drhd;
 }
 
 static void iommu_clear_root_pgtable(struct domain *d)
@@ -2137,16 +2174,17 @@ static int intel_iommu_enable_device(str
 
 static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
 {
+    const struct acpi_drhd_unit *drhd;
     struct acpi_rmrr_unit *rmrr;
     u16 bdf;
-    int ret, i;
+    unsigned int i;
 
     if ( !pdev->domain )
         return -EINVAL;
 
-    ret = domain_context_unmap(pdev->domain, devfn, pdev);
-    if ( ret )
-        return ret;
+    drhd = domain_context_unmap(pdev->domain, devfn, pdev);
+    if ( IS_ERR(drhd) )
+        return PTR_ERR(drhd);
 
     for_each_rmrr_device ( rmrr, bdf, i )
     {
@@ -2163,6 +2201,13 @@ static int intel_iommu_remove_device(u8
                                rmrr->end_address, 0);
     }
 
+    if ( drhd )
+    {
+        iommu_free_domid(pdev->arch.pseudo_domid,
+                         drhd->iommu->pseudo_domid_map);
+        pdev->arch.pseudo_domid = DOMID_INVALID;
+    }
+
     return 0;
 }
 
--- xen/drivers/passthrough/vtd/iommu.h.orig
+++ xen/drivers/passthrough/vtd/iommu.h
@@ -535,6 +535,7 @@ struct vtd_iommu {
     } flush;
 
     struct list_head ats_devices;
+    unsigned long *pseudo_domid_map; /* "pseudo" domain id bitmap */
     unsigned long *domid_bitmap;  /* domain id bitmap */
     u16 *domid_map;               /* domain id mapping array */
     uint32_t version;
--- xen/drivers/passthrough/x86/iommu.c.orig
+++ xen/drivers/passthrough/x86/iommu.c
@@ -386,6 +386,53 @@ void __hwdom_init arch_iommu_hwdom_init(
         return;
 }
 
+unsigned long *__init iommu_init_domid(void)
+{
+    if ( !iommu_quarantine )
+        return ZERO_BLOCK_PTR;
+
+    BUILD_BUG_ON(DOMID_MASK * 2U >= UINT16_MAX);
+
+    return xzalloc_array(unsigned long,
+                         BITS_TO_LONGS(UINT16_MAX - DOMID_MASK));
+}
+
+domid_t iommu_alloc_domid(unsigned long *map)
+{
+    /*
+     * This is used uniformly across all IOMMUs, such that on typical
+     * systems we wouldn't re-use the same ID very quickly (perhaps never).
+     */
+    static unsigned int start;
+    unsigned int idx = find_next_zero_bit(map, UINT16_MAX - DOMID_MASK, start);
+
+    ASSERT(pcidevs_locked());
+
+    if ( idx >= UINT16_MAX - DOMID_MASK )
+        idx = find_first_zero_bit(map, UINT16_MAX - DOMID_MASK);
+    if ( idx >= UINT16_MAX - DOMID_MASK )
+        return DOMID_INVALID;
+
+    __set_bit(idx, map);
+
+    start = idx + 1;
+
+    return idx | (DOMID_MASK + 1);
+}
+
+void iommu_free_domid(domid_t domid, unsigned long *map)
+{
+    ASSERT(pcidevs_locked());
+
+    if ( domid == DOMID_INVALID )
+        return;
+
+    ASSERT(domid > DOMID_MASK);
+
+    if ( !__test_and_clear_bit(domid & DOMID_MASK, map) )
+        BUG();
+}
+
 int iommu_free_pgtables(struct domain *d)
 {
     struct domain_iommu *hd = dom_iommu(d);
From: Jan Beulich <jbeulich@suse.com>
Subject: IOMMU/x86: drop TLB flushes from quarantine_init() hooks

The page tables just created aren't hooked up yet anywhere, so there's
nothing that could be present in any TLB, and hence nothing to flush.
Dropping this flush is, at least on the VT-d side, a prereq to per-
device domain ID use when quarantining devices, as dom_io isn't going
to be assigned a DID anymore: The warning in get_iommu_did() would
trigger.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>

--- xen/drivers/passthrough/amd/iommu_map.c.orig
+++ xen/drivers/passthrough/amd/iommu_map.c
@@ -584,8 +584,6 @@ int __init amd_iommu_quarantine_init(str
  out:
     spin_unlock(&hd->arch.mapping_lock);
 
-    amd_iommu_flush_all_pages(d);
-
     /* Pages leaked in failure case */
     return level ? -ENOMEM : 0;
 }
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -2958,9 +2958,6 @@ static int __init intel_iommu_quarantine
  out:
     spin_unlock(&hd->arch.mapping_lock);
 
-    if ( !rc )
-        rc = iommu_flush_iotlb_all(d);
-
     /* Pages may be leaked in failure case */
     return rc;
 }
From: Jan Beulich <jbeulich@suse.com>
Subject: AMD/IOMMU: abstract maximum number of page table levels

We will want to use the constant elsewhere.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>

--- xen/drivers/passthrough/amd/iommu.h.orig
+++ xen/drivers/passthrough/amd/iommu.h
@@ -358,7 +358,7 @@ static inline int amd_iommu_get_paging_m
     while ( max_frames > PTE_PER_TABLE_SIZE )
     {
         max_frames = PTE_PER_TABLE_ALIGN(max_frames) >> PTE_PER_TABLE_SHIFT;
-        if ( ++level > 6 )
+        if ( ++level > IOMMU_MAX_PT_LEVELS )
             return -ENOMEM;
     }
 
--- xen/drivers/passthrough/amd/iommu-defs.h.orig
+++ xen/drivers/passthrough/amd/iommu-defs.h
@@ -106,6 +106,7 @@ struct amd_iommu_dte {
     bool tv:1;
     unsigned int :5;
     unsigned int had:2;
+#define IOMMU_MAX_PT_LEVELS 6
     unsigned int paging_mode:3;
     uint64_t pt_root:40;
     bool ppr:1;
--- xen/drivers/passthrough/amd/iommu_map.c.orig
+++ xen/drivers/passthrough/amd/iommu_map.c
@@ -250,7 +250,7 @@ static int iommu_pde_from_dfn(struct dom
     table = hd->arch.amd.root_table;
     level = hd->arch.amd.paging_mode;
 
-    BUG_ON( table == NULL || level < 1 || level > 6 );
+    BUG_ON( table == NULL || level < 1 || level > IOMMU_MAX_PT_LEVELS );
 
     /*
      * A frame number past what the current page tables can represent can't
From: Jan Beulich <jbeulich@suse.com>
Subject: IOMMU/x86: use per-device page tables for quarantining

Devices with RMRRs / unity mapped regions, due to it being unspecified
how/when these memory regions may be accessed, may not be left
disconnected from the mappings of these regions (as long as it's not
certain that the device has been fully quiesced). Hence even the page
tables used when quarantining such devices need to have mappings of
those regions. This implies installing page tables in the first place
even when not in scratch-page quarantining mode.

This is CVE-2022-26361 / part of XSA-400.

While for the purpose here it would be sufficient to have devices with
RMRRs / unity mapped regions use per-device page tables, extend this to
all devices (in scratch-page quarantining mode). This allows the leaf
pages to be mapped r/w, thus covering also memory writes (rather than
just reads) issued by non-quiescent devices.

Set up quarantine page tables as late as possible, yet early enough to
not encounter failure during de-assign. This means setup generally
happens in assign_device(), while (for now) the one in deassign_device()
is there mainly to be on the safe side.

In VT-d's DID allocation function don't require the IOMMU lock to be
held anymore: All involved code paths hold pcidevs_lock, so this way we
avoid the need to acquire the IOMMU lock around the new call to
context_set_domain_id().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Paul Durrant <paul@xen.org>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- xen/arch/x86/mm/p2m.c.orig
+++ xen/arch/x86/mm/p2m.c
@@ -1468,7 +1468,7 @@ int set_identity_p2m_entry(struct domain
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
     int ret;
 
-    if ( !paging_mode_translate(p2m->domain) )
+    if ( !paging_mode_translate(d) )
     {
         if ( !is_iommu_enabled(d) )
             return 0;
--- xen/include/asm-x86/pci.h.orig
+++ xen/include/asm-x86/pci.h
@@ -1,6 +1,8 @@
 #ifndef __X86_PCI_H__
 #define __X86_PCI_H__
 
+#include <xen/mm.h>
+
 #define CF8_BDF(cf8)     (  ((cf8) & 0x00ffff00) >> 8)
 #define CF8_ADDR_LO(cf8) (   (cf8) & 0x000000fc)
 #define CF8_ADDR_HI(cf8) (  ((cf8) & 0x0f000000) >> 16)
@@ -20,7 +22,18 @@ struct arch_pci_dev {
      * them don't race (de)initialization and hence don't strictly need any
      * locking.
      */
+    union {
+        /* Subset of struct arch_iommu's fields, to be used in dom_io. */
+        struct {
+            uint64_t pgd_maddr;
+        } vtd;
+        struct {
+            struct page_info *root_table;
+        } amd;
+    };
     domid_t pseudo_domid;
+    mfn_t leaf_mfn;
+    struct page_list_head pgtables_list;
 };
 
 int pci_conf_write_intercept(unsigned int seg, unsigned int bdf,
--- xen/drivers/passthrough/amd/iommu.h.orig
+++ xen/drivers/passthrough/amd/iommu.h
@@ -223,7 +223,8 @@ int amd_iommu_init_late(void);
 int amd_iommu_update_ivrs_mapping_acpi(void);
 int iov_adjust_irq_affinities(void);
 
-int amd_iommu_quarantine_init(struct domain *d);
+int amd_iommu_quarantine_init(struct pci_dev *pdev);
+void amd_iommu_quarantine_teardown(struct pci_dev *pdev);
 
 /* mapping functions */
 int __must_check amd_iommu_map_page(struct domain *d, dfn_t dfn,
--- xen/drivers/passthrough/amd/iommu_map.c.orig
+++ xen/drivers/passthrough/amd/iommu_map.c
@@ -528,64 +528,135 @@ int amd_iommu_reserve_domain_unity_unmap
     return rc;
 }
 
-int __init amd_iommu_quarantine_init(struct domain *d)
+static int fill_qpt(union amd_iommu_pte *this, unsigned int level,
+                    struct page_info *pgs[IOMMU_MAX_PT_LEVELS])
 {
-    struct domain_iommu *hd = dom_iommu(d);
+    struct domain_iommu *hd = dom_iommu(dom_io);
+    unsigned int i;
+    int rc = 0;
+
+    for ( i = 0; !rc && i < PTE_PER_TABLE_SIZE; ++i )
+    {
+        union amd_iommu_pte *pte = &this[i], *next;
+
+        if ( !pte->pr )
+        {
+            if ( !pgs[level] )
+            {
+                /*
+                 * The pgtable allocator is fine for the leaf page, as well as
+                 * page table pages, and the resulting allocations are always
+                 * zeroed.
+                 */
+                pgs[level] = iommu_alloc_pgtable(hd);
+                if ( !pgs[level] )
+                {
+                    rc = -ENOMEM;
+                    break;
+                }
+
+                if ( level )
+                {
+                    next = __map_domain_page(pgs[level]);
+                    rc = fill_qpt(next, level - 1, pgs);
+                    unmap_domain_page(next);
+                }
+            }
+
+            /*
+             * PDEs are essentially a subset of PTEs, so this function
+             * is fine to use even at the leaf.
+             */
+            set_iommu_pde_present(pte, mfn_x(page_to_mfn(pgs[level])), level,
+                                  true, true);
+        }
+        else if ( level && pte->next_level )
+        {
+            next = map_domain_page(_mfn(pte->mfn));
+            rc = fill_qpt(next, level - 1, pgs);
+            unmap_domain_page(next);
+        }
+    }
+
+    return rc;
+}
+
+int amd_iommu_quarantine_init(struct pci_dev *pdev)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
     unsigned long end_gfn =
         1ul << (DEFAULT_DOMAIN_ADDRESS_WIDTH - PAGE_SHIFT);
     unsigned int level = amd_iommu_get_paging_mode(end_gfn);
-    union amd_iommu_pte *table;
+    unsigned int req_id = get_dma_requestor_id(pdev->seg, pdev->sbdf.bdf);
+    const struct ivrs_mappings *ivrs_mappings = get_ivrs_mappings(pdev->seg);
+    int rc;
 
-    if ( hd->arch.amd.root_table )
+    ASSERT(pcidevs_locked());
+    ASSERT(!hd->arch.amd.root_table);
+    ASSERT(page_list_empty(&hd->arch.pgtables.list));
+
+    ASSERT(pdev->arch.pseudo_domid != DOMID_INVALID);
+
+    if ( pdev->arch.amd.root_table )
     {
-        ASSERT_UNREACHABLE();
+        clear_domain_page(pdev->arch.leaf_mfn);
         return 0;
     }
 
-    spin_lock(&hd->arch.mapping_lock);
-
-    hd->arch.amd.root_table = iommu_alloc_pgtable(hd);
-    if ( !hd->arch.amd.root_table )
-        goto out;
-
-    table = __map_domain_page(hd->arch.amd.root_table);
-    while ( level )
+    pdev->arch.amd.root_table = iommu_alloc_pgtable(hd);
+    if ( !pdev->arch.amd.root_table )
+        return -ENOMEM;
+
+    /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
+    hd->arch.amd.root_table = pdev->arch.amd.root_table;
+
+    rc = amd_iommu_reserve_domain_unity_map(dom_io,
+                                            ivrs_mappings[req_id].unity_map,
+                                            0);
+
+    iommu_identity_map_teardown(dom_io);
+    hd->arch.amd.root_table = NULL;
+
+    if ( rc )
+        printk("%pp: quarantine unity mapping failed\n", &pdev->sbdf);
+    else
     {
-        struct page_info *pg;
-        unsigned int i;
+        union amd_iommu_pte *root;
+        struct page_info *pgs[IOMMU_MAX_PT_LEVELS] = {};
 
-        /*
-         * The pgtable allocator is fine for the leaf page, as well as
-         * page table pages, and the resulting allocations are always
-         * zeroed.
-         */
-        pg = iommu_alloc_pgtable(hd);
-        if ( !pg )
-            break;
+        spin_lock(&hd->arch.mapping_lock);
 
-        for ( i = 0; i < PTE_PER_TABLE_SIZE; i++ )
-        {
-            union amd_iommu_pte *pde = &table[i];
+        root = __map_domain_page(pdev->arch.amd.root_table);
+        rc = fill_qpt(root, level - 1, pgs);
+        unmap_domain_page(root);
 
-            /*
-             * PDEs are essentially a subset of PTEs, so this function
-             * is fine to use even at the leaf.
-             */
-            set_iommu_pde_present(pde, mfn_x(page_to_mfn(pg)), level - 1,
-                                  false, true);
-        }
+        pdev->arch.leaf_mfn = page_to_mfn(pgs[0]);
 
-        unmap_domain_page(table);
-        table = __map_domain_page(pg);
-        level--;
+        spin_unlock(&hd->arch.mapping_lock);
     }
-    unmap_domain_page(table);
 
- out:
-    spin_unlock(&hd->arch.mapping_lock);
+    page_list_move(&pdev->arch.pgtables_list, &hd->arch.pgtables.list);
+
+    if ( rc )
+        amd_iommu_quarantine_teardown(pdev);
+
+    return rc;
+}
+
+void amd_iommu_quarantine_teardown(struct pci_dev *pdev)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
+
+    ASSERT(pcidevs_locked());
+
+    if ( !pdev->arch.amd.root_table )
+        return;
 
-    /* Pages leaked in failure case */
-    return level ? -ENOMEM : 0;
+    ASSERT(page_list_empty(&hd->arch.pgtables.list));
+    page_list_move(&hd->arch.pgtables.list, &pdev->arch.pgtables_list);
+    while ( iommu_free_pgtables(dom_io) == -ERESTART )
+        /* nothing */;
+    pdev->arch.amd.root_table = NULL;
 }
 
 /*
--- xen/drivers/passthrough/amd/pci_amd_iommu.c.orig
+++ xen/drivers/passthrough/amd/pci_amd_iommu.c
@@ -122,6 +122,8 @@ static int __must_check amd_iommu_setup_
     u8 bus = pdev->bus;
     const struct domain_iommu *hd = dom_iommu(domain);
     const struct ivrs_mappings *ivrs_dev;
+    const struct page_info *root_pg;
+    domid_t domid;
 
     BUG_ON(!hd->arch.amd.paging_mode || !iommu->dev_table.buffer);
 
@@ -141,14 +143,25 @@ static int __must_check amd_iommu_setup_
     dte = &table[req_id];
     ivrs_dev = &get_ivrs_mappings(iommu->seg)[req_id];
 
+    if ( domain != dom_io )
+    {
+        root_pg = hd->arch.amd.root_table;
+        domid = domain->domain_id;
+    }
+    else
+    {
+        root_pg = pdev->arch.amd.root_table;
+        domid = pdev->arch.pseudo_domid;
+    }
+
     spin_lock_irqsave(&iommu->lock, flags);
 
     if ( !dte->v || !dte->tv )
     {
         /* bind DTE to domain page-tables */
         rc = amd_iommu_set_root_page_table(
-                 dte, page_to_maddr(hd->arch.amd.root_table),
-                 domain->domain_id, hd->arch.amd.paging_mode, sr_flags);
+                 dte, page_to_maddr(root_pg), domid,
+                 hd->arch.amd.paging_mode, sr_flags);
         if ( rc )
         {
             ASSERT(rc < 0);
@@ -172,7 +185,7 @@ static int __must_check amd_iommu_setup_
 
         amd_iommu_flush_device(iommu, req_id);
     }
-    else if ( dte->pt_root != mfn_x(page_to_mfn(hd->arch.amd.root_table)) )
+    else if ( dte->pt_root != mfn_x(page_to_mfn(root_pg)) )
     {
         /*
          * Strictly speaking if the device is the only one with this requestor
@@ -185,8 +198,8 @@ static int __must_check amd_iommu_setup_
             rc = -EOPNOTSUPP;
         else
             rc = amd_iommu_set_root_page_table(
-                     dte, page_to_maddr(hd->arch.amd.root_table),
-                     domain->domain_id, hd->arch.amd.paging_mode, sr_flags);
+                     dte, page_to_maddr(root_pg), domid,
+                     hd->arch.amd.paging_mode, sr_flags);
         if ( rc < 0 )
         {
             spin_unlock_irqrestore(&iommu->lock, flags);
@@ -205,6 +218,7 @@ static int __must_check amd_iommu_setup_
               * intended anyway.
               */
              !pdev->domain->is_dying &&
+             pdev->domain != dom_io &&
              (any_pdev_behind_iommu(pdev->domain, pdev, iommu) ||
               pdev->phantom_stride) )
             printk(" %pp: reassignment may cause %pd data corruption\n",
@@ -234,9 +248,8 @@ static int __must_check amd_iommu_setup_
     AMD_IOMMU_DEBUG("Setup I/O page table: device id = %#x, type = %#x, "
                     "root table = %#"PRIx64", "
                     "domain = %d, paging mode = %d\n",
-                    req_id, pdev->type,
-                    page_to_maddr(hd->arch.amd.root_table),
-                    domain->domain_id, hd->arch.amd.paging_mode);
+                    req_id, pdev->type, page_to_maddr(root_pg),
+                    domid, hd->arch.amd.paging_mode);
 
     ASSERT(pcidevs_locked());
 
@@ -305,7 +318,7 @@ int amd_iommu_alloc_root(struct domain *
 {
     struct domain_iommu *hd = dom_iommu(d);
 
-    if ( unlikely(!hd->arch.amd.root_table) )
+    if ( unlikely(!hd->arch.amd.root_table) && d != dom_io )
     {
         hd->arch.amd.root_table = iommu_alloc_pgtable(hd);
         if ( !hd->arch.amd.root_table )
@@ -396,7 +409,7 @@ static void amd_iommu_disable_domain_dev
 
         AMD_IOMMU_DEBUG("Disable: device id = %#x, "
                         "domain = %d, paging mode = %d\n",
-                        req_id,  domain->domain_id,
+                        req_id, dte->domain_id,
                         dom_iommu(domain)->arch.amd.paging_mode);
     }
     spin_unlock_irqrestore(&iommu->lock, flags);
@@ -608,6 +621,8 @@ static int amd_iommu_remove_device(u8 de
 
     amd_iommu_disable_domain_device(pdev->domain, iommu, devfn, pdev);
 
+    amd_iommu_quarantine_teardown(pdev);
+
     iommu_free_domid(pdev->arch.pseudo_domid, iommu->domid_map);
     pdev->arch.pseudo_domid = DOMID_INVALID;
 
--- xen/drivers/passthrough/iommu.c.orig
+++ xen/drivers/passthrough/iommu.c
@@ -424,21 +424,21 @@ int iommu_iotlb_flush_all(struct domain
     return rc;
 }
 
-static int __init iommu_quarantine_init(void)
+int iommu_quarantine_dev_init(device_t *dev)
 {
     const struct domain_iommu *hd = dom_iommu(dom_io);
-    int rc;
 
-    dom_io->options |= XEN_DOMCTL_CDF_iommu;
+    if ( !iommu_quarantine || !hd->platform_ops->quarantine_init )
+        return 0;
 
-    rc = iommu_domain_init(dom_io, 0);
-    if ( rc )
-        return rc;
+    return iommu_call(hd->platform_ops, quarantine_init, dev);
+}
 
-    if ( !hd->platform_ops->quarantine_init )
-        return 0;
+static int __init iommu_quarantine_init(void)
+{
+    dom_io->options |= XEN_DOMCTL_CDF_iommu;
 
-    return hd->platform_ops->quarantine_init(dom_io);
+    return iommu_domain_init(dom_io, 0);
 }
 
 int __init iommu_setup(void)
--- xen/drivers/passthrough/pci.c.orig
+++ xen/drivers/passthrough/pci.c
@@ -858,9 +858,16 @@ static int deassign_device(struct domain
         return -ENODEV;
 
     /* De-assignment from dom_io should de-quarantine the device */
-    target = ((pdev->quarantine || iommu_quarantine) &&
-              pdev->domain != dom_io) ?
-        dom_io : hardware_domain;
+    if ( (pdev->quarantine || iommu_quarantine) && pdev->domain != dom_io )
+    {
+        ret = iommu_quarantine_dev_init(pci_to_dev(pdev));
+        if ( ret )
+           return ret;
+
+        target = dom_io;
+    }
+    else
+        target = hardware_domain;
 
     while ( pdev->phantom_stride )
     {
@@ -1441,6 +1448,13 @@ static int assign_device(struct domain *
         msixtbl_init(d);
     }
 
+    if ( pdev->domain != dom_io )
+    {
+        rc = iommu_quarantine_dev_init(pci_to_dev(pdev));
+        if ( rc )
+            goto done;
+    }
+
     pdev->fault.count = 0;
 
     if ( (rc = hd->platform_ops->assign_device(d, devfn, pci_to_dev(pdev), flag)) )
--- xen/drivers/passthrough/vtd/iommu.c.orig
+++ xen/drivers/passthrough/vtd/iommu.c
@@ -43,6 +43,12 @@
 #include "vtd.h"
 #include "../ats.h"
 
+#define DEVICE_DOMID(d, pdev) ((d) != dom_io ? (d)->domain_id \
+                                             : (pdev)->arch.pseudo_domid)
+#define DEVICE_PGTABLE(d, pdev) ((d) != dom_io \
+                                 ? dom_iommu(d)->arch.vtd.pgd_maddr \
+                                 : (pdev)->arch.vtd.pgd_maddr)
+
 /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
 bool __read_mostly untrusted_msi;
 
@@ -85,13 +91,18 @@ static int get_iommu_did(domid_t domid,
 
 #define DID_FIELD_WIDTH 16
 #define DID_HIGH_OFFSET 8
+
+/*
+ * This function may have "context" passed as NULL, to merely obtain a DID
+ * for "domid".
+ */
 static int context_set_domain_id(struct context_entry *context,
                                  domid_t domid, struct vtd_iommu *iommu)
 {
     unsigned long nr_dom, i;
     int found = 0;
 
-    ASSERT(spin_is_locked(&iommu->lock));
+    ASSERT(pcidevs_locked());
 
     nr_dom = cap_ndoms(iommu->cap);
     i = find_first_bit(iommu->domid_bitmap, nr_dom);
@@ -117,8 +128,13 @@ static int context_set_domain_id(struct
     }
 
     set_bit(i, iommu->domid_bitmap);
-    context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
-    context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
+
+    if ( context )
+    {
+        context->hi &= ~(((1 << DID_FIELD_WIDTH) - 1) << DID_HIGH_OFFSET);
+        context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
+    }
+
     return 0;
 }
 
@@ -168,8 +184,12 @@ static void check_cleanup_domid_map(stru
                                     const struct pci_dev *exclude,
                                     struct vtd_iommu *iommu)
 {
-    bool found = any_pdev_behind_iommu(d, exclude, iommu);
+    bool found;
+
+    if ( d == dom_io )
+        return;
 
+    found = any_pdev_behind_iommu(d, exclude, iommu);
     /*
      * Hidden devices are associated with DomXEN but usable by the hardware
      * domain. Hence they need considering here as well.
@@ -1414,7 +1434,7 @@ int domain_context_mapping_one(
         domid = iommu->domid_map[prev_did];
         if ( domid < DOMID_FIRST_RESERVED )
             prev_dom = rcu_lock_domain_by_id(domid);
-        else if ( domid == DOMID_IO )
+        else if ( pdev ? domid == pdev->arch.pseudo_domid : domid > DOMID_MASK )
             prev_dom = rcu_lock_domain(dom_io);
         if ( !prev_dom )
         {
@@ -1570,15 +1590,12 @@ int domain_context_mapping_one(
     {
         if ( !prev_dom )
             ret = domain_context_unmap_one(domain, iommu, bus, devfn,
-                                           domain->domain_id);
+                                           DEVICE_DOMID(domain, pdev));
         else if ( prev_dom != domain ) /* Avoid infinite recursion. */
-        {
-            hd = dom_iommu(prev_dom);
             ret = domain_context_mapping_one(prev_dom, iommu, bus, devfn, pdev,
-                                             domain->domain_id,
-                                             hd->arch.vtd.pgd_maddr,
+                                             DEVICE_DOMID(prev_dom, pdev),
+                                             DEVICE_PGTABLE(prev_dom, pdev),
                                              mode & MAP_WITH_RMRR) < 0;
-        }
         else
             ret = 1;
 
@@ -1600,7 +1617,7 @@ static int domain_context_mapping(struct
 {
     struct acpi_drhd_unit *drhd;
     const struct acpi_rmrr_unit *rmrr;
-    paddr_t pgd_maddr = dom_iommu(domain)->arch.vtd.pgd_maddr;
+    paddr_t pgd_maddr = DEVICE_PGTABLE(domain, pdev);
     domid_t orig_domid = pdev->arch.pseudo_domid;
     int ret = 0;
     unsigned int i, mode = 0;
@@ -1633,7 +1650,7 @@ static int domain_context_mapping(struct
         break;
     }
 
-    if ( domain != pdev->domain )
+    if ( domain != pdev->domain && pdev->domain != dom_io )
     {
         if ( pdev->domain->is_dying )
             mode |= MAP_OWNER_DYING;
@@ -1672,8 +1689,8 @@ static int domain_context_mapping(struct
         if ( iommu_debug )
             printk(VTDPREFIX "%pd:PCIe: map %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
-        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev, domain->domain_id, pgd_maddr,
+        ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn, pdev,
+                                         DEVICE_DOMID(domain, pdev), pgd_maddr,
                                          mode);
         if ( ret > 0 )
             ret = 0;
@@ -1696,8 +1713,8 @@ static int domain_context_mapping(struct
                    domain, &PCI_SBDF3(seg, bus, devfn));
 
         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                         pdev, domain->domain_id, pgd_maddr,
-                                         mode);
+                                         pdev, DEVICE_DOMID(domain, pdev),
+                                         pgd_maddr, mode);
         if ( ret < 0 )
             break;
         prev_present = ret;
@@ -1725,8 +1742,8 @@ static int domain_context_mapping(struct
          */
         if ( ret >= 0 )
             ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
-                                             NULL, domain->domain_id, pgd_maddr,
-                                             mode);
+                                             NULL, DEVICE_DOMID(domain, pdev),
+                                             pgd_maddr, mode);
 
         /*
          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
@@ -1741,8 +1758,8 @@ static int domain_context_mapping(struct
         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
              (secbus != pdev->bus || pdev->devfn != 0) )
             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
-                                             NULL, domain->domain_id, pgd_maddr,
-                                             mode);
+                                             NULL, DEVICE_DOMID(domain, pdev),
+                                             pgd_maddr, mode);
 
         if ( ret )
         {
@@ -1889,7 +1906,7 @@ static const struct acpi_drhd_unit *doma
             printk(VTDPREFIX "%pd:PCIe: unmap %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
         ret = domain_context_unmap_one(domain, iommu, bus, devfn,
-                                       domain->domain_id);
+                                       DEVICE_DOMID(domain, pdev));
         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
             disable_ats_device(pdev);
 
@@ -1900,7 +1917,7 @@ static const struct acpi_drhd_unit *doma
             printk(VTDPREFIX "%pd:PCI: unmap %pp\n",
                    domain, &PCI_SBDF3(seg, bus, devfn));
         ret = domain_context_unmap_one(domain, iommu, bus, devfn,
-                                       domain->domain_id);
+                                       DEVICE_DOMID(domain, pdev));
         if ( ret )
             break;
 
@@ -1923,18 +1940,12 @@ static const struct acpi_drhd_unit *doma
             break;
         }
 
+        ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
+                                       DEVICE_DOMID(domain, pdev));
         /* PCIe to PCI/PCIx bridge */
-        if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
-        {
-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
-                                           domain->domain_id);
-            if ( !ret )
-                ret = domain_context_unmap_one(domain, iommu, secbus, 0,
-                                               domain->domain_id);
-        }
-        else /* Legacy PCI bridge */
-            ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn,
-                                           domain->domain_id);
+        if ( !ret && pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
+            ret = domain_context_unmap_one(domain, iommu, secbus, 0,
+                                           DEVICE_DOMID(domain, pdev));
 
         break;
 
@@ -1977,6 +1988,26 @@ static void iommu_domain_teardown(struct
         cleanup_domid_map(d->domain_id, drhd->iommu);
 }
 
+static void quarantine_teardown(struct pci_dev *pdev,
+                                const struct acpi_drhd_unit *drhd)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
+
+    ASSERT(pcidevs_locked());
+
+    if ( !pdev->arch.vtd.pgd_maddr )
+        return;
+
+    ASSERT(page_list_empty(&hd->arch.pgtables.list));
+    page_list_move(&hd->arch.pgtables.list, &pdev->arch.pgtables_list);
+    while ( iommu_free_pgtables(dom_io) == -ERESTART )
+        /* nothing */;
+    pdev->arch.vtd.pgd_maddr = 0;
+
+    if ( drhd )
+        cleanup_domid_map(pdev->arch.pseudo_domid, drhd->iommu);
+}
+
 static int __must_check intel_iommu_map_page(struct domain *d, dfn_t dfn,
                                              mfn_t mfn, unsigned int flags,
                                              unsigned int *flush_flags)
@@ -2201,6 +2232,8 @@ static int intel_iommu_remove_device(u8
                                rmrr->end_address, 0);
     }
 
+    quarantine_teardown(pdev, drhd);
+
     if ( drhd )
     {
         iommu_free_domid(pdev->arch.pseudo_domid,
@@ -2896,69 +2929,135 @@ static void vtd_dump_page_tables(struct
                               agaw_to_level(hd->arch.vtd.agaw), 0, 0);
 }
 
-static int __init intel_iommu_quarantine_init(struct domain *d)
+static int fill_qpt(struct dma_pte *this, unsigned int level,
+                    struct page_info *pgs[6])
 {
-    struct domain_iommu *hd = dom_iommu(d);
+    struct domain_iommu *hd = dom_iommu(dom_io);
+    unsigned int i;
+    int rc = 0;
+
+    for ( i = 0; !rc && i < PTE_NUM; ++i )
+    {
+        struct dma_pte *pte = &this[i], *next;
+
+        if ( !dma_pte_present(*pte) )
+        {
+            if ( !pgs[level] )
+            {
+                /*
+                 * The pgtable allocator is fine for the leaf page, as well as
+                 * page table pages, and the resulting allocations are always
+                 * zeroed.
+                 */
+                pgs[level] = iommu_alloc_pgtable(hd);
+                if ( !pgs[level] )
+                {
+                    rc = -ENOMEM;
+                    break;
+                }
+
+                if ( level )
+                {
+                    next = map_vtd_domain_page(page_to_maddr(pgs[level]));
+                    rc = fill_qpt(next, level - 1, pgs);
+                    unmap_vtd_domain_page(next);
+                }
+            }
+
+            dma_set_pte_addr(*pte, page_to_maddr(pgs[level]));
+            dma_set_pte_readable(*pte);
+            dma_set_pte_writable(*pte);
+        }
+        else if ( level && !dma_pte_superpage(*pte) )
+        {
+            next = map_vtd_domain_page(dma_pte_addr(*pte));
+            rc = fill_qpt(next, level - 1, pgs);
+            unmap_vtd_domain_page(next);
+        }
+    }
+
+    return rc;
+}
+
+static int intel_iommu_quarantine_init(struct pci_dev *pdev)
+{
+    struct domain_iommu *hd = dom_iommu(dom_io);
     struct page_info *pg;
-    struct dma_pte *parent;
     unsigned int agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
     unsigned int level = agaw_to_level(agaw);
-    int rc = 0;
+    const struct acpi_drhd_unit *drhd;
+    const struct acpi_rmrr_unit *rmrr;
+    unsigned int i, bdf;
+    bool rmrr_found = false;
+    int rc;
 
-    spin_lock(&hd->arch.mapping_lock);
+    ASSERT(pcidevs_locked());
+    ASSERT(!hd->arch.vtd.pgd_maddr);
+    ASSERT(page_list_empty(&hd->arch.pgtables.list));
 
-    if ( hd->arch.vtd.pgd_maddr )
+    if ( pdev->arch.vtd.pgd_maddr )
     {
-        ASSERT_UNREACHABLE();
-        goto out;
+        clear_domain_page(pdev->arch.leaf_mfn);
+        return 0;
     }
 
-    pg = iommu_alloc_pgtable(hd);
+    drhd = acpi_find_matched_drhd_unit(pdev);
+    if ( !drhd )
+        return -ENODEV;
 
-    rc = -ENOMEM;
+    pg = iommu_alloc_pgtable(hd);
     if ( !pg )
-        goto out;
+        return -ENOMEM;
 
+    rc = context_set_domain_id(NULL, pdev->arch.pseudo_domid, drhd->iommu);
+
+    /* Transiently install the root into DomIO, for iommu_identity_mapping(). */
     hd->arch.vtd.pgd_maddr = page_to_maddr(pg);
 
-    parent = map_vtd_domain_page(hd->arch.vtd.pgd_maddr);
-    while ( level )
+    for_each_rmrr_device ( rmrr, bdf, i )
     {
-        uint64_t maddr;
-        unsigned int offset;
-
-        /*
-         * The pgtable allocator is fine for the leaf page, as well as
-         * page table pages, and the resulting allocations are always
-         * zeroed.
-         */
-        pg = iommu_alloc_pgtable(hd);
-
-        if ( !pg )
-            goto out;
+        if ( rc )
+            break;
 
-        maddr = page_to_maddr(pg);
-        for ( offset = 0; offset < PTE_NUM; offset++ )
+        if ( rmrr->segment == pdev->seg && bdf == pdev->sbdf.bdf )
         {
-            struct dma_pte *pte = &parent[offset];
+            rmrr_found = true;
 
-            dma_set_pte_addr(*pte, maddr);
-            dma_set_pte_readable(*pte);
+            rc = iommu_identity_mapping(dom_io, p2m_access_rw,
+                                        rmrr->base_address, rmrr->end_address,
+                                        0);
+            if ( rc )
+                printk(XENLOG_ERR VTDPREFIX
+                       "%pp: RMRR quarantine mapping failed\n",
+                       &pdev->sbdf);
         }
-        iommu_sync_cache(parent, PAGE_SIZE);
+    }
 
-        unmap_vtd_domain_page(parent);
-        parent = map_vtd_domain_page(maddr);
-        level--;
+    iommu_identity_map_teardown(dom_io);
+    hd->arch.vtd.pgd_maddr = 0;
+    pdev->arch.vtd.pgd_maddr = page_to_maddr(pg);
+
+    if ( !rc )
+    {
+        struct dma_pte *root;
+        struct page_info *pgs[6] = {};
+
+        spin_lock(&hd->arch.mapping_lock);
+
+        root = map_vtd_domain_page(pdev->arch.vtd.pgd_maddr);
+        rc = fill_qpt(root, level - 1, pgs);
+        unmap_vtd_domain_page(root);
+
+        pdev->arch.leaf_mfn = page_to_mfn(pgs[0]);
+
+        spin_unlock(&hd->arch.mapping_lock);
     }
-    unmap_vtd_domain_page(parent);
 
-    rc = 0;
+    page_list_move(&pdev->arch.pgtables_list, &hd->arch.pgtables.list);
 
- out:
-    spin_unlock(&hd->arch.mapping_lock);
+    if ( rc )
+        quarantine_teardown(pdev, drhd);
 
-    /* Pages may be leaked in failure case */
     return rc;
 }
 
--- xen/drivers/passthrough/vtd/iommu.h.orig
+++ xen/drivers/passthrough/vtd/iommu.h
@@ -509,7 +509,7 @@ struct vtd_iommu {
     u32 nr_pt_levels;
     u64	cap;
     u64	ecap;
-    spinlock_t lock; /* protect context, domain ids */
+    spinlock_t lock; /* protect context */
     spinlock_t register_lock; /* protect iommu register handling */
     u64 root_maddr; /* root entry machine address */
     nodeid_t node;
--- xen/include/xen/iommu.h.orig
+++ xen/include/xen/iommu.h
@@ -234,7 +234,7 @@ typedef int iommu_grdm_t(xen_pfn_t start
 struct iommu_ops {
     int (*init)(struct domain *d);
     void (*hwdom_init)(struct domain *d);
-    int (*quarantine_init)(struct domain *d);
+    int (*quarantine_init)(device_t *dev);
     int (*add_device)(u8 devfn, device_t *dev);
     int (*enable_device)(device_t *dev);
     int (*remove_device)(u8 devfn, device_t *dev);
@@ -352,6 +352,7 @@ int __must_check iommu_suspend(void);
 void iommu_resume(void);
 void iommu_crash_shutdown(void);
 int iommu_get_reserved_device_memory(iommu_grdm_t *, void *);
+int iommu_quarantine_dev_init(device_t *dev);
 
 #ifdef CONFIG_HAS_PCI
 int iommu_do_pci_domctl(struct xen_domctl *, struct domain *d,