File 67977677-AMD-IOMMU-atomically-update-IRTE.patch of Package xen.37689
# Commit b953a99da98d63a7c827248abc450d4e8e015ab6
# Date 2025-01-27 13:05:11 +0100
# Author Roger Pau Monne <roger.pau@citrix.com>
# Committer Roger Pau Monne <roger.pau@citrix.com>
iommu/amd: atomically update IRTE
Either when using a 32bit Interrupt Remapping Entry or a 128bit one update
the entry atomically, by using cmpxchg unconditionally as IOMMU depends on
it. No longer disable the entry by setting RemapEn = 0 ahead of updating
it. As a consequence of not toggling RemapEn ahead of the update the
Interrupt Remapping Table needs to be flushed after the entry update.
This avoids a window where the IRTE has RemapEn = 0, which can lead to
IO_PAGE_FAULT if the underlying interrupt source is not masked.
There's no guidance in AMD-Vi specification about how IRTE update should be
performed as opposed to DTE updating which has specific guidance. However
DTE updating claims that reads will always be at least 128bits in size, and
hence for the purposes here assume that reads and caching of the IRTE
entries in either 32 or 128 bit format will be done atomically from
the IOMMU.
Note that as part of introducing a new raw128 field in the IRTE struct, the
current raw field is renamed to raw64 to explicitly contain the size in the
field name.
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
--- a/xen/drivers/passthrough/amd/iommu_intr.c
+++ b/xen/drivers/passthrough/amd/iommu_intr.c
@@ -39,7 +39,8 @@ union irte32 {
};
union irte128 {
- uint64_t raw[2];
+ uint64_t raw64[2];
+ __uint128_t raw128;
struct {
bool remap_en:1;
bool sup_io_pf:1;
@@ -186,7 +187,7 @@ static void free_intremap_entry(const st
if ( iommu->ctrl.ga_en )
{
- ACCESS_ONCE(entry.ptr128->raw[0]) = 0;
+ ACCESS_ONCE(entry.ptr128->raw64[0]) = 0;
/*
* Low half (containing RemapEn) needs to be cleared first. Note that
* strictly speaking smp_wmb() isn't enough, as conceptually it expands
@@ -196,7 +197,7 @@ static void free_intremap_entry(const st
* variant will do.
*/
smp_wmb();
- entry.ptr128->raw[1] = 0;
+ entry.ptr128->raw64[1] = 0;
}
else
ACCESS_ONCE(entry.ptr32->raw) = 0;
@@ -211,7 +212,7 @@ static void update_intremap_entry(const
{
if ( iommu->ctrl.ga_en )
{
- union irte128 irte = {
+ const union irte128 irte = {
.full = {
.remap_en = true,
.int_type = int_type,
@@ -221,19 +222,26 @@ static void update_intremap_entry(const
.vector = vector,
},
};
+ __uint128_t old = entry.ptr128->raw128;
+ __uint128_t res = cmpxchg16b(&entry.ptr128->raw128, &old,
+ &irte.raw128);
- ASSERT(!entry.ptr128->full.remap_en);
- entry.ptr128->raw[1] = irte.raw[1];
/*
- * High half needs to be set before low one (containing RemapEn). See
- * comment in free_intremap_entry() regarding the choice of barrier.
+ * Hardware does not update the IRTE behind our backs, so the return
+ * value should match "old".
*/
- smp_wmb();
- ACCESS_ONCE(entry.ptr128->raw[0]) = irte.raw[0];
+ if ( res != old )
+ {
+ printk(XENLOG_ERR
+ "unexpected IRTE %016lx_%016lx (expected %016lx_%016lx)\n",
+ (uint64_t)(res >> 64), (uint64_t)res,
+ (uint64_t)(old >> 64), (uint64_t)old);
+ ASSERT_UNREACHABLE();
+ }
}
else
{
- union irte32 irte = {
+ const union irte32 irte = {
.flds = {
.remap_en = true,
.int_type = int_type,
@@ -298,21 +306,13 @@ static int update_intremap_entry_from_io
entry = get_intremap_entry(iommu, req_id, offset);
- /* The RemapEn fields match for all formats. */
- while ( iommu->enabled && entry.ptr32->flds.remap_en )
- {
- entry.ptr32->flds.remap_en = false;
- spin_unlock(lock);
-
- amd_iommu_flush_intremap(iommu, req_id);
-
- spin_lock(lock);
- }
-
update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
spin_unlock_irqrestore(lock, flags);
+ if ( !fresh )
+ amd_iommu_flush_intremap(iommu, req_id);
+
set_rte_index(rte, offset);
return 0;
@@ -321,7 +321,7 @@ static int update_intremap_entry_from_io
void cf_check amd_iommu_ioapic_update_ire(
unsigned int apic, unsigned int pin, uint64_t rte)
{
- struct IO_APIC_route_entry old_rte, new_rte;
+ struct IO_APIC_route_entry new_rte;
int seg, bdf, rc;
struct amd_iommu *iommu;
unsigned int idx;
@@ -345,14 +345,6 @@ void cf_check amd_iommu_ioapic_update_ir
return;
}
- old_rte = __ioapic_read_entry(apic, pin, true);
- /* mask the interrupt while we change the intremap table */
- if ( !old_rte.mask )
- {
- old_rte.mask = 1;
- __ioapic_write_entry(apic, pin, true, old_rte);
- }
-
/* Update interrupt remapping entry */
rc = update_intremap_entry_from_ioapic(
bdf, iommu, &new_rte,
@@ -424,6 +416,7 @@ static int update_intremap_entry_from_ms
uint8_t delivery_mode, vector, dest_mode;
spinlock_t *lock;
unsigned int dest, offset, i;
+ bool fresh = false;
req_id = get_dma_requestor_id(iommu->seg, bdf);
alias_id = get_intremap_requestor_id(iommu->seg, bdf);
@@ -467,26 +460,21 @@ static int update_intremap_entry_from_ms
return -ENOSPC;
}
*remap_index = offset;
+ fresh = true;
}
entry = get_intremap_entry(iommu, req_id, offset);
- /* The RemapEn fields match for all formats. */
- while ( iommu->enabled && entry.ptr32->flds.remap_en )
- {
- entry.ptr32->flds.remap_en = false;
- spin_unlock(lock);
+ update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
+ spin_unlock_irqrestore(lock, flags);
+ if ( !fresh )
+ {
amd_iommu_flush_intremap(iommu, req_id);
if ( alias_id != req_id )
amd_iommu_flush_intremap(iommu, alias_id);
-
- spin_lock(lock);
}
- update_intremap_entry(iommu, entry, vector, delivery_mode, dest_mode, dest);
- spin_unlock_irqrestore(lock, flags);
-
*data = (msg->data & ~(INTREMAP_MAX_ENTRIES - 1)) | offset;
/*
@@ -734,7 +722,7 @@ static void dump_intremap_table(const st
for ( count = 0; count < nr; count++ )
{
if ( iommu->ctrl.ga_en
- ? !tbl.ptr128[count].raw[0] && !tbl.ptr128[count].raw[1]
+ ? !tbl.ptr128[count].raw64[0] && !tbl.ptr128[count].raw64[1]
: !tbl.ptr32[count].raw )
continue;
@@ -747,7 +735,8 @@ static void dump_intremap_table(const st
if ( iommu->ctrl.ga_en )
printk(" IRTE[%03x] %016lx_%016lx\n",
- count, tbl.ptr128[count].raw[1], tbl.ptr128[count].raw[0]);
+ count, tbl.ptr128[count].raw64[1],
+ tbl.ptr128[count].raw64[0]);
else
printk(" IRTE[%03x] %08x\n", count, tbl.ptr32[count].raw);
}