File xsa380-2.patch of Package xen

gnttab: replace mapkind()

mapkind() doesn't scale very well with larger maptrack entry counts,
using a brute force linear search through all entries, with the only
option of an early loop exit if a matching writable entry was found.
Introduce a radix tree alongside the main maptrack table, thus
allowing much faster MFN-based lookup. To avoid the need to actually
allocate space for the individual nodes, encode the two counters in the
node pointers themselves, thus limiting the number of permitted
simultaneous r/o and r/w mappings of the same MFN to 2³¹-1 (64-bit) /
2¹⁵-1 (32-bit) each.

To avoid enforcing an unnecessarily low bound on the number of
simultaneous mappings of a single MFN, introduce
radix_tree_{ulong_to_ptr,ptr_to_ulong} paralleling
radix_tree_{int_to_ptr,ptr_to_int}.

As a consequence locking changes are also applicable: With there no
longer being any inspection of the remote domain's active entries,
there's also no need anymore to hold the remote domain's grant table
lock. And since we're no longer iterating over the local domain's map
track table, the lock in map_grant_ref() can also be dropped before the
new maptrack entry actually gets populated.

As a nice side effect this also reduces the number of IOMMU operations
in unmap_common(): Previously we would have "established" a readable
mapping whenever we didn't find a writable entry anymore (yet, of
course, at least one readable one). But we only need to do this if we
actually dropped the last writable entry, not if there were none already
before.

This is part of XSA-380.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

# Commit b6da9d0414d69c2682214ee3ecf9816fcac500d0
# Date 2021-08-27 10:54:46 +0200
# Author Jan Beulich <jbeulich@suse.com>
# Committer Jan Beulich <jbeulich@suse.com>
gnttab: avoid triggering assertion in radix_tree_ulong_to_ptr()

Relevant quotes from the C11 standard:

"Except where explicitly stated otherwise, for the purposes of this
 subclause unnamed members of objects of structure and union type do not
 participate in initialization. Unnamed members of structure objects
 have indeterminate value even after initialization."

"If there are fewer initializers in a brace-enclosed list than there are
 elements or members of an aggregate, [...], the remainder of the
 aggregate shall be initialized implicitly the same as objects that have
 static storage duration."

"If an object that has static or thread storage duration is not
 initialized explicitly, then:
 [...]
 — if it is an aggregate, every member is initialized (recursively)
   according to these rules, and any padding is initialized to zero
   bits;
 [...]"

"A bit-field declaration with no declarator, but only a colon and a
 width, indicates an unnamed bit-field." Footnote: "An unnamed bit-field
 structure member is useful for padding to conform to externally imposed
 layouts."

"There may be unnamed padding within a structure object, but not at its
 beginning."

Which makes me conclude:
- Whether an unnamed bit-field member is an unnamed member or padding is
  unclear, and hence also whether the last quote above would render the
  big endian case of the structure declaration invalid.
- Whether the number of members of an aggregate includes unnamed ones is
  also not really clear.
- The initializer in map_grant_ref() initializes all fields of the "cnt"
  sub-structure of the union, so assuming the second quote above applies
  here (indirectly), the compiler isn't required to implicitly
  initialize the rest (i.e. in particular any padding) like would happen
  for static storage duration objects.

Gcc 7.4.1 can be observed (apparently in debug builds only) to translate
aforementioned initializer to a read-modify-write operation of a stack
variable, leaving unchanged the top two bits of whatever was previously
in that stack slot. Clearly if either of the two bits were set,
radix_tree_ulong_to_ptr()'s assertion would trigger.

Therefore, to be on the safe side, add an explicit padding field for the
non-big-endian-bitfields case and give a dummy name to both padding
fields.

Fixes: 9781b51efde2 ("gnttab: replace mapkind()")
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

--- a/xen/common/grant_table.c
+++ b/xen/common/grant_table.c
@@ -308,34 +308,6 @@ static int __get_paged_frame(unsigned lo
     return rc;
 }
 
-static inline void
-double_gt_lock(struct grant_table *lgt, struct grant_table *rgt)
-{
-    /*
-     * See mapkind() for why the write lock is also required for the
-     * remote domain.
-     */
-    if ( lgt < rgt )
-    {
-        grant_write_lock(lgt);
-        grant_write_lock(rgt);
-    }
-    else
-    {
-        if ( lgt != rgt )
-            grant_write_lock(rgt);
-        grant_write_lock(lgt);
-    }
-}
-
-static inline void
-double_gt_unlock(struct grant_table *lgt, struct grant_table *rgt)
-{
-    grant_write_unlock(lgt);
-    if ( lgt != rgt )
-        grant_write_unlock(rgt);
-}
-
 static inline int
 __get_maptrack_handle(
     struct grant_table *t,
@@ -754,41 +726,20 @@ static struct active_grant_entry *grant_
     return ERR_PTR(-EINVAL);
 }
 
-#define MAPKIND_READ 1
-#define MAPKIND_WRITE 2
-static unsigned int mapkind(
-    struct grant_table *lgt, const struct domain *rd, unsigned long mfn)
-{
-    struct grant_mapping *map;
-    grant_handle_t handle;
-    unsigned int kind = 0;
-
-    /*
-     * Must have the local domain's grant table write lock when
-     * iterating over its maptrack entries.
-     */
-    ASSERT(percpu_rw_is_write_locked(&lgt->lock));
-    /*
-     * Must have the remote domain's grant table write lock while
-     * counting its active entries.
-     */
-    ASSERT(percpu_rw_is_write_locked(&rd->grant_table->lock));
-
-    for ( handle = 0; !(kind & MAPKIND_WRITE) &&
-                      handle < lgt->maptrack_limit; handle++ )
-    {
-        smp_rmb();
-        map = &maptrack_entry(lgt, handle);
-        if ( !(map->flags & (GNTMAP_device_map|GNTMAP_host_map)) ||
-             map->domid != rd->domain_id )
-            continue;
-        if ( _active_entry(rd->grant_table, map->ref).frame == mfn )
-            kind |= map->flags & GNTMAP_readonly ?
-                    MAPKIND_READ : MAPKIND_WRITE;
-    }
-
-    return kind;
-}
+union maptrack_node {
+    struct {
+        /* Radix tree slot pointers use two of the bits. */
+#ifdef __BIG_ENDIAN_BITFIELD
+        unsigned long _0 : 2;
+#endif
+        unsigned long rd : BITS_PER_LONG / 2 - 1;
+        unsigned long wr : BITS_PER_LONG / 2 - 1;
+#ifndef __BIG_ENDIAN_BITFIELD
+        unsigned long _0 : 2;
+#endif
+    } cnt;
+    unsigned long raw;
+};
 
 /*
  * Returns 0 if TLB flush / invalidate required by caller.
@@ -813,7 +764,6 @@ __gnttab_map_grant_ref(
     struct grant_mapping *mt;
     grant_entry_header_t *shah;
     uint16_t *status;
-    bool_t need_iommu;
 
     led = current;
     ld = led->domain;
@@ -1015,31 +965,75 @@ __gnttab_map_grant_ref(
         goto undo_out;
     }
 
-    need_iommu = gnttab_need_iommu_mapping(ld);
-    if ( need_iommu )
+    if ( gnttab_need_iommu_mapping(ld) )
     {
+        union maptrack_node node = {
+            .cnt.rd = !!(op->flags & GNTMAP_readonly),
+            .cnt.wr = !(op->flags & GNTMAP_readonly),
+        };
+        int err;
+        void **slot = NULL;
         unsigned int kind;
 
-        double_gt_lock(lgt, rgt);
+        grant_write_lock(lgt);
+
+        err = radix_tree_insert(&lgt->maptrack_tree, frame,
+                                radix_tree_ulong_to_ptr(node.raw));
+        if ( err == -EEXIST )
+        {
+            slot = radix_tree_lookup_slot(&lgt->maptrack_tree, frame);
+            if ( likely(slot) )
+            {
+                node.raw = radix_tree_ptr_to_ulong(*slot);
+                err = -EBUSY;
+
+                /* Update node only when refcount doesn't overflow. */
+                if ( op->flags & GNTMAP_readonly ? ++node.cnt.rd
+                                                 : ++node.cnt.wr )
+                {
+                    radix_tree_replace_slot(slot,
+                                            radix_tree_ulong_to_ptr(node.raw));
+                    err = 0;
+                }
+            }
+            else
+                ASSERT_UNREACHABLE();
+        }
 
         /*
          * We're not translated, so we know that dfns and mfns are
          * the same things, so the IOMMU entry is always 1-to-1.
          */
-        kind = mapkind(lgt, rd, frame);
-        if ( !(op->flags & GNTMAP_readonly) &&
-             !(kind & MAPKIND_WRITE) )
+        if ( !(op->flags & GNTMAP_readonly) && node.cnt.wr == 1 )
             kind = IOMMUF_readable | IOMMUF_writable;
-        else if ( !kind )
+        else if ( (op->flags & GNTMAP_readonly) &&
+                  node.cnt.rd == 1 && !node.cnt.wr )
             kind = IOMMUF_readable;
         else
             kind = 0;
-        if ( kind && iommu_map_page(ld, frame, frame, kind) )
+        if ( err ||
+             (kind && iommu_map_page(ld, frame, frame, kind)) )
         {
-            double_gt_unlock(lgt, rgt);
+            if ( !err )
+            {
+                if ( slot )
+                {
+                    op->flags & GNTMAP_readonly ? node.cnt.rd--
+                                                : node.cnt.wr--;
+                    radix_tree_replace_slot(slot,
+                                            radix_tree_ulong_to_ptr(node.raw));
+                }
+                else
+                    radix_tree_delete(&lgt->maptrack_tree, frame);
+            }
+
             rc = GNTST_general_error;
-            goto undo_out;
         }
+
+        grant_write_unlock(lgt);
+
+        if ( rc != GNTST_okay )
+            goto undo_out;
     }
 
     TRACE_1D(TRC_MEM_PAGE_GRANT_MAP, op->dom);
@@ -1047,10 +1041,6 @@ __gnttab_map_grant_ref(
     /*
      * All maptrack entry users check mt->flags first before using the
      * other fields so just ensure the flags field is stored last.
-     *
-     * However, if gnttab_need_iommu_mapping() then this would race
-     * with a concurrent mapkind() call (on an unmap, for example)
-     * and a lock is required.
      */
     mt = &maptrack_entry(lgt, handle);
     mt->domid = op->dom;
@@ -1058,9 +1048,6 @@ __gnttab_map_grant_ref(
     smp_wmb();
     write_atomic(&mt->flags, op->flags);
 
-    if ( need_iommu )
-        double_gt_unlock(lgt, rgt);
-
     op->dev_bus_addr = (u64)frame << PAGE_SHIFT;
     op->handle       = handle;
     op->status       = GNTST_okay;
@@ -1280,18 +1267,33 @@ __gnttab_unmap_common(
 
     if ( rc == GNTST_okay && gnttab_need_iommu_mapping(ld) )
     {
-        unsigned int kind;
+        void **slot;
+        union maptrack_node node;
         int err = 0;
 
-        double_gt_lock(lgt, rgt);
+        grant_write_lock(lgt);
+        slot = radix_tree_lookup_slot(&lgt->maptrack_tree, op->frame);
+        node.raw = likely(slot) ? radix_tree_ptr_to_ulong(*slot) : 0;
+
+        /* Refcount must not underflow. */
+        if ( !(flags & GNTMAP_readonly ? node.cnt.rd--
+                                       : node.cnt.wr--) )
+            BUG();
 
-        kind = mapkind(lgt, rd, op->frame);
-        if ( !kind )
+        if ( !node.raw )
             err = iommu_unmap_page(ld, op->frame);
-        else if ( !(kind & MAPKIND_WRITE) )
+        else if ( !(flags & GNTMAP_readonly) && !node.cnt.wr )
             err = iommu_map_page(ld, op->frame, op->frame, IOMMUF_readable);
 
-        double_gt_unlock(lgt, rgt);
+        if ( err )
+            ;
+        else if ( !node.raw )
+            radix_tree_delete(&lgt->maptrack_tree, op->frame);
+        else
+            radix_tree_replace_slot(slot,
+                                    radix_tree_ulong_to_ptr(node.raw));
+
+        grant_write_unlock(lgt);
 
         if ( err )
             rc = GNTST_general_error;
@@ -3409,6 +3411,8 @@ grant_table_create(
     if ( t->maptrack == NULL )
         goto no_mem_2;
 
+    radix_tree_init(&t->maptrack_tree);
+
     /* Shared grant table. */
     if ( (t->shared_raw = xzalloc_array(void *, max_grant_frames)) == NULL )
         goto no_mem_3;
@@ -3470,6 +3474,8 @@ int gnttab_release_mappings(struct domai
 
     for ( handle = gt->maptrack_limit; handle; )
     {
+        unsigned long mfn;
+
         /*
          * Deal with full pages such that their freeing (in the body of the
          * if()) remains simple.
@@ -3571,17 +3577,31 @@ int gnttab_release_mappings(struct domai
         if ( act->pin == 0 )
             gnttab_clear_flag(_GTF_reading, status);
 
+        mfn = act->frame;
+
         active_entry_release(act);
         grant_read_unlock(rgt);
 
         rcu_unlock_domain(rd);
 
         map->flags = 0;
+
+        /*
+         * This is excessive in that a single such call would suffice per
+         * mapped MFN (or none at all, if no entry was ever inserted). But it
+         * should be the common case for an MFN to be mapped just once, and
+         * this way we don't need to further maintain the counters. We also
+         * don't want to leave cleaning up of the tree as a whole to the end
+         * of the function, as this could take quite some time.
+         */
+        radix_tree_delete(&gt->maptrack_tree, mfn);
     }
 
     gt->maptrack_limit = 0;
     FREE_XENHEAP_PAGE(gt->maptrack[0]);
 
+    radix_tree_destroy(&gt->maptrack_tree, NULL);
+
     return 0;
 }
 
--- a/xen/include/xen/grant_table.h
+++ b/xen/include/xen/grant_table.h
@@ -23,6 +23,7 @@
 #ifndef __XEN_GRANT_TABLE_H__
 #define __XEN_GRANT_TABLE_H__
 
+#include <xen/radix-tree.h>
 #include <xen/rwlock.h>
 #include <public/grant_table.h>
 #include <asm/page.h>
@@ -75,9 +76,14 @@ struct grant_table {
     grant_status_t       **status;
     /* Active grant table. */
     struct active_grant_entry **active;
-    /* Mapping tracking table per vcpu. */
+    /* Handle-indexed tracking table of mappings. */
     struct grant_mapping **maptrack;
     /*
+     * MFN-indexed tracking tree of mappings, if needed.  Note that this is
+     * protected by @lock, not @maptrack_lock.
+     */
+    struct radix_tree_root maptrack_tree;
+    /*
      * Number of available maptrack entries.  For cleanup purposes it is
      * important to realize that this field and @maptrack further down will
      * only ever be accessed by the local domain.  Thus it is okay to clean
--- a/xen/include/xen/radix-tree.h
+++ b/xen/include/xen/radix-tree.h
@@ -190,6 +190,25 @@ static inline int radix_tree_ptr_to_int(
     return (int)((long)ptr >> 2);
 }
 
+/**
+ * radix_tree_{ulong_to_ptr,ptr_to_ulong}:
+ *
+ * Same for unsigned long values. Beware though that only BITS_PER_LONG-2
+ * bits are actually usable for the value.
+ */
+static inline void *radix_tree_ulong_to_ptr(unsigned long val)
+{
+    unsigned long ptr = (val << 2) | 0x2;
+    ASSERT((ptr >> 2) == val);
+    return (void *)ptr;
+}
+
+static inline unsigned long radix_tree_ptr_to_ulong(void *ptr)
+{
+    ASSERT(((unsigned long)ptr & 0x3) == 0x2);
+    return (unsigned long)ptr >> 2;
+}
+
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
openSUSE Build Service is sponsored by