File 1001-udev-use-lock-when-selecting-the-highest-priority-de.patch of Package systemd.30673

From 212aa5e819ced67e152a4efc43e0fab38a3f60a0 Mon Sep 17 00:00:00 2001
From: Franck Bui <fbui@suse.com>
Date: Wed, 3 Mar 2021 17:34:39 +0100
Subject: [PATCH 1001/1001] udev: use lock when selecting the highest priority
 devlink

Commit 30f6dce62cb3a738b20253f2192270607c31b55b introduced a lock-less
algorithm, which is a kind of spinlock in userspace spinning on a heavy loop
(!?!), that is supposed to fix the race that might happen when multiple
workers/devices claim for the same symlink.

But the algorithm is boggus as every worker will prefer itself and will try to
"steal" the link from another worker with the same priority.

Rather than trying to fix the algorithm and close all possible races, let's use
a much simplest approach which consists in using a lock. This was originally
implemented by Martin Wilck: https://github.com/systemd/systemd/pull/8667 but
somehow upstream preferred the more complex lock-less algo without any proof of
benefits (quite the opposite since there're only drawback AFAICS).

In fact the figures we collected so far tends to show that the lock approach is
even faster than the racy version without any fix.

This patch basically drop the lock-less algo introduced by commit
30f6dce62cb3a738b20253f2192270607c31b55b and reuses Martin's lock approach with
some minor improvements.

Compare to the old implementation (before commit 30f6dce62cb3a738b20253f21), it
changes the moment when the symlink are created: it's now done *after* udev DB
has been updated, hence removing the risk for another worker to overwrite a
just created symlink with a higher priority which has not yet been registed in
the DB. Latest versions of upstream calls update_devnode() twice, see
https://github.com/systemd/systemd/pull/19560#issuecomment-836693914 for the
"rationale".

Update v2:
----------
v249.13 includes a bunch of reworks of this lock-less algorithm, which confirm
how this design was a bad idea (upstream eventually got rid of it and replaced
it with the lock approach since v252). This patch was rebased to drop these
changes as they're useless when the link directories are protected with locks.

v249.13 also introduced a new format for the link directory entries. I'm not
sure how relevant such changes are for a stable release but it increases the
risk of regression significantly. So this part was also dropped.

[fbui: fixes bsc#1181192]

Update v3:
---------

Optimize the case where hundred workers claim the same symlink with the same
priority.

Each time a new device claimed a symlink, the worker searched for the device
claiming the symlink with the highest prio by examining the prio of each device
(including the new one) and recreate the symlink accordingly. However this can
be improved by stopping the search as soon as it encounters a device with the
same prio since in this case it can assume that the symlink is currently owned
by a device with equal or greater prio.

[fbui: fixes bsc#1203141]
---
 src/udev/udev-event.c |   9 +-
 src/udev/udev-node.c  | 321 ++++++++++--------------------------------
 2 files changed, 82 insertions(+), 248 deletions(-)

diff --git a/src/udev/udev-event.c b/src/udev/udev-event.c
index 2661ed7933..173cd37618 100644
--- a/src/udev/udev-event.c
+++ b/src/udev/udev-event.c
@@ -1051,10 +1051,6 @@ int udev_event_execute_rules(
         if (r < 0)
                 return r;
 
-        r = update_devnode(event);
-        if (r < 0)
-                return r;
-
         /* preserve old, or get new initialization timestamp */
         r = device_ensure_usec_initialized(dev, event->dev_db_clone);
         if (r < 0)
@@ -1069,8 +1065,11 @@ int udev_event_execute_rules(
         if (r < 0)
                 return log_device_debug_errno(dev, r, "Failed to update database under /run/udev/data/: %m");
 
-        device_set_is_initialized(dev);
+        r = update_devnode(event);
+        if (r < 0)
+                return r;
 
+        device_set_is_initialized(dev);
         return 0;
 }
 
diff --git a/src/udev/udev-node.c b/src/udev/udev-node.c
index d9309efa25..7be933a90f 100644
--- a/src/udev/udev-node.c
+++ b/src/udev/udev-node.c
@@ -17,26 +17,19 @@
 #include "format-util.h"
 #include "fs-util.h"
 #include "hexdecoct.h"
+#include "lockfile-util.h"
 #include "mkdir.h"
-#include "parse-util.h"
 #include "path-util.h"
-#include "random-util.h"
 #include "selinux-util.h"
 #include "smack-util.h"
 #include "stat-util.h"
 #include "stdio-util.h"
 #include "string-util.h"
 #include "strxcpyx.h"
-#include "time-util.h"
 #include "udev-node.h"
 #include "user-util.h"
 
 #define CREATE_LINK_MAX_RETRIES        128
-#define LINK_UPDATE_MAX_RETRIES        128
-#define CREATE_STACK_LINK_MAX_RETRIES  128
-#define UPDATE_TIMESTAMP_MAX_RETRIES   128
-#define MAX_RANDOM_DELAY (250 * USEC_PER_MSEC)
-#define MIN_RANDOM_DELAY ( 50 * USEC_PER_MSEC)
 #define UDEV_NODE_HASH_KEY SD_ID128_MAKE(b9,6a,f1,ce,40,31,44,1a,9e,19,ec,8b,ae,f3,e3,2f)
 
 static int create_symlink(const char *target, const char *slink) {
@@ -122,8 +115,14 @@ static int link_find_prioritized(sd_device *dev, bool add, const char *stackdir,
         assert(stackdir);
         assert(ret);
 
-        /* Find device node of device with highest priority. This returns 1 if a device found, 0 if no
-         * device found, or a negative errno. */
+        /* When 'add' is true, search for a device having the same or a higher prio. If such a device is
+         * found, return 0 and 'ret' is unchanged. Otherwise return 1 and 'ret' contains the device node of
+         * the passed device.
+         *
+         * When 'add' is false, find device node of device with highest priority. This returns 1 if a device
+         * found 0 if no device found.
+         *
+         * In both cases it returns a negative errno in case of errors. */
 
         if (add) {
                 const char *devnode;
@@ -139,6 +138,8 @@ static int link_find_prioritized(sd_device *dev, bool add, const char *stackdir,
                 target = strdup(devnode);
                 if (!target)
                         return -ENOMEM;
+
+                log_device_debug(dev, "Attempting to claim '%s' with priority %i", stackdir, priority);
         }
 
         dir = opendir(stackdir);
@@ -157,67 +158,50 @@ static int link_find_prioritized(sd_device *dev, bool add, const char *stackdir,
                 return r;
 
         FOREACH_DIRENT_ALL(dent, dir, break) {
-                _cleanup_free_ char *path = NULL, *buf = NULL;
-                int tmp_prio;
+                _cleanup_(sd_device_unrefp) sd_device *dev_db = NULL;
+                const char *devnode;
+                int db_prio = 0;
 
+                if (dent->d_name[0] == '\0')
+                        break;
                 if (dent->d_name[0] == '.')
                         continue;
 
-                /* skip ourself */
+                /* did we find ourself? */
                 if (streq(dent->d_name, id))
                         continue;
 
-                path = path_join(stackdir, dent->d_name);
-                if (!path)
-                        return -ENOMEM;
-
-                if (readlink_malloc(path, &buf) >= 0) {
-                        char *devnode;
-
-                        /* New format. The devnode and priority can be obtained from symlink. */
-
-                        devnode = strchr(buf, ':');
-                        if (!devnode || devnode == buf)
-                                continue;
+                log_device_debug(dev, "Found '%s' claiming '%s'", dent->d_name, stackdir);
 
-                        *(devnode++) = '\0';
-                        if (!path_startswith(devnode, "/dev"))
-                                continue;
-
-                        if (safe_atoi(buf, &tmp_prio) < 0)
-                                continue;
-
-                        if (target && tmp_prio <= priority)
-                                continue;
-
-                        r = free_and_strdup(&target, devnode);
-                        if (r < 0)
-                                return r;
-                } else {
-                        _cleanup_(sd_device_unrefp) sd_device *tmp_dev = NULL;
-                        const char *devnode;
+                if (sd_device_new_from_device_id(&dev_db, dent->d_name) < 0)
+                        continue;
 
-                        /* Old format. The devnode and priority must be obtained from uevent and
-                         * udev database files. */
+                if (sd_device_get_devname(dev_db, &devnode) < 0)
+                        continue;
 
-                        if (sd_device_new_from_device_id(&tmp_dev, dent->d_name) < 0)
-                                continue;
+                if (device_get_devlink_priority(dev_db, &db_prio) < 0)
+                        continue;
 
-                        if (device_get_devlink_priority(tmp_dev, &tmp_prio) < 0)
-                                continue;
+                if (target && db_prio < priority)
+                        continue;
 
-                        if (target && tmp_prio <= priority)
-                                continue;
+                if (add) {
+                        assert(target);
+                        /* Exit early as we found an existing device claiming the symlink with same or higher
+                         * prio. */
+                        log_device_debug(dev, "Giving up '%s' in favor of %s (prio=%i)", stackdir, dent->d_name, db_prio);
+                        return 0;
+                }
 
-                        if (sd_device_get_devname(tmp_dev, &devnode) < 0)
-                                continue;
+                if (target && db_prio == priority)
+                        continue;
 
-                        r = free_and_strdup(&target, devnode);
-                        if (r < 0)
-                                return r;
-                }
+                log_device_debug(dev_db, "Device claims priority %i for '%s'", db_prio, stackdir);
 
-                priority = tmp_prio;
+                r = free_and_strdup(&target, devnode);
+                if (r < 0)
+                        return r;
+                priority = db_prio;
         }
 
         *ret = TAKE_PTR(target);
@@ -266,160 +250,12 @@ toolong:
         return size - 1;
 }
 
-static int update_timestamp(sd_device *dev, const char *path, struct stat *prev) {
-        assert(path);
-        assert(prev);
-
-        /* Even if a symlink in the stack directory is created/removed, the mtime of the directory may
-         * not be changed. Why? Let's consider the following situation. For simplicity, let's assume
-         * there exist two udev workers (A and B) and all of them calls link_update() for the same
-         * devlink simultaneously.
-         *
-         * 1. A creates/removes a symlink in the stack directory.
-         * 2. A calls the first stat() in the loop of link_update().
-         * 3. A calls link_find_prioritized().
-         * 4. B creates/removes another symlink in the stack directory, so the result of the step 3 is outdated.
-         * 5. B finishes link_update().
-         * 6. A creates/removes devlink according to the outdated result in the step 3.
-         * 7. A calls the second stat() in the loop of link_update().
-         *
-         * If these 7 steps are processed in this order within a short time period that kernel's timer
-         * does not increase, then even if the contents in the stack directory is changed, the results
-         * of two stat() called by A shows the same timestamp, and A cannot detect the change.
-         *
-         * By calling this function after creating/removing symlinks in the stack directory, the
-         * timestamp of the stack directory is always increased at least in the above step 5, so A can
-         * detect the update. */
-
-        if ((prev->st_mode & S_IFMT) == 0)
-                return 0; /* Does not exist, or previous stat() failed. */
-
-        for (unsigned i = 0; i < UPDATE_TIMESTAMP_MAX_RETRIES; i++) {
-                struct stat st;
-
-                if (stat(path, &st) < 0)
-                        return -errno;
-
-                if (!stat_inode_unmodified(prev, &st))
-                        return 0;
-
-                log_device_debug(dev,
-                                 "%s is modified, but its timestamp is not changed, "
-                                 "updating timestamp after 10ms.",
-                                 path);
-
-                (void) usleep(10 * USEC_PER_MSEC);
-                if (utimensat(AT_FDCWD, path, NULL, 0) < 0)
-                        return -errno;
-        }
-
-        return -ELOOP;
-}
-
-static int update_stack_directory(sd_device *dev, const char *dirname, bool add) {
-        _cleanup_free_ char *filename = NULL, *data = NULL, *buf = NULL;
-        const char *devname, *id;
-        struct stat st = {};
-        int priority, r;
-
-        assert(dev);
-        assert(dirname);
-
-        r = device_get_device_id(dev, &id);
-        if (r < 0)
-                return log_device_debug_errno(dev, r, "Failed to get device id: %m");
-
-        filename = path_join(dirname, id);
-        if (!filename)
-                return log_oom_debug();
-
-        if (!add) {
-                int unlink_error = 0, stat_error = 0;
-
-                if (stat(dirname, &st) < 0) {
-                        if (errno == ENOENT)
-                                return 0; /* The stack directory is already removed. That's OK. */
-                        stat_error = -errno;
-                }
-
-                if (unlink(filename) < 0)
-                        unlink_error = -errno;
-
-                if (rmdir(dirname) >= 0 || errno == ENOENT)
-                        return 0;
-
-                if (unlink_error < 0) {
-                        if (unlink_error == -ENOENT)
-                                return 0;
-
-                        /* If we failed to remove the symlink, then there is almost nothing we can do. */
-                        return log_device_debug_errno(dev, unlink_error, "Failed to remove %s: %m", filename);
-                }
-
-                if (stat_error < 0)
-                        return log_device_debug_errno(dev, stat_error, "Failed to stat %s: %m", dirname);
-
-                /* The symlink was removed. Check if the timestamp of directory is changed. */
-                r = update_timestamp(dev, dirname, &st);
-                if (r < 0 && r != -ENOENT)
-                        return log_device_debug_errno(dev, r, "Failed to update timestamp of %s: %m", dirname);
-
-                return 0;
-        }
-
-        r = sd_device_get_devname(dev, &devname);
-        if (r < 0)
-                return log_device_debug_errno(dev, r, "Failed to get device node: %m");
-
-        r = device_get_devlink_priority(dev, &priority);
-        if (r < 0)
-                return log_device_debug_errno(dev, r, "Failed to get priority of device node symlink: %m");
-
-        if (asprintf(&data, "%i:%s", priority, devname) < 0)
-                return log_oom_debug();
-
-        if (readlink_malloc(filename, &buf) >= 0 && streq(buf, data))
-                return 0;
-
-        if (unlink(filename) < 0 && errno != ENOENT)
-                log_device_debug_errno(dev, errno, "Failed to remove %s, ignoring: %m", filename);
-
-        for (unsigned j = 0; j < CREATE_STACK_LINK_MAX_RETRIES; j++) {
-                /* This may fail with -ENOENT when the parent directory is removed during
-                 * creating the file by another udevd worker. */
-                r = mkdir_p(dirname, 0755);
-                if (r == -ENOENT)
-                        continue;
-                if (r < 0)
-                        return log_device_debug_errno(dev, r, "Failed to create directory %s: %m", dirname);
-
-                if (stat(dirname, &st) < 0) {
-                        if (errno == ENOENT)
-                                continue;
-                        return log_device_debug_errno(dev, errno, "Failed to stat %s: %m", dirname);
-                }
-
-                if (symlink(data, filename) < 0) {
-                        if (errno == ENOENT)
-                                continue;
-                        return log_device_debug_errno(dev, errno, "Failed to create symbolic link %s: %m", filename);
-                }
-
-                /* The symlink was created. Check if the timestamp of directory is changed. */
-                r = update_timestamp(dev, dirname, &st);
-                if (r < 0)
-                        return log_device_debug_errno(dev, r, "Failed to update timestamp of %s: %m", dirname);
-
-                return 0;
-        }
-
-        return log_device_debug_errno(dev, SYNTHETIC_ERRNO(ELOOP), "Failed to create symbolic link %s: %m", filename);
-}
-
 /* manage "stack of names" with possibly specified device priorities */
 static int link_update(sd_device *dev, const char *slink_in, bool add) {
-        _cleanup_free_ char *slink = NULL, *dirname = NULL;
-        const char *slink_name;
+        _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
+        _cleanup_free_ char *slink = NULL, *filename = NULL, *dirname = NULL;
+        _cleanup_free_ char *target = NULL;
+        const char *slink_name, *id;
         char name_enc[NAME_MAX+1];
         int r;
 
@@ -439,57 +275,56 @@ static int link_update(sd_device *dev, const char *slink_in, bool add) {
                 return log_device_debug_errno(dev, SYNTHETIC_ERRNO(EINVAL),
                                               "Invalid symbolic link of device node: %s", slink);
 
+        r = device_get_device_id(dev, &id);
+        if (r < 0)
+                return log_device_debug_errno(dev, r, "Failed to get device id: %m");
+
         (void) udev_node_escape_path(slink_name, name_enc, sizeof(name_enc));
         dirname = path_join("/run/udev/links", name_enc);
         if (!dirname)
                 return log_oom_debug();
 
-        r = update_stack_directory(dev, dirname, add);
-        if (r < 0)
-                return r;
-
-        for (unsigned i = 0; i < LINK_UPDATE_MAX_RETRIES; i++) {
-                _cleanup_free_ char *target = NULL;
-                struct stat st1 = {}, st2 = {};
-
-                if (i > 0) {
-                        char buf[FORMAT_TIMESPAN_MAX];
-                        usec_t delay = MIN_RANDOM_DELAY + random_u64_range(MAX_RANDOM_DELAY - MIN_RANDOM_DELAY);
+        filename = path_join(dirname, id);
+        if (!filename)
+                return log_oom_debug();
 
-                        log_device_debug(dev, "Directory %s was updated, retrying to update devlink %s after %s.",
-                                         dirname, slink, format_timespan(buf, sizeof(buf), delay, USEC_PER_MSEC));
-                        (void) usleep(delay);
-                }
+        mkdir_parents(dirname, 0755);
+        r = make_lock_file_for(dirname, LOCK_EX, &lf);
+        if (r < 0)
+                return log_error_errno(r, "failed to lock %s", dirname);
 
-                if (stat(dirname, &st1) < 0 && errno != ENOENT)
-                        return log_device_debug_errno(dev, errno, "Failed to stat %s: %m", dirname);
+        if (!add) {
+                if (unlink(filename) < 0 && errno != ENOENT)
+                        log_device_error_errno(dev, errno, "Failed to remove %s, ignoring: %m", filename);
 
-                r = link_find_prioritized(dev, add, dirname, &target);
+                (void) rmdir(dirname);
+        } else {
+                r = touch_file(filename, true, USEC_INFINITY, UID_INVALID, GID_INVALID, 0444);
                 if (r < 0)
-                        return log_device_debug_errno(dev, r, "Failed to determine device node with the highest priority for '%s': %m", slink);
-                if (r == 0) {
+                        return log_device_error_errno(dev, r, "Failed to create %s: %m", filename);
+        }
+
+        r = link_find_prioritized(dev, add, dirname, &target);
+        if (r < 0)
+                return log_device_error_errno(dev, r, "Failed to find highest priority for symlink '%s': %m", slink);
+        if (r == 0) {
+                if (!add) {
                         log_device_debug(dev, "No reference left for '%s', removing", slink);
 
-                        if (unlink(slink) < 0 && errno != ENOENT)
-                                log_device_debug_errno(dev, errno, "Failed to remove '%s', ignoring: %m", slink);
+                        if (unlink(slink) < 0)
+                                return log_device_error_errno(dev, errno,
+                                                              "Failed to remove '%s', ignoring: %m", slink);
 
                         (void) rmdir_parents(slink, "/dev");
-                        return 0;
                 }
-
-                r = node_symlink(dev, target, slink);
-                if (r < 0)
-                        return r;
-
-                if (stat(dirname, &st2) < 0 && errno != ENOENT)
-                        return log_device_debug_errno(dev, errno, "Failed to stat %s: %m", dirname);
-
-                if (((st1.st_mode & S_IFMT) == 0 && (st2.st_mode & S_IFMT) == 0) ||
-                    stat_inode_unmodified(&st1, &st2))
-                        return 0;
+                return 0;
         }
 
-        return -ELOOP;
+        r = node_symlink(dev, target, slink);
+        if (r < 0)
+                (void) unlink(filename);
+
+        return r;
 }
 
 static int device_get_devpath_by_devnum(sd_device *dev, char **ret) {
-- 
2.35.3

openSUSE Build Service is sponsored by