1 files changed, 151 insertions, 29 deletions
diff --git a/src/libsystemd/sd-event/sd-event.c b/src/libsystemd/sd-event/sd-event.c
index b06e00fae2..37efe3f425 100644
--- a/src/libsystemd/sd-event/sd-event.c
+++ b/src/libsystemd/sd-event/sd-event.c
@@ -1820,6 +1820,29 @@ static void event_free_inode_data(
         free(d);
 }
 
+static void event_gc_inotify_data(
+                sd_event *e,
+                struct inotify_data *d) {
+
+        assert(e);
+
+        /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
+         * any inode with it anymore, which in turn happens if no event source of this priority is interested
+         * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
+         * (under the expectation that the GC is called again once the counter is decremented). */
+
+        if (!d)
+                return;
+
+        if (!hashmap_isempty(d->inodes))
+                return;
+
+        if (d->n_busy > 0)
+                return;
+
+        event_free_inotify_data(e, d);
+}
+
 static void event_gc_inode_data(
                 sd_event *e,
                 struct inode_data *d) {
@@ -1837,8 +1860,7 @@ static void event_gc_inode_data(
         inotify_data = d->inotify_data;
         event_free_inode_data(e, d);
 
-        if (inotify_data && hashmap_isempty(inotify_data->inodes))
-                event_free_inotify_data(e, inotify_data);
+        event_gc_inotify_data(e, inotify_data);
 }
 
 static int event_make_inode_data(
@@ -1967,24 +1989,25 @@ static int inotify_exit_callback(sd_event_source *s, const struct inotify_event
         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
 }
 
-_public_ int sd_event_add_inotify(
+static int event_add_inotify_fd_internal(
                 sd_event *e,
                 sd_event_source **ret,
-                const char *path,
+                int fd,
+                bool donate,
                 uint32_t mask,
                 sd_event_inotify_handler_t callback,
                 void *userdata) {
 
+        _cleanup_close_ int donated_fd = donate ? fd : -1;
+        _cleanup_(source_freep) sd_event_source *s = NULL;
         struct inotify_data *inotify_data = NULL;
         struct inode_data *inode_data = NULL;
-        _cleanup_close_ int fd = -1;
-        _cleanup_(source_freep) sd_event_source *s = NULL;
         struct stat st;
         int r;
 
         assert_return(e, -EINVAL);
         assert_return(e = event_resolve(e), -ENOPKG);
-        assert_return(path, -EINVAL);
+        assert_return(fd >= 0, -EBADF);
         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
         assert_return(!event_pid_changed(e), -ECHILD);
 
@@ -1997,12 +2020,6 @@ _public_ int sd_event_add_inotify(
         if (mask & IN_MASK_ADD)
                 return -EINVAL;
 
-        fd = open(path, O_PATH|O_CLOEXEC|
-                  (mask & IN_ONLYDIR ? O_DIRECTORY : 0)|
-                  (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
-        if (fd < 0)
-                return -errno;
-
         if (fstat(fd, &st) < 0)
                 return -errno;
 
@@ -2022,14 +2039,24 @@ _public_ int sd_event_add_inotify(
 
         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
         if (r < 0) {
-                event_free_inotify_data(e, inotify_data);
+                event_gc_inotify_data(e, inotify_data);
                 return r;
         }
 
         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
          * the event source, until then, for which we need the original inode. */
         if (inode_data->fd < 0) {
-                inode_data->fd = TAKE_FD(fd);
+                if (donated_fd >= 0)
+                        inode_data->fd = TAKE_FD(donated_fd);
+                else {
+                        inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
+                        if (inode_data->fd < 0) {
+                                r = -errno;
+                                event_gc_inode_data(e, inode_data);
+                                return r;
+                        }
+                }
+
                 LIST_PREPEND(to_close, e->inode_data_to_close, inode_data);
         }
 
@@ -2042,8 +2069,6 @@ _public_ int sd_event_add_inotify(
         if (r < 0)
                 return r;
 
-        (void) sd_event_source_set_description(s, path);
-
         if (ret)
                 *ret = s;
         TAKE_PTR(s);
@@ -2051,6 +2076,48 @@ _public_ int sd_event_add_inotify(
         return 0;
 }
 
+_public_ int sd_event_add_inotify_fd(
+                sd_event *e,
+                sd_event_source **ret,
+                int fd,
+                uint32_t mask,
+                sd_event_inotify_handler_t callback,
+                void *userdata) {
+
+        return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
+}
+
+_public_ int sd_event_add_inotify(
+                sd_event *e,
+                sd_event_source **ret,
+                const char *path,
+                uint32_t mask,
+                sd_event_inotify_handler_t callback,
+                void *userdata) {
+
+        sd_event_source *s;
+        int fd, r;
+
+        assert_return(path, -EINVAL);
+
+        fd = open(path, O_PATH|O_CLOEXEC|
+                  (mask & IN_ONLYDIR ? O_DIRECTORY : 0)|
+                  (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
+        if (fd < 0)
+                return -errno;
+
+        r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
+        if (r < 0)
+                return r;
+
+        (void) sd_event_source_set_description(s, path);
+
+        if (ret)
+                *ret = s;
+
+        return r;
+}
+
 static sd_event_source* event_source_free(sd_event_source *s) {
         if (!s)
                 return NULL;
@@ -2841,7 +2908,7 @@ fail:
         return r;
 }
 
-static int event_source_leave_ratelimit(sd_event_source *s) {
+static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
         int r;
 
         assert(s);
@@ -2873,6 +2940,30 @@ static int event_source_leave_ratelimit(sd_event_source *s) {
         ratelimit_reset(&s->rate_limit);
 
         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
+
+        if (run_callback && s->ratelimit_expire_callback) {
+                s->dispatching = true;
+                r = s->ratelimit_expire_callback(s, s->userdata);
+                s->dispatching = false;
+
+                if (r < 0) {
+                        log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
+                                        strna(s->description),
+                                        event_source_type_to_string(s->type),
+                                        s->exit_on_failure ? "exiting" : "disabling");
+
+                        if (s->exit_on_failure)
+                                (void) sd_event_exit(s->event, r);
+                }
+
+                if (s->n_ref == 0)
+                        source_free(s);
+                else if (r < 0)
+                        sd_event_source_set_enabled(s, SD_EVENT_OFF);
+
+                return 1;
+        }
+
         return 0;
 
 fail:
@@ -3072,6 +3163,7 @@ static int process_timer(
                 struct clock_data *d) {
 
         sd_event_source *s;
+        bool callback_invoked = false;
         int r;
 
         assert(e);
@@ -3089,9 +3181,11 @@ static int process_timer(
                          * again. */
                         assert(s->ratelimited);
 
-                        r = event_source_leave_ratelimit(s);
+                        r = event_source_leave_ratelimit(s, /* run_callback */ true);
                         if (r < 0)
                                 return r;
+                        else if (r == 1)
+                                callback_invoked = true;
 
                         continue;
                 }
@@ -3106,7 +3200,7 @@ static int process_timer(
                 event_source_time_prioq_reshuffle(s);
         }
 
-        return 0;
+        return callback_invoked;
 }
 
 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
@@ -3556,13 +3650,23 @@ static int source_dispatch(sd_event_source *s) {
                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
                 assert(d->buffer_filled >= sz);
 
+                /* If the inotify callback destroys the event source then this likely means we don't need to
+                 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
+                 * free it immediately, then we couldn't drop the event from the inotify event queue without
+                 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
+                 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
+                 * explicitly GC it after we are done dropping the inotify event from the buffer. */
+                d->n_busy++;
                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
+                d->n_busy--;
 
-                /* When no event is pending anymore on this inotify object, then let's drop the event from the
-                 * buffer. */
+                /* When no event is pending anymore on this inotify object, then let's drop the event from
+                 * the inotify event queue buffer. */
                 if (d->n_pending == 0)
                         event_inotify_data_drop(e, d, sz);
 
+                /* Now we don't want to access 'd' anymore, it's OK to GC now. */
+                event_gc_inotify_data(e, d);
                 break;
         }
 
@@ -3587,7 +3691,7 @@ static int source_dispatch(sd_event_source *s) {
         if (s->n_ref == 0)
                 source_free(s);
         else if (r < 0)
-                sd_event_source_set_enabled(s, SD_EVENT_OFF);
+                assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
 
         return 1;
 }
@@ -3628,7 +3732,7 @@ static int event_prepare(sd_event *e) {
                 if (s->n_ref == 0)
                         source_free(s);
                 else if (r < 0)
-                        sd_event_source_set_enabled(s, SD_EVENT_OFF);
+                        assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
         }
 
         return 0;
@@ -4020,15 +4124,15 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
         if (r < 0)
                 goto finish;
 
-        r = process_timer(e, e->timestamp.realtime, &e->realtime);
+        r = process_inotify(e);
         if (r < 0)
                 goto finish;
 
-        r = process_timer(e, e->timestamp.boottime, &e->boottime);
+        r = process_timer(e, e->timestamp.realtime, &e->realtime);
         if (r < 0)
                 goto finish;
 
-        r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
+        r = process_timer(e, e->timestamp.boottime, &e->boottime);
         if (r < 0)
                 goto finish;
 
@@ -4040,9 +4144,20 @@ _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
         if (r < 0)
                 goto finish;
 
-        r = process_inotify(e);
+        r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
         if (r < 0)
                 goto finish;
+        else if (r == 1) {
+                /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
+                 * put loop in the initial state in order to evaluate (in the next iteration) also sources
+                 * there were potentially re-enabled by the callback.
+                 *
+                 * Wondering why we treat only this invocation of process_timer() differently? Once event
+                 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
+                 * ratelimit expiry callback is never called for any other timer type. */
+                r = 0;
+                goto finish;
+        }
 
         if (event_next_pending(e)) {
                 e->state = SD_EVENT_PENDING;
@@ -4411,7 +4526,7 @@ _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval
 
         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
          * non-ratelimited. */
-        r = event_source_leave_ratelimit(s);
+        r = event_source_leave_ratelimit(s, /* run_callback */ false);
         if (r < 0)
                 return r;
 
@@ -4419,6 +4534,13 @@ _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval
         return 0;
 }
 
+_public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
+        assert_return(s, -EINVAL);
+
+        s->ratelimit_expire_callback = callback;
+        return 0;
+}
+
 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
         assert_return(s, -EINVAL);