When PRECISE_TIMERS is set with epoll, use timerfd for microsecond precision

The epoll interface ordinarily gives us one-millisecond precision, so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE timer. But when the user has set the new PRECISE_TIMER flag for an event_base (either by the EVENT_BASE_FLAG_PRECISE_TIMER flag, or by the EVENT_PRECISE_TIMER environment variable), they presumably want finer granularity. On not-too-old Linuxes, we can achieve this using the Timerfd mechanism, which accepts nanosecond granularity and understands posix clocks. It's a little more expensive than just calling epoll_wait(), so we won't do it by default.
author: Nick Mathewson <nickm@torproject.org> 2012-04-26 16:22:03 -0400
committer: Nick Mathewson <nickm@torproject.org> 2012-04-26 16:42:21 -0400
commit: 26c75828b75e4c14fbbdce9212d3114d9926af1f (patch)
tree: cd1322987378cb5a9b81299da5b23f9977a13447 /epoll.c
parent: 7428c78a959210951409803455092edff4bdea35 (diff)
download: libevent-26c75828b75e4c14fbbdce9212d3114d9926af1f.tar.gz
1 files changed, 83 insertions, 1 deletions
diff --git a/epoll.c b/epoll.c
index a40939c4..edd4e18b 100644
--- a/epoll.c
+++ b/epoll.c
@@ -47,6 +47,9 @@
 #ifdef EVENT__HAVE_FCNTL_H
 #include <fcntl.h>
 #endif
+#ifdef EVENT__HAVE_SYS_TIMERFD_H
+#include <sys/timerfd.h>
+#endif
 
 #include "event-internal.h"
 #include "evsignal-internal.h"
@@ -57,10 +60,24 @@
 #include "changelist-internal.h"
 #include "time-internal.h"
 
+#if defined(EVENT__HAVE_SYS_TIMERFD_H) &&			  \
+	defined(EVENT__HAVE_TIMERFD_CREATE) &&			  \
+	defined(HAVE_POSIX_MONOTONIC) && defined(TFD_NONBLOCK) && \
+	defined(TFD_CLOEXEC)
+/* Note that we only use timerfd if TFD_NONBLOCK and TFD_CLOEXEC are available
+   and working.  This means that we can't support it on 2.6.25 (where timerfd
+   was introduced) or 2.6.26, since 2.6.27 introduced those flags.
+ */
+#define USING_TIMERFD
+#endif
+
 struct epollop {
 	struct epoll_event *events;
 	int nevents;
 	int epfd;
+#ifdef USING_TIMERFD
+	int timerfd;
+#endif
 };
 
 static void *epoll_init(struct event_base *);
@@ -147,8 +164,38 @@ epoll_init(struct event_base *base)
 
 	if ((base->flags & EVENT_BASE_FLAG_EPOLL_USE_CHANGELIST) != 0 ||
 	    ((base->flags & EVENT_BASE_FLAG_IGNORE_ENV) == 0 &&
-		evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL))
+		evutil_getenv_("EVENT_EPOLL_USE_CHANGELIST") != NULL)) {
+
 		base->evsel = &epollops_changelist;
+	}
+
+#ifdef USING_TIMERFD
+	/*
+	  The epoll interface ordinarily gives us one-millisecond precision,
+	  so on Linux it makes perfect sense to use the CLOCK_MONOTONIC_COARSE
+	  timer.  But when the user has set the new PRECISE_TIMER flag for an
+	  event_base, we can try to use timerfd to give them finer granularity.
+	*/
+	if ((base->flags & EVENT_BASE_FLAG_PRECISE_TIMER) &&
+	    base->monotonic_timer.monotonic_clock == CLOCK_MONOTONIC) {
+		int fd;
+		fd = epollop->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
+		if (epollop->timerfd >= 0) {
+			struct epoll_event epev;
+			epev.data.fd = epollop->timerfd;
+			epev.events = EPOLLIN;
+			if (epoll_ctl(epollop->epfd, EPOLL_CTL_ADD, fd, &epev) < 0) {
+				event_warn("epoll_ctl(timerfd)");
+				close(fd);
+				epollop->timerfd = -1;
+			}
+		} else {
+			event_warn("timerfd_create");
+		}
+	} else {
+		epollop->timerfd = -1;
+	}
+#endif
 
 	evsig_init_(base);
 
@@ -509,6 +556,33 @@ epoll_dispatch(struct event_base *base, struct timeval *tv)
 	int i, res;
 	long timeout = -1;
 
+#ifdef USING_TIMERFD
+	if (epollop->timerfd >= 0) {
+		struct itimerspec is;
+		is.it_interval.tv_sec = 0;
+		is.it_interval.tv_nsec = 0;
+		if (tv == NULL) {
+			/* No timeout; disarm the timer. */
+			is.it_value.tv_sec = 0;
+			is.it_value.tv_nsec = 0;
+		} else {
+			if (tv->tv_sec == 0 && tv->tv_usec == 0) {
+				/* we need to exit immediately; timerfd can't
+				 * do that. */
+				timeout = 0;
+			}
+			is.it_value.tv_sec = tv->tv_sec;
+			is.it_value.tv_nsec = tv->tv_usec * 1000;
+		}
+		/* TODO: we could avoid unnecessary syscalls here by only
+		   calling timerfd_settime when the top timeout changes, or
+		   when we're called with a different timeval.
+		*/
+		if (timerfd_settime(epollop->timerfd, 0, &is, NULL) < 0) {
+			event_warn("timerfd_settime");
+		}
+	} else
+#endif
 	if (tv != NULL) {
 		timeout = evutil_tv_to_msec_(tv);
 		if (timeout < 0 || timeout > MAX_EPOLL_TIMEOUT_MSEC) {
@@ -542,6 +616,10 @@ epoll_dispatch(struct event_base *base, struct timeval *tv)
 	for (i = 0; i < res; i++) {
 		int what = events[i].events;
 		short ev = 0;
+#ifdef USING_TIMERFD
+		if (events[i].data.fd == epollop->timerfd)
+			continue;
+#endif
 
 		if (what & (EPOLLHUP|EPOLLERR)) {
 			ev = EV_READ | EV_WRITE;
@@ -586,6 +664,10 @@ epoll_dealloc(struct event_base *base)
 		mm_free(epollop->events);
 	if (epollop->epfd >= 0)
 		close(epollop->epfd);
+#ifdef USING_TIMERFD
+	if (epollop->timerfd >= 0)
+		close(epollop->timerfd);
+#endif
 
 	memset(epollop, 0, sizeof(struct epollop));
 	mm_free(epollop);
author	Nick Mathewson <nickm@torproject.org>	2012-04-26 16:22:03 -0400
committer	Nick Mathewson <nickm@torproject.org>	2012-04-26 16:42:21 -0400
commit	26c75828b75e4c14fbbdce9212d3114d9926af1f (patch)
tree	cd1322987378cb5a9b81299da5b23f9977a13447 /epoll.c
parent	7428c78a959210951409803455092edff4bdea35 (diff)
download	libevent-26c75828b75e4c14fbbdce9212d3114d9926af1f.tar.gz