/* * Copyright 2000-2003 Niels Provos * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include #include #ifdef HAVE_SYS_TIME_H #include #else #include #endif #include #include #include #include #include #include #include #include #ifdef HAVE_FCNTL_H #include #endif #include "event.h" #include "event-internal.h" #include "evsignal.h" #include "log.h" /* due to limitations in the epoll interface, we need to keep track of * all file descriptors outself. */ struct evepoll { struct event *evread; struct event *evwrite; }; struct epollop { struct evepoll *fds; int nfds; struct epoll_event *events; int nevents; int epfd; }; static void *epoll_init (struct event_base *); static int epoll_add (void *, struct event *); static int epoll_del (void *, struct event *); static int epoll_dispatch (struct event_base *, void *, struct timeval *); static void epoll_dealloc (struct event_base *, void *); const struct eventop epollops = { "epoll", epoll_init, epoll_add, epoll_del, epoll_dispatch, epoll_dealloc, 1 /* need reinit */ }; #ifdef HAVE_SETFD #define FD_CLOSEONEXEC(x) do { \ if (fcntl(x, F_SETFD, 1) == -1) \ event_warn("fcntl(%d, F_SETFD)", x); \ } while (0) #else #define FD_CLOSEONEXEC(x) #endif #define NEVENT 32000 /* On Linux kernels at least up to 2.6.24.4, epoll can't handle timeout * values bigger than (LONG_MAX - 999ULL)/HZ. HZ in the wild can be * as big as 1000, and LONG_MAX can be as small as (1<<31)-1, so the * largest number of msec we can support here is 2147482. Let's * round that down by 47 seconds. */ #define MAX_EPOLL_TIMEOUT_MSEC (35*60*1000) static void * epoll_init(struct event_base *base) { int epfd, nfiles = NEVENT; struct rlimit rl; struct epollop *epollop; /* Disable epollueue when this environment variable is set */ if (getenv("EVENT_NOEPOLL")) return (NULL); if (getrlimit(RLIMIT_NOFILE, &rl) == 0 && rl.rlim_cur != RLIM_INFINITY) { /* * Solaris is somewhat retarded - it's important to drop * backwards compatibility when making changes. So, don't * dare to put rl.rlim_cur here. */ nfiles = rl.rlim_cur - 1; } /* Initalize the kernel queue */ if ((epfd = epoll_create(nfiles)) == -1) { if (errno != ENOSYS) event_warn("epoll_create"); return (NULL); } FD_CLOSEONEXEC(epfd); if (!(epollop = calloc(1, sizeof(struct epollop)))) return (NULL); epollop->epfd = epfd; /* Initalize fields */ epollop->events = malloc(nfiles * sizeof(struct epoll_event)); if (epollop->events == NULL) { free(epollop); return (NULL); } epollop->nevents = nfiles; epollop->fds = calloc(nfiles, sizeof(struct evepoll)); if (epollop->fds == NULL) { free(epollop->events); free(epollop); return (NULL); } epollop->nfds = nfiles; evsignal_init(base); return (epollop); } static int epoll_recalc(struct event_base *base, void *arg, int max) { struct epollop *epollop = arg; if (max >= epollop->nfds) { struct evepoll *fds; int nfds; nfds = epollop->nfds; while (nfds <= max) nfds <<= 1; fds = realloc(epollop->fds, nfds * sizeof(struct evepoll)); if (fds == NULL) { event_warn("realloc"); return (-1); } epollop->fds = fds; memset(fds + epollop->nfds, 0, (nfds - epollop->nfds) * sizeof(struct evepoll)); epollop->nfds = nfds; } return (0); } static int epoll_dispatch(struct event_base *base, void *arg, struct timeval *tv) { struct epollop *epollop = arg; struct epoll_event *events = epollop->events; struct evepoll *evep; int i, res, timeout = -1; if (tv != NULL) timeout = tv->tv_sec * 1000 + (tv->tv_usec + 999) / 1000; if (timeout > MAX_EPOLL_TIMEOUT_MSEC) { /* Linux kernels can wait forever if the timeout is too big; * see comment on MAX_EPOLL_TIMEOUT_MSEC. */ timeout = MAX_EPOLL_TIMEOUT_MSEC; } res = epoll_wait(epollop->epfd, events, epollop->nevents, timeout); if (res == -1) { if (errno != EINTR) { event_warn("epoll_wait"); return (-1); } evsignal_process(base); return (0); } else if (base->sig.evsignal_caught) { evsignal_process(base); } event_debug(("%s: epoll_wait reports %d", __func__, res)); for (i = 0; i < res; i++) { int what = events[i].events; struct event *evread = NULL, *evwrite = NULL; int fd = events[i].data.fd; if (fd < 0 || fd >= epollop->nfds) continue; evep = &epollop->fds[fd]; if (what & (EPOLLHUP|EPOLLERR)) { evread = evep->evread; evwrite = evep->evwrite; } else { if (what & EPOLLIN) { evread = evep->evread; } if (what & EPOLLOUT) { evwrite = evep->evwrite; } } if (!(evread||evwrite)) continue; if (evread != NULL) event_active(evread, EV_READ, 1); if (evwrite != NULL) event_active(evwrite, EV_WRITE, 1); } return (0); } static int epoll_add(void *arg, struct event *ev) { struct epollop *epollop = arg; struct epoll_event epev = {0, {0}}; struct evepoll *evep; int fd, op, events; if (ev->ev_events & EV_SIGNAL) return (evsignal_add(ev)); fd = ev->ev_fd; if (fd >= epollop->nfds) { /* Extent the file descriptor array as necessary */ if (epoll_recalc(ev->ev_base, epollop, fd) == -1) return (-1); } evep = &epollop->fds[fd]; op = EPOLL_CTL_ADD; events = 0; if (evep->evread != NULL) { events |= EPOLLIN; op = EPOLL_CTL_MOD; } if (evep->evwrite != NULL) { events |= EPOLLOUT; op = EPOLL_CTL_MOD; } if (ev->ev_events & EV_READ) events |= EPOLLIN; if (ev->ev_events & EV_WRITE) events |= EPOLLOUT; epev.data.fd = fd; epev.events = events; if (epoll_ctl(epollop->epfd, op, ev->ev_fd, &epev) == -1) return (-1); /* Update events responsible */ if (ev->ev_events & EV_READ) evep->evread = ev; if (ev->ev_events & EV_WRITE) evep->evwrite = ev; return (0); } static int epoll_del(void *arg, struct event *ev) { struct epollop *epollop = arg; struct epoll_event epev = {0, {0}}; struct evepoll *evep; int fd, events, op; int needwritedelete = 1, needreaddelete = 1; if (ev->ev_events & EV_SIGNAL) return (evsignal_del(ev)); fd = ev->ev_fd; if (fd >= epollop->nfds) return (0); evep = &epollop->fds[fd]; op = EPOLL_CTL_DEL; events = 0; if (ev->ev_events & EV_READ) events |= EPOLLIN; if (ev->ev_events & EV_WRITE) events |= EPOLLOUT; if ((events & (EPOLLIN|EPOLLOUT)) != (EPOLLIN|EPOLLOUT)) { if ((events & EPOLLIN) && evep->evwrite != NULL) { needwritedelete = 0; events = EPOLLOUT; op = EPOLL_CTL_MOD; } else if ((events & EPOLLOUT) && evep->evread != NULL) { needreaddelete = 0; events = EPOLLIN; op = EPOLL_CTL_MOD; } } epev.events = events; epev.data.fd = fd; if (needreaddelete) evep->evread = NULL; if (needwritedelete) evep->evwrite = NULL; if (epoll_ctl(epollop->epfd, op, fd, &epev) == -1) return (-1); return (0); } static void epoll_dealloc(struct event_base *base, void *arg) { struct epollop *epollop = arg; evsignal_dealloc(base); if (epollop->fds) free(epollop->fds); if (epollop->events) free(epollop->events); if (epollop->epfd >= 0) close(epollop->epfd); memset(epollop, 0, sizeof(struct epollop)); free(epollop); }