summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorantirez <antirez@gmail.com>2018-02-23 17:42:24 +0100
committerantirez <antirez@gmail.com>2018-02-23 17:42:24 +0100
commit2500e124e94c257708b53ce00a37e67bd02f81d8 (patch)
tree5f5d1e67c186448916f483e256f93c9074d9a84d
parent813960dbdd86b88b509b2946dbaa023e0ae8b1b9 (diff)
downloadredis-2500e124e94c257708b53ce00a37e67bd02f81d8.tar.gz
ae.c: introduce the concept of read->write barrier.
AOF fsync=always, and certain Redis Cluster bus operations, require to fsync data on disk before replying with an acknowledge. In such case, in order to implement Group Commits, we want to be sure that queries that are read in a given cycle of the event loop, are never served to clients in the same event loop iteration. This way, by using the event loop "before sleep" callback, we can fsync the information just one time before returning into the event loop for the next cycle. This is much more efficient compared to calling fsync() multiple times. Unfortunately because of a bug, this was not always guaranteed: the actual way the events are installed was the sole thing that could control. Normally this problem is hard to trigger when AOF is enabled with fsync=always, because we try to flush the output buffers to the socekt directly in the beforeSleep() function of Redis. However if the output buffers are full, we actually install a write event, and in such a case, this bug could happen. This change to ae.c modifies the event loop implementation to make this concept explicit. Write events that are registered with: AE_WRITABLE|AE_BARRIER Are guaranteed to never fire after the readable event was fired for the same file descriptor. In this way we are sure that data is persisted to disk before the client performing the operation receives an acknowledged. However note that this semantics does not provide all the guarantees that one may believe are automatically provided. Take the example of the blocking list operations in Redis. With AOF and fsync=always we could have: Client A doing: BLPOP myqueue 0 Client B doing: RPUSH myqueue a b c In this scenario, Client A will get the "a" elements immediately after the Client B RPUSH will be executed, even before the operation is persisted. However when Client B will get the acknowledge, it can be sure that "b,c" are already safe on disk inside the list. What to note here is that it cannot be assumed that Client A receiving the element is a guaranteed that the operation succeeded from the point of view of Client B. This is due to the fact that the barrier exists within the same socket, and not between different sockets. However in the case above, the element "a" was not going to be persisted regardless, so it is a pretty synthetic argument.
-rw-r--r--src/ae.c22
-rw-r--r--src/ae.h13
2 files changed, 29 insertions, 6 deletions
diff --git a/src/ae.c b/src/ae.c
index 742388d85..0fd5b4612 100644
--- a/src/ae.c
+++ b/src/ae.c
@@ -159,6 +159,10 @@ void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask)
aeFileEvent *fe = &eventLoop->events[fd];
if (fe->mask == AE_NONE) return;
+ /* We want to always remove AE_BARRIER if set when AE_WRITABLE
+ * is removed. */
+ if (mask & AE_WRITABLE) mask |= AE_BARRIER;
+
aeApiDelEvent(eventLoop, fd, mask);
fe->mask = fe->mask & (~mask);
if (fd == eventLoop->maxfd && fe->mask == AE_NONE) {
@@ -421,8 +425,22 @@ int aeProcessEvents(aeEventLoop *eventLoop, int flags)
fe->rfileProc(eventLoop,fd,fe->clientData,mask);
}
if (fe->mask & mask & AE_WRITABLE) {
- if (!rfired || fe->wfileProc != fe->rfileProc)
- fe->wfileProc(eventLoop,fd,fe->clientData,mask);
+ int can_fire = 1;
+ if (rfired) {
+ /* The previous event fired? We do not want this to
+ * fire again if:
+ *
+ * 1. The handler is the same as the READABLE event.
+ * 2. If there AE_BARRIER is set, to signal that we
+ * are never allowed to fire WRITABLE after READABLE
+ * in the same iteration. */
+ if (fe->wfileProc == fe->rfileProc ||
+ fe->mask & AE_BARRIER)
+ {
+ can_fire = 0;
+ }
+ }
+ if (can_fire) fe->wfileProc(eventLoop,fd,fe->clientData,mask);
}
processed++;
}
diff --git a/src/ae.h b/src/ae.h
index c49bfe233..df5174838 100644
--- a/src/ae.h
+++ b/src/ae.h
@@ -38,9 +38,14 @@
#define AE_OK 0
#define AE_ERR -1
-#define AE_NONE 0
-#define AE_READABLE 1
-#define AE_WRITABLE 2
+#define AE_NONE 0 /* No events registered. */
+#define AE_READABLE 1 /* Fire when descriptor is readable. */
+#define AE_WRITABLE 2 /* Fire when descriptor is writable. */
+#define AE_BARRIER 4 /* With WRITABLE, never fire the event if the
+ READABLE event already fired in the same event
+ loop iteration. Useful when you want to persist
+ things to disk before sending replies, and want
+ to do that in a group fashion. */
#define AE_FILE_EVENTS 1
#define AE_TIME_EVENTS 2
@@ -64,7 +69,7 @@ typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop);
/* File event structure */
typedef struct aeFileEvent {
- int mask; /* one of AE_(READABLE|WRITABLE) */
+ int mask; /* one of AE_(READABLE|WRITABLE|BARRIER) */
aeFileProc *rfileProc;
aeFileProc *wfileProc;
void *clientData;