summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorantirez <antirez@gmail.com>2010-06-22 00:07:48 +0200
committerantirez <antirez@gmail.com>2010-07-01 14:38:51 +0200
commite2641e09cc0daf44f63f654230f72d22acf3a9af (patch)
treef0443876d28414f7c80787593e5f35a9f9c87747 /src
parentc2ff0e90b8ce84d7b966622ffe0178303bb0a625 (diff)
downloadredis-e2641e09cc0daf44f63f654230f72d22acf3a9af.tar.gz
redis.c split into many different C files.
networking related stuff moved into networking.c moved more code more work on layout of source code SDS instantaneuos memory saving. By Pieter and Salvatore at VMware ;) cleanly compiling again after the first split, now splitting it in more C files moving more things around... work in progress split replication code splitting more Sets split Hash split replication split even more splitting more splitting minor change
Diffstat (limited to 'src')
-rw-r--r--src/Makefile111
-rw-r--r--src/adlist.c325
-rw-r--r--src/adlist.h92
-rw-r--r--src/ae.c390
-rw-r--r--src/ae.h117
-rw-r--r--src/ae_epoll.c91
-rw-r--r--src/ae_kqueue.c93
-rw-r--r--src/ae_select.c72
-rw-r--r--src/anet.c270
-rw-r--r--src/anet.h49
-rw-r--r--src/aof.c694
-rw-r--r--src/config.c438
-rw-r--r--src/config.h45
-rw-r--r--src/db.c508
-rw-r--r--src/debug.c309
-rw-r--r--src/dict.c727
-rw-r--r--src/dict.h151
-rw-r--r--src/fmacros.h15
-rw-r--r--src/linenoise.c433
-rw-r--r--src/linenoise.h41
-rw-r--r--src/lzf.h100
-rw-r--r--src/lzfP.h159
-rw-r--r--src/lzf_c.c295
-rw-r--r--src/lzf_d.c150
-rwxr-xr-xsrc/mkreleasehdr.sh9
-rw-r--r--src/multi.c266
-rw-r--r--src/networking.c589
-rw-r--r--src/object.c405
-rw-r--r--src/pqsort.c197
-rw-r--r--src/pqsort.h15
-rw-r--r--src/pubsub.c259
-rw-r--r--src/rdb.c886
-rw-r--r--src/redis-benchmark.c665
-rw-r--r--src/redis-check-aof.c185
-rw-r--r--src/redis-check-dump.c671
-rw-r--r--src/redis-cli.c493
-rw-r--r--src/redis.c1516
-rw-r--r--src/redis.h885
-rw-r--r--src/release.c13
-rw-r--r--src/replication.c475
-rw-r--r--src/sds.c384
-rw-r--r--src/sds.h74
-rw-r--r--src/sha1.c276
-rw-r--r--src/sha1.h17
-rw-r--r--src/solarisfixes.h21
-rw-r--r--src/sort.c383
-rw-r--r--src/t_hash.c397
-rw-r--r--src/t_list.c829
-rw-r--r--src/t_set.c349
-rw-r--r--src/t_string.c251
-rw-r--r--src/t_zset.c985
-rw-r--r--src/util.c223
-rw-r--r--src/version.h1
-rw-r--r--src/vm.c1126
-rw-r--r--src/ziplist.c959
-rw-r--r--src/ziplist.h15
-rw-r--r--src/zipmap.c455
-rw-r--r--src/zipmap.h48
-rw-r--r--src/zmalloc.c158
-rw-r--r--src/zmalloc.h41
60 files changed, 20196 insertions, 0 deletions
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 000000000..3cba3c069
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,111 @@
+# Redis Makefile
+# Copyright (C) 2009 Salvatore Sanfilippo <antirez at gmail dot com>
+# This file is released under the BSD license, see the COPYING file
+
+release_hdr := $(shell sh -c './mkreleasehdr.sh')
+uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+OPTIMIZATION?=-O2
+ifeq ($(uname_S),SunOS)
+ CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W -D__EXTENSIONS__ -D_XPG6
+ CCLINK?= -ldl -lnsl -lsocket -lm -lpthread
+else
+ CFLAGS?= -std=c99 -pedantic $(OPTIMIZATION) -Wall -W $(ARCH) $(PROF)
+ CCLINK?= -lm -pthread
+endif
+CCOPT= $(CFLAGS) $(CCLINK) $(ARCH) $(PROF)
+DEBUG?= -g -rdynamic -ggdb
+
+OBJ = adlist.o ae.o anet.o dict.o redis.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o vm.o pubsub.o multi.o debug.o sort.o
+BENCHOBJ = ae.o anet.o redis-benchmark.o sds.o adlist.o zmalloc.o
+CLIOBJ = anet.o sds.o adlist.o redis-cli.o zmalloc.o linenoise.o
+CHECKDUMPOBJ = redis-check-dump.o lzf_c.o lzf_d.o
+CHECKAOFOBJ = redis-check-aof.o
+
+PRGNAME = redis-server
+BENCHPRGNAME = redis-benchmark
+CLIPRGNAME = redis-cli
+CHECKDUMPPRGNAME = redis-check-dump
+CHECKAOFPRGNAME = redis-check-aof
+
+all: redis-server redis-benchmark redis-cli redis-check-dump redis-check-aof
+
+# Deps (use make dep to generate this)
+adlist.o: adlist.c adlist.h zmalloc.h
+ae.o: ae.c ae.h zmalloc.h config.h ae_kqueue.c
+ae_epoll.o: ae_epoll.c
+ae_kqueue.o: ae_kqueue.c
+ae_select.o: ae_select.c
+anet.o: anet.c fmacros.h anet.h
+dict.o: dict.c fmacros.h dict.h zmalloc.h
+linenoise.o: linenoise.c fmacros.h
+lzf_c.o: lzf_c.c lzfP.h
+lzf_d.o: lzf_d.c lzfP.h
+pqsort.o: pqsort.c
+redis-benchmark.o: redis-benchmark.c fmacros.h ae.h anet.h sds.h adlist.h \
+ zmalloc.h
+redis-check-aof.o: redis-check-aof.c fmacros.h config.h
+redis-check-dump.o: redis-check-dump.c lzf.h
+redis-cli.o: redis-cli.c fmacros.h anet.h sds.h adlist.h zmalloc.h \
+ linenoise.h
+redis.o: redis.c fmacros.h config.h redis.h ae.h sds.h anet.h dict.h \
+ adlist.h zmalloc.h lzf.h pqsort.h zipmap.h ziplist.h sha1.h
+release.o: release.c release.h
+sds.o: sds.c sds.h zmalloc.h
+sha1.o: sha1.c sha1.h
+ziplist.o: ziplist.c zmalloc.h ziplist.h
+zipmap.o: zipmap.c zmalloc.h
+zmalloc.o: zmalloc.c config.h
+
+redis-server: $(OBJ)
+ $(CC) -o $(PRGNAME) $(CCOPT) $(DEBUG) $(OBJ)
+ @echo ""
+ @echo "Hint: To run 'make test' is a good idea ;)"
+ @echo ""
+
+redis-benchmark: $(BENCHOBJ)
+ $(CC) -o $(BENCHPRGNAME) $(CCOPT) $(DEBUG) $(BENCHOBJ)
+
+redis-cli: $(CLIOBJ)
+ $(CC) -o $(CLIPRGNAME) $(CCOPT) $(DEBUG) $(CLIOBJ)
+
+redis-check-dump: $(CHECKDUMPOBJ)
+ $(CC) -o $(CHECKDUMPPRGNAME) $(CCOPT) $(DEBUG) $(CHECKDUMPOBJ)
+
+redis-check-aof: $(CHECKAOFOBJ)
+ $(CC) -o $(CHECKAOFPRGNAME) $(CCOPT) $(DEBUG) $(CHECKAOFOBJ)
+
+.c.o:
+ $(CC) -c $(CFLAGS) $(DEBUG) $(COMPILE_TIME) $<
+
+clean:
+ rm -rf $(PRGNAME) $(BENCHPRGNAME) $(CLIPRGNAME) $(CHECKDUMPPRGNAME) $(CHECKAOFPRGNAME) *.o *.gcda *.gcno *.gcov
+
+dep:
+ $(CC) -MM *.c
+
+test:
+ (cd ..; tclsh8.5 tests/test_helper.tcl --tags "${TAGS}")
+
+bench:
+ ./redis-benchmark
+
+log:
+ git log '--pretty=format:%ad %s (%cn)' --date=short > Changelog
+
+32bit:
+ @echo ""
+ @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386"
+ @echo ""
+ make ARCH="-m32"
+
+gprof:
+ make PROF="-pg"
+
+gcov:
+ make PROF="-fprofile-arcs -ftest-coverage"
+
+noopt:
+ make OPTIMIZATION=""
+
+32bitgprof:
+ make PROF="-pg" ARCH="-arch i386"
diff --git a/src/adlist.c b/src/adlist.c
new file mode 100644
index 000000000..015012f5c
--- /dev/null
+++ b/src/adlist.c
@@ -0,0 +1,325 @@
+/* adlist.c - A generic doubly linked list implementation
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include <stdlib.h>
+#include "adlist.h"
+#include "zmalloc.h"
+
+/* Create a new list. The created list can be freed with
+ * AlFreeList(), but private value of every node need to be freed
+ * by the user before to call AlFreeList().
+ *
+ * On error, NULL is returned. Otherwise the pointer to the new list. */
+list *listCreate(void)
+{
+ struct list *list;
+
+ if ((list = zmalloc(sizeof(*list))) == NULL)
+ return NULL;
+ list->head = list->tail = NULL;
+ list->len = 0;
+ list->dup = NULL;
+ list->free = NULL;
+ list->match = NULL;
+ return list;
+}
+
+/* Free the whole list.
+ *
+ * This function can't fail. */
+void listRelease(list *list)
+{
+ unsigned int len;
+ listNode *current, *next;
+
+ current = list->head;
+ len = list->len;
+ while(len--) {
+ next = current->next;
+ if (list->free) list->free(current->value);
+ zfree(current);
+ current = next;
+ }
+ zfree(list);
+}
+
+/* Add a new node to the list, to head, contaning the specified 'value'
+ * pointer as value.
+ *
+ * On error, NULL is returned and no operation is performed (i.e. the
+ * list remains unaltered).
+ * On success the 'list' pointer you pass to the function is returned. */
+list *listAddNodeHead(list *list, void *value)
+{
+ listNode *node;
+
+ if ((node = zmalloc(sizeof(*node))) == NULL)
+ return NULL;
+ node->value = value;
+ if (list->len == 0) {
+ list->head = list->tail = node;
+ node->prev = node->next = NULL;
+ } else {
+ node->prev = NULL;
+ node->next = list->head;
+ list->head->prev = node;
+ list->head = node;
+ }
+ list->len++;
+ return list;
+}
+
+/* Add a new node to the list, to tail, contaning the specified 'value'
+ * pointer as value.
+ *
+ * On error, NULL is returned and no operation is performed (i.e. the
+ * list remains unaltered).
+ * On success the 'list' pointer you pass to the function is returned. */
+list *listAddNodeTail(list *list, void *value)
+{
+ listNode *node;
+
+ if ((node = zmalloc(sizeof(*node))) == NULL)
+ return NULL;
+ node->value = value;
+ if (list->len == 0) {
+ list->head = list->tail = node;
+ node->prev = node->next = NULL;
+ } else {
+ node->prev = list->tail;
+ node->next = NULL;
+ list->tail->next = node;
+ list->tail = node;
+ }
+ list->len++;
+ return list;
+}
+
+list *listInsertNode(list *list, listNode *old_node, void *value, int after) {
+ listNode *node;
+
+ if ((node = zmalloc(sizeof(*node))) == NULL)
+ return NULL;
+ node->value = value;
+ if (after) {
+ node->prev = old_node;
+ node->next = old_node->next;
+ if (list->tail == old_node) {
+ list->tail = node;
+ }
+ } else {
+ node->next = old_node;
+ node->prev = old_node->prev;
+ if (list->head == old_node) {
+ list->head = node;
+ }
+ }
+ if (node->prev != NULL) {
+ node->prev->next = node;
+ }
+ if (node->next != NULL) {
+ node->next->prev = node;
+ }
+ list->len++;
+ return list;
+}
+
+/* Remove the specified node from the specified list.
+ * It's up to the caller to free the private value of the node.
+ *
+ * This function can't fail. */
+void listDelNode(list *list, listNode *node)
+{
+ if (node->prev)
+ node->prev->next = node->next;
+ else
+ list->head = node->next;
+ if (node->next)
+ node->next->prev = node->prev;
+ else
+ list->tail = node->prev;
+ if (list->free) list->free(node->value);
+ zfree(node);
+ list->len--;
+}
+
+/* Returns a list iterator 'iter'. After the initialization every
+ * call to listNext() will return the next element of the list.
+ *
+ * This function can't fail. */
+listIter *listGetIterator(list *list, int direction)
+{
+ listIter *iter;
+
+ if ((iter = zmalloc(sizeof(*iter))) == NULL) return NULL;
+ if (direction == AL_START_HEAD)
+ iter->next = list->head;
+ else
+ iter->next = list->tail;
+ iter->direction = direction;
+ return iter;
+}
+
+/* Release the iterator memory */
+void listReleaseIterator(listIter *iter) {
+ zfree(iter);
+}
+
+/* Create an iterator in the list private iterator structure */
+void listRewind(list *list, listIter *li) {
+ li->next = list->head;
+ li->direction = AL_START_HEAD;
+}
+
+void listRewindTail(list *list, listIter *li) {
+ li->next = list->tail;
+ li->direction = AL_START_TAIL;
+}
+
+/* Return the next element of an iterator.
+ * It's valid to remove the currently returned element using
+ * listDelNode(), but not to remove other elements.
+ *
+ * The function returns a pointer to the next element of the list,
+ * or NULL if there are no more elements, so the classical usage patter
+ * is:
+ *
+ * iter = listGetIterator(list,<direction>);
+ * while ((node = listNext(iter)) != NULL) {
+ * doSomethingWith(listNodeValue(node));
+ * }
+ *
+ * */
+listNode *listNext(listIter *iter)
+{
+ listNode *current = iter->next;
+
+ if (current != NULL) {
+ if (iter->direction == AL_START_HEAD)
+ iter->next = current->next;
+ else
+ iter->next = current->prev;
+ }
+ return current;
+}
+
+/* Duplicate the whole list. On out of memory NULL is returned.
+ * On success a copy of the original list is returned.
+ *
+ * The 'Dup' method set with listSetDupMethod() function is used
+ * to copy the node value. Otherwise the same pointer value of
+ * the original node is used as value of the copied node.
+ *
+ * The original list both on success or error is never modified. */
+list *listDup(list *orig)
+{
+ list *copy;
+ listIter *iter;
+ listNode *node;
+
+ if ((copy = listCreate()) == NULL)
+ return NULL;
+ copy->dup = orig->dup;
+ copy->free = orig->free;
+ copy->match = orig->match;
+ iter = listGetIterator(orig, AL_START_HEAD);
+ while((node = listNext(iter)) != NULL) {
+ void *value;
+
+ if (copy->dup) {
+ value = copy->dup(node->value);
+ if (value == NULL) {
+ listRelease(copy);
+ listReleaseIterator(iter);
+ return NULL;
+ }
+ } else
+ value = node->value;
+ if (listAddNodeTail(copy, value) == NULL) {
+ listRelease(copy);
+ listReleaseIterator(iter);
+ return NULL;
+ }
+ }
+ listReleaseIterator(iter);
+ return copy;
+}
+
+/* Search the list for a node matching a given key.
+ * The match is performed using the 'match' method
+ * set with listSetMatchMethod(). If no 'match' method
+ * is set, the 'value' pointer of every node is directly
+ * compared with the 'key' pointer.
+ *
+ * On success the first matching node pointer is returned
+ * (search starts from head). If no matching node exists
+ * NULL is returned. */
+listNode *listSearchKey(list *list, void *key)
+{
+ listIter *iter;
+ listNode *node;
+
+ iter = listGetIterator(list, AL_START_HEAD);
+ while((node = listNext(iter)) != NULL) {
+ if (list->match) {
+ if (list->match(node->value, key)) {
+ listReleaseIterator(iter);
+ return node;
+ }
+ } else {
+ if (key == node->value) {
+ listReleaseIterator(iter);
+ return node;
+ }
+ }
+ }
+ listReleaseIterator(iter);
+ return NULL;
+}
+
+/* Return the element at the specified zero-based index
+ * where 0 is the head, 1 is the element next to head
+ * and so on. Negative integers are used in order to count
+ * from the tail, -1 is the last element, -2 the penultimante
+ * and so on. If the index is out of range NULL is returned. */
+listNode *listIndex(list *list, int index) {
+ listNode *n;
+
+ if (index < 0) {
+ index = (-index)-1;
+ n = list->tail;
+ while(index-- && n) n = n->prev;
+ } else {
+ n = list->head;
+ while(index-- && n) n = n->next;
+ }
+ return n;
+}
diff --git a/src/adlist.h b/src/adlist.h
new file mode 100644
index 000000000..a1209f62f
--- /dev/null
+++ b/src/adlist.h
@@ -0,0 +1,92 @@
+/* adlist.h - A generic doubly linked list implementation
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ADLIST_H__
+#define __ADLIST_H__
+
+/* Node, List, and Iterator are the only data structures used currently. */
+
+typedef struct listNode {
+ struct listNode *prev;
+ struct listNode *next;
+ void *value;
+} listNode;
+
+typedef struct listIter {
+ listNode *next;
+ int direction;
+} listIter;
+
+typedef struct list {
+ listNode *head;
+ listNode *tail;
+ void *(*dup)(void *ptr);
+ void (*free)(void *ptr);
+ int (*match)(void *ptr, void *key);
+ unsigned int len;
+} list;
+
+/* Functions implemented as macros */
+#define listLength(l) ((l)->len)
+#define listFirst(l) ((l)->head)
+#define listLast(l) ((l)->tail)
+#define listPrevNode(n) ((n)->prev)
+#define listNextNode(n) ((n)->next)
+#define listNodeValue(n) ((n)->value)
+
+#define listSetDupMethod(l,m) ((l)->dup = (m))
+#define listSetFreeMethod(l,m) ((l)->free = (m))
+#define listSetMatchMethod(l,m) ((l)->match = (m))
+
+#define listGetDupMethod(l) ((l)->dup)
+#define listGetFree(l) ((l)->free)
+#define listGetMatchMethod(l) ((l)->match)
+
+/* Prototypes */
+list *listCreate(void);
+void listRelease(list *list);
+list *listAddNodeHead(list *list, void *value);
+list *listAddNodeTail(list *list, void *value);
+list *listInsertNode(list *list, listNode *old_node, void *value, int after);
+void listDelNode(list *list, listNode *node);
+listIter *listGetIterator(list *list, int direction);
+listNode *listNext(listIter *iter);
+void listReleaseIterator(listIter *iter);
+list *listDup(list *orig);
+listNode *listSearchKey(list *list, void *key);
+listNode *listIndex(list *list, int index);
+void listRewind(list *list, listIter *li);
+void listRewindTail(list *list, listIter *li);
+
+/* Directions for iterators */
+#define AL_START_HEAD 0
+#define AL_START_TAIL 1
+
+#endif /* __ADLIST_H__ */
diff --git a/src/ae.c b/src/ae.c
new file mode 100644
index 000000000..c7918ee1d
--- /dev/null
+++ b/src/ae.c
@@ -0,0 +1,390 @@
+/* A simple event-driven programming library. Originally I wrote this code
+ * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated
+ * it in form of a library for easy reuse.
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+#include "ae.h"
+#include "zmalloc.h"
+#include "config.h"
+
+/* Include the best multiplexing layer supported by this system.
+ * The following should be ordered by performances, descending. */
+#ifdef HAVE_EPOLL
+#include "ae_epoll.c"
+#else
+ #ifdef HAVE_KQUEUE
+ #include "ae_kqueue.c"
+ #else
+ #include "ae_select.c"
+ #endif
+#endif
+
+aeEventLoop *aeCreateEventLoop(void) {
+ aeEventLoop *eventLoop;
+ int i;
+
+ eventLoop = zmalloc(sizeof(*eventLoop));
+ if (!eventLoop) return NULL;
+ eventLoop->timeEventHead = NULL;
+ eventLoop->timeEventNextId = 0;
+ eventLoop->stop = 0;
+ eventLoop->maxfd = -1;
+ eventLoop->beforesleep = NULL;
+ if (aeApiCreate(eventLoop) == -1) {
+ zfree(eventLoop);
+ return NULL;
+ }
+ /* Events with mask == AE_NONE are not set. So let's initialize the
+ * vector with it. */
+ for (i = 0; i < AE_SETSIZE; i++)
+ eventLoop->events[i].mask = AE_NONE;
+ return eventLoop;
+}
+
+void aeDeleteEventLoop(aeEventLoop *eventLoop) {
+ aeApiFree(eventLoop);
+ zfree(eventLoop);
+}
+
+void aeStop(aeEventLoop *eventLoop) {
+ eventLoop->stop = 1;
+}
+
+int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
+ aeFileProc *proc, void *clientData)
+{
+ if (fd >= AE_SETSIZE) return AE_ERR;
+ aeFileEvent *fe = &eventLoop->events[fd];
+
+ if (aeApiAddEvent(eventLoop, fd, mask) == -1)
+ return AE_ERR;
+ fe->mask |= mask;
+ if (mask & AE_READABLE) fe->rfileProc = proc;
+ if (mask & AE_WRITABLE) fe->wfileProc = proc;
+ fe->clientData = clientData;
+ if (fd > eventLoop->maxfd)
+ eventLoop->maxfd = fd;
+ return AE_OK;
+}
+
+void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask)
+{
+ if (fd >= AE_SETSIZE) return;
+ aeFileEvent *fe = &eventLoop->events[fd];
+
+ if (fe->mask == AE_NONE) return;
+ fe->mask = fe->mask & (~mask);
+ if (fd == eventLoop->maxfd && fe->mask == AE_NONE) {
+ /* Update the max fd */
+ int j;
+
+ for (j = eventLoop->maxfd-1; j >= 0; j--)
+ if (eventLoop->events[j].mask != AE_NONE) break;
+ eventLoop->maxfd = j;
+ }
+ aeApiDelEvent(eventLoop, fd, mask);
+}
+
+static void aeGetTime(long *seconds, long *milliseconds)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ *seconds = tv.tv_sec;
+ *milliseconds = tv.tv_usec/1000;
+}
+
+static void aeAddMillisecondsToNow(long long milliseconds, long *sec, long *ms) {
+ long cur_sec, cur_ms, when_sec, when_ms;
+
+ aeGetTime(&cur_sec, &cur_ms);
+ when_sec = cur_sec + milliseconds/1000;
+ when_ms = cur_ms + milliseconds%1000;
+ if (when_ms >= 1000) {
+ when_sec ++;
+ when_ms -= 1000;
+ }
+ *sec = when_sec;
+ *ms = when_ms;
+}
+
+long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds,
+ aeTimeProc *proc, void *clientData,
+ aeEventFinalizerProc *finalizerProc)
+{
+ long long id = eventLoop->timeEventNextId++;
+ aeTimeEvent *te;
+
+ te = zmalloc(sizeof(*te));
+ if (te == NULL) return AE_ERR;
+ te->id = id;
+ aeAddMillisecondsToNow(milliseconds,&te->when_sec,&te->when_ms);
+ te->timeProc = proc;
+ te->finalizerProc = finalizerProc;
+ te->clientData = clientData;
+ te->next = eventLoop->timeEventHead;
+ eventLoop->timeEventHead = te;
+ return id;
+}
+
+int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id)
+{
+ aeTimeEvent *te, *prev = NULL;
+
+ te = eventLoop->timeEventHead;
+ while(te) {
+ if (te->id == id) {
+ if (prev == NULL)
+ eventLoop->timeEventHead = te->next;
+ else
+ prev->next = te->next;
+ if (te->finalizerProc)
+ te->finalizerProc(eventLoop, te->clientData);
+ zfree(te);
+ return AE_OK;
+ }
+ prev = te;
+ te = te->next;
+ }
+ return AE_ERR; /* NO event with the specified ID found */
+}
+
+/* Search the first timer to fire.
+ * This operation is useful to know how many time the select can be
+ * put in sleep without to delay any event.
+ * If there are no timers NULL is returned.
+ *
+ * Note that's O(N) since time events are unsorted.
+ * Possible optimizations (not needed by Redis so far, but...):
+ * 1) Insert the event in order, so that the nearest is just the head.
+ * Much better but still insertion or deletion of timers is O(N).
+ * 2) Use a skiplist to have this operation as O(1) and insertion as O(log(N)).
+ */
+static aeTimeEvent *aeSearchNearestTimer(aeEventLoop *eventLoop)
+{
+ aeTimeEvent *te = eventLoop->timeEventHead;
+ aeTimeEvent *nearest = NULL;
+
+ while(te) {
+ if (!nearest || te->when_sec < nearest->when_sec ||
+ (te->when_sec == nearest->when_sec &&
+ te->when_ms < nearest->when_ms))
+ nearest = te;
+ te = te->next;
+ }
+ return nearest;
+}
+
+/* Process time events */
+static int processTimeEvents(aeEventLoop *eventLoop) {
+ int processed = 0;
+ aeTimeEvent *te;
+ long long maxId;
+
+ te = eventLoop->timeEventHead;
+ maxId = eventLoop->timeEventNextId-1;
+ while(te) {
+ long now_sec, now_ms;
+ long long id;
+
+ if (te->id > maxId) {
+ te = te->next;
+ continue;
+ }
+ aeGetTime(&now_sec, &now_ms);
+ if (now_sec > te->when_sec ||
+ (now_sec == te->when_sec && now_ms >= te->when_ms))
+ {
+ int retval;
+
+ id = te->id;
+ retval = te->timeProc(eventLoop, id, te->clientData);
+ processed++;
+ /* After an event is processed our time event list may
+ * no longer be the same, so we restart from head.
+ * Still we make sure to don't process events registered
+ * by event handlers itself in order to don't loop forever.
+ * To do so we saved the max ID we want to handle.
+ *
+ * FUTURE OPTIMIZATIONS:
+ * Note that this is NOT great algorithmically. Redis uses
+ * a single time event so it's not a problem but the right
+ * way to do this is to add the new elements on head, and
+ * to flag deleted elements in a special way for later
+ * deletion (putting references to the nodes to delete into
+ * another linked list). */
+ if (retval != AE_NOMORE) {
+ aeAddMillisecondsToNow(retval,&te->when_sec,&te->when_ms);
+ } else {
+ aeDeleteTimeEvent(eventLoop, id);
+ }
+ te = eventLoop->timeEventHead;
+ } else {
+ te = te->next;
+ }
+ }
+ return processed;
+}
+
+/* Process every pending time event, then every pending file event
+ * (that may be registered by time event callbacks just processed).
+ * Without special flags the function sleeps until some file event
+ * fires, or when the next time event occurrs (if any).
+ *
+ * If flags is 0, the function does nothing and returns.
+ * if flags has AE_ALL_EVENTS set, all the kind of events are processed.
+ * if flags has AE_FILE_EVENTS set, file events are processed.
+ * if flags has AE_TIME_EVENTS set, time events are processed.
+ * if flags has AE_DONT_WAIT set the function returns ASAP until all
+ * the events that's possible to process without to wait are processed.
+ *
+ * The function returns the number of events processed. */
+int aeProcessEvents(aeEventLoop *eventLoop, int flags)
+{
+ int processed = 0, numevents;
+
+ /* Nothing to do? return ASAP */
+ if (!(flags & AE_TIME_EVENTS) && !(flags & AE_FILE_EVENTS)) return 0;
+
+ /* Note that we want call select() even if there are no
+ * file events to process as long as we want to process time
+ * events, in order to sleep until the next time event is ready
+ * to fire. */
+ if (eventLoop->maxfd != -1 ||
+ ((flags & AE_TIME_EVENTS) && !(flags & AE_DONT_WAIT))) {
+ int j;
+ aeTimeEvent *shortest = NULL;
+ struct timeval tv, *tvp;
+
+ if (flags & AE_TIME_EVENTS && !(flags & AE_DONT_WAIT))
+ shortest = aeSearchNearestTimer(eventLoop);
+ if (shortest) {
+ long now_sec, now_ms;
+
+ /* Calculate the time missing for the nearest
+ * timer to fire. */
+ aeGetTime(&now_sec, &now_ms);
+ tvp = &tv;
+ tvp->tv_sec = shortest->when_sec - now_sec;
+ if (shortest->when_ms < now_ms) {
+ tvp->tv_usec = ((shortest->when_ms+1000) - now_ms)*1000;
+ tvp->tv_sec --;
+ } else {
+ tvp->tv_usec = (shortest->when_ms - now_ms)*1000;
+ }
+ if (tvp->tv_sec < 0) tvp->tv_sec = 0;
+ if (tvp->tv_usec < 0) tvp->tv_usec = 0;
+ } else {
+ /* If we have to check for events but need to return
+ * ASAP because of AE_DONT_WAIT we need to se the timeout
+ * to zero */
+ if (flags & AE_DONT_WAIT) {
+ tv.tv_sec = tv.tv_usec = 0;
+ tvp = &tv;
+ } else {
+ /* Otherwise we can block */
+ tvp = NULL; /* wait forever */
+ }
+ }
+
+ numevents = aeApiPoll(eventLoop, tvp);
+ for (j = 0; j < numevents; j++) {
+ aeFileEvent *fe = &eventLoop->events[eventLoop->fired[j].fd];
+ int mask = eventLoop->fired[j].mask;
+ int fd = eventLoop->fired[j].fd;
+ int rfired = 0;
+
+ /* note the fe->mask & mask & ... code: maybe an already processed
+ * event removed an element that fired and we still didn't
+ * processed, so we check if the event is still valid. */
+ if (fe->mask & mask & AE_READABLE) {
+ rfired = 1;
+ fe->rfileProc(eventLoop,fd,fe->clientData,mask);
+ }
+ if (fe->mask & mask & AE_WRITABLE) {
+ if (!rfired || fe->wfileProc != fe->rfileProc)
+ fe->wfileProc(eventLoop,fd,fe->clientData,mask);
+ }
+ processed++;
+ }
+ }
+ /* Check time events */
+ if (flags & AE_TIME_EVENTS)
+ processed += processTimeEvents(eventLoop);
+
+ return processed; /* return the number of processed file/time events */
+}
+
+/* Wait for millseconds until the given file descriptor becomes
+ * writable/readable/exception */
+int aeWait(int fd, int mask, long long milliseconds) {
+ struct timeval tv;
+ fd_set rfds, wfds, efds;
+ int retmask = 0, retval;
+
+ tv.tv_sec = milliseconds/1000;
+ tv.tv_usec = (milliseconds%1000)*1000;
+ FD_ZERO(&rfds);
+ FD_ZERO(&wfds);
+ FD_ZERO(&efds);
+
+ if (mask & AE_READABLE) FD_SET(fd,&rfds);
+ if (mask & AE_WRITABLE) FD_SET(fd,&wfds);
+ if ((retval = select(fd+1, &rfds, &wfds, &efds, &tv)) > 0) {
+ if (FD_ISSET(fd,&rfds)) retmask |= AE_READABLE;
+ if (FD_ISSET(fd,&wfds)) retmask |= AE_WRITABLE;
+ return retmask;
+ } else {
+ return retval;
+ }
+}
+
+void aeMain(aeEventLoop *eventLoop) {
+ eventLoop->stop = 0;
+ while (!eventLoop->stop) {
+ if (eventLoop->beforesleep != NULL)
+ eventLoop->beforesleep(eventLoop);
+ aeProcessEvents(eventLoop, AE_ALL_EVENTS);
+ }
+}
+
+char *aeGetApiName(void) {
+ return aeApiName();
+}
+
+void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep) {
+ eventLoop->beforesleep = beforesleep;
+}
diff --git a/src/ae.h b/src/ae.h
new file mode 100644
index 000000000..a9db18ed9
--- /dev/null
+++ b/src/ae.h
@@ -0,0 +1,117 @@
+/* A simple event-driven programming library. Originally I wrote this code
+ * for the Jim's event-loop (Jim is a Tcl interpreter) but later translated
+ * it in form of a library for easy reuse.
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __AE_H__
+#define __AE_H__
+
+#define AE_SETSIZE (1024*10) /* Max number of fd supported */
+
+#define AE_OK 0
+#define AE_ERR -1
+
+#define AE_NONE 0
+#define AE_READABLE 1
+#define AE_WRITABLE 2
+
+#define AE_FILE_EVENTS 1
+#define AE_TIME_EVENTS 2
+#define AE_ALL_EVENTS (AE_FILE_EVENTS|AE_TIME_EVENTS)
+#define AE_DONT_WAIT 4
+
+#define AE_NOMORE -1
+
+/* Macros */
+#define AE_NOTUSED(V) ((void) V)
+
+struct aeEventLoop;
+
+/* Types and data structures */
+typedef void aeFileProc(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask);
+typedef int aeTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData);
+typedef void aeEventFinalizerProc(struct aeEventLoop *eventLoop, void *clientData);
+typedef void aeBeforeSleepProc(struct aeEventLoop *eventLoop);
+
+/* File event structure */
+typedef struct aeFileEvent {
+ int mask; /* one of AE_(READABLE|WRITABLE) */
+ aeFileProc *rfileProc;
+ aeFileProc *wfileProc;
+ void *clientData;
+} aeFileEvent;
+
+/* Time event structure */
+typedef struct aeTimeEvent {
+ long long id; /* time event identifier. */
+ long when_sec; /* seconds */
+ long when_ms; /* milliseconds */
+ aeTimeProc *timeProc;
+ aeEventFinalizerProc *finalizerProc;
+ void *clientData;
+ struct aeTimeEvent *next;
+} aeTimeEvent;
+
+/* A fired event */
+typedef struct aeFiredEvent {
+ int fd;
+ int mask;
+} aeFiredEvent;
+
+/* State of an event based program */
+typedef struct aeEventLoop {
+ int maxfd;
+ long long timeEventNextId;
+ aeFileEvent events[AE_SETSIZE]; /* Registered events */
+ aeFiredEvent fired[AE_SETSIZE]; /* Fired events */
+ aeTimeEvent *timeEventHead;
+ int stop;
+ void *apidata; /* This is used for polling API specific data */
+ aeBeforeSleepProc *beforesleep;
+} aeEventLoop;
+
+/* Prototypes */
+aeEventLoop *aeCreateEventLoop(void);
+void aeDeleteEventLoop(aeEventLoop *eventLoop);
+void aeStop(aeEventLoop *eventLoop);
+int aeCreateFileEvent(aeEventLoop *eventLoop, int fd, int mask,
+ aeFileProc *proc, void *clientData);
+void aeDeleteFileEvent(aeEventLoop *eventLoop, int fd, int mask);
+long long aeCreateTimeEvent(aeEventLoop *eventLoop, long long milliseconds,
+ aeTimeProc *proc, void *clientData,
+ aeEventFinalizerProc *finalizerProc);
+int aeDeleteTimeEvent(aeEventLoop *eventLoop, long long id);
+int aeProcessEvents(aeEventLoop *eventLoop, int flags);
+int aeWait(int fd, int mask, long long milliseconds);
+void aeMain(aeEventLoop *eventLoop);
+char *aeGetApiName(void);
+void aeSetBeforeSleepProc(aeEventLoop *eventLoop, aeBeforeSleepProc *beforesleep);
+
+#endif
diff --git a/src/ae_epoll.c b/src/ae_epoll.c
new file mode 100644
index 000000000..d48977b65
--- /dev/null
+++ b/src/ae_epoll.c
@@ -0,0 +1,91 @@
+/* Linux epoll(2) based ae.c module
+ * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com
+ * Released under the BSD license. See the COPYING file for more info. */
+
+#include <sys/epoll.h>
+
+typedef struct aeApiState {
+ int epfd;
+ struct epoll_event events[AE_SETSIZE];
+} aeApiState;
+
+static int aeApiCreate(aeEventLoop *eventLoop) {
+ aeApiState *state = zmalloc(sizeof(aeApiState));
+
+ if (!state) return -1;
+ state->epfd = epoll_create(1024); /* 1024 is just an hint for the kernel */
+ if (state->epfd == -1) return -1;
+ eventLoop->apidata = state;
+ return 0;
+}
+
+static void aeApiFree(aeEventLoop *eventLoop) {
+ aeApiState *state = eventLoop->apidata;
+
+ close(state->epfd);
+ zfree(state);
+}
+
+static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
+ aeApiState *state = eventLoop->apidata;
+ struct epoll_event ee;
+ /* If the fd was already monitored for some event, we need a MOD
+ * operation. Otherwise we need an ADD operation. */
+ int op = eventLoop->events[fd].mask == AE_NONE ?
+ EPOLL_CTL_ADD : EPOLL_CTL_MOD;
+
+ ee.events = 0;
+ mask |= eventLoop->events[fd].mask; /* Merge old events */
+ if (mask & AE_READABLE) ee.events |= EPOLLIN;
+ if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
+ ee.data.u64 = 0; /* avoid valgrind warning */
+ ee.data.fd = fd;
+ if (epoll_ctl(state->epfd,op,fd,&ee) == -1) return -1;
+ return 0;
+}
+
+static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int delmask) {
+ aeApiState *state = eventLoop->apidata;
+ struct epoll_event ee;
+ int mask = eventLoop->events[fd].mask & (~delmask);
+
+ ee.events = 0;
+ if (mask & AE_READABLE) ee.events |= EPOLLIN;
+ if (mask & AE_WRITABLE) ee.events |= EPOLLOUT;
+ ee.data.u64 = 0; /* avoid valgrind warning */
+ ee.data.fd = fd;
+ if (mask != AE_NONE) {
+ epoll_ctl(state->epfd,EPOLL_CTL_MOD,fd,&ee);
+ } else {
+ /* Note, Kernel < 2.6.9 requires a non null event pointer even for
+ * EPOLL_CTL_DEL. */
+ epoll_ctl(state->epfd,EPOLL_CTL_DEL,fd,&ee);
+ }
+}
+
+static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
+ aeApiState *state = eventLoop->apidata;
+ int retval, numevents = 0;
+
+ retval = epoll_wait(state->epfd,state->events,AE_SETSIZE,
+ tvp ? (tvp->tv_sec*1000 + tvp->tv_usec/1000) : -1);
+ if (retval > 0) {
+ int j;
+
+ numevents = retval;
+ for (j = 0; j < numevents; j++) {
+ int mask = 0;
+ struct epoll_event *e = state->events+j;
+
+ if (e->events & EPOLLIN) mask |= AE_READABLE;
+ if (e->events & EPOLLOUT) mask |= AE_WRITABLE;
+ eventLoop->fired[j].fd = e->data.fd;
+ eventLoop->fired[j].mask = mask;
+ }
+ }
+ return numevents;
+}
+
+static char *aeApiName(void) {
+ return "epoll";
+}
diff --git a/src/ae_kqueue.c b/src/ae_kqueue.c
new file mode 100644
index 000000000..04c3536ba
--- /dev/null
+++ b/src/ae_kqueue.c
@@ -0,0 +1,93 @@
+/* Kqueue(2)-based ae.c module
+ * Copyright (C) 2009 Harish Mallipeddi - harish.mallipeddi@gmail.com
+ * Released under the BSD license. See the COPYING file for more info. */
+
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+
+typedef struct aeApiState {
+ int kqfd;
+ struct kevent events[AE_SETSIZE];
+} aeApiState;
+
+static int aeApiCreate(aeEventLoop *eventLoop) {
+ aeApiState *state = zmalloc(sizeof(aeApiState));
+
+ if (!state) return -1;
+ state->kqfd = kqueue();
+ if (state->kqfd == -1) return -1;
+ eventLoop->apidata = state;
+
+ return 0;
+}
+
+static void aeApiFree(aeEventLoop *eventLoop) {
+ aeApiState *state = eventLoop->apidata;
+
+ close(state->kqfd);
+ zfree(state);
+}
+
+static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
+ aeApiState *state = eventLoop->apidata;
+ struct kevent ke;
+
+ if (mask & AE_READABLE) {
+ EV_SET(&ke, fd, EVFILT_READ, EV_ADD, 0, 0, NULL);
+ if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1;
+ }
+ if (mask & AE_WRITABLE) {
+ EV_SET(&ke, fd, EVFILT_WRITE, EV_ADD, 0, 0, NULL);
+ if (kevent(state->kqfd, &ke, 1, NULL, 0, NULL) == -1) return -1;
+ }
+ return 0;
+}
+
+static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) {
+ aeApiState *state = eventLoop->apidata;
+ struct kevent ke;
+
+ if (mask & AE_READABLE) {
+ EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, NULL);
+ kevent(state->kqfd, &ke, 1, NULL, 0, NULL);
+ }
+ if (mask & AE_WRITABLE) {
+ EV_SET(&ke, fd, EVFILT_WRITE, EV_DELETE, 0, 0, NULL);
+ kevent(state->kqfd, &ke, 1, NULL, 0, NULL);
+ }
+}
+
+static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
+ aeApiState *state = eventLoop->apidata;
+ int retval, numevents = 0;
+
+ if (tvp != NULL) {
+ struct timespec timeout;
+ timeout.tv_sec = tvp->tv_sec;
+ timeout.tv_nsec = tvp->tv_usec * 1000;
+ retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, &timeout);
+ } else {
+ retval = kevent(state->kqfd, NULL, 0, state->events, AE_SETSIZE, NULL);
+ }
+
+ if (retval > 0) {
+ int j;
+
+ numevents = retval;
+ for(j = 0; j < numevents; j++) {
+ int mask = 0;
+ struct kevent *e = state->events+j;
+
+ if (e->filter == EVFILT_READ) mask |= AE_READABLE;
+ if (e->filter == EVFILT_WRITE) mask |= AE_WRITABLE;
+ eventLoop->fired[j].fd = e->ident;
+ eventLoop->fired[j].mask = mask;
+ }
+ }
+ return numevents;
+}
+
+static char *aeApiName(void) {
+ return "kqueue";
+}
diff --git a/src/ae_select.c b/src/ae_select.c
new file mode 100644
index 000000000..43f5867f3
--- /dev/null
+++ b/src/ae_select.c
@@ -0,0 +1,72 @@
+/* Select()-based ae.c module
+ * Copyright (C) 2009-2010 Salvatore Sanfilippo - antirez@gmail.com
+ * Released under the BSD license. See the COPYING file for more info. */
+
+#include <string.h>
+
+typedef struct aeApiState {
+ fd_set rfds, wfds;
+ /* We need to have a copy of the fd sets as it's not safe to reuse
+ * FD sets after select(). */
+ fd_set _rfds, _wfds;
+} aeApiState;
+
+static int aeApiCreate(aeEventLoop *eventLoop) {
+ aeApiState *state = zmalloc(sizeof(aeApiState));
+
+ if (!state) return -1;
+ FD_ZERO(&state->rfds);
+ FD_ZERO(&state->wfds);
+ eventLoop->apidata = state;
+ return 0;
+}
+
+static void aeApiFree(aeEventLoop *eventLoop) {
+ zfree(eventLoop->apidata);
+}
+
+static int aeApiAddEvent(aeEventLoop *eventLoop, int fd, int mask) {
+ aeApiState *state = eventLoop->apidata;
+
+ if (mask & AE_READABLE) FD_SET(fd,&state->rfds);
+ if (mask & AE_WRITABLE) FD_SET(fd,&state->wfds);
+ return 0;
+}
+
+static void aeApiDelEvent(aeEventLoop *eventLoop, int fd, int mask) {
+ aeApiState *state = eventLoop->apidata;
+
+ if (mask & AE_READABLE) FD_CLR(fd,&state->rfds);
+ if (mask & AE_WRITABLE) FD_CLR(fd,&state->wfds);
+}
+
+static int aeApiPoll(aeEventLoop *eventLoop, struct timeval *tvp) {
+ aeApiState *state = eventLoop->apidata;
+ int retval, j, numevents = 0;
+
+ memcpy(&state->_rfds,&state->rfds,sizeof(fd_set));
+ memcpy(&state->_wfds,&state->wfds,sizeof(fd_set));
+
+ retval = select(eventLoop->maxfd+1,
+ &state->_rfds,&state->_wfds,NULL,tvp);
+ if (retval > 0) {
+ for (j = 0; j <= eventLoop->maxfd; j++) {
+ int mask = 0;
+ aeFileEvent *fe = &eventLoop->events[j];
+
+ if (fe->mask == AE_NONE) continue;
+ if (fe->mask & AE_READABLE && FD_ISSET(j,&state->_rfds))
+ mask |= AE_READABLE;
+ if (fe->mask & AE_WRITABLE && FD_ISSET(j,&state->_wfds))
+ mask |= AE_WRITABLE;
+ eventLoop->fired[numevents].fd = j;
+ eventLoop->fired[numevents].mask = mask;
+ numevents++;
+ }
+ }
+ return numevents;
+}
+
+static char *aeApiName(void) {
+ return "select";
+}
diff --git a/src/anet.c b/src/anet.c
new file mode 100644
index 000000000..4fe811a11
--- /dev/null
+++ b/src/anet.c
@@ -0,0 +1,270 @@
+/* anet.c -- Basic TCP socket stuff made a bit less boring
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fmacros.h"
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <netdb.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "anet.h"
+
+static void anetSetError(char *err, const char *fmt, ...)
+{
+ va_list ap;
+
+ if (!err) return;
+ va_start(ap, fmt);
+ vsnprintf(err, ANET_ERR_LEN, fmt, ap);
+ va_end(ap);
+}
+
+int anetNonBlock(char *err, int fd)
+{
+ int flags;
+
+ /* Set the socket nonblocking.
+ * Note that fcntl(2) for F_GETFL and F_SETFL can't be
+ * interrupted by a signal. */
+ if ((flags = fcntl(fd, F_GETFL)) == -1) {
+ anetSetError(err, "fcntl(F_GETFL): %s\n", strerror(errno));
+ return ANET_ERR;
+ }
+ if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) {
+ anetSetError(err, "fcntl(F_SETFL,O_NONBLOCK): %s\n", strerror(errno));
+ return ANET_ERR;
+ }
+ return ANET_OK;
+}
+
+int anetTcpNoDelay(char *err, int fd)
+{
+ int yes = 1;
+ if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &yes, sizeof(yes)) == -1)
+ {
+ anetSetError(err, "setsockopt TCP_NODELAY: %s\n", strerror(errno));
+ return ANET_ERR;
+ }
+ return ANET_OK;
+}
+
+int anetSetSendBuffer(char *err, int fd, int buffsize)
+{
+ if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buffsize, sizeof(buffsize)) == -1)
+ {
+ anetSetError(err, "setsockopt SO_SNDBUF: %s\n", strerror(errno));
+ return ANET_ERR;
+ }
+ return ANET_OK;
+}
+
+int anetTcpKeepAlive(char *err, int fd)
+{
+ int yes = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &yes, sizeof(yes)) == -1) {
+ anetSetError(err, "setsockopt SO_KEEPALIVE: %s\n", strerror(errno));
+ return ANET_ERR;
+ }
+ return ANET_OK;
+}
+
+int anetResolve(char *err, char *host, char *ipbuf)
+{
+ struct sockaddr_in sa;
+
+ sa.sin_family = AF_INET;
+ if (inet_aton(host, &sa.sin_addr) == 0) {
+ struct hostent *he;
+
+ he = gethostbyname(host);
+ if (he == NULL) {
+ anetSetError(err, "can't resolve: %s\n", host);
+ return ANET_ERR;
+ }
+ memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr));
+ }
+ strcpy(ipbuf,inet_ntoa(sa.sin_addr));
+ return ANET_OK;
+}
+
+#define ANET_CONNECT_NONE 0
+#define ANET_CONNECT_NONBLOCK 1
+static int anetTcpGenericConnect(char *err, char *addr, int port, int flags)
+{
+ int s, on = 1;
+ struct sockaddr_in sa;
+
+ if ((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
+ anetSetError(err, "creating socket: %s\n", strerror(errno));
+ return ANET_ERR;
+ }
+ /* Make sure connection-intensive things like the redis benckmark
+ * will be able to close/open sockets a zillion of times */
+ setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+ sa.sin_family = AF_INET;
+ sa.sin_port = htons(port);
+ if (inet_aton(addr, &sa.sin_addr) == 0) {
+ struct hostent *he;
+
+ he = gethostbyname(addr);
+ if (he == NULL) {
+ anetSetError(err, "can't resolve: %s\n", addr);
+ close(s);
+ return ANET_ERR;
+ }
+ memcpy(&sa.sin_addr, he->h_addr, sizeof(struct in_addr));
+ }
+ if (flags & ANET_CONNECT_NONBLOCK) {
+ if (anetNonBlock(err,s) != ANET_OK)
+ return ANET_ERR;
+ }
+ if (connect(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) {
+ if (errno == EINPROGRESS &&
+ flags & ANET_CONNECT_NONBLOCK)
+ return s;
+
+ anetSetError(err, "connect: %s\n", strerror(errno));
+ close(s);
+ return ANET_ERR;
+ }
+ return s;
+}
+
+int anetTcpConnect(char *err, char *addr, int port)
+{
+ return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONE);
+}
+
+int anetTcpNonBlockConnect(char *err, char *addr, int port)
+{
+ return anetTcpGenericConnect(err,addr,port,ANET_CONNECT_NONBLOCK);
+}
+
+/* Like read(2) but make sure 'count' is read before to return
+ * (unless error or EOF condition is encountered) */
+int anetRead(int fd, char *buf, int count)
+{
+ int nread, totlen = 0;
+ while(totlen != count) {
+ nread = read(fd,buf,count-totlen);
+ if (nread == 0) return totlen;
+ if (nread == -1) return -1;
+ totlen += nread;
+ buf += nread;
+ }
+ return totlen;
+}
+
+/* Like write(2) but make sure 'count' is read before to return
+ * (unless error is encountered) */
+int anetWrite(int fd, char *buf, int count)
+{
+ int nwritten, totlen = 0;
+ while(totlen != count) {
+ nwritten = write(fd,buf,count-totlen);
+ if (nwritten == 0) return totlen;
+ if (nwritten == -1) return -1;
+ totlen += nwritten;
+ buf += nwritten;
+ }
+ return totlen;
+}
+
+int anetTcpServer(char *err, int port, char *bindaddr)
+{
+ int s, on = 1;
+ struct sockaddr_in sa;
+
+ if ((s = socket(AF_INET, SOCK_STREAM, 0)) == -1) {
+ anetSetError(err, "socket: %s\n", strerror(errno));
+ return ANET_ERR;
+ }
+ if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) {
+ anetSetError(err, "setsockopt SO_REUSEADDR: %s\n", strerror(errno));
+ close(s);
+ return ANET_ERR;
+ }
+ memset(&sa,0,sizeof(sa));
+ sa.sin_family = AF_INET;
+ sa.sin_port = htons(port);
+ sa.sin_addr.s_addr = htonl(INADDR_ANY);
+ if (bindaddr) {
+ if (inet_aton(bindaddr, &sa.sin_addr) == 0) {
+ anetSetError(err, "Invalid bind address\n");
+ close(s);
+ return ANET_ERR;
+ }
+ }
+ if (bind(s, (struct sockaddr*)&sa, sizeof(sa)) == -1) {
+ anetSetError(err, "bind: %s\n", strerror(errno));
+ close(s);
+ return ANET_ERR;
+ }
+ if (listen(s, 511) == -1) { /* the magic 511 constant is from nginx */
+ anetSetError(err, "listen: %s\n", strerror(errno));
+ close(s);
+ return ANET_ERR;
+ }
+ return s;
+}
+
+int anetAccept(char *err, int serversock, char *ip, int *port)
+{
+ int fd;
+ struct sockaddr_in sa;
+ unsigned int saLen;
+
+ while(1) {
+ saLen = sizeof(sa);
+ fd = accept(serversock, (struct sockaddr*)&sa, &saLen);
+ if (fd == -1) {
+ if (errno == EINTR)
+ continue;
+ else {
+ anetSetError(err, "accept: %s\n", strerror(errno));
+ return ANET_ERR;
+ }
+ }
+ break;
+ }
+ if (ip) strcpy(ip,inet_ntoa(sa.sin_addr));
+ if (port) *port = ntohs(sa.sin_port);
+ return fd;
+}
diff --git a/src/anet.h b/src/anet.h
new file mode 100644
index 000000000..ce0f47787
--- /dev/null
+++ b/src/anet.h
@@ -0,0 +1,49 @@
+/* anet.c -- Basic TCP socket stuff made a bit less boring
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef ANET_H
+#define ANET_H
+
+#define ANET_OK 0
+#define ANET_ERR -1
+#define ANET_ERR_LEN 256
+
+int anetTcpConnect(char *err, char *addr, int port);
+int anetTcpNonBlockConnect(char *err, char *addr, int port);
+int anetRead(int fd, char *buf, int count);
+int anetResolve(char *err, char *host, char *ipbuf);
+int anetTcpServer(char *err, int port, char *bindaddr);
+int anetAccept(char *err, int serversock, char *ip, int *port);
+int anetWrite(int fd, char *buf, int count);
+int anetNonBlock(char *err, int fd);
+int anetTcpNoDelay(char *err, int fd);
+int anetTcpKeepAlive(char *err, int fd);
+
+#endif
diff --git a/src/aof.c b/src/aof.c
new file mode 100644
index 000000000..51054b296
--- /dev/null
+++ b/src/aof.c
@@ -0,0 +1,694 @@
+#include "redis.h"
+
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+/* Called when the user switches from "appendonly yes" to "appendonly no"
+ * at runtime using the CONFIG command. */
+void stopAppendOnly(void) {
+ flushAppendOnlyFile();
+ aof_fsync(server.appendfd);
+ close(server.appendfd);
+
+ server.appendfd = -1;
+ server.appendseldb = -1;
+ server.appendonly = 0;
+ /* rewrite operation in progress? kill it, wait child exit */
+ if (server.bgsavechildpid != -1) {
+ int statloc;
+
+ if (kill(server.bgsavechildpid,SIGKILL) != -1)
+ wait3(&statloc,0,NULL);
+ /* reset the buffer accumulating changes while the child saves */
+ sdsfree(server.bgrewritebuf);
+ server.bgrewritebuf = sdsempty();
+ server.bgsavechildpid = -1;
+ }
+}
+
+/* Called when the user switches from "appendonly no" to "appendonly yes"
+ * at runtime using the CONFIG command. */
+int startAppendOnly(void) {
+ server.appendonly = 1;
+ server.lastfsync = time(NULL);
+ server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
+ if (server.appendfd == -1) {
+ redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, but I can't open the AOF file: %s",strerror(errno));
+ return REDIS_ERR;
+ }
+ if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
+ server.appendonly = 0;
+ close(server.appendfd);
+ redisLog(REDIS_WARNING,"Used tried to switch on AOF via CONFIG, I can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.",strerror(errno));
+ return REDIS_ERR;
+ }
+ return REDIS_OK;
+}
+
+/* Write the append only file buffer on disk.
+ *
+ * Since we are required to write the AOF before replying to the client,
+ * and the only way the client socket can get a write is entering when the
+ * the event loop, we accumulate all the AOF writes in a memory
+ * buffer and write it on disk using this function just before entering
+ * the event loop again. */
+void flushAppendOnlyFile(void) {
+ time_t now;
+ ssize_t nwritten;
+
+ if (sdslen(server.aofbuf) == 0) return;
+
+ /* We want to perform a single write. This should be guaranteed atomic
+ * at least if the filesystem we are writing is a real physical one.
+ * While this will save us against the server being killed I don't think
+ * there is much to do about the whole server stopping for power problems
+ * or alike */
+ nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
+ if (nwritten != (signed)sdslen(server.aofbuf)) {
+ /* Ooops, we are in troubles. The best thing to do for now is
+ * aborting instead of giving the illusion that everything is
+ * working as expected. */
+ if (nwritten == -1) {
+ redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
+ } else {
+ redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
+ }
+ exit(1);
+ }
+ sdsfree(server.aofbuf);
+ server.aofbuf = sdsempty();
+
+ /* Don't Fsync if no-appendfsync-on-rewrite is set to yes and we have
+ * childs performing heavy I/O on disk. */
+ if (server.no_appendfsync_on_rewrite &&
+ (server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
+ return;
+ /* Fsync if needed */
+ now = time(NULL);
+ if (server.appendfsync == APPENDFSYNC_ALWAYS ||
+ (server.appendfsync == APPENDFSYNC_EVERYSEC &&
+ now-server.lastfsync > 1))
+ {
+ /* aof_fsync is defined as fdatasync() for Linux in order to avoid
+ * flushing metadata. */
+ aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
+ server.lastfsync = now;
+ }
+}
+
+sds catAppendOnlyGenericCommand(sds buf, int argc, robj **argv) {
+ int j;
+ buf = sdscatprintf(buf,"*%d\r\n",argc);
+ for (j = 0; j < argc; j++) {
+ robj *o = getDecodedObject(argv[j]);
+ buf = sdscatprintf(buf,"$%lu\r\n",(unsigned long)sdslen(o->ptr));
+ buf = sdscatlen(buf,o->ptr,sdslen(o->ptr));
+ buf = sdscatlen(buf,"\r\n",2);
+ decrRefCount(o);
+ }
+ return buf;
+}
+
+sds catAppendOnlyExpireAtCommand(sds buf, robj *key, robj *seconds) {
+ int argc = 3;
+ long when;
+ robj *argv[3];
+
+ /* Make sure we can use strtol */
+ seconds = getDecodedObject(seconds);
+ when = time(NULL)+strtol(seconds->ptr,NULL,10);
+ decrRefCount(seconds);
+
+ argv[0] = createStringObject("EXPIREAT",8);
+ argv[1] = key;
+ argv[2] = createObject(REDIS_STRING,
+ sdscatprintf(sdsempty(),"%ld",when));
+ buf = catAppendOnlyGenericCommand(buf, argc, argv);
+ decrRefCount(argv[0]);
+ decrRefCount(argv[2]);
+ return buf;
+}
+
+void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
+ sds buf = sdsempty();
+ robj *tmpargv[3];
+
+ /* The DB this command was targetting is not the same as the last command
+ * we appendend. To issue a SELECT command is needed. */
+ if (dictid != server.appendseldb) {
+ char seldb[64];
+
+ snprintf(seldb,sizeof(seldb),"%d",dictid);
+ buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
+ (unsigned long)strlen(seldb),seldb);
+ server.appendseldb = dictid;
+ }
+
+ if (cmd->proc == expireCommand) {
+ /* Translate EXPIRE into EXPIREAT */
+ buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
+ } else if (cmd->proc == setexCommand) {
+ /* Translate SETEX to SET and EXPIREAT */
+ tmpargv[0] = createStringObject("SET",3);
+ tmpargv[1] = argv[1];
+ tmpargv[2] = argv[3];
+ buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
+ decrRefCount(tmpargv[0]);
+ buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
+ } else {
+ buf = catAppendOnlyGenericCommand(buf,argc,argv);
+ }
+
+ /* Append to the AOF buffer. This will be flushed on disk just before
+ * of re-entering the event loop, so before the client will get a
+ * positive reply about the operation performed. */
+ server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
+
+ /* If a background append only file rewriting is in progress we want to
+ * accumulate the differences between the child DB and the current one
+ * in a buffer, so that when the child process will do its work we
+ * can append the differences to the new append only file. */
+ if (server.bgrewritechildpid != -1)
+ server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
+
+ sdsfree(buf);
+}
+
+/* In Redis commands are always executed in the context of a client, so in
+ * order to load the append only file we need to create a fake client. */
+struct redisClient *createFakeClient(void) {
+ struct redisClient *c = zmalloc(sizeof(*c));
+
+ selectDb(c,0);
+ c->fd = -1;
+ c->querybuf = sdsempty();
+ c->argc = 0;
+ c->argv = NULL;
+ c->flags = 0;
+ /* We set the fake client as a slave waiting for the synchronization
+ * so that Redis will not try to send replies to this client. */
+ c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
+ c->reply = listCreate();
+ listSetFreeMethod(c->reply,decrRefCount);
+ listSetDupMethod(c->reply,dupClientReplyValue);
+ initClientMultiState(c);
+ return c;
+}
+
+void freeFakeClient(struct redisClient *c) {
+ sdsfree(c->querybuf);
+ listRelease(c->reply);
+ freeClientMultiState(c);
+ zfree(c);
+}
+
+/* Replay the append log file. On error REDIS_OK is returned. On non fatal
+ * error (the append only file is zero-length) REDIS_ERR is returned. On
+ * fatal error an error message is logged and the program exists. */
+int loadAppendOnlyFile(char *filename) {
+ struct redisClient *fakeClient;
+ FILE *fp = fopen(filename,"r");
+ struct redis_stat sb;
+ int appendonly = server.appendonly;
+
+ if (redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0)
+ return REDIS_ERR;
+
+ if (fp == NULL) {
+ redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
+ exit(1);
+ }
+
+ /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
+ * to the same file we're about to read. */
+ server.appendonly = 0;
+
+ fakeClient = createFakeClient();
+ while(1) {
+ int argc, j;
+ unsigned long len;
+ robj **argv;
+ char buf[128];
+ sds argsds;
+ struct redisCommand *cmd;
+ int force_swapout;
+
+ if (fgets(buf,sizeof(buf),fp) == NULL) {
+ if (feof(fp))
+ break;
+ else
+ goto readerr;
+ }
+ if (buf[0] != '*') goto fmterr;
+ argc = atoi(buf+1);
+ argv = zmalloc(sizeof(robj*)*argc);
+ for (j = 0; j < argc; j++) {
+ if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
+ if (buf[0] != '$') goto fmterr;
+ len = strtol(buf+1,NULL,10);
+ argsds = sdsnewlen(NULL,len);
+ if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
+ argv[j] = createObject(REDIS_STRING,argsds);
+ if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
+ }
+
+ /* Command lookup */
+ cmd = lookupCommand(argv[0]->ptr);
+ if (!cmd) {
+ redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
+ exit(1);
+ }
+ /* Try object encoding */
+ if (cmd->flags & REDIS_CMD_BULK)
+ argv[argc-1] = tryObjectEncoding(argv[argc-1]);
+ /* Run the command in the context of a fake client */
+ fakeClient->argc = argc;
+ fakeClient->argv = argv;
+ cmd->proc(fakeClient);
+ /* Discard the reply objects list from the fake client */
+ while(listLength(fakeClient->reply))
+ listDelNode(fakeClient->reply,listFirst(fakeClient->reply));
+ /* Clean up, ready for the next command */
+ for (j = 0; j < argc; j++) decrRefCount(argv[j]);
+ zfree(argv);
+ /* Handle swapping while loading big datasets when VM is on */
+ force_swapout = 0;
+ if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
+ force_swapout = 1;
+
+ if (server.vm_enabled && force_swapout) {
+ while (zmalloc_used_memory() > server.vm_max_memory) {
+ if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
+ }
+ }
+ }
+
+ /* This point can only be reached when EOF is reached without errors.
+ * If the client is in the middle of a MULTI/EXEC, log error and quit. */
+ if (fakeClient->flags & REDIS_MULTI) goto readerr;
+
+ fclose(fp);
+ freeFakeClient(fakeClient);
+ server.appendonly = appendonly;
+ return REDIS_OK;
+
+readerr:
+ if (feof(fp)) {
+ redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
+ } else {
+ redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
+ }
+ exit(1);
+fmterr:
+ redisLog(REDIS_WARNING,"Bad file format reading the append only file");
+ exit(1);
+}
+
+/* Write binary-safe string into a file in the bulkformat
+ * $<count>\r\n<payload>\r\n */
+int fwriteBulkString(FILE *fp, char *s, unsigned long len) {
+ char cbuf[128];
+ int clen;
+ cbuf[0] = '$';
+ clen = 1+ll2string(cbuf+1,sizeof(cbuf)-1,len);
+ cbuf[clen++] = '\r';
+ cbuf[clen++] = '\n';
+ if (fwrite(cbuf,clen,1,fp) == 0) return 0;
+ if (len > 0 && fwrite(s,len,1,fp) == 0) return 0;
+ if (fwrite("\r\n",2,1,fp) == 0) return 0;
+ return 1;
+}
+
+/* Write a double value in bulk format $<count>\r\n<payload>\r\n */
+int fwriteBulkDouble(FILE *fp, double d) {
+ char buf[128], dbuf[128];
+
+ snprintf(dbuf,sizeof(dbuf),"%.17g\r\n",d);
+ snprintf(buf,sizeof(buf),"$%lu\r\n",(unsigned long)strlen(dbuf)-2);
+ if (fwrite(buf,strlen(buf),1,fp) == 0) return 0;
+ if (fwrite(dbuf,strlen(dbuf),1,fp) == 0) return 0;
+ return 1;
+}
+
+/* Write a long value in bulk format $<count>\r\n<payload>\r\n */
+int fwriteBulkLongLong(FILE *fp, long long l) {
+ char bbuf[128], lbuf[128];
+ unsigned int blen, llen;
+ llen = ll2string(lbuf,32,l);
+ blen = snprintf(bbuf,sizeof(bbuf),"$%u\r\n%s\r\n",llen,lbuf);
+ if (fwrite(bbuf,blen,1,fp) == 0) return 0;
+ return 1;
+}
+
+/* Delegate writing an object to writing a bulk string or bulk long long. */
+int fwriteBulkObject(FILE *fp, robj *obj) {
+ /* Avoid using getDecodedObject to help copy-on-write (we are often
+ * in a child process when this function is called). */
+ if (obj->encoding == REDIS_ENCODING_INT) {
+ return fwriteBulkLongLong(fp,(long)obj->ptr);
+ } else if (obj->encoding == REDIS_ENCODING_RAW) {
+ return fwriteBulkString(fp,obj->ptr,sdslen(obj->ptr));
+ } else {
+ redisPanic("Unknown string encoding");
+ }
+}
+
+/* Write a sequence of commands able to fully rebuild the dataset into
+ * "filename". Used both by REWRITEAOF and BGREWRITEAOF. */
+int rewriteAppendOnlyFile(char *filename) {
+ dictIterator *di = NULL;
+ dictEntry *de;
+ FILE *fp;
+ char tmpfile[256];
+ int j;
+ time_t now = time(NULL);
+
+ /* Note that we have to use a different temp name here compared to the
+ * one used by rewriteAppendOnlyFileBackground() function. */
+ snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
+ fp = fopen(tmpfile,"w");
+ if (!fp) {
+ redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
+ return REDIS_ERR;
+ }
+ for (j = 0; j < server.dbnum; j++) {
+ char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
+ redisDb *db = server.db+j;
+ dict *d = db->dict;
+ if (dictSize(d) == 0) continue;
+ di = dictGetIterator(d);
+ if (!di) {
+ fclose(fp);
+ return REDIS_ERR;
+ }
+
+ /* SELECT the new DB */
+ if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkLongLong(fp,j) == 0) goto werr;
+
+ /* Iterate this DB writing every entry */
+ while((de = dictNext(di)) != NULL) {
+ sds keystr = dictGetEntryKey(de);
+ robj key, *o;
+ time_t expiretime;
+ int swapped;
+
+ keystr = dictGetEntryKey(de);
+ o = dictGetEntryVal(de);
+ initStaticStringObject(key,keystr);
+ /* If the value for this key is swapped, load a preview in memory.
+ * We use a "swapped" flag to remember if we need to free the
+ * value object instead to just increment the ref count anyway
+ * in order to avoid copy-on-write of pages if we are forked() */
+ if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
+ o->storage == REDIS_VM_SWAPPING) {
+ swapped = 0;
+ } else {
+ o = vmPreviewObject(o);
+ swapped = 1;
+ }
+ expiretime = getExpire(db,&key);
+
+ /* Save the key and associated value */
+ if (o->type == REDIS_STRING) {
+ /* Emit a SET command */
+ char cmd[]="*3\r\n$3\r\nSET\r\n";
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ /* Key and value */
+ if (fwriteBulkObject(fp,&key) == 0) goto werr;
+ if (fwriteBulkObject(fp,o) == 0) goto werr;
+ } else if (o->type == REDIS_LIST) {
+ /* Emit the RPUSHes needed to rebuild the list */
+ char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
+ if (o->encoding == REDIS_ENCODING_ZIPLIST) {
+ unsigned char *zl = o->ptr;
+ unsigned char *p = ziplistIndex(zl,0);
+ unsigned char *vstr;
+ unsigned int vlen;
+ long long vlong;
+
+ while(ziplistGet(p,&vstr,&vlen,&vlong)) {
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkObject(fp,&key) == 0) goto werr;
+ if (vstr) {
+ if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
+ goto werr;
+ } else {
+ if (fwriteBulkLongLong(fp,vlong) == 0)
+ goto werr;
+ }
+ p = ziplistNext(zl,p);
+ }
+ } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) {
+ list *list = o->ptr;
+ listNode *ln;
+ listIter li;
+
+ listRewind(list,&li);
+ while((ln = listNext(&li))) {
+ robj *eleobj = listNodeValue(ln);
+
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkObject(fp,&key) == 0) goto werr;
+ if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
+ }
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+ } else if (o->type == REDIS_SET) {
+ /* Emit the SADDs needed to rebuild the set */
+ dict *set = o->ptr;
+ dictIterator *di = dictGetIterator(set);
+ dictEntry *de;
+
+ while((de = dictNext(di)) != NULL) {
+ char cmd[]="*3\r\n$4\r\nSADD\r\n";
+ robj *eleobj = dictGetEntryKey(de);
+
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkObject(fp,&key) == 0) goto werr;
+ if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
+ }
+ dictReleaseIterator(di);
+ } else if (o->type == REDIS_ZSET) {
+ /* Emit the ZADDs needed to rebuild the sorted set */
+ zset *zs = o->ptr;
+ dictIterator *di = dictGetIterator(zs->dict);
+ dictEntry *de;
+
+ while((de = dictNext(di)) != NULL) {
+ char cmd[]="*4\r\n$4\r\nZADD\r\n";
+ robj *eleobj = dictGetEntryKey(de);
+ double *score = dictGetEntryVal(de);
+
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkObject(fp,&key) == 0) goto werr;
+ if (fwriteBulkDouble(fp,*score) == 0) goto werr;
+ if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
+ }
+ dictReleaseIterator(di);
+ } else if (o->type == REDIS_HASH) {
+ char cmd[]="*4\r\n$4\r\nHSET\r\n";
+
+ /* Emit the HSETs needed to rebuild the hash */
+ if (o->encoding == REDIS_ENCODING_ZIPMAP) {
+ unsigned char *p = zipmapRewind(o->ptr);
+ unsigned char *field, *val;
+ unsigned int flen, vlen;
+
+ while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkObject(fp,&key) == 0) goto werr;
+ if (fwriteBulkString(fp,(char*)field,flen) == -1)
+ return -1;
+ if (fwriteBulkString(fp,(char*)val,vlen) == -1)
+ return -1;
+ }
+ } else {
+ dictIterator *di = dictGetIterator(o->ptr);
+ dictEntry *de;
+
+ while((de = dictNext(di)) != NULL) {
+ robj *field = dictGetEntryKey(de);
+ robj *val = dictGetEntryVal(de);
+
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkObject(fp,&key) == 0) goto werr;
+ if (fwriteBulkObject(fp,field) == -1) return -1;
+ if (fwriteBulkObject(fp,val) == -1) return -1;
+ }
+ dictReleaseIterator(di);
+ }
+ } else {
+ redisPanic("Unknown object type");
+ }
+ /* Save the expire time */
+ if (expiretime != -1) {
+ char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
+ /* If this key is already expired skip it */
+ if (expiretime < now) continue;
+ if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
+ if (fwriteBulkObject(fp,&key) == 0) goto werr;
+ if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
+ }
+ if (swapped) decrRefCount(o);
+ }
+ dictReleaseIterator(di);
+ }
+
+ /* Make sure data will not remain on the OS's output buffers */
+ fflush(fp);
+ aof_fsync(fileno(fp));
+ fclose(fp);
+
+ /* Use RENAME to make sure the DB file is changed atomically only
+ * if the generate DB file is ok. */
+ if (rename(tmpfile,filename) == -1) {
+ redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
+ unlink(tmpfile);
+ return REDIS_ERR;
+ }
+ redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
+ return REDIS_OK;
+
+werr:
+ fclose(fp);
+ unlink(tmpfile);
+ redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
+ if (di) dictReleaseIterator(di);
+ return REDIS_ERR;
+}
+
+/* This is how rewriting of the append only file in background works:
+ *
+ * 1) The user calls BGREWRITEAOF
+ * 2) Redis calls this function, that forks():
+ * 2a) the child rewrite the append only file in a temp file.
+ * 2b) the parent accumulates differences in server.bgrewritebuf.
+ * 3) When the child finished '2a' exists.
+ * 4) The parent will trap the exit code, if it's OK, will append the
+ * data accumulated into server.bgrewritebuf into the temp file, and
+ * finally will rename(2) the temp file in the actual file name.
+ * The the new file is reopened as the new append only file. Profit!
+ */
+int rewriteAppendOnlyFileBackground(void) {
+ pid_t childpid;
+
+ if (server.bgrewritechildpid != -1) return REDIS_ERR;
+ if (server.vm_enabled) waitEmptyIOJobsQueue();
+ if ((childpid = fork()) == 0) {
+ /* Child */
+ char tmpfile[256];
+
+ if (server.vm_enabled) vmReopenSwapFile();
+ close(server.fd);
+ snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
+ if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
+ _exit(0);
+ } else {
+ _exit(1);
+ }
+ } else {
+ /* Parent */
+ if (childpid == -1) {
+ redisLog(REDIS_WARNING,
+ "Can't rewrite append only file in background: fork: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ redisLog(REDIS_NOTICE,
+ "Background append only file rewriting started by pid %d",childpid);
+ server.bgrewritechildpid = childpid;
+ updateDictResizePolicy();
+ /* We set appendseldb to -1 in order to force the next call to the
+ * feedAppendOnlyFile() to issue a SELECT command, so the differences
+ * accumulated by the parent into server.bgrewritebuf will start
+ * with a SELECT statement and it will be safe to merge. */
+ server.appendseldb = -1;
+ return REDIS_OK;
+ }
+ return REDIS_OK; /* unreached */
+}
+
+void bgrewriteaofCommand(redisClient *c) {
+ if (server.bgrewritechildpid != -1) {
+ addReplySds(c,sdsnew("-ERR background append only file rewriting already in progress\r\n"));
+ return;
+ }
+ if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
+ char *status = "+Background append only file rewriting started\r\n";
+ addReplySds(c,sdsnew(status));
+ } else {
+ addReply(c,shared.err);
+ }
+}
+
+void aofRemoveTempFile(pid_t childpid) {
+ char tmpfile[256];
+
+ snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
+ unlink(tmpfile);
+}
+
+/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
+ * Handle this. */
+void backgroundRewriteDoneHandler(int statloc) {
+ int exitcode = WEXITSTATUS(statloc);
+ int bysignal = WIFSIGNALED(statloc);
+
+ if (!bysignal && exitcode == 0) {
+ int fd;
+ char tmpfile[256];
+
+ redisLog(REDIS_NOTICE,
+ "Background append only file rewriting terminated with success");
+ /* Now it's time to flush the differences accumulated by the parent */
+ snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
+ fd = open(tmpfile,O_WRONLY|O_APPEND);
+ if (fd == -1) {
+ redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
+ goto cleanup;
+ }
+ /* Flush our data... */
+ if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
+ (signed) sdslen(server.bgrewritebuf)) {
+ redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
+ close(fd);
+ goto cleanup;
+ }
+ redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
+ /* Now our work is to rename the temp file into the stable file. And
+ * switch the file descriptor used by the server for append only. */
+ if (rename(tmpfile,server.appendfilename) == -1) {
+ redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
+ close(fd);
+ goto cleanup;
+ }
+ /* Mission completed... almost */
+ redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
+ if (server.appendfd != -1) {
+ /* If append only is actually enabled... */
+ close(server.appendfd);
+ server.appendfd = fd;
+ if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
+ server.appendseldb = -1; /* Make sure it will issue SELECT */
+ redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
+ } else {
+ /* If append only is disabled we just generate a dump in this
+ * format. Why not? */
+ close(fd);
+ }
+ } else if (!bysignal && exitcode != 0) {
+ redisLog(REDIS_WARNING, "Background append only file rewriting error");
+ } else {
+ redisLog(REDIS_WARNING,
+ "Background append only file rewriting terminated by signal %d",
+ WTERMSIG(statloc));
+ }
+cleanup:
+ sdsfree(server.bgrewritebuf);
+ server.bgrewritebuf = sdsempty();
+ aofRemoveTempFile(server.bgrewritechildpid);
+ server.bgrewritechildpid = -1;
+}
diff --git a/src/config.c b/src/config.c
new file mode 100644
index 000000000..6d946ee0c
--- /dev/null
+++ b/src/config.c
@@ -0,0 +1,438 @@
+#include "redis.h"
+
+/*-----------------------------------------------------------------------------
+ * Config file parsing
+ *----------------------------------------------------------------------------*/
+
+int yesnotoi(char *s) {
+ if (!strcasecmp(s,"yes")) return 1;
+ else if (!strcasecmp(s,"no")) return 0;
+ else return -1;
+}
+
+void appendServerSaveParams(time_t seconds, int changes) {
+ server.saveparams = zrealloc(server.saveparams,sizeof(struct saveparam)*(server.saveparamslen+1));
+ server.saveparams[server.saveparamslen].seconds = seconds;
+ server.saveparams[server.saveparamslen].changes = changes;
+ server.saveparamslen++;
+}
+
+void resetServerSaveParams() {
+ zfree(server.saveparams);
+ server.saveparams = NULL;
+ server.saveparamslen = 0;
+}
+
+/* I agree, this is a very rudimental way to load a configuration...
+ will improve later if the config gets more complex */
+void loadServerConfig(char *filename) {
+ FILE *fp;
+ char buf[REDIS_CONFIGLINE_MAX+1], *err = NULL;
+ int linenum = 0;
+ sds line = NULL;
+
+ if (filename[0] == '-' && filename[1] == '\0')
+ fp = stdin;
+ else {
+ if ((fp = fopen(filename,"r")) == NULL) {
+ redisLog(REDIS_WARNING, "Fatal error, can't open config file '%s'", filename);
+ exit(1);
+ }
+ }
+
+ while(fgets(buf,REDIS_CONFIGLINE_MAX+1,fp) != NULL) {
+ sds *argv;
+ int argc, j;
+
+ linenum++;
+ line = sdsnew(buf);
+ line = sdstrim(line," \t\r\n");
+
+ /* Skip comments and blank lines*/
+ if (line[0] == '#' || line[0] == '\0') {
+ sdsfree(line);
+ continue;
+ }
+
+ /* Split into arguments */
+ argv = sdssplitlen(line,sdslen(line)," ",1,&argc);
+ sdstolower(argv[0]);
+
+ /* Execute config directives */
+ if (!strcasecmp(argv[0],"timeout") && argc == 2) {
+ server.maxidletime = atoi(argv[1]);
+ if (server.maxidletime < 0) {
+ err = "Invalid timeout value"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"port") && argc == 2) {
+ server.port = atoi(argv[1]);
+ if (server.port < 1 || server.port > 65535) {
+ err = "Invalid port"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"bind") && argc == 2) {
+ server.bindaddr = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"save") && argc == 3) {
+ int seconds = atoi(argv[1]);
+ int changes = atoi(argv[2]);
+ if (seconds < 1 || changes < 0) {
+ err = "Invalid save parameters"; goto loaderr;
+ }
+ appendServerSaveParams(seconds,changes);
+ } else if (!strcasecmp(argv[0],"dir") && argc == 2) {
+ if (chdir(argv[1]) == -1) {
+ redisLog(REDIS_WARNING,"Can't chdir to '%s': %s",
+ argv[1], strerror(errno));
+ exit(1);
+ }
+ } else if (!strcasecmp(argv[0],"loglevel") && argc == 2) {
+ if (!strcasecmp(argv[1],"debug")) server.verbosity = REDIS_DEBUG;
+ else if (!strcasecmp(argv[1],"verbose")) server.verbosity = REDIS_VERBOSE;
+ else if (!strcasecmp(argv[1],"notice")) server.verbosity = REDIS_NOTICE;
+ else if (!strcasecmp(argv[1],"warning")) server.verbosity = REDIS_WARNING;
+ else {
+ err = "Invalid log level. Must be one of debug, notice, warning";
+ goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"logfile") && argc == 2) {
+ FILE *logfp;
+
+ server.logfile = zstrdup(argv[1]);
+ if (!strcasecmp(server.logfile,"stdout")) {
+ zfree(server.logfile);
+ server.logfile = NULL;
+ }
+ if (server.logfile) {
+ /* Test if we are able to open the file. The server will not
+ * be able to abort just for this problem later... */
+ logfp = fopen(server.logfile,"a");
+ if (logfp == NULL) {
+ err = sdscatprintf(sdsempty(),
+ "Can't open the log file: %s", strerror(errno));
+ goto loaderr;
+ }
+ fclose(logfp);
+ }
+ } else if (!strcasecmp(argv[0],"databases") && argc == 2) {
+ server.dbnum = atoi(argv[1]);
+ if (server.dbnum < 1) {
+ err = "Invalid number of databases"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"include") && argc == 2) {
+ loadServerConfig(argv[1]);
+ } else if (!strcasecmp(argv[0],"maxclients") && argc == 2) {
+ server.maxclients = atoi(argv[1]);
+ } else if (!strcasecmp(argv[0],"maxmemory") && argc == 2) {
+ server.maxmemory = memtoll(argv[1],NULL);
+ } else if (!strcasecmp(argv[0],"slaveof") && argc == 3) {
+ server.masterhost = sdsnew(argv[1]);
+ server.masterport = atoi(argv[2]);
+ server.replstate = REDIS_REPL_CONNECT;
+ } else if (!strcasecmp(argv[0],"masterauth") && argc == 2) {
+ server.masterauth = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"glueoutputbuf") && argc == 2) {
+ if ((server.glueoutputbuf = yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"rdbcompression") && argc == 2) {
+ if ((server.rdbcompression = yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"activerehashing") && argc == 2) {
+ if ((server.activerehashing = yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"daemonize") && argc == 2) {
+ if ((server.daemonize = yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"appendonly") && argc == 2) {
+ if ((server.appendonly = yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"appendfilename") && argc == 2) {
+ zfree(server.appendfilename);
+ server.appendfilename = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"no-appendfsync-on-rewrite")
+ && argc == 2) {
+ if ((server.no_appendfsync_on_rewrite= yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"appendfsync") && argc == 2) {
+ if (!strcasecmp(argv[1],"no")) {
+ server.appendfsync = APPENDFSYNC_NO;
+ } else if (!strcasecmp(argv[1],"always")) {
+ server.appendfsync = APPENDFSYNC_ALWAYS;
+ } else if (!strcasecmp(argv[1],"everysec")) {
+ server.appendfsync = APPENDFSYNC_EVERYSEC;
+ } else {
+ err = "argument must be 'no', 'always' or 'everysec'";
+ goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"requirepass") && argc == 2) {
+ server.requirepass = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"pidfile") && argc == 2) {
+ zfree(server.pidfile);
+ server.pidfile = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"dbfilename") && argc == 2) {
+ zfree(server.dbfilename);
+ server.dbfilename = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"vm-enabled") && argc == 2) {
+ if ((server.vm_enabled = yesnotoi(argv[1])) == -1) {
+ err = "argument must be 'yes' or 'no'"; goto loaderr;
+ }
+ } else if (!strcasecmp(argv[0],"vm-swap-file") && argc == 2) {
+ zfree(server.vm_swap_file);
+ server.vm_swap_file = zstrdup(argv[1]);
+ } else if (!strcasecmp(argv[0],"vm-max-memory") && argc == 2) {
+ server.vm_max_memory = memtoll(argv[1],NULL);
+ } else if (!strcasecmp(argv[0],"vm-page-size") && argc == 2) {
+ server.vm_page_size = memtoll(argv[1], NULL);
+ } else if (!strcasecmp(argv[0],"vm-pages") && argc == 2) {
+ server.vm_pages = memtoll(argv[1], NULL);
+ } else if (!strcasecmp(argv[0],"vm-max-threads") && argc == 2) {
+ server.vm_max_threads = strtoll(argv[1], NULL, 10);
+ } else if (!strcasecmp(argv[0],"hash-max-zipmap-entries") && argc == 2){
+ server.hash_max_zipmap_entries = memtoll(argv[1], NULL);
+ } else if (!strcasecmp(argv[0],"hash-max-zipmap-value") && argc == 2){
+ server.hash_max_zipmap_value = memtoll(argv[1], NULL);
+ } else if (!strcasecmp(argv[0],"list-max-ziplist-entries") && argc == 2){
+ server.list_max_ziplist_entries = memtoll(argv[1], NULL);
+ } else if (!strcasecmp(argv[0],"list-max-ziplist-value") && argc == 2){
+ server.list_max_ziplist_value = memtoll(argv[1], NULL);
+ } else {
+ err = "Bad directive or wrong number of arguments"; goto loaderr;
+ }
+ for (j = 0; j < argc; j++)
+ sdsfree(argv[j]);
+ zfree(argv);
+ sdsfree(line);
+ }
+ if (fp != stdin) fclose(fp);
+ return;
+
+loaderr:
+ fprintf(stderr, "\n*** FATAL CONFIG FILE ERROR ***\n");
+ fprintf(stderr, "Reading the configuration file, at line %d\n", linenum);
+ fprintf(stderr, ">>> '%s'\n", line);
+ fprintf(stderr, "%s\n", err);
+ exit(1);
+}
+
+/*-----------------------------------------------------------------------------
+ * CONFIG command for remote configuration
+ *----------------------------------------------------------------------------*/
+
+void configSetCommand(redisClient *c) {
+ robj *o = getDecodedObject(c->argv[3]);
+ long long ll;
+
+ if (!strcasecmp(c->argv[2]->ptr,"dbfilename")) {
+ zfree(server.dbfilename);
+ server.dbfilename = zstrdup(o->ptr);
+ } else if (!strcasecmp(c->argv[2]->ptr,"requirepass")) {
+ zfree(server.requirepass);
+ server.requirepass = zstrdup(o->ptr);
+ } else if (!strcasecmp(c->argv[2]->ptr,"masterauth")) {
+ zfree(server.masterauth);
+ server.masterauth = zstrdup(o->ptr);
+ } else if (!strcasecmp(c->argv[2]->ptr,"maxmemory")) {
+ if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
+ ll < 0) goto badfmt;
+ server.maxmemory = ll;
+ } else if (!strcasecmp(c->argv[2]->ptr,"timeout")) {
+ if (getLongLongFromObject(o,&ll) == REDIS_ERR ||
+ ll < 0 || ll > LONG_MAX) goto badfmt;
+ server.maxidletime = ll;
+ } else if (!strcasecmp(c->argv[2]->ptr,"appendfsync")) {
+ if (!strcasecmp(o->ptr,"no")) {
+ server.appendfsync = APPENDFSYNC_NO;
+ } else if (!strcasecmp(o->ptr,"everysec")) {
+ server.appendfsync = APPENDFSYNC_EVERYSEC;
+ } else if (!strcasecmp(o->ptr,"always")) {
+ server.appendfsync = APPENDFSYNC_ALWAYS;
+ } else {
+ goto badfmt;
+ }
+ } else if (!strcasecmp(c->argv[2]->ptr,"no-appendfsync-on-rewrite")) {
+ int yn = yesnotoi(o->ptr);
+
+ if (yn == -1) goto badfmt;
+ server.no_appendfsync_on_rewrite = yn;
+ } else if (!strcasecmp(c->argv[2]->ptr,"appendonly")) {
+ int old = server.appendonly;
+ int new = yesnotoi(o->ptr);
+
+ if (new == -1) goto badfmt;
+ if (old != new) {
+ if (new == 0) {
+ stopAppendOnly();
+ } else {
+ if (startAppendOnly() == REDIS_ERR) {
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "-ERR Unable to turn on AOF. Check server logs.\r\n"));
+ decrRefCount(o);
+ return;
+ }
+ }
+ }
+ } else if (!strcasecmp(c->argv[2]->ptr,"save")) {
+ int vlen, j;
+ sds *v = sdssplitlen(o->ptr,sdslen(o->ptr)," ",1,&vlen);
+
+ /* Perform sanity check before setting the new config:
+ * - Even number of args
+ * - Seconds >= 1, changes >= 0 */
+ if (vlen & 1) {
+ sdsfreesplitres(v,vlen);
+ goto badfmt;
+ }
+ for (j = 0; j < vlen; j++) {
+ char *eptr;
+ long val;
+
+ val = strtoll(v[j], &eptr, 10);
+ if (eptr[0] != '\0' ||
+ ((j & 1) == 0 && val < 1) ||
+ ((j & 1) == 1 && val < 0)) {
+ sdsfreesplitres(v,vlen);
+ goto badfmt;
+ }
+ }
+ /* Finally set the new config */
+ resetServerSaveParams();
+ for (j = 0; j < vlen; j += 2) {
+ time_t seconds;
+ int changes;
+
+ seconds = strtoll(v[j],NULL,10);
+ changes = strtoll(v[j+1],NULL,10);
+ appendServerSaveParams(seconds, changes);
+ }
+ sdsfreesplitres(v,vlen);
+ } else {
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "-ERR not supported CONFIG parameter %s\r\n",
+ (char*)c->argv[2]->ptr));
+ decrRefCount(o);
+ return;
+ }
+ decrRefCount(o);
+ addReply(c,shared.ok);
+ return;
+
+badfmt: /* Bad format errors */
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "-ERR invalid argument '%s' for CONFIG SET '%s'\r\n",
+ (char*)o->ptr,
+ (char*)c->argv[2]->ptr));
+ decrRefCount(o);
+}
+
+void configGetCommand(redisClient *c) {
+ robj *o = getDecodedObject(c->argv[2]);
+ robj *lenobj = createObject(REDIS_STRING,NULL);
+ char *pattern = o->ptr;
+ int matches = 0;
+
+ addReply(c,lenobj);
+ decrRefCount(lenobj);
+
+ if (stringmatch(pattern,"dbfilename",0)) {
+ addReplyBulkCString(c,"dbfilename");
+ addReplyBulkCString(c,server.dbfilename);
+ matches++;
+ }
+ if (stringmatch(pattern,"requirepass",0)) {
+ addReplyBulkCString(c,"requirepass");
+ addReplyBulkCString(c,server.requirepass);
+ matches++;
+ }
+ if (stringmatch(pattern,"masterauth",0)) {
+ addReplyBulkCString(c,"masterauth");
+ addReplyBulkCString(c,server.masterauth);
+ matches++;
+ }
+ if (stringmatch(pattern,"maxmemory",0)) {
+ char buf[128];
+
+ ll2string(buf,128,server.maxmemory);
+ addReplyBulkCString(c,"maxmemory");
+ addReplyBulkCString(c,buf);
+ matches++;
+ }
+ if (stringmatch(pattern,"timeout",0)) {
+ char buf[128];
+
+ ll2string(buf,128,server.maxidletime);
+ addReplyBulkCString(c,"timeout");
+ addReplyBulkCString(c,buf);
+ matches++;
+ }
+ if (stringmatch(pattern,"appendonly",0)) {
+ addReplyBulkCString(c,"appendonly");
+ addReplyBulkCString(c,server.appendonly ? "yes" : "no");
+ matches++;
+ }
+ if (stringmatch(pattern,"no-appendfsync-on-rewrite",0)) {
+ addReplyBulkCString(c,"no-appendfsync-on-rewrite");
+ addReplyBulkCString(c,server.no_appendfsync_on_rewrite ? "yes" : "no");
+ matches++;
+ }
+ if (stringmatch(pattern,"appendfsync",0)) {
+ char *policy;
+
+ switch(server.appendfsync) {
+ case APPENDFSYNC_NO: policy = "no"; break;
+ case APPENDFSYNC_EVERYSEC: policy = "everysec"; break;
+ case APPENDFSYNC_ALWAYS: policy = "always"; break;
+ default: policy = "unknown"; break; /* too harmless to panic */
+ }
+ addReplyBulkCString(c,"appendfsync");
+ addReplyBulkCString(c,policy);
+ matches++;
+ }
+ if (stringmatch(pattern,"save",0)) {
+ sds buf = sdsempty();
+ int j;
+
+ for (j = 0; j < server.saveparamslen; j++) {
+ buf = sdscatprintf(buf,"%ld %d",
+ server.saveparams[j].seconds,
+ server.saveparams[j].changes);
+ if (j != server.saveparamslen-1)
+ buf = sdscatlen(buf," ",1);
+ }
+ addReplyBulkCString(c,"save");
+ addReplyBulkCString(c,buf);
+ sdsfree(buf);
+ matches++;
+ }
+ decrRefCount(o);
+ lenobj->ptr = sdscatprintf(sdsempty(),"*%d\r\n",matches*2);
+}
+
+void configCommand(redisClient *c) {
+ if (!strcasecmp(c->argv[1]->ptr,"set")) {
+ if (c->argc != 4) goto badarity;
+ configSetCommand(c);
+ } else if (!strcasecmp(c->argv[1]->ptr,"get")) {
+ if (c->argc != 3) goto badarity;
+ configGetCommand(c);
+ } else if (!strcasecmp(c->argv[1]->ptr,"resetstat")) {
+ if (c->argc != 2) goto badarity;
+ server.stat_numcommands = 0;
+ server.stat_numconnections = 0;
+ server.stat_expiredkeys = 0;
+ server.stat_starttime = time(NULL);
+ addReply(c,shared.ok);
+ } else {
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "-ERR CONFIG subcommand must be one of GET, SET, RESETSTAT\r\n"));
+ }
+ return;
+
+badarity:
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "-ERR Wrong number of arguments for CONFIG %s\r\n",
+ (char*) c->argv[1]->ptr));
+}
diff --git a/src/config.h b/src/config.h
new file mode 100644
index 000000000..6e98fbb2c
--- /dev/null
+++ b/src/config.h
@@ -0,0 +1,45 @@
+#ifndef __CONFIG_H
+#define __CONFIG_H
+
+#ifdef __APPLE__
+#include <AvailabilityMacros.h>
+#endif
+
+/* test for malloc_size() */
+#ifdef __APPLE__
+#include <malloc/malloc.h>
+#define HAVE_MALLOC_SIZE 1
+#define redis_malloc_size(p) malloc_size(p)
+#endif
+
+/* define redis_fstat to fstat or fstat64() */
+#if defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
+#define redis_fstat fstat64
+#define redis_stat stat64
+#else
+#define redis_fstat fstat
+#define redis_stat stat
+#endif
+
+/* test for backtrace() */
+#if defined(__APPLE__) || defined(__linux__)
+#define HAVE_BACKTRACE 1
+#endif
+
+/* test for polling API */
+#ifdef __linux__
+#define HAVE_EPOLL 1
+#endif
+
+#if (defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)) || defined(__FreeBSD__) || defined(__OpenBSD__) || defined (__NetBSD__)
+#define HAVE_KQUEUE 1
+#endif
+
+/* define aof_fsync to fdatasync() in Linux and fsync() for all the rest */
+#ifdef __linux__
+#define aof_fsync fdatasync
+#else
+#define aof_fsync fsync
+#endif
+
+#endif
diff --git a/src/db.c b/src/db.c
new file mode 100644
index 000000000..e1e82cb22
--- /dev/null
+++ b/src/db.c
@@ -0,0 +1,508 @@
+#include "redis.h"
+
+#include <signal.h>
+
+/*-----------------------------------------------------------------------------
+ * C-level DB API
+ *----------------------------------------------------------------------------*/
+
+robj *lookupKey(redisDb *db, robj *key) {
+ dictEntry *de = dictFind(db->dict,key->ptr);
+ if (de) {
+ robj *val = dictGetEntryVal(de);
+
+ if (server.vm_enabled) {
+ if (val->storage == REDIS_VM_MEMORY ||
+ val->storage == REDIS_VM_SWAPPING)
+ {
+ /* If we were swapping the object out, cancel the operation */
+ if (val->storage == REDIS_VM_SWAPPING)
+ vmCancelThreadedIOJob(val);
+ /* Update the access time for the aging algorithm. */
+ val->lru = server.lruclock;
+ } else {
+ int notify = (val->storage == REDIS_VM_LOADING);
+
+ /* Our value was swapped on disk. Bring it at home. */
+ redisAssert(val->type == REDIS_VMPOINTER);
+ val = vmLoadObject(val);
+ dictGetEntryVal(de) = val;
+
+ /* Clients blocked by the VM subsystem may be waiting for
+ * this key... */
+ if (notify) handleClientsBlockedOnSwappedKey(db,key);
+ }
+ }
+ return val;
+ } else {
+ return NULL;
+ }
+}
+
+robj *lookupKeyRead(redisDb *db, robj *key) {
+ expireIfNeeded(db,key);
+ return lookupKey(db,key);
+}
+
+robj *lookupKeyWrite(redisDb *db, robj *key) {
+ deleteIfVolatile(db,key);
+ touchWatchedKey(db,key);
+ return lookupKey(db,key);
+}
+
+robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply) {
+ robj *o = lookupKeyRead(c->db, key);
+ if (!o) addReply(c,reply);
+ return o;
+}
+
+robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply) {
+ robj *o = lookupKeyWrite(c->db, key);
+ if (!o) addReply(c,reply);
+ return o;
+}
+
+/* Add the key to the DB. If the key already exists REDIS_ERR is returned,
+ * otherwise REDIS_OK is returned, and the caller should increment the
+ * refcount of 'val'. */
+int dbAdd(redisDb *db, robj *key, robj *val) {
+ /* Perform a lookup before adding the key, as we need to copy the
+ * key value. */
+ if (dictFind(db->dict, key->ptr) != NULL) {
+ return REDIS_ERR;
+ } else {
+ sds copy = sdsdup(key->ptr);
+ dictAdd(db->dict, copy, val);
+ return REDIS_OK;
+ }
+}
+
+/* If the key does not exist, this is just like dbAdd(). Otherwise
+ * the value associated to the key is replaced with the new one.
+ *
+ * On update (key already existed) 0 is returned. Otherwise 1. */
+int dbReplace(redisDb *db, robj *key, robj *val) {
+ if (dictFind(db->dict,key->ptr) == NULL) {
+ sds copy = sdsdup(key->ptr);
+ dictAdd(db->dict, copy, val);
+ return 1;
+ } else {
+ dictReplace(db->dict, key->ptr, val);
+ return 0;
+ }
+}
+
+int dbExists(redisDb *db, robj *key) {
+ return dictFind(db->dict,key->ptr) != NULL;
+}
+
+/* Return a random key, in form of a Redis object.
+ * If there are no keys, NULL is returned.
+ *
+ * The function makes sure to return keys not already expired. */
+robj *dbRandomKey(redisDb *db) {
+ struct dictEntry *de;
+
+ while(1) {
+ sds key;
+ robj *keyobj;
+
+ de = dictGetRandomKey(db->dict);
+ if (de == NULL) return NULL;
+
+ key = dictGetEntryKey(de);
+ keyobj = createStringObject(key,sdslen(key));
+ if (dictFind(db->expires,key)) {
+ if (expireIfNeeded(db,keyobj)) {
+ decrRefCount(keyobj);
+ continue; /* search for another key. This expired. */
+ }
+ }
+ return keyobj;
+ }
+}
+
+/* Delete a key, value, and associated expiration entry if any, from the DB */
+int dbDelete(redisDb *db, robj *key) {
+ /* Deleting an entry from the expires dict will not free the sds of
+ * the key, because it is shared with the main dictionary. */
+ if (dictSize(db->expires) > 0) dictDelete(db->expires,key->ptr);
+ return dictDelete(db->dict,key->ptr) == DICT_OK;
+}
+
+/* Empty the whole database */
+long long emptyDb() {
+ int j;
+ long long removed = 0;
+
+ for (j = 0; j < server.dbnum; j++) {
+ removed += dictSize(server.db[j].dict);
+ dictEmpty(server.db[j].dict);
+ dictEmpty(server.db[j].expires);
+ }
+ return removed;
+}
+
+int selectDb(redisClient *c, int id) {
+ if (id < 0 || id >= server.dbnum)
+ return REDIS_ERR;
+ c->db = &server.db[id];
+ return REDIS_OK;
+}
+
+/*-----------------------------------------------------------------------------
+ * Type agnostic commands operating on the key space
+ *----------------------------------------------------------------------------*/
+
+void flushdbCommand(redisClient *c) {
+ server.dirty += dictSize(c->db->dict);
+ touchWatchedKeysOnFlush(c->db->id);
+ dictEmpty(c->db->dict);
+ dictEmpty(c->db->expires);
+ addReply(c,shared.ok);
+}
+
+void flushallCommand(redisClient *c) {
+ touchWatchedKeysOnFlush(-1);
+ server.dirty += emptyDb();
+ addReply(c,shared.ok);
+ if (server.bgsavechildpid != -1) {
+ kill(server.bgsavechildpid,SIGKILL);
+ rdbRemoveTempFile(server.bgsavechildpid);
+ }
+ rdbSave(server.dbfilename);
+ server.dirty++;
+}
+
+void delCommand(redisClient *c) {
+ int deleted = 0, j;
+
+ for (j = 1; j < c->argc; j++) {
+ if (dbDelete(c->db,c->argv[j])) {
+ touchWatchedKey(c->db,c->argv[j]);
+ server.dirty++;
+ deleted++;
+ }
+ }
+ addReplyLongLong(c,deleted);
+}
+
+void existsCommand(redisClient *c) {
+ expireIfNeeded(c->db,c->argv[1]);
+ if (dbExists(c->db,c->argv[1])) {
+ addReply(c, shared.cone);
+ } else {
+ addReply(c, shared.czero);
+ }
+}
+
+void selectCommand(redisClient *c) {
+ int id = atoi(c->argv[1]->ptr);
+
+ if (selectDb(c,id) == REDIS_ERR) {
+ addReplySds(c,sdsnew("-ERR invalid DB index\r\n"));
+ } else {
+ addReply(c,shared.ok);
+ }
+}
+
+void randomkeyCommand(redisClient *c) {
+ robj *key;
+
+ if ((key = dbRandomKey(c->db)) == NULL) {
+ addReply(c,shared.nullbulk);
+ return;
+ }
+
+ addReplyBulk(c,key);
+ decrRefCount(key);
+}
+
+void keysCommand(redisClient *c) {
+ dictIterator *di;
+ dictEntry *de;
+ sds pattern = c->argv[1]->ptr;
+ int plen = sdslen(pattern);
+ unsigned long numkeys = 0;
+ robj *lenobj = createObject(REDIS_STRING,NULL);
+
+ di = dictGetIterator(c->db->dict);
+ addReply(c,lenobj);
+ decrRefCount(lenobj);
+ while((de = dictNext(di)) != NULL) {
+ sds key = dictGetEntryKey(de);
+ robj *keyobj;
+
+ if ((pattern[0] == '*' && pattern[1] == '\0') ||
+ stringmatchlen(pattern,plen,key,sdslen(key),0)) {
+ keyobj = createStringObject(key,sdslen(key));
+ if (expireIfNeeded(c->db,keyobj) == 0) {
+ addReplyBulk(c,keyobj);
+ numkeys++;
+ }
+ decrRefCount(keyobj);
+ }
+ }
+ dictReleaseIterator(di);
+ lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",numkeys);
+}
+
+void dbsizeCommand(redisClient *c) {
+ addReplySds(c,
+ sdscatprintf(sdsempty(),":%lu\r\n",dictSize(c->db->dict)));
+}
+
+void lastsaveCommand(redisClient *c) {
+ addReplySds(c,
+ sdscatprintf(sdsempty(),":%lu\r\n",server.lastsave));
+}
+
+void typeCommand(redisClient *c) {
+ robj *o;
+ char *type;
+
+ o = lookupKeyRead(c->db,c->argv[1]);
+ if (o == NULL) {
+ type = "+none";
+ } else {
+ switch(o->type) {
+ case REDIS_STRING: type = "+string"; break;
+ case REDIS_LIST: type = "+list"; break;
+ case REDIS_SET: type = "+set"; break;
+ case REDIS_ZSET: type = "+zset"; break;
+ case REDIS_HASH: type = "+hash"; break;
+ default: type = "+unknown"; break;
+ }
+ }
+ addReplySds(c,sdsnew(type));
+ addReply(c,shared.crlf);
+}
+
+void saveCommand(redisClient *c) {
+ if (server.bgsavechildpid != -1) {
+ addReplySds(c,sdsnew("-ERR background save in progress\r\n"));
+ return;
+ }
+ if (rdbSave(server.dbfilename) == REDIS_OK) {
+ addReply(c,shared.ok);
+ } else {
+ addReply(c,shared.err);
+ }
+}
+
+void bgsaveCommand(redisClient *c) {
+ if (server.bgsavechildpid != -1) {
+ addReplySds(c,sdsnew("-ERR background save already in progress\r\n"));
+ return;
+ }
+ if (rdbSaveBackground(server.dbfilename) == REDIS_OK) {
+ char *status = "+Background saving started\r\n";
+ addReplySds(c,sdsnew(status));
+ } else {
+ addReply(c,shared.err);
+ }
+}
+
+void shutdownCommand(redisClient *c) {
+ if (prepareForShutdown() == REDIS_OK)
+ exit(0);
+ addReplySds(c, sdsnew("-ERR Errors trying to SHUTDOWN. Check logs.\r\n"));
+}
+
+void renameGenericCommand(redisClient *c, int nx) {
+ robj *o;
+
+ /* To use the same key as src and dst is probably an error */
+ if (sdscmp(c->argv[1]->ptr,c->argv[2]->ptr) == 0) {
+ addReply(c,shared.sameobjecterr);
+ return;
+ }
+
+ if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr)) == NULL)
+ return;
+
+ incrRefCount(o);
+ deleteIfVolatile(c->db,c->argv[2]);
+ if (dbAdd(c->db,c->argv[2],o) == REDIS_ERR) {
+ if (nx) {
+ decrRefCount(o);
+ addReply(c,shared.czero);
+ return;
+ }
+ dbReplace(c->db,c->argv[2],o);
+ }
+ dbDelete(c->db,c->argv[1]);
+ touchWatchedKey(c->db,c->argv[2]);
+ server.dirty++;
+ addReply(c,nx ? shared.cone : shared.ok);
+}
+
+void renameCommand(redisClient *c) {
+ renameGenericCommand(c,0);
+}
+
+void renamenxCommand(redisClient *c) {
+ renameGenericCommand(c,1);
+}
+
+void moveCommand(redisClient *c) {
+ robj *o;
+ redisDb *src, *dst;
+ int srcid;
+
+ /* Obtain source and target DB pointers */
+ src = c->db;
+ srcid = c->db->id;
+ if (selectDb(c,atoi(c->argv[2]->ptr)) == REDIS_ERR) {
+ addReply(c,shared.outofrangeerr);
+ return;
+ }
+ dst = c->db;
+ selectDb(c,srcid); /* Back to the source DB */
+
+ /* If the user is moving using as target the same
+ * DB as the source DB it is probably an error. */
+ if (src == dst) {
+ addReply(c,shared.sameobjecterr);
+ return;
+ }
+
+ /* Check if the element exists and get a reference */
+ o = lookupKeyWrite(c->db,c->argv[1]);
+ if (!o) {
+ addReply(c,shared.czero);
+ return;
+ }
+
+ /* Try to add the element to the target DB */
+ deleteIfVolatile(dst,c->argv[1]);
+ if (dbAdd(dst,c->argv[1],o) == REDIS_ERR) {
+ addReply(c,shared.czero);
+ return;
+ }
+ incrRefCount(o);
+
+ /* OK! key moved, free the entry in the source DB */
+ dbDelete(src,c->argv[1]);
+ server.dirty++;
+ addReply(c,shared.cone);
+}
+
+/*-----------------------------------------------------------------------------
+ * Expires API
+ *----------------------------------------------------------------------------*/
+
+int removeExpire(redisDb *db, robj *key) {
+ /* An expire may only be removed if there is a corresponding entry in the
+ * main dict. Otherwise, the key will never be freed. */
+ redisAssert(dictFind(db->dict,key->ptr) != NULL);
+ if (dictDelete(db->expires,key->ptr) == DICT_OK) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+int setExpire(redisDb *db, robj *key, time_t when) {
+ dictEntry *de;
+
+ /* Reuse the sds from the main dict in the expire dict */
+ redisAssert((de = dictFind(db->dict,key->ptr)) != NULL);
+ if (dictAdd(db->expires,dictGetEntryKey(de),(void*)when) == DICT_ERR) {
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+/* Return the expire time of the specified key, or -1 if no expire
+ * is associated with this key (i.e. the key is non volatile) */
+time_t getExpire(redisDb *db, robj *key) {
+ dictEntry *de;
+
+ /* No expire? return ASAP */
+ if (dictSize(db->expires) == 0 ||
+ (de = dictFind(db->expires,key->ptr)) == NULL) return -1;
+
+ /* The entry was found in the expire dict, this means it should also
+ * be present in the main dict (safety check). */
+ redisAssert(dictFind(db->dict,key->ptr) != NULL);
+ return (time_t) dictGetEntryVal(de);
+}
+
+int expireIfNeeded(redisDb *db, robj *key) {
+ time_t when = getExpire(db,key);
+ if (when < 0) return 0;
+
+ /* Return when this key has not expired */
+ if (time(NULL) <= when) return 0;
+
+ /* Delete the key */
+ server.stat_expiredkeys++;
+ server.dirty++;
+ return dbDelete(db,key);
+}
+
+int deleteIfVolatile(redisDb *db, robj *key) {
+ if (getExpire(db,key) < 0) return 0;
+
+ /* Delete the key */
+ server.stat_expiredkeys++;
+ server.dirty++;
+ return dbDelete(db,key);
+}
+
+/*-----------------------------------------------------------------------------
+ * Expires Commands
+ *----------------------------------------------------------------------------*/
+
+void expireGenericCommand(redisClient *c, robj *key, robj *param, long offset) {
+ dictEntry *de;
+ time_t seconds;
+
+ if (getLongFromObjectOrReply(c, param, &seconds, NULL) != REDIS_OK) return;
+
+ seconds -= offset;
+
+ de = dictFind(c->db->dict,key->ptr);
+ if (de == NULL) {
+ addReply(c,shared.czero);
+ return;
+ }
+ if (seconds <= 0) {
+ if (dbDelete(c->db,key)) server.dirty++;
+ addReply(c, shared.cone);
+ return;
+ } else {
+ time_t when = time(NULL)+seconds;
+ if (setExpire(c->db,key,when)) {
+ addReply(c,shared.cone);
+ server.dirty++;
+ } else {
+ addReply(c,shared.czero);
+ }
+ return;
+ }
+}
+
+void expireCommand(redisClient *c) {
+ expireGenericCommand(c,c->argv[1],c->argv[2],0);
+}
+
+void expireatCommand(redisClient *c) {
+ expireGenericCommand(c,c->argv[1],c->argv[2],time(NULL));
+}
+
+void ttlCommand(redisClient *c) {
+ time_t expire;
+ int ttl = -1;
+
+ expire = getExpire(c->db,c->argv[1]);
+ if (expire != -1) {
+ ttl = (int) (expire-time(NULL));
+ if (ttl < 0) ttl = -1;
+ }
+ addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",ttl));
+}
+
+
diff --git a/src/debug.c b/src/debug.c
new file mode 100644
index 000000000..10b620d6f
--- /dev/null
+++ b/src/debug.c
@@ -0,0 +1,309 @@
+#include "redis.h"
+#include "sha1.h" /* SHA1 is used for DEBUG DIGEST */
+
+/* ================================= Debugging ============================== */
+
+/* Compute the sha1 of string at 's' with 'len' bytes long.
+ * The SHA1 is then xored againt the string pointed by digest.
+ * Since xor is commutative, this operation is used in order to
+ * "add" digests relative to unordered elements.
+ *
+ * So digest(a,b,c,d) will be the same of digest(b,a,c,d) */
+void xorDigest(unsigned char *digest, void *ptr, size_t len) {
+ SHA1_CTX ctx;
+ unsigned char hash[20], *s = ptr;
+ int j;
+
+ SHA1Init(&ctx);
+ SHA1Update(&ctx,s,len);
+ SHA1Final(hash,&ctx);
+
+ for (j = 0; j < 20; j++)
+ digest[j] ^= hash[j];
+}
+
+void xorObjectDigest(unsigned char *digest, robj *o) {
+ o = getDecodedObject(o);
+ xorDigest(digest,o->ptr,sdslen(o->ptr));
+ decrRefCount(o);
+}
+
+/* This function instead of just computing the SHA1 and xoring it
+ * against diget, also perform the digest of "digest" itself and
+ * replace the old value with the new one.
+ *
+ * So the final digest will be:
+ *
+ * digest = SHA1(digest xor SHA1(data))
+ *
+ * This function is used every time we want to preserve the order so
+ * that digest(a,b,c,d) will be different than digest(b,c,d,a)
+ *
+ * Also note that mixdigest("foo") followed by mixdigest("bar")
+ * will lead to a different digest compared to "fo", "obar".
+ */
+void mixDigest(unsigned char *digest, void *ptr, size_t len) {
+ SHA1_CTX ctx;
+ char *s = ptr;
+
+ xorDigest(digest,s,len);
+ SHA1Init(&ctx);
+ SHA1Update(&ctx,digest,20);
+ SHA1Final(digest,&ctx);
+}
+
+void mixObjectDigest(unsigned char *digest, robj *o) {
+ o = getDecodedObject(o);
+ mixDigest(digest,o->ptr,sdslen(o->ptr));
+ decrRefCount(o);
+}
+
+/* Compute the dataset digest. Since keys, sets elements, hashes elements
+ * are not ordered, we use a trick: every aggregate digest is the xor
+ * of the digests of their elements. This way the order will not change
+ * the result. For list instead we use a feedback entering the output digest
+ * as input in order to ensure that a different ordered list will result in
+ * a different digest. */
+void computeDatasetDigest(unsigned char *final) {
+ unsigned char digest[20];
+ char buf[128];
+ dictIterator *di = NULL;
+ dictEntry *de;
+ int j;
+ uint32_t aux;
+
+ memset(final,0,20); /* Start with a clean result */
+
+ for (j = 0; j < server.dbnum; j++) {
+ redisDb *db = server.db+j;
+
+ if (dictSize(db->dict) == 0) continue;
+ di = dictGetIterator(db->dict);
+
+ /* hash the DB id, so the same dataset moved in a different
+ * DB will lead to a different digest */
+ aux = htonl(j);
+ mixDigest(final,&aux,sizeof(aux));
+
+ /* Iterate this DB writing every entry */
+ while((de = dictNext(di)) != NULL) {
+ sds key;
+ robj *keyobj, *o;
+ time_t expiretime;
+
+ memset(digest,0,20); /* This key-val digest */
+ key = dictGetEntryKey(de);
+ keyobj = createStringObject(key,sdslen(key));
+
+ mixDigest(digest,key,sdslen(key));
+
+ /* Make sure the key is loaded if VM is active */
+ o = lookupKeyRead(db,keyobj);
+
+ aux = htonl(o->type);
+ mixDigest(digest,&aux,sizeof(aux));
+ expiretime = getExpire(db,keyobj);
+
+ /* Save the key and associated value */
+ if (o->type == REDIS_STRING) {
+ mixObjectDigest(digest,o);
+ } else if (o->type == REDIS_LIST) {
+ listTypeIterator *li = listTypeInitIterator(o,0,REDIS_TAIL);
+ listTypeEntry entry;
+ while(listTypeNext(li,&entry)) {
+ robj *eleobj = listTypeGet(&entry);
+ mixObjectDigest(digest,eleobj);
+ decrRefCount(eleobj);
+ }
+ listTypeReleaseIterator(li);
+ } else if (o->type == REDIS_SET) {
+ dict *set = o->ptr;
+ dictIterator *di = dictGetIterator(set);
+ dictEntry *de;
+
+ while((de = dictNext(di)) != NULL) {
+ robj *eleobj = dictGetEntryKey(de);
+
+ xorObjectDigest(digest,eleobj);
+ }
+ dictReleaseIterator(di);
+ } else if (o->type == REDIS_ZSET) {
+ zset *zs = o->ptr;
+ dictIterator *di = dictGetIterator(zs->dict);
+ dictEntry *de;
+
+ while((de = dictNext(di)) != NULL) {
+ robj *eleobj = dictGetEntryKey(de);
+ double *score = dictGetEntryVal(de);
+ unsigned char eledigest[20];
+
+ snprintf(buf,sizeof(buf),"%.17g",*score);
+ memset(eledigest,0,20);
+ mixObjectDigest(eledigest,eleobj);
+ mixDigest(eledigest,buf,strlen(buf));
+ xorDigest(digest,eledigest,20);
+ }
+ dictReleaseIterator(di);
+ } else if (o->type == REDIS_HASH) {
+ hashTypeIterator *hi;
+ robj *obj;
+
+ hi = hashTypeInitIterator(o);
+ while (hashTypeNext(hi) != REDIS_ERR) {
+ unsigned char eledigest[20];
+
+ memset(eledigest,0,20);
+ obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
+ mixObjectDigest(eledigest,obj);
+ decrRefCount(obj);
+ obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
+ mixObjectDigest(eledigest,obj);
+ decrRefCount(obj);
+ xorDigest(digest,eledigest,20);
+ }
+ hashTypeReleaseIterator(hi);
+ } else {
+ redisPanic("Unknown object type");
+ }
+ /* If the key has an expire, add it to the mix */
+ if (expiretime != -1) xorDigest(digest,"!!expire!!",10);
+ /* We can finally xor the key-val digest to the final digest */
+ xorDigest(final,digest,20);
+ decrRefCount(keyobj);
+ }
+ dictReleaseIterator(di);
+ }
+}
+
+void debugCommand(redisClient *c) {
+ if (!strcasecmp(c->argv[1]->ptr,"segfault")) {
+ *((char*)-1) = 'x';
+ } else if (!strcasecmp(c->argv[1]->ptr,"reload")) {
+ if (rdbSave(server.dbfilename) != REDIS_OK) {
+ addReply(c,shared.err);
+ return;
+ }
+ emptyDb();
+ if (rdbLoad(server.dbfilename) != REDIS_OK) {
+ addReply(c,shared.err);
+ return;
+ }
+ redisLog(REDIS_WARNING,"DB reloaded by DEBUG RELOAD");
+ addReply(c,shared.ok);
+ } else if (!strcasecmp(c->argv[1]->ptr,"loadaof")) {
+ emptyDb();
+ if (loadAppendOnlyFile(server.appendfilename) != REDIS_OK) {
+ addReply(c,shared.err);
+ return;
+ }
+ redisLog(REDIS_WARNING,"Append Only File loaded by DEBUG LOADAOF");
+ addReply(c,shared.ok);
+ } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) {
+ dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
+ robj *val;
+
+ if (!de) {
+ addReply(c,shared.nokeyerr);
+ return;
+ }
+ val = dictGetEntryVal(de);
+ if (!server.vm_enabled || (val->storage == REDIS_VM_MEMORY ||
+ val->storage == REDIS_VM_SWAPPING)) {
+ char *strenc;
+
+ strenc = strEncoding(val->encoding);
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "+Value at:%p refcount:%d "
+ "encoding:%s serializedlength:%lld\r\n",
+ (void*)val, val->refcount,
+ strenc, (long long) rdbSavedObjectLen(val,NULL)));
+ } else {
+ vmpointer *vp = (vmpointer*) val;
+ addReplySds(c,sdscatprintf(sdsempty(),
+ "+Value swapped at: page %llu "
+ "using %llu pages\r\n",
+ (unsigned long long) vp->page,
+ (unsigned long long) vp->usedpages));
+ }
+ } else if (!strcasecmp(c->argv[1]->ptr,"swapin") && c->argc == 3) {
+ lookupKeyRead(c->db,c->argv[2]);
+ addReply(c,shared.ok);
+ } else if (!strcasecmp(c->argv[1]->ptr,"swapout") && c->argc == 3) {
+ dictEntry *de = dictFind(c->db->dict,c->argv[2]->ptr);
+ robj *val;
+ vmpointer *vp;
+
+ if (!server.vm_enabled) {
+ addReplySds(c,sdsnew("-ERR Virtual Memory is disabled\r\n"));
+ return;
+ }
+ if (!de) {
+ addReply(c,shared.nokeyerr);
+ return;
+ }
+ val = dictGetEntryVal(de);
+ /* Swap it */
+ if (val->storage != REDIS_VM_MEMORY) {
+ addReplySds(c,sdsnew("-ERR This key is not in memory\r\n"));
+ } else if (val->refcount != 1) {
+ addReplySds(c,sdsnew("-ERR Object is shared\r\n"));
+ } else if ((vp = vmSwapObjectBlocking(val)) != NULL) {
+ dictGetEntryVal(de) = vp;
+ addReply(c,shared.ok);
+ } else {
+ addReply(c,shared.err);
+ }
+ } else if (!strcasecmp(c->argv[1]->ptr,"populate") && c->argc == 3) {
+ long keys, j;
+ robj *key, *val;
+ char buf[128];
+
+ if (getLongFromObjectOrReply(c, c->argv[2], &keys, NULL) != REDIS_OK)
+ return;
+ for (j = 0; j < keys; j++) {
+ snprintf(buf,sizeof(buf),"key:%lu",j);
+ key = createStringObject(buf,strlen(buf));
+ if (lookupKeyRead(c->db,key) != NULL) {
+ decrRefCount(key);
+ continue;
+ }
+ snprintf(buf,sizeof(buf),"value:%lu",j);
+ val = createStringObject(buf,strlen(buf));
+ dbAdd(c->db,key,val);
+ decrRefCount(key);
+ }
+ addReply(c,shared.ok);
+ } else if (!strcasecmp(c->argv[1]->ptr,"digest") && c->argc == 2) {
+ unsigned char digest[20];
+ sds d = sdsnew("+");
+ int j;
+
+ computeDatasetDigest(digest);
+ for (j = 0; j < 20; j++)
+ d = sdscatprintf(d, "%02x",digest[j]);
+
+ d = sdscatlen(d,"\r\n",2);
+ addReplySds(c,d);
+ } else {
+ addReplySds(c,sdsnew(
+ "-ERR Syntax error, try DEBUG [SEGFAULT|OBJECT <key>|SWAPIN <key>|SWAPOUT <key>|RELOAD]\r\n"));
+ }
+}
+
+void _redisAssert(char *estr, char *file, int line) {
+ redisLog(REDIS_WARNING,"=== ASSERTION FAILED ===");
+ redisLog(REDIS_WARNING,"==> %s:%d '%s' is not true",file,line,estr);
+#ifdef HAVE_BACKTRACE
+ redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
+ *((char*)-1) = 'x';
+#endif
+}
+
+void _redisPanic(char *msg, char *file, int line) {
+ redisLog(REDIS_WARNING,"!!! Software Failure. Press left mouse button to continue");
+ redisLog(REDIS_WARNING,"Guru Meditation: %s #%s:%d",msg,file,line);
+#ifdef HAVE_BACKTRACE
+ redisLog(REDIS_WARNING,"(forcing SIGSEGV in order to print the stack trace)");
+ *((char*)-1) = 'x';
+#endif
+}
diff --git a/src/dict.c b/src/dict.c
new file mode 100644
index 000000000..d5010708c
--- /dev/null
+++ b/src/dict.c
@@ -0,0 +1,727 @@
+/* Hash Tables Implementation.
+ *
+ * This file implements in memory hash tables with insert/del/replace/find/
+ * get-random-element operations. Hash tables will auto resize if needed
+ * tables of power of two in size are used, collisions are handled by
+ * chaining. See the source code for more information... :)
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fmacros.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <assert.h>
+#include <limits.h>
+#include <sys/time.h>
+
+#include "dict.h"
+#include "zmalloc.h"
+
+/* Using dictEnableResize() / dictDisableResize() we make possible to
+ * enable/disable resizing of the hash table as needed. This is very important
+ * for Redis, as we use copy-on-write and don't want to move too much memory
+ * around when there is a child performing saving operations. */
+static int dict_can_resize = 1;
+
+/* ---------------------------- Utility funcitons --------------------------- */
+
+static void _dictPanic(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ fprintf(stderr, "\nDICT LIBRARY PANIC: ");
+ vfprintf(stderr, fmt, ap);
+ fprintf(stderr, "\n\n");
+ va_end(ap);
+}
+
+/* ------------------------- Heap Management Wrappers------------------------ */
+
+static void *_dictAlloc(size_t size)
+{
+ void *p = zmalloc(size);
+ if (p == NULL)
+ _dictPanic("Out of memory");
+ return p;
+}
+
+static void _dictFree(void *ptr) {
+ zfree(ptr);
+}
+
+/* -------------------------- private prototypes ---------------------------- */
+
+static int _dictExpandIfNeeded(dict *ht);
+static unsigned long _dictNextPower(unsigned long size);
+static int _dictKeyIndex(dict *ht, const void *key);
+static int _dictInit(dict *ht, dictType *type, void *privDataPtr);
+
+/* -------------------------- hash functions -------------------------------- */
+
+/* Thomas Wang's 32 bit Mix Function */
+unsigned int dictIntHashFunction(unsigned int key)
+{
+ key += ~(key << 15);
+ key ^= (key >> 10);
+ key += (key << 3);
+ key ^= (key >> 6);
+ key += ~(key << 11);
+ key ^= (key >> 16);
+ return key;
+}
+
+/* Identity hash function for integer keys */
+unsigned int dictIdentityHashFunction(unsigned int key)
+{
+ return key;
+}
+
+/* Generic hash function (a popular one from Bernstein).
+ * I tested a few and this was the best. */
+unsigned int dictGenHashFunction(const unsigned char *buf, int len) {
+ unsigned int hash = 5381;
+
+ while (len--)
+ hash = ((hash << 5) + hash) + (*buf++); /* hash * 33 + c */
+ return hash;
+}
+
+/* ----------------------------- API implementation ------------------------- */
+
+/* Reset an hashtable already initialized with ht_init().
+ * NOTE: This function should only called by ht_destroy(). */
+static void _dictReset(dictht *ht)
+{
+ ht->table = NULL;
+ ht->size = 0;
+ ht->sizemask = 0;
+ ht->used = 0;
+}
+
+/* Create a new hash table */
+dict *dictCreate(dictType *type,
+ void *privDataPtr)
+{
+ dict *d = _dictAlloc(sizeof(*d));
+
+ _dictInit(d,type,privDataPtr);
+ return d;
+}
+
+/* Initialize the hash table */
+int _dictInit(dict *d, dictType *type,
+ void *privDataPtr)
+{
+ _dictReset(&d->ht[0]);
+ _dictReset(&d->ht[1]);
+ d->type = type;
+ d->privdata = privDataPtr;
+ d->rehashidx = -1;
+ d->iterators = 0;
+ return DICT_OK;
+}
+
+/* Resize the table to the minimal size that contains all the elements,
+ * but with the invariant of a USER/BUCKETS ration near to <= 1 */
+int dictResize(dict *d)
+{
+ int minimal;
+
+ if (!dict_can_resize || dictIsRehashing(d)) return DICT_ERR;
+ minimal = d->ht[0].used;
+ if (minimal < DICT_HT_INITIAL_SIZE)
+ minimal = DICT_HT_INITIAL_SIZE;
+ return dictExpand(d, minimal);
+}
+
+/* Expand or create the hashtable */
+int dictExpand(dict *d, unsigned long size)
+{
+ dictht n; /* the new hashtable */
+ unsigned long realsize = _dictNextPower(size);
+
+ /* the size is invalid if it is smaller than the number of
+ * elements already inside the hashtable */
+ if (dictIsRehashing(d) || d->ht[0].used > size)
+ return DICT_ERR;
+
+ n.size = realsize;
+ n.sizemask = realsize-1;
+ n.table = _dictAlloc(realsize*sizeof(dictEntry*));
+ n.used = 0;
+
+ /* Initialize all the pointers to NULL */
+ memset(n.table, 0, realsize*sizeof(dictEntry*));
+
+ /* Is this the first initialization? If so it's not really a rehashing
+ * we just set the first hash table so that it can accept keys. */
+ if (d->ht[0].table == NULL) {
+ d->ht[0] = n;
+ return DICT_OK;
+ }
+
+ /* Prepare a second hash table for incremental rehashing */
+ d->ht[1] = n;
+ d->rehashidx = 0;
+ return DICT_OK;
+}
+
+/* Performs N steps of incremental rehashing. Returns 1 if there are still
+ * keys to move from the old to the new hash table, otherwise 0 is returned.
+ * Note that a rehashing step consists in moving a bucket (that may have more
+ * thank one key as we use chaining) from the old to the new hash table. */
+int dictRehash(dict *d, int n) {
+ if (!dictIsRehashing(d)) return 0;
+
+ while(n--) {
+ dictEntry *de, *nextde;
+
+ /* Check if we already rehashed the whole table... */
+ if (d->ht[0].used == 0) {
+ _dictFree(d->ht[0].table);
+ d->ht[0] = d->ht[1];
+ _dictReset(&d->ht[1]);
+ d->rehashidx = -1;
+ return 0;
+ }
+
+ /* Note that rehashidx can't overflow as we are sure there are more
+ * elements because ht[0].used != 0 */
+ while(d->ht[0].table[d->rehashidx] == NULL) d->rehashidx++;
+ de = d->ht[0].table[d->rehashidx];
+ /* Move all the keys in this bucket from the old to the new hash HT */
+ while(de) {
+ unsigned int h;
+
+ nextde = de->next;
+ /* Get the index in the new hash table */
+ h = dictHashKey(d, de->key) & d->ht[1].sizemask;
+ de->next = d->ht[1].table[h];
+ d->ht[1].table[h] = de;
+ d->ht[0].used--;
+ d->ht[1].used++;
+ de = nextde;
+ }
+ d->ht[0].table[d->rehashidx] = NULL;
+ d->rehashidx++;
+ }
+ return 1;
+}
+
+long long timeInMilliseconds(void) {
+ struct timeval tv;
+
+ gettimeofday(&tv,NULL);
+ return (((long long)tv.tv_sec)*1000)+(tv.tv_usec/1000);
+}
+
+/* Rehash for an amount of time between ms milliseconds and ms+1 milliseconds */
+int dictRehashMilliseconds(dict *d, int ms) {
+ long long start = timeInMilliseconds();
+ int rehashes = 0;
+
+ while(dictRehash(d,100)) {
+ rehashes += 100;
+ if (timeInMilliseconds()-start > ms) break;
+ }
+ return rehashes;
+}
+
+/* This function performs just a step of rehashing, and only if there are
+ * not iterators bound to our hash table. When we have iterators in the middle
+ * of a rehashing we can't mess with the two hash tables otherwise some element
+ * can be missed or duplicated.
+ *
+ * This function is called by common lookup or update operations in the
+ * dictionary so that the hash table automatically migrates from H1 to H2
+ * while it is actively used. */
+static void _dictRehashStep(dict *d) {
+ if (d->iterators == 0) dictRehash(d,1);
+}
+
+/* Add an element to the target hash table */
+int dictAdd(dict *d, void *key, void *val)
+{
+ int index;
+ dictEntry *entry;
+ dictht *ht;
+
+ if (dictIsRehashing(d)) _dictRehashStep(d);
+
+ /* Get the index of the new element, or -1 if
+ * the element already exists. */
+ if ((index = _dictKeyIndex(d, key)) == -1)
+ return DICT_ERR;
+
+ /* Allocates the memory and stores key */
+ ht = dictIsRehashing(d) ? &d->ht[1] : &d->ht[0];
+ entry = _dictAlloc(sizeof(*entry));
+ entry->next = ht->table[index];
+ ht->table[index] = entry;
+ ht->used++;
+
+ /* Set the hash entry fields. */
+ dictSetHashKey(d, entry, key);
+ dictSetHashVal(d, entry, val);
+ return DICT_OK;
+}
+
+/* Add an element, discarding the old if the key already exists.
+ * Return 1 if the key was added from scratch, 0 if there was already an
+ * element with such key and dictReplace() just performed a value update
+ * operation. */
+int dictReplace(dict *d, void *key, void *val)
+{
+ dictEntry *entry, auxentry;
+
+ /* Try to add the element. If the key
+ * does not exists dictAdd will suceed. */
+ if (dictAdd(d, key, val) == DICT_OK)
+ return 1;
+ /* It already exists, get the entry */
+ entry = dictFind(d, key);
+ /* Free the old value and set the new one */
+ /* Set the new value and free the old one. Note that it is important
+ * to do that in this order, as the value may just be exactly the same
+ * as the previous one. In this context, think to reference counting,
+ * you want to increment (set), and then decrement (free), and not the
+ * reverse. */
+ auxentry = *entry;
+ dictSetHashVal(d, entry, val);
+ dictFreeEntryVal(d, &auxentry);
+ return 0;
+}
+
+/* Search and remove an element */
+static int dictGenericDelete(dict *d, const void *key, int nofree)
+{
+ unsigned int h, idx;
+ dictEntry *he, *prevHe;
+ int table;
+
+ if (d->ht[0].size == 0) return DICT_ERR; /* d->ht[0].table is NULL */
+ if (dictIsRehashing(d)) _dictRehashStep(d);
+ h = dictHashKey(d, key);
+
+ for (table = 0; table <= 1; table++) {
+ idx = h & d->ht[table].sizemask;
+ he = d->ht[table].table[idx];
+ prevHe = NULL;
+ while(he) {
+ if (dictCompareHashKeys(d, key, he->key)) {
+ /* Unlink the element from the list */
+ if (prevHe)
+ prevHe->next = he->next;
+ else
+ d->ht[table].table[idx] = he->next;
+ if (!nofree) {
+ dictFreeEntryKey(d, he);
+ dictFreeEntryVal(d, he);
+ }
+ _dictFree(he);
+ d->ht[table].used--;
+ return DICT_OK;
+ }
+ prevHe = he;
+ he = he->next;
+ }
+ if (!dictIsRehashing(d)) break;
+ }
+ return DICT_ERR; /* not found */
+}
+
+int dictDelete(dict *ht, const void *key) {
+ return dictGenericDelete(ht,key,0);
+}
+
+int dictDeleteNoFree(dict *ht, const void *key) {
+ return dictGenericDelete(ht,key,1);
+}
+
+/* Destroy an entire dictionary */
+int _dictClear(dict *d, dictht *ht)
+{
+ unsigned long i;
+
+ /* Free all the elements */
+ for (i = 0; i < ht->size && ht->used > 0; i++) {
+ dictEntry *he, *nextHe;
+
+ if ((he = ht->table[i]) == NULL) continue;
+ while(he) {
+ nextHe = he->next;
+ dictFreeEntryKey(d, he);
+ dictFreeEntryVal(d, he);
+ _dictFree(he);
+ ht->used--;
+ he = nextHe;
+ }
+ }
+ /* Free the table and the allocated cache structure */
+ _dictFree(ht->table);
+ /* Re-initialize the table */
+ _dictReset(ht);
+ return DICT_OK; /* never fails */
+}
+
+/* Clear & Release the hash table */
+void dictRelease(dict *d)
+{
+ _dictClear(d,&d->ht[0]);
+ _dictClear(d,&d->ht[1]);
+ _dictFree(d);
+}
+
+dictEntry *dictFind(dict *d, const void *key)
+{
+ dictEntry *he;
+ unsigned int h, idx, table;
+
+ if (d->ht[0].size == 0) return NULL; /* We don't have a table at all */
+ if (dictIsRehashing(d)) _dictRehashStep(d);
+ h = dictHashKey(d, key);
+ for (table = 0; table <= 1; table++) {
+ idx = h & d->ht[table].sizemask;
+ he = d->ht[table].table[idx];
+ while(he) {
+ if (dictCompareHashKeys(d, key, he->key))
+ return he;
+ he = he->next;
+ }
+ if (!dictIsRehashing(d)) return NULL;
+ }
+ return NULL;
+}
+
+void *dictFetchValue(dict *d, const void *key) {
+ dictEntry *he;
+
+ he = dictFind(d,key);
+ return he ? dictGetEntryVal(he) : NULL;
+}
+
+dictIterator *dictGetIterator(dict *d)
+{
+ dictIterator *iter = _dictAlloc(sizeof(*iter));
+
+ iter->d = d;
+ iter->table = 0;
+ iter->index = -1;
+ iter->entry = NULL;
+ iter->nextEntry = NULL;
+ return iter;
+}
+
+dictEntry *dictNext(dictIterator *iter)
+{
+ while (1) {
+ if (iter->entry == NULL) {
+ dictht *ht = &iter->d->ht[iter->table];
+ if (iter->index == -1 && iter->table == 0) iter->d->iterators++;
+ iter->index++;
+ if (iter->index >= (signed) ht->size) {
+ if (dictIsRehashing(iter->d) && iter->table == 0) {
+ iter->table++;
+ iter->index = 0;
+ ht = &iter->d->ht[1];
+ } else {
+ break;
+ }
+ }
+ iter->entry = ht->table[iter->index];
+ } else {
+ iter->entry = iter->nextEntry;
+ }
+ if (iter->entry) {
+ /* We need to save the 'next' here, the iterator user
+ * may delete the entry we are returning. */
+ iter->nextEntry = iter->entry->next;
+ return iter->entry;
+ }
+ }
+ return NULL;
+}
+
+void dictReleaseIterator(dictIterator *iter)
+{
+ if (!(iter->index == -1 && iter->table == 0)) iter->d->iterators--;
+ _dictFree(iter);
+}
+
+/* Return a random entry from the hash table. Useful to
+ * implement randomized algorithms */
+dictEntry *dictGetRandomKey(dict *d)
+{
+ dictEntry *he, *orighe;
+ unsigned int h;
+ int listlen, listele;
+
+ if (dictSize(d) == 0) return NULL;
+ if (dictIsRehashing(d)) _dictRehashStep(d);
+ if (dictIsRehashing(d)) {
+ do {
+ h = random() % (d->ht[0].size+d->ht[1].size);
+ he = (h >= d->ht[0].size) ? d->ht[1].table[h - d->ht[0].size] :
+ d->ht[0].table[h];
+ } while(he == NULL);
+ } else {
+ do {
+ h = random() & d->ht[0].sizemask;
+ he = d->ht[0].table[h];
+ } while(he == NULL);
+ }
+
+ /* Now we found a non empty bucket, but it is a linked
+ * list and we need to get a random element from the list.
+ * The only sane way to do so is counting the elements and
+ * select a random index. */
+ listlen = 0;
+ orighe = he;
+ while(he) {
+ he = he->next;
+ listlen++;
+ }
+ listele = random() % listlen;
+ he = orighe;
+ while(listele--) he = he->next;
+ return he;
+}
+
+/* ------------------------- private functions ------------------------------ */
+
+/* Expand the hash table if needed */
+static int _dictExpandIfNeeded(dict *d)
+{
+ /* If the hash table is empty expand it to the intial size,
+ * if the table is "full" dobule its size. */
+ if (dictIsRehashing(d)) return DICT_OK;
+ if (d->ht[0].size == 0)
+ return dictExpand(d, DICT_HT_INITIAL_SIZE);
+ if (d->ht[0].used >= d->ht[0].size && dict_can_resize)
+ return dictExpand(d, ((d->ht[0].size > d->ht[0].used) ?
+ d->ht[0].size : d->ht[0].used)*2);
+ return DICT_OK;
+}
+
+/* Our hash table capability is a power of two */
+static unsigned long _dictNextPower(unsigned long size)
+{
+ unsigned long i = DICT_HT_INITIAL_SIZE;
+
+ if (size >= LONG_MAX) return LONG_MAX;
+ while(1) {
+ if (i >= size)
+ return i;
+ i *= 2;
+ }
+}
+
+/* Returns the index of a free slot that can be populated with
+ * an hash entry for the given 'key'.
+ * If the key already exists, -1 is returned.
+ *
+ * Note that if we are in the process of rehashing the hash table, the
+ * index is always returned in the context of the second (new) hash table. */
+static int _dictKeyIndex(dict *d, const void *key)
+{
+ unsigned int h, idx, table;
+ dictEntry *he;
+
+ /* Expand the hashtable if needed */
+ if (_dictExpandIfNeeded(d) == DICT_ERR)
+ return -1;
+ /* Compute the key hash value */
+ h = dictHashKey(d, key);
+ for (table = 0; table <= 1; table++) {
+ idx = h & d->ht[table].sizemask;
+ /* Search if this slot does not already contain the given key */
+ he = d->ht[table].table[idx];
+ while(he) {
+ if (dictCompareHashKeys(d, key, he->key))
+ return -1;
+ he = he->next;
+ }
+ if (!dictIsRehashing(d)) break;
+ }
+ return idx;
+}
+
+void dictEmpty(dict *d) {
+ _dictClear(d,&d->ht[0]);
+ _dictClear(d,&d->ht[1]);
+ d->rehashidx = -1;
+ d->iterators = 0;
+}
+
+#define DICT_STATS_VECTLEN 50
+static void _dictPrintStatsHt(dictht *ht) {
+ unsigned long i, slots = 0, chainlen, maxchainlen = 0;
+ unsigned long totchainlen = 0;
+ unsigned long clvector[DICT_STATS_VECTLEN];
+
+ if (ht->used == 0) {
+ printf("No stats available for empty dictionaries\n");
+ return;
+ }
+
+ for (i = 0; i < DICT_STATS_VECTLEN; i++) clvector[i] = 0;
+ for (i = 0; i < ht->size; i++) {
+ dictEntry *he;
+
+ if (ht->table[i] == NULL) {
+ clvector[0]++;
+ continue;
+ }
+ slots++;
+ /* For each hash entry on this slot... */
+ chainlen = 0;
+ he = ht->table[i];
+ while(he) {
+ chainlen++;
+ he = he->next;
+ }
+ clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++;
+ if (chainlen > maxchainlen) maxchainlen = chainlen;
+ totchainlen += chainlen;
+ }
+ printf("Hash table stats:\n");
+ printf(" table size: %ld\n", ht->size);
+ printf(" number of elements: %ld\n", ht->used);
+ printf(" different slots: %ld\n", slots);
+ printf(" max chain length: %ld\n", maxchainlen);
+ printf(" avg chain length (counted): %.02f\n", (float)totchainlen/slots);
+ printf(" avg chain length (computed): %.02f\n", (float)ht->used/slots);
+ printf(" Chain length distribution:\n");
+ for (i = 0; i < DICT_STATS_VECTLEN-1; i++) {
+ if (clvector[i] == 0) continue;
+ printf(" %s%ld: %ld (%.02f%%)\n",(i == DICT_STATS_VECTLEN-1)?">= ":"", i, clvector[i], ((float)clvector[i]/ht->size)*100);
+ }
+}
+
+void dictPrintStats(dict *d) {
+ _dictPrintStatsHt(&d->ht[0]);
+ if (dictIsRehashing(d)) {
+ printf("-- Rehashing into ht[1]:\n");
+ _dictPrintStatsHt(&d->ht[1]);
+ }
+}
+
+void dictEnableResize(void) {
+ dict_can_resize = 1;
+}
+
+void dictDisableResize(void) {
+ dict_can_resize = 0;
+}
+
+/* ----------------------- StringCopy Hash Table Type ------------------------*/
+
+static unsigned int _dictStringCopyHTHashFunction(const void *key)
+{
+ return dictGenHashFunction(key, strlen(key));
+}
+
+static void *_dictStringCopyHTKeyDup(void *privdata, const void *key)
+{
+ int len = strlen(key);
+ char *copy = _dictAlloc(len+1);
+ DICT_NOTUSED(privdata);
+
+ memcpy(copy, key, len);
+ copy[len] = '\0';
+ return copy;
+}
+
+static void *_dictStringKeyValCopyHTValDup(void *privdata, const void *val)
+{
+ int len = strlen(val);
+ char *copy = _dictAlloc(len+1);
+ DICT_NOTUSED(privdata);
+
+ memcpy(copy, val, len);
+ copy[len] = '\0';
+ return copy;
+}
+
+static int _dictStringCopyHTKeyCompare(void *privdata, const void *key1,
+ const void *key2)
+{
+ DICT_NOTUSED(privdata);
+
+ return strcmp(key1, key2) == 0;
+}
+
+static void _dictStringCopyHTKeyDestructor(void *privdata, void *key)
+{
+ DICT_NOTUSED(privdata);
+
+ _dictFree((void*)key); /* ATTENTION: const cast */
+}
+
+static void _dictStringKeyValCopyHTValDestructor(void *privdata, void *val)
+{
+ DICT_NOTUSED(privdata);
+
+ _dictFree((void*)val); /* ATTENTION: const cast */
+}
+
+dictType dictTypeHeapStringCopyKey = {
+ _dictStringCopyHTHashFunction, /* hash function */
+ _dictStringCopyHTKeyDup, /* key dup */
+ NULL, /* val dup */
+ _dictStringCopyHTKeyCompare, /* key compare */
+ _dictStringCopyHTKeyDestructor, /* key destructor */
+ NULL /* val destructor */
+};
+
+/* This is like StringCopy but does not auto-duplicate the key.
+ * It's used for intepreter's shared strings. */
+dictType dictTypeHeapStrings = {
+ _dictStringCopyHTHashFunction, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ _dictStringCopyHTKeyCompare, /* key compare */
+ _dictStringCopyHTKeyDestructor, /* key destructor */
+ NULL /* val destructor */
+};
+
+/* This is like StringCopy but also automatically handle dynamic
+ * allocated C strings as values. */
+dictType dictTypeHeapStringCopyKeyValue = {
+ _dictStringCopyHTHashFunction, /* hash function */
+ _dictStringCopyHTKeyDup, /* key dup */
+ _dictStringKeyValCopyHTValDup, /* val dup */
+ _dictStringCopyHTKeyCompare, /* key compare */
+ _dictStringCopyHTKeyDestructor, /* key destructor */
+ _dictStringKeyValCopyHTValDestructor, /* val destructor */
+};
diff --git a/src/dict.h b/src/dict.h
new file mode 100644
index 000000000..30ace4db7
--- /dev/null
+++ b/src/dict.h
@@ -0,0 +1,151 @@
+/* Hash Tables Implementation.
+ *
+ * This file implements in memory hash tables with insert/del/replace/find/
+ * get-random-element operations. Hash tables will auto resize if needed
+ * tables of power of two in size are used, collisions are handled by
+ * chaining. See the source code for more information... :)
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __DICT_H
+#define __DICT_H
+
+#define DICT_OK 0
+#define DICT_ERR 1
+
+/* Unused arguments generate annoying warnings... */
+#define DICT_NOTUSED(V) ((void) V)
+
+typedef struct dictEntry {
+ void *key;
+ void *val;
+ struct dictEntry *next;
+} dictEntry;
+
+typedef struct dictType {
+ unsigned int (*hashFunction)(const void *key);
+ void *(*keyDup)(void *privdata, const void *key);
+ void *(*valDup)(void *privdata, const void *obj);
+ int (*keyCompare)(void *privdata, const void *key1, const void *key2);
+ void (*keyDestructor)(void *privdata, void *key);
+ void (*valDestructor)(void *privdata, void *obj);
+} dictType;
+
+/* This is our hash table structure. Every dictionary has two of this as we
+ * implement incremental rehashing, for the old to the new table. */
+typedef struct dictht {
+ dictEntry **table;
+ unsigned long size;
+ unsigned long sizemask;
+ unsigned long used;
+} dictht;
+
+typedef struct dict {
+ dictType *type;
+ void *privdata;
+ dictht ht[2];
+ int rehashidx; /* rehashing not in progress if rehashidx == -1 */
+ int iterators; /* number of iterators currently running */
+} dict;
+
+typedef struct dictIterator {
+ dict *d;
+ int table;
+ int index;
+ dictEntry *entry, *nextEntry;
+} dictIterator;
+
+/* This is the initial size of every hash table */
+#define DICT_HT_INITIAL_SIZE 4
+
+/* ------------------------------- Macros ------------------------------------*/
+#define dictFreeEntryVal(d, entry) \
+ if ((d)->type->valDestructor) \
+ (d)->type->valDestructor((d)->privdata, (entry)->val)
+
+#define dictSetHashVal(d, entry, _val_) do { \
+ if ((d)->type->valDup) \
+ entry->val = (d)->type->valDup((d)->privdata, _val_); \
+ else \
+ entry->val = (_val_); \
+} while(0)
+
+#define dictFreeEntryKey(d, entry) \
+ if ((d)->type->keyDestructor) \
+ (d)->type->keyDestructor((d)->privdata, (entry)->key)
+
+#define dictSetHashKey(d, entry, _key_) do { \
+ if ((d)->type->keyDup) \
+ entry->key = (d)->type->keyDup((d)->privdata, _key_); \
+ else \
+ entry->key = (_key_); \
+} while(0)
+
+#define dictCompareHashKeys(d, key1, key2) \
+ (((d)->type->keyCompare) ? \
+ (d)->type->keyCompare((d)->privdata, key1, key2) : \
+ (key1) == (key2))
+
+#define dictHashKey(d, key) (d)->type->hashFunction(key)
+
+#define dictGetEntryKey(he) ((he)->key)
+#define dictGetEntryVal(he) ((he)->val)
+#define dictSlots(d) ((d)->ht[0].size+(d)->ht[1].size)
+#define dictSize(d) ((d)->ht[0].used+(d)->ht[1].used)
+#define dictIsRehashing(ht) ((ht)->rehashidx != -1)
+
+/* API */
+dict *dictCreate(dictType *type, void *privDataPtr);
+int dictExpand(dict *d, unsigned long size);
+int dictAdd(dict *d, void *key, void *val);
+int dictReplace(dict *d, void *key, void *val);
+int dictDelete(dict *d, const void *key);
+int dictDeleteNoFree(dict *d, const void *key);
+void dictRelease(dict *d);
+dictEntry * dictFind(dict *d, const void *key);
+void *dictFetchValue(dict *d, const void *key);
+int dictResize(dict *d);
+dictIterator *dictGetIterator(dict *d);
+dictEntry *dictNext(dictIterator *iter);
+void dictReleaseIterator(dictIterator *iter);
+dictEntry *dictGetRandomKey(dict *d);
+void dictPrintStats(dict *d);
+unsigned int dictGenHashFunction(const unsigned char *buf, int len);
+void dictEmpty(dict *d);
+void dictEnableResize(void);
+void dictDisableResize(void);
+int dictRehash(dict *d, int n);
+int dictRehashMilliseconds(dict *d, int ms);
+
+/* Hash table types */
+extern dictType dictTypeHeapStringCopyKey;
+extern dictType dictTypeHeapStrings;
+extern dictType dictTypeHeapStringCopyKeyValue;
+
+#endif /* __DICT_H */
diff --git a/src/fmacros.h b/src/fmacros.h
new file mode 100644
index 000000000..38f46482a
--- /dev/null
+++ b/src/fmacros.h
@@ -0,0 +1,15 @@
+#ifndef _REDIS_FMACRO_H
+#define _REDIS_FMACRO_H
+
+#define _BSD_SOURCE
+
+#ifdef __linux__
+#define _XOPEN_SOURCE 700
+#else
+#define _XOPEN_SOURCE
+#endif
+
+#define _LARGEFILE_SOURCE
+#define _FILE_OFFSET_BITS 64
+
+#endif
diff --git a/src/linenoise.c b/src/linenoise.c
new file mode 100644
index 000000000..0c04d03fb
--- /dev/null
+++ b/src/linenoise.c
@@ -0,0 +1,433 @@
+/* linenoise.c -- guerrilla line editing library against the idea that a
+ * line editing lib needs to be 20,000 lines of C code.
+ *
+ * You can find the latest source code at:
+ *
+ * http://github.com/antirez/linenoise
+ *
+ * Does a number of crazy assumptions that happen to be true in 99.9999% of
+ * the 2010 UNIX computers around.
+ *
+ * Copyright (c) 2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * References:
+ * - http://invisible-island.net/xterm/ctlseqs/ctlseqs.html
+ * - http://www.3waylabs.com/nw/WWW/products/wizcon/vt220.html
+ *
+ * Todo list:
+ * - Switch to gets() if $TERM is something we can't support.
+ * - Filter bogus Ctrl+<char> combinations.
+ * - Win32 support
+ *
+ * Bloat:
+ * - Completion?
+ * - History search like Ctrl+r in readline?
+ *
+ * List of escape sequences used by this program, we do everything just
+ * with three sequences. In order to be so cheap we may have some
+ * flickering effect with some slow terminal, but the lesser sequences
+ * the more compatible.
+ *
+ * CHA (Cursor Horizontal Absolute)
+ * Sequence: ESC [ n G
+ * Effect: moves cursor to column n
+ *
+ * EL (Erase Line)
+ * Sequence: ESC [ n K
+ * Effect: if n is 0 or missing, clear from cursor to end of line
+ * Effect: if n is 1, clear from beginning of line to cursor
+ * Effect: if n is 2, clear entire line
+ *
+ * CUF (CUrsor Forward)
+ * Sequence: ESC [ n C
+ * Effect: moves cursor forward of n chars
+ *
+ */
+
+#include "fmacros.h"
+#include <termios.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+
+#define LINENOISE_MAX_LINE 4096
+static char *unsupported_term[] = {"dumb","cons25",NULL};
+
+static struct termios orig_termios; /* in order to restore at exit */
+static int rawmode = 0; /* for atexit() function to check if restore is needed*/
+static int atexit_registered = 0; /* register atexit just 1 time */
+static int history_max_len = 100;
+static int history_len = 0;
+char **history = NULL;
+
+static void linenoiseAtExit(void);
+int linenoiseHistoryAdd(const char *line);
+
+static int isUnsupportedTerm(void) {
+ char *term = getenv("TERM");
+ int j;
+
+ if (term == NULL) return 0;
+ for (j = 0; unsupported_term[j]; j++)
+ if (!strcasecmp(term,unsupported_term[j])) return 1;
+ return 0;
+}
+
+static void freeHistory(void) {
+ if (history) {
+ int j;
+
+ for (j = 0; j < history_len; j++)
+ free(history[j]);
+ free(history);
+ }
+}
+
+static int enableRawMode(int fd) {
+ struct termios raw;
+
+ if (!isatty(STDIN_FILENO)) goto fatal;
+ if (!atexit_registered) {
+ atexit(linenoiseAtExit);
+ atexit_registered = 1;
+ }
+ if (tcgetattr(fd,&orig_termios) == -1) goto fatal;
+
+ raw = orig_termios; /* modify the original mode */
+ /* input modes: no break, no CR to NL, no parity check, no strip char,
+ * no start/stop output control. */
+ raw.c_iflag &= ~(BRKINT | ICRNL | INPCK | ISTRIP | IXON);
+ /* output modes - disable post processing */
+ raw.c_oflag &= ~(OPOST);
+ /* control modes - set 8 bit chars */
+ raw.c_cflag |= (CS8);
+ /* local modes - choing off, canonical off, no extended functions,
+ * no signal chars (^Z,^C) */
+ raw.c_lflag &= ~(ECHO | ICANON | IEXTEN | ISIG);
+ /* control chars - set return condition: min number of bytes and timer.
+ * We want read to return every single byte, without timeout. */
+ raw.c_cc[VMIN] = 1; raw.c_cc[VTIME] = 0; /* 1 byte, no timer */
+
+ /* put terminal in raw mode after flushing */
+ if (tcsetattr(fd,TCSAFLUSH,&raw) < 0) goto fatal;
+ rawmode = 1;
+ return 0;
+
+fatal:
+ errno = ENOTTY;
+ return -1;
+}
+
+static void disableRawMode(int fd) {
+ /* Don't even check the return value as it's too late. */
+ if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1)
+ rawmode = 0;
+}
+
+/* At exit we'll try to fix the terminal to the initial conditions. */
+static void linenoiseAtExit(void) {
+ disableRawMode(STDIN_FILENO);
+ freeHistory();
+}
+
+static int getColumns(void) {
+ struct winsize ws;
+
+ if (ioctl(1, TIOCGWINSZ, &ws) == -1) return 80;
+ return ws.ws_col;
+}
+
+static void refreshLine(int fd, const char *prompt, char *buf, size_t len, size_t pos, size_t cols) {
+ char seq[64];
+ size_t plen = strlen(prompt);
+
+ while((plen+pos) >= cols) {
+ buf++;
+ len--;
+ pos--;
+ }
+ while (plen+len > cols) {
+ len--;
+ }
+
+ /* Cursor to left edge */
+ snprintf(seq,64,"\x1b[0G");
+ if (write(fd,seq,strlen(seq)) == -1) return;
+ /* Write the prompt and the current buffer content */
+ if (write(fd,prompt,strlen(prompt)) == -1) return;
+ if (write(fd,buf,len) == -1) return;
+ /* Erase to right */
+ snprintf(seq,64,"\x1b[0K");
+ if (write(fd,seq,strlen(seq)) == -1) return;
+ /* Move cursor to original position. */
+ snprintf(seq,64,"\x1b[0G\x1b[%dC", (int)(pos+plen));
+ if (write(fd,seq,strlen(seq)) == -1) return;
+}
+
+static int linenoisePrompt(int fd, char *buf, size_t buflen, const char *prompt) {
+ size_t plen = strlen(prompt);
+ size_t pos = 0;
+ size_t len = 0;
+ size_t cols = getColumns();
+ int history_index = 0;
+
+ buf[0] = '\0';
+ buflen--; /* Make sure there is always space for the nulterm */
+
+ /* The latest history entry is always our current buffer, that
+ * initially is just an empty string. */
+ linenoiseHistoryAdd("");
+
+ if (write(fd,prompt,plen) == -1) return -1;
+ while(1) {
+ char c;
+ int nread;
+ char seq[2];
+
+ nread = read(fd,&c,1);
+ if (nread <= 0) return len;
+ switch(c) {
+ case 13: /* enter */
+ history_len--;
+ return len;
+ case 4: /* ctrl-d */
+ history_len--;
+ return (len == 0) ? -1 : (int)len;
+ case 3: /* ctrl-c */
+ errno = EAGAIN;
+ return -1;
+ case 127: /* backspace */
+ case 8: /* ctrl-h */
+ if (pos > 0 && len > 0) {
+ memmove(buf+pos-1,buf+pos,len-pos);
+ pos--;
+ len--;
+ buf[len] = '\0';
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ }
+ break;
+ case 20: /* ctrl-t */
+ if (pos > 0 && pos < len) {
+ int aux = buf[pos-1];
+ buf[pos-1] = buf[pos];
+ buf[pos] = aux;
+ if (pos != len-1) pos++;
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ }
+ break;
+ case 2: /* ctrl-b */
+ goto left_arrow;
+ case 6: /* ctrl-f */
+ goto right_arrow;
+ case 16: /* ctrl-p */
+ seq[1] = 65;
+ goto up_down_arrow;
+ case 14: /* ctrl-n */
+ seq[1] = 66;
+ goto up_down_arrow;
+ break;
+ case 27: /* escape sequence */
+ if (read(fd,seq,2) == -1) break;
+ if (seq[0] == 91 && seq[1] == 68) {
+left_arrow:
+ /* left arrow */
+ if (pos > 0) {
+ pos--;
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ }
+ } else if (seq[0] == 91 && seq[1] == 67) {
+right_arrow:
+ /* right arrow */
+ if (pos != len) {
+ pos++;
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ }
+ } else if (seq[0] == 91 && (seq[1] == 65 || seq[1] == 66)) {
+up_down_arrow:
+ /* up and down arrow: history */
+ if (history_len > 1) {
+ /* Update the current history entry before to
+ * overwrite it with tne next one. */
+ free(history[history_len-1-history_index]);
+ history[history_len-1-history_index] = strdup(buf);
+ /* Show the new entry */
+ history_index += (seq[1] == 65) ? 1 : -1;
+ if (history_index < 0) {
+ history_index = 0;
+ break;
+ } else if (history_index >= history_len) {
+ history_index = history_len-1;
+ break;
+ }
+ strncpy(buf,history[history_len-1-history_index],buflen);
+ buf[buflen] = '\0';
+ len = pos = strlen(buf);
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ }
+ }
+ break;
+ default:
+ if (len < buflen) {
+ if (len == pos) {
+ buf[pos] = c;
+ pos++;
+ len++;
+ buf[len] = '\0';
+ if (plen+len < cols) {
+ /* Avoid a full update of the line in the
+ * trivial case. */
+ if (write(fd,&c,1) == -1) return -1;
+ } else {
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ }
+ } else {
+ memmove(buf+pos+1,buf+pos,len-pos);
+ buf[pos] = c;
+ len++;
+ pos++;
+ buf[len] = '\0';
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ }
+ }
+ break;
+ case 21: /* Ctrl+u, delete the whole line. */
+ buf[0] = '\0';
+ pos = len = 0;
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ break;
+ case 11: /* Ctrl+k, delete from current to end of line. */
+ buf[pos] = '\0';
+ len = pos;
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ break;
+ case 1: /* Ctrl+a, go to the start of the line */
+ pos = 0;
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ break;
+ case 5: /* ctrl+e, go to the end of the line */
+ pos = len;
+ refreshLine(fd,prompt,buf,len,pos,cols);
+ break;
+ }
+ }
+ return len;
+}
+
+static int linenoiseRaw(char *buf, size_t buflen, const char *prompt) {
+ int fd = STDIN_FILENO;
+ int count;
+
+ if (buflen == 0) {
+ errno = EINVAL;
+ return -1;
+ }
+ if (!isatty(STDIN_FILENO)) {
+ if (fgets(buf, buflen, stdin) == NULL) return -1;
+ count = strlen(buf);
+ if (count && buf[count-1] == '\n') {
+ count--;
+ buf[count] = '\0';
+ }
+ } else {
+ if (enableRawMode(fd) == -1) return -1;
+ count = linenoisePrompt(fd, buf, buflen, prompt);
+ disableRawMode(fd);
+ printf("\n");
+ }
+ return count;
+}
+
+char *linenoise(const char *prompt) {
+ char buf[LINENOISE_MAX_LINE];
+ int count;
+
+ if (isUnsupportedTerm()) {
+ size_t len;
+
+ printf("%s",prompt);
+ fflush(stdout);
+ if (fgets(buf,LINENOISE_MAX_LINE,stdin) == NULL) return NULL;
+ len = strlen(buf);
+ while(len && (buf[len-1] == '\n' || buf[len-1] == '\r')) {
+ len--;
+ buf[len] = '\0';
+ }
+ return strdup(buf);
+ } else {
+ count = linenoiseRaw(buf,LINENOISE_MAX_LINE,prompt);
+ if (count == -1) return NULL;
+ return strdup(buf);
+ }
+}
+
+/* Using a circular buffer is smarter, but a bit more complex to handle. */
+int linenoiseHistoryAdd(const char *line) {
+ char *linecopy;
+
+ if (history_max_len == 0) return 0;
+ if (history == 0) {
+ history = malloc(sizeof(char*)*history_max_len);
+ if (history == NULL) return 0;
+ memset(history,0,(sizeof(char*)*history_max_len));
+ }
+ linecopy = strdup(line);
+ if (!linecopy) return 0;
+ if (history_len == history_max_len) {
+ memmove(history,history+1,sizeof(char*)*(history_max_len-1));
+ history_len--;
+ }
+ history[history_len] = linecopy;
+ history_len++;
+ return 1;
+}
+
+int linenoiseHistorySetMaxLen(int len) {
+ char **new;
+
+ if (len < 1) return 0;
+ if (history) {
+ int tocopy = history_len;
+
+ new = malloc(sizeof(char*)*len);
+ if (new == NULL) return 0;
+ if (len < tocopy) tocopy = len;
+ memcpy(new,history+(history_max_len-tocopy), sizeof(char*)*tocopy);
+ free(history);
+ history = new;
+ }
+ history_max_len = len;
+ if (history_len > history_max_len)
+ history_len = history_max_len;
+ return 1;
+}
diff --git a/src/linenoise.h b/src/linenoise.h
new file mode 100644
index 000000000..ff45e2c47
--- /dev/null
+++ b/src/linenoise.h
@@ -0,0 +1,41 @@
+/* linenoise.h -- guerrilla line editing library against the idea that a
+ * line editing lib needs to be 20,000 lines of C code.
+ *
+ * See linenoise.c for more information.
+ *
+ * Copyright (c) 2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __LINENOISE_H
+#define __LINENOISE_H
+
+char *linenoise(const char *prompt);
+int linenoiseHistoryAdd(char *line);
+int linenoiseHistorySetMaxLen(int len);
+
+#endif /* __LINENOISE_H */
diff --git a/src/lzf.h b/src/lzf.h
new file mode 100644
index 000000000..919b6e6be
--- /dev/null
+++ b/src/lzf.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#ifndef LZF_H
+#define LZF_H
+
+/***********************************************************************
+**
+** lzf -- an extremely fast/free compression/decompression-method
+** http://liblzf.plan9.de/
+**
+** This algorithm is believed to be patent-free.
+**
+***********************************************************************/
+
+#define LZF_VERSION 0x0105 /* 1.5, API version */
+
+/*
+ * Compress in_len bytes stored at the memory block starting at
+ * in_data and write the result to out_data, up to a maximum length
+ * of out_len bytes.
+ *
+ * If the output buffer is not large enough or any error occurs return 0,
+ * otherwise return the number of bytes used, which might be considerably
+ * more than in_len (but less than 104% of the original size), so it
+ * makes sense to always use out_len == in_len - 1), to ensure _some_
+ * compression, and store the data uncompressed otherwise (with a flag, of
+ * course.
+ *
+ * lzf_compress might use different algorithms on different systems and
+ * even different runs, thus might result in different compressed strings
+ * depending on the phase of the moon or similar factors. However, all
+ * these strings are architecture-independent and will result in the
+ * original data when decompressed using lzf_decompress.
+ *
+ * The buffers must not be overlapping.
+ *
+ * If the option LZF_STATE_ARG is enabled, an extra argument must be
+ * supplied which is not reflected in this header file. Refer to lzfP.h
+ * and lzf_c.c.
+ *
+ */
+unsigned int
+lzf_compress (const void *const in_data, unsigned int in_len,
+ void *out_data, unsigned int out_len);
+
+/*
+ * Decompress data compressed with some version of the lzf_compress
+ * function and stored at location in_data and length in_len. The result
+ * will be stored at out_data up to a maximum of out_len characters.
+ *
+ * If the output buffer is not large enough to hold the decompressed
+ * data, a 0 is returned and errno is set to E2BIG. Otherwise the number
+ * of decompressed bytes (i.e. the original length of the data) is
+ * returned.
+ *
+ * If an error in the compressed data is detected, a zero is returned and
+ * errno is set to EINVAL.
+ *
+ * This function is very fast, about as fast as a copying loop.
+ */
+unsigned int
+lzf_decompress (const void *const in_data, unsigned int in_len,
+ void *out_data, unsigned int out_len);
+
+#endif
+
diff --git a/src/lzfP.h b/src/lzfP.h
new file mode 100644
index 000000000..d533f1829
--- /dev/null
+++ b/src/lzfP.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2000-2007 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#ifndef LZFP_h
+#define LZFP_h
+
+#define STANDALONE 1 /* at the moment, this is ok. */
+
+#ifndef STANDALONE
+# include "lzf.h"
+#endif
+
+/*
+ * Size of hashtable is (1 << HLOG) * sizeof (char *)
+ * decompression is independent of the hash table size
+ * the difference between 15 and 14 is very small
+ * for small blocks (and 14 is usually a bit faster).
+ * For a low-memory/faster configuration, use HLOG == 13;
+ * For best compression, use 15 or 16 (or more, up to 23).
+ */
+#ifndef HLOG
+# define HLOG 16
+#endif
+
+/*
+ * Sacrifice very little compression quality in favour of compression speed.
+ * This gives almost the same compression as the default code, and is
+ * (very roughly) 15% faster. This is the preferred mode of operation.
+ */
+#ifndef VERY_FAST
+# define VERY_FAST 1
+#endif
+
+/*
+ * Sacrifice some more compression quality in favour of compression speed.
+ * (roughly 1-2% worse compression for large blocks and
+ * 9-10% for small, redundant, blocks and >>20% better speed in both cases)
+ * In short: when in need for speed, enable this for binary data,
+ * possibly disable this for text data.
+ */
+#ifndef ULTRA_FAST
+# define ULTRA_FAST 0
+#endif
+
+/*
+ * Unconditionally aligning does not cost very much, so do it if unsure
+ */
+#ifndef STRICT_ALIGN
+# define STRICT_ALIGN !(defined(__i386) || defined (__amd64))
+#endif
+
+/*
+ * You may choose to pre-set the hash table (might be faster on some
+ * modern cpus and large (>>64k) blocks, and also makes compression
+ * deterministic/repeatable when the configuration otherwise is the same).
+ */
+#ifndef INIT_HTAB
+# define INIT_HTAB 0
+#endif
+
+/*
+ * Avoid assigning values to errno variable? for some embedding purposes
+ * (linux kernel for example), this is neccessary. NOTE: this breaks
+ * the documentation in lzf.h.
+ */
+#ifndef AVOID_ERRNO
+# define AVOID_ERRNO 0
+#endif
+
+/*
+ * Wether to pass the LZF_STATE variable as argument, or allocate it
+ * on the stack. For small-stack environments, define this to 1.
+ * NOTE: this breaks the prototype in lzf.h.
+ */
+#ifndef LZF_STATE_ARG
+# define LZF_STATE_ARG 0
+#endif
+
+/*
+ * Wether to add extra checks for input validity in lzf_decompress
+ * and return EINVAL if the input stream has been corrupted. This
+ * only shields against overflowing the input buffer and will not
+ * detect most corrupted streams.
+ * This check is not normally noticable on modern hardware
+ * (<1% slowdown), but might slow down older cpus considerably.
+ */
+#ifndef CHECK_INPUT
+# define CHECK_INPUT 1
+#endif
+
+/*****************************************************************************/
+/* nothing should be changed below */
+
+typedef unsigned char u8;
+
+typedef const u8 *LZF_STATE[1 << (HLOG)];
+
+#if !STRICT_ALIGN
+/* for unaligned accesses we need a 16 bit datatype. */
+# include <limits.h>
+# if USHRT_MAX == 65535
+ typedef unsigned short u16;
+# elif UINT_MAX == 65535
+ typedef unsigned int u16;
+# else
+# undef STRICT_ALIGN
+# define STRICT_ALIGN 1
+# endif
+#endif
+
+#if ULTRA_FAST
+# if defined(VERY_FAST)
+# undef VERY_FAST
+# endif
+#endif
+
+#if INIT_HTAB
+# ifdef __cplusplus
+# include <cstring>
+# else
+# include <string.h>
+# endif
+#endif
+
+#endif
+
diff --git a/src/lzf_c.c b/src/lzf_c.c
new file mode 100644
index 000000000..99dab091c
--- /dev/null
+++ b/src/lzf_c.c
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#include "lzfP.h"
+
+#define HSIZE (1 << (HLOG))
+
+/*
+ * don't play with this unless you benchmark!
+ * decompression is not dependent on the hash function
+ * the hashing function might seem strange, just believe me
+ * it works ;)
+ */
+#ifndef FRST
+# define FRST(p) (((p[0]) << 8) | p[1])
+# define NEXT(v,p) (((v) << 8) | p[2])
+# if ULTRA_FAST
+# define IDX(h) ((( h >> (3*8 - HLOG)) - h ) & (HSIZE - 1))
+# elif VERY_FAST
+# define IDX(h) ((( h >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
+# else
+# define IDX(h) ((((h ^ (h << 5)) >> (3*8 - HLOG)) - h*5) & (HSIZE - 1))
+# endif
+#endif
+/*
+ * IDX works because it is very similar to a multiplicative hash, e.g.
+ * ((h * 57321 >> (3*8 - HLOG)) & (HSIZE - 1))
+ * the latter is also quite fast on newer CPUs, and compresses similarly.
+ *
+ * the next one is also quite good, albeit slow ;)
+ * (int)(cos(h & 0xffffff) * 1e6)
+ */
+
+#if 0
+/* original lzv-like hash function, much worse and thus slower */
+# define FRST(p) (p[0] << 5) ^ p[1]
+# define NEXT(v,p) ((v) << 5) ^ p[2]
+# define IDX(h) ((h) & (HSIZE - 1))
+#endif
+
+#define MAX_LIT (1 << 5)
+#define MAX_OFF (1 << 13)
+#define MAX_REF ((1 << 8) + (1 << 3))
+
+#if __GNUC__ >= 3
+# define expect(expr,value) __builtin_expect ((expr),(value))
+# define inline inline
+#else
+# define expect(expr,value) (expr)
+# define inline static
+#endif
+
+#define expect_false(expr) expect ((expr) != 0, 0)
+#define expect_true(expr) expect ((expr) != 0, 1)
+
+/*
+ * compressed format
+ *
+ * 000LLLLL <L+1> ; literal
+ * LLLooooo oooooooo ; backref L
+ * 111ooooo LLLLLLLL oooooooo ; backref L+7
+ *
+ */
+
+unsigned int
+lzf_compress (const void *const in_data, unsigned int in_len,
+ void *out_data, unsigned int out_len
+#if LZF_STATE_ARG
+ , LZF_STATE htab
+#endif
+ )
+{
+#if !LZF_STATE_ARG
+ LZF_STATE htab;
+#endif
+ const u8 **hslot;
+ const u8 *ip = (const u8 *)in_data;
+ u8 *op = (u8 *)out_data;
+ const u8 *in_end = ip + in_len;
+ u8 *out_end = op + out_len;
+ const u8 *ref;
+
+ /* off requires a type wide enough to hold a general pointer difference.
+ * ISO C doesn't have that (size_t might not be enough and ptrdiff_t only
+ * works for differences within a single object). We also assume that no
+ * no bit pattern traps. Since the only platform that is both non-POSIX
+ * and fails to support both assumptions is windows 64 bit, we make a
+ * special workaround for it.
+ */
+#if defined (WIN32) && defined (_M_X64)
+ unsigned _int64 off; /* workaround for missing POSIX compliance */
+#else
+ unsigned long off;
+#endif
+ unsigned int hval;
+ int lit;
+
+ if (!in_len || !out_len)
+ return 0;
+
+#if INIT_HTAB
+ memset (htab, 0, sizeof (htab));
+# if 0
+ for (hslot = htab; hslot < htab + HSIZE; hslot++)
+ *hslot++ = ip;
+# endif
+#endif
+
+ lit = 0; op++; /* start run */
+
+ hval = FRST (ip);
+ while (ip < in_end - 2)
+ {
+ hval = NEXT (hval, ip);
+ hslot = htab + IDX (hval);
+ ref = *hslot; *hslot = ip;
+
+ if (1
+#if INIT_HTAB
+ && ref < ip /* the next test will actually take care of this, but this is faster */
+#endif
+ && (off = ip - ref - 1) < MAX_OFF
+ && ip + 4 < in_end
+ && ref > (u8 *)in_data
+#if STRICT_ALIGN
+ && ref[0] == ip[0]
+ && ref[1] == ip[1]
+ && ref[2] == ip[2]
+#else
+ && *(u16 *)ref == *(u16 *)ip
+ && ref[2] == ip[2]
+#endif
+ )
+ {
+ /* match found at *ref++ */
+ unsigned int len = 2;
+ unsigned int maxlen = in_end - ip - len;
+ maxlen = maxlen > MAX_REF ? MAX_REF : maxlen;
+
+ op [- lit - 1] = lit - 1; /* stop run */
+ op -= !lit; /* undo run if length is zero */
+
+ if (expect_false (op + 3 + 1 >= out_end))
+ return 0;
+
+ for (;;)
+ {
+ if (expect_true (maxlen > 16))
+ {
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ len++; if (ref [len] != ip [len]) break;
+ }
+
+ do
+ len++;
+ while (len < maxlen && ref[len] == ip[len]);
+
+ break;
+ }
+
+ len -= 2; /* len is now #octets - 1 */
+ ip++;
+
+ if (len < 7)
+ {
+ *op++ = (off >> 8) + (len << 5);
+ }
+ else
+ {
+ *op++ = (off >> 8) + ( 7 << 5);
+ *op++ = len - 7;
+ }
+
+ *op++ = off;
+ lit = 0; op++; /* start run */
+
+ ip += len + 1;
+
+ if (expect_false (ip >= in_end - 2))
+ break;
+
+#if ULTRA_FAST || VERY_FAST
+ --ip;
+# if VERY_FAST && !ULTRA_FAST
+ --ip;
+# endif
+ hval = FRST (ip);
+
+ hval = NEXT (hval, ip);
+ htab[IDX (hval)] = ip;
+ ip++;
+
+# if VERY_FAST && !ULTRA_FAST
+ hval = NEXT (hval, ip);
+ htab[IDX (hval)] = ip;
+ ip++;
+# endif
+#else
+ ip -= len + 1;
+
+ do
+ {
+ hval = NEXT (hval, ip);
+ htab[IDX (hval)] = ip;
+ ip++;
+ }
+ while (len--);
+#endif
+ }
+ else
+ {
+ /* one more literal byte we must copy */
+ if (expect_false (op >= out_end))
+ return 0;
+
+ lit++; *op++ = *ip++;
+
+ if (expect_false (lit == MAX_LIT))
+ {
+ op [- lit - 1] = lit - 1; /* stop run */
+ lit = 0; op++; /* start run */
+ }
+ }
+ }
+
+ if (op + 3 > out_end) /* at most 3 bytes can be missing here */
+ return 0;
+
+ while (ip < in_end)
+ {
+ lit++; *op++ = *ip++;
+
+ if (expect_false (lit == MAX_LIT))
+ {
+ op [- lit - 1] = lit - 1; /* stop run */
+ lit = 0; op++; /* start run */
+ }
+ }
+
+ op [- lit - 1] = lit - 1; /* end run */
+ op -= !lit; /* undo run if length is zero */
+
+ return op - (u8 *)out_data;
+}
+
diff --git a/src/lzf_d.c b/src/lzf_d.c
new file mode 100644
index 000000000..e7e48c138
--- /dev/null
+++ b/src/lzf_d.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2000-2007 Marc Alexander Lehmann <schmorp@schmorp.de>
+ *
+ * Redistribution and use in source and binary forms, with or without modifica-
+ * tion, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MER-
+ * CHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+ * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
+ * CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTH-
+ * ERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * the GNU General Public License ("GPL") version 2 or any later version,
+ * in which case the provisions of the GPL are applicable instead of
+ * the above. If you wish to allow the use of your version of this file
+ * only under the terms of the GPL and not to allow others to use your
+ * version of this file under the BSD license, indicate your decision
+ * by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL. If you do not delete the
+ * provisions above, a recipient may use your version of this file under
+ * either the BSD or the GPL.
+ */
+
+#include "lzfP.h"
+
+#if AVOID_ERRNO
+# define SET_ERRNO(n)
+#else
+# include <errno.h>
+# define SET_ERRNO(n) errno = (n)
+#endif
+
+/*
+#if (__i386 || __amd64) && __GNUC__ >= 3
+# define lzf_movsb(dst, src, len) \
+ asm ("rep movsb" \
+ : "=D" (dst), "=S" (src), "=c" (len) \
+ : "0" (dst), "1" (src), "2" (len));
+#endif
+*/
+
+unsigned int
+lzf_decompress (const void *const in_data, unsigned int in_len,
+ void *out_data, unsigned int out_len)
+{
+ u8 const *ip = (const u8 *)in_data;
+ u8 *op = (u8 *)out_data;
+ u8 const *const in_end = ip + in_len;
+ u8 *const out_end = op + out_len;
+
+ do
+ {
+ unsigned int ctrl = *ip++;
+
+ if (ctrl < (1 << 5)) /* literal run */
+ {
+ ctrl++;
+
+ if (op + ctrl > out_end)
+ {
+ SET_ERRNO (E2BIG);
+ return 0;
+ }
+
+#if CHECK_INPUT
+ if (ip + ctrl > in_end)
+ {
+ SET_ERRNO (EINVAL);
+ return 0;
+ }
+#endif
+
+#ifdef lzf_movsb
+ lzf_movsb (op, ip, ctrl);
+#else
+ do
+ *op++ = *ip++;
+ while (--ctrl);
+#endif
+ }
+ else /* back reference */
+ {
+ unsigned int len = ctrl >> 5;
+
+ u8 *ref = op - ((ctrl & 0x1f) << 8) - 1;
+
+#if CHECK_INPUT
+ if (ip >= in_end)
+ {
+ SET_ERRNO (EINVAL);
+ return 0;
+ }
+#endif
+ if (len == 7)
+ {
+ len += *ip++;
+#if CHECK_INPUT
+ if (ip >= in_end)
+ {
+ SET_ERRNO (EINVAL);
+ return 0;
+ }
+#endif
+ }
+
+ ref -= *ip++;
+
+ if (op + len + 2 > out_end)
+ {
+ SET_ERRNO (E2BIG);
+ return 0;
+ }
+
+ if (ref < (u8 *)out_data)
+ {
+ SET_ERRNO (EINVAL);
+ return 0;
+ }
+
+#ifdef lzf_movsb
+ len += 2;
+ lzf_movsb (op, ref, len);
+#else
+ *op++ = *ref++;
+ *op++ = *ref++;
+
+ do
+ *op++ = *ref++;
+ while (--len);
+#endif
+ }
+ }
+ while (ip < in_end);
+
+ return op - (u8 *)out_data;
+}
+
diff --git a/src/mkreleasehdr.sh b/src/mkreleasehdr.sh
new file mode 100755
index 000000000..30984160e
--- /dev/null
+++ b/src/mkreleasehdr.sh
@@ -0,0 +1,9 @@
+#!/bin/sh
+GIT_SHA1=`(git show-ref --head --hash=8 2> /dev/null || echo 00000000) | head -n1`
+GIT_DIRTY=`git diff 2> /dev/null | wc -l`
+test -f release.h || touch release.h
+(cat release.h | grep SHA1 | grep $GIT_SHA1) && \
+(cat release.h | grep DIRTY | grep $GIT_DIRTY) && exit 0 # Already uptodate
+echo "#define REDIS_GIT_SHA1 \"$GIT_SHA1\"" > release.h
+echo "#define REDIS_GIT_DIRTY \"$GIT_DIRTY\"" >> release.h
+touch release.c # Force recompile of release.c
diff --git a/src/multi.c b/src/multi.c
new file mode 100644
index 000000000..def1dd673
--- /dev/null
+++ b/src/multi.c
@@ -0,0 +1,266 @@
+#include "redis.h"
+
+/* ================================ MULTI/EXEC ============================== */
+
+/* Client state initialization for MULTI/EXEC */
+void initClientMultiState(redisClient *c) {
+ c->mstate.commands = NULL;
+ c->mstate.count = 0;
+}
+
+/* Release all the resources associated with MULTI/EXEC state */
+void freeClientMultiState(redisClient *c) {
+ int j;
+
+ for (j = 0; j < c->mstate.count; j++) {
+ int i;
+ multiCmd *mc = c->mstate.commands+j;
+
+ for (i = 0; i < mc->argc; i++)
+ decrRefCount(mc->argv[i]);
+ zfree(mc->argv);
+ }
+ zfree(c->mstate.commands);
+}
+
+/* Add a new command into the MULTI commands queue */
+void queueMultiCommand(redisClient *c, struct redisCommand *cmd) {
+ multiCmd *mc;
+ int j;
+
+ c->mstate.commands = zrealloc(c->mstate.commands,
+ sizeof(multiCmd)*(c->mstate.count+1));
+ mc = c->mstate.commands+c->mstate.count;
+ mc->cmd = cmd;
+ mc->argc = c->argc;
+ mc->argv = zmalloc(sizeof(robj*)*c->argc);
+ memcpy(mc->argv,c->argv,sizeof(robj*)*c->argc);
+ for (j = 0; j < c->argc; j++)
+ incrRefCount(mc->argv[j]);
+ c->mstate.count++;
+}
+
+void multiCommand(redisClient *c) {
+ if (c->flags & REDIS_MULTI) {
+ addReplySds(c,sdsnew("-ERR MULTI calls can not be nested\r\n"));
+ return;
+ }
+ c->flags |= REDIS_MULTI;
+ addReply(c,shared.ok);
+}
+
+void discardCommand(redisClient *c) {
+ if (!(c->flags & REDIS_MULTI)) {
+ addReplySds(c,sdsnew("-ERR DISCARD without MULTI\r\n"));
+ return;
+ }
+
+ freeClientMultiState(c);
+ initClientMultiState(c);
+ c->flags &= (~REDIS_MULTI);
+ unwatchAllKeys(c);
+ addReply(c,shared.ok);
+}
+
+/* Send a MULTI command to all the slaves and AOF file. Check the execCommand
+ * implememntation for more information. */
+void execCommandReplicateMulti(redisClient *c) {
+ struct redisCommand *cmd;
+ robj *multistring = createStringObject("MULTI",5);
+
+ cmd = lookupCommand("multi");
+ if (server.appendonly)
+ feedAppendOnlyFile(cmd,c->db->id,&multistring,1);
+ if (listLength(server.slaves))
+ replicationFeedSlaves(server.slaves,c->db->id,&multistring,1);
+ decrRefCount(multistring);
+}
+
+void execCommand(redisClient *c) {
+ int j;
+ robj **orig_argv;
+ int orig_argc;
+
+ if (!(c->flags & REDIS_MULTI)) {
+ addReplySds(c,sdsnew("-ERR EXEC without MULTI\r\n"));
+ return;
+ }
+
+ /* Check if we need to abort the EXEC if some WATCHed key was touched.
+ * A failed EXEC will return a multi bulk nil object. */
+ if (c->flags & REDIS_DIRTY_CAS) {
+ freeClientMultiState(c);
+ initClientMultiState(c);
+ c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
+ unwatchAllKeys(c);
+ addReply(c,shared.nullmultibulk);
+ return;
+ }
+
+ /* Replicate a MULTI request now that we are sure the block is executed.
+ * This way we'll deliver the MULTI/..../EXEC block as a whole and
+ * both the AOF and the replication link will have the same consistency
+ * and atomicity guarantees. */
+ execCommandReplicateMulti(c);
+
+ /* Exec all the queued commands */
+ unwatchAllKeys(c); /* Unwatch ASAP otherwise we'll waste CPU cycles */
+ orig_argv = c->argv;
+ orig_argc = c->argc;
+ addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->mstate.count));
+ for (j = 0; j < c->mstate.count; j++) {
+ c->argc = c->mstate.commands[j].argc;
+ c->argv = c->mstate.commands[j].argv;
+ call(c,c->mstate.commands[j].cmd);
+ }
+ c->argv = orig_argv;
+ c->argc = orig_argc;
+ freeClientMultiState(c);
+ initClientMultiState(c);
+ c->flags &= ~(REDIS_MULTI|REDIS_DIRTY_CAS);
+ /* Make sure the EXEC command is always replicated / AOF, since we
+ * always send the MULTI command (we can't know beforehand if the
+ * next operations will contain at least a modification to the DB). */
+ server.dirty++;
+}
+
+/* ===================== WATCH (CAS alike for MULTI/EXEC) ===================
+ *
+ * The implementation uses a per-DB hash table mapping keys to list of clients
+ * WATCHing those keys, so that given a key that is going to be modified
+ * we can mark all the associated clients as dirty.
+ *
+ * Also every client contains a list of WATCHed keys so that's possible to
+ * un-watch such keys when the client is freed or when UNWATCH is called. */
+
+/* In the client->watched_keys list we need to use watchedKey structures
+ * as in order to identify a key in Redis we need both the key name and the
+ * DB */
+typedef struct watchedKey {
+ robj *key;
+ redisDb *db;
+} watchedKey;
+
+/* Watch for the specified key */
+void watchForKey(redisClient *c, robj *key) {
+ list *clients = NULL;
+ listIter li;
+ listNode *ln;
+ watchedKey *wk;
+
+ /* Check if we are already watching for this key */
+ listRewind(c->watched_keys,&li);
+ while((ln = listNext(&li))) {
+ wk = listNodeValue(ln);
+ if (wk->db == c->db && equalStringObjects(key,wk->key))
+ return; /* Key already watched */
+ }
+ /* This key is not already watched in this DB. Let's add it */
+ clients = dictFetchValue(c->db->watched_keys,key);
+ if (!clients) {
+ clients = listCreate();
+ dictAdd(c->db->watched_keys,key,clients);
+ incrRefCount(key);
+ }
+ listAddNodeTail(clients,c);
+ /* Add the new key to the lits of keys watched by this client */
+ wk = zmalloc(sizeof(*wk));
+ wk->key = key;
+ wk->db = c->db;
+ incrRefCount(key);
+ listAddNodeTail(c->watched_keys,wk);
+}
+
+/* Unwatch all the keys watched by this client. To clean the EXEC dirty
+ * flag is up to the caller. */
+void unwatchAllKeys(redisClient *c) {
+ listIter li;
+ listNode *ln;
+
+ if (listLength(c->watched_keys) == 0) return;
+ listRewind(c->watched_keys,&li);
+ while((ln = listNext(&li))) {
+ list *clients;
+ watchedKey *wk;
+
+ /* Lookup the watched key -> clients list and remove the client
+ * from the list */
+ wk = listNodeValue(ln);
+ clients = dictFetchValue(wk->db->watched_keys, wk->key);
+ redisAssert(clients != NULL);
+ listDelNode(clients,listSearchKey(clients,c));
+ /* Kill the entry at all if this was the only client */
+ if (listLength(clients) == 0)
+ dictDelete(wk->db->watched_keys, wk->key);
+ /* Remove this watched key from the client->watched list */
+ listDelNode(c->watched_keys,ln);
+ decrRefCount(wk->key);
+ zfree(wk);
+ }
+}
+
+/* "Touch" a key, so that if this key is being WATCHed by some client the
+ * next EXEC will fail. */
+void touchWatchedKey(redisDb *db, robj *key) {
+ list *clients;
+ listIter li;
+ listNode *ln;
+
+ if (dictSize(db->watched_keys) == 0) return;
+ clients = dictFetchValue(db->watched_keys, key);
+ if (!clients) return;
+
+ /* Mark all the clients watching this key as REDIS_DIRTY_CAS */
+ /* Check if we are already watching for this key */
+ listRewind(clients,&li);
+ while((ln = listNext(&li))) {
+ redisClient *c = listNodeValue(ln);
+
+ c->flags |= REDIS_DIRTY_CAS;
+ }
+}
+
+/* On FLUSHDB or FLUSHALL all the watched keys that are present before the
+ * flush but will be deleted as effect of the flushing operation should
+ * be touched. "dbid" is the DB that's getting the flush. -1 if it is
+ * a FLUSHALL operation (all the DBs flushed). */
+void touchWatchedKeysOnFlush(int dbid) {
+ listIter li1, li2;
+ listNode *ln;
+
+ /* For every client, check all the waited keys */
+ listRewind(server.clients,&li1);
+ while((ln = listNext(&li1))) {
+ redisClient *c = listNodeValue(ln);
+ listRewind(c->watched_keys,&li2);
+ while((ln = listNext(&li2))) {
+ watchedKey *wk = listNodeValue(ln);
+
+ /* For every watched key matching the specified DB, if the
+ * key exists, mark the client as dirty, as the key will be
+ * removed. */
+ if (dbid == -1 || wk->db->id == dbid) {
+ if (dictFind(wk->db->dict, wk->key->ptr) != NULL)
+ c->flags |= REDIS_DIRTY_CAS;
+ }
+ }
+ }
+}
+
+void watchCommand(redisClient *c) {
+ int j;
+
+ if (c->flags & REDIS_MULTI) {
+ addReplySds(c,sdsnew("-ERR WATCH inside MULTI is not allowed\r\n"));
+ return;
+ }
+ for (j = 1; j < c->argc; j++)
+ watchForKey(c,c->argv[j]);
+ addReply(c,shared.ok);
+}
+
+void unwatchCommand(redisClient *c) {
+ unwatchAllKeys(c);
+ c->flags &= (~REDIS_DIRTY_CAS);
+ addReply(c,shared.ok);
+}
diff --git a/src/networking.c b/src/networking.c
new file mode 100644
index 000000000..31844a09f
--- /dev/null
+++ b/src/networking.c
@@ -0,0 +1,589 @@
+#include "redis.h"
+
+#include <sys/uio.h>
+
+void *dupClientReplyValue(void *o) {
+ incrRefCount((robj*)o);
+ return o;
+}
+
+int listMatchObjects(void *a, void *b) {
+ return equalStringObjects(a,b);
+}
+
+redisClient *createClient(int fd) {
+ redisClient *c = zmalloc(sizeof(*c));
+
+ anetNonBlock(NULL,fd);
+ anetTcpNoDelay(NULL,fd);
+ if (!c) return NULL;
+ selectDb(c,0);
+ c->fd = fd;
+ c->querybuf = sdsempty();
+ c->argc = 0;
+ c->argv = NULL;
+ c->bulklen = -1;
+ c->multibulk = 0;
+ c->mbargc = 0;
+ c->mbargv = NULL;
+ c->sentlen = 0;
+ c->flags = 0;
+ c->lastinteraction = time(NULL);
+ c->authenticated = 0;
+ c->replstate = REDIS_REPL_NONE;
+ c->reply = listCreate();
+ listSetFreeMethod(c->reply,decrRefCount);
+ listSetDupMethod(c->reply,dupClientReplyValue);
+ c->blocking_keys = NULL;
+ c->blocking_keys_num = 0;
+ c->io_keys = listCreate();
+ c->watched_keys = listCreate();
+ listSetFreeMethod(c->io_keys,decrRefCount);
+ c->pubsub_channels = dictCreate(&setDictType,NULL);
+ c->pubsub_patterns = listCreate();
+ listSetFreeMethod(c->pubsub_patterns,decrRefCount);
+ listSetMatchMethod(c->pubsub_patterns,listMatchObjects);
+ if (aeCreateFileEvent(server.el, c->fd, AE_READABLE,
+ readQueryFromClient, c) == AE_ERR) {
+ freeClient(c);
+ return NULL;
+ }
+ listAddNodeTail(server.clients,c);
+ initClientMultiState(c);
+ return c;
+}
+
+void addReply(redisClient *c, robj *obj) {
+ if (listLength(c->reply) == 0 &&
+ (c->replstate == REDIS_REPL_NONE ||
+ c->replstate == REDIS_REPL_ONLINE) &&
+ aeCreateFileEvent(server.el, c->fd, AE_WRITABLE,
+ sendReplyToClient, c) == AE_ERR) return;
+
+ if (server.vm_enabled && obj->storage != REDIS_VM_MEMORY) {
+ obj = dupStringObject(obj);
+ obj->refcount = 0; /* getDecodedObject() will increment the refcount */
+ }
+ listAddNodeTail(c->reply,getDecodedObject(obj));
+}
+
+void addReplySds(redisClient *c, sds s) {
+ robj *o = createObject(REDIS_STRING,s);
+ addReply(c,o);
+ decrRefCount(o);
+}
+
+void addReplyDouble(redisClient *c, double d) {
+ char buf[128];
+
+ snprintf(buf,sizeof(buf),"%.17g",d);
+ addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n%s\r\n",
+ (unsigned long) strlen(buf),buf));
+}
+
+void addReplyLongLong(redisClient *c, long long ll) {
+ char buf[128];
+ size_t len;
+
+ if (ll == 0) {
+ addReply(c,shared.czero);
+ return;
+ } else if (ll == 1) {
+ addReply(c,shared.cone);
+ return;
+ }
+ buf[0] = ':';
+ len = ll2string(buf+1,sizeof(buf)-1,ll);
+ buf[len+1] = '\r';
+ buf[len+2] = '\n';
+ addReplySds(c,sdsnewlen(buf,len+3));
+}
+
+void addReplyUlong(redisClient *c, unsigned long ul) {
+ char buf[128];
+ size_t len;
+
+ if (ul == 0) {
+ addReply(c,shared.czero);
+ return;
+ } else if (ul == 1) {
+ addReply(c,shared.cone);
+ return;
+ }
+ len = snprintf(buf,sizeof(buf),":%lu\r\n",ul);
+ addReplySds(c,sdsnewlen(buf,len));
+}
+
+void addReplyBulkLen(redisClient *c, robj *obj) {
+ size_t len, intlen;
+ char buf[128];
+
+ if (obj->encoding == REDIS_ENCODING_RAW) {
+ len = sdslen(obj->ptr);
+ } else {
+ long n = (long)obj->ptr;
+
+ /* Compute how many bytes will take this integer as a radix 10 string */
+ len = 1;
+ if (n < 0) {
+ len++;
+ n = -n;
+ }
+ while((n = n/10) != 0) {
+ len++;
+ }
+ }
+ buf[0] = '$';
+ intlen = ll2string(buf+1,sizeof(buf)-1,(long long)len);
+ buf[intlen+1] = '\r';
+ buf[intlen+2] = '\n';
+ addReplySds(c,sdsnewlen(buf,intlen+3));
+}
+
+void addReplyBulk(redisClient *c, robj *obj) {
+ addReplyBulkLen(c,obj);
+ addReply(c,obj);
+ addReply(c,shared.crlf);
+}
+
+/* In the CONFIG command we need to add vanilla C string as bulk replies */
+void addReplyBulkCString(redisClient *c, char *s) {
+ if (s == NULL) {
+ addReply(c,shared.nullbulk);
+ } else {
+ robj *o = createStringObject(s,strlen(s));
+ addReplyBulk(c,o);
+ decrRefCount(o);
+ }
+}
+
+void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
+ int cport, cfd;
+ char cip[128];
+ redisClient *c;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(mask);
+ REDIS_NOTUSED(privdata);
+
+ cfd = anetAccept(server.neterr, fd, cip, &cport);
+ if (cfd == AE_ERR) {
+ redisLog(REDIS_VERBOSE,"Accepting client connection: %s", server.neterr);
+ return;
+ }
+ redisLog(REDIS_VERBOSE,"Accepted %s:%d", cip, cport);
+ if ((c = createClient(cfd)) == NULL) {
+ redisLog(REDIS_WARNING,"Error allocating resoures for the client");
+ close(cfd); /* May be already closed, just ingore errors */
+ return;
+ }
+ /* If maxclient directive is set and this is one client more... close the
+ * connection. Note that we create the client instead to check before
+ * for this condition, since now the socket is already set in nonblocking
+ * mode and we can send an error for free using the Kernel I/O */
+ if (server.maxclients && listLength(server.clients) > server.maxclients) {
+ char *err = "-ERR max number of clients reached\r\n";
+
+ /* That's a best effort error message, don't check write errors */
+ if (write(c->fd,err,strlen(err)) == -1) {
+ /* Nothing to do, Just to avoid the warning... */
+ }
+ freeClient(c);
+ return;
+ }
+ server.stat_numconnections++;
+}
+
+static void freeClientArgv(redisClient *c) {
+ int j;
+
+ for (j = 0; j < c->argc; j++)
+ decrRefCount(c->argv[j]);
+ for (j = 0; j < c->mbargc; j++)
+ decrRefCount(c->mbargv[j]);
+ c->argc = 0;
+ c->mbargc = 0;
+}
+
+void freeClient(redisClient *c) {
+ listNode *ln;
+
+ /* Note that if the client we are freeing is blocked into a blocking
+ * call, we have to set querybuf to NULL *before* to call
+ * unblockClientWaitingData() to avoid processInputBuffer() will get
+ * called. Also it is important to remove the file events after
+ * this, because this call adds the READABLE event. */
+ sdsfree(c->querybuf);
+ c->querybuf = NULL;
+ if (c->flags & REDIS_BLOCKED)
+ unblockClientWaitingData(c);
+
+ /* UNWATCH all the keys */
+ unwatchAllKeys(c);
+ listRelease(c->watched_keys);
+ /* Unsubscribe from all the pubsub channels */
+ pubsubUnsubscribeAllChannels(c,0);
+ pubsubUnsubscribeAllPatterns(c,0);
+ dictRelease(c->pubsub_channels);
+ listRelease(c->pubsub_patterns);
+ /* Obvious cleanup */
+ aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
+ aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
+ listRelease(c->reply);
+ freeClientArgv(c);
+ close(c->fd);
+ /* Remove from the list of clients */
+ ln = listSearchKey(server.clients,c);
+ redisAssert(ln != NULL);
+ listDelNode(server.clients,ln);
+ /* Remove from the list of clients that are now ready to be restarted
+ * after waiting for swapped keys */
+ if (c->flags & REDIS_IO_WAIT && listLength(c->io_keys) == 0) {
+ ln = listSearchKey(server.io_ready_clients,c);
+ if (ln) {
+ listDelNode(server.io_ready_clients,ln);
+ server.vm_blocked_clients--;
+ }
+ }
+ /* Remove from the list of clients waiting for swapped keys */
+ while (server.vm_enabled && listLength(c->io_keys)) {
+ ln = listFirst(c->io_keys);
+ dontWaitForSwappedKey(c,ln->value);
+ }
+ listRelease(c->io_keys);
+ /* Master/slave cleanup */
+ if (c->flags & REDIS_SLAVE) {
+ if (c->replstate == REDIS_REPL_SEND_BULK && c->repldbfd != -1)
+ close(c->repldbfd);
+ list *l = (c->flags & REDIS_MONITOR) ? server.monitors : server.slaves;
+ ln = listSearchKey(l,c);
+ redisAssert(ln != NULL);
+ listDelNode(l,ln);
+ }
+ if (c->flags & REDIS_MASTER) {
+ server.master = NULL;
+ server.replstate = REDIS_REPL_CONNECT;
+ }
+ /* Release memory */
+ zfree(c->argv);
+ zfree(c->mbargv);
+ freeClientMultiState(c);
+ zfree(c);
+}
+
+#define GLUEREPLY_UP_TO (1024)
+static void glueReplyBuffersIfNeeded(redisClient *c) {
+ int copylen = 0;
+ char buf[GLUEREPLY_UP_TO];
+ listNode *ln;
+ listIter li;
+ robj *o;
+
+ listRewind(c->reply,&li);
+ while((ln = listNext(&li))) {
+ int objlen;
+
+ o = ln->value;
+ objlen = sdslen(o->ptr);
+ if (copylen + objlen <= GLUEREPLY_UP_TO) {
+ memcpy(buf+copylen,o->ptr,objlen);
+ copylen += objlen;
+ listDelNode(c->reply,ln);
+ } else {
+ if (copylen == 0) return;
+ break;
+ }
+ }
+ /* Now the output buffer is empty, add the new single element */
+ o = createObject(REDIS_STRING,sdsnewlen(buf,copylen));
+ listAddNodeHead(c->reply,o);
+}
+
+void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask) {
+ redisClient *c = privdata;
+ int nwritten = 0, totwritten = 0, objlen;
+ robj *o;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(mask);
+
+ /* Use writev() if we have enough buffers to send */
+ if (!server.glueoutputbuf &&
+ listLength(c->reply) > REDIS_WRITEV_THRESHOLD &&
+ !(c->flags & REDIS_MASTER))
+ {
+ sendReplyToClientWritev(el, fd, privdata, mask);
+ return;
+ }
+
+ while(listLength(c->reply)) {
+ if (server.glueoutputbuf && listLength(c->reply) > 1)
+ glueReplyBuffersIfNeeded(c);
+
+ o = listNodeValue(listFirst(c->reply));
+ objlen = sdslen(o->ptr);
+
+ if (objlen == 0) {
+ listDelNode(c->reply,listFirst(c->reply));
+ continue;
+ }
+
+ if (c->flags & REDIS_MASTER) {
+ /* Don't reply to a master */
+ nwritten = objlen - c->sentlen;
+ } else {
+ nwritten = write(fd, ((char*)o->ptr)+c->sentlen, objlen - c->sentlen);
+ if (nwritten <= 0) break;
+ }
+ c->sentlen += nwritten;
+ totwritten += nwritten;
+ /* If we fully sent the object on head go to the next one */
+ if (c->sentlen == objlen) {
+ listDelNode(c->reply,listFirst(c->reply));
+ c->sentlen = 0;
+ }
+ /* Note that we avoid to send more thank REDIS_MAX_WRITE_PER_EVENT
+ * bytes, in a single threaded server it's a good idea to serve
+ * other clients as well, even if a very large request comes from
+ * super fast link that is always able to accept data (in real world
+ * scenario think about 'KEYS *' against the loopback interfae) */
+ if (totwritten > REDIS_MAX_WRITE_PER_EVENT) break;
+ }
+ if (nwritten == -1) {
+ if (errno == EAGAIN) {
+ nwritten = 0;
+ } else {
+ redisLog(REDIS_VERBOSE,
+ "Error writing to client: %s", strerror(errno));
+ freeClient(c);
+ return;
+ }
+ }
+ if (totwritten > 0) c->lastinteraction = time(NULL);
+ if (listLength(c->reply) == 0) {
+ c->sentlen = 0;
+ aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
+ }
+}
+
+void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask)
+{
+ redisClient *c = privdata;
+ int nwritten = 0, totwritten = 0, objlen, willwrite;
+ robj *o;
+ struct iovec iov[REDIS_WRITEV_IOVEC_COUNT];
+ int offset, ion = 0;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(mask);
+
+ listNode *node;
+ while (listLength(c->reply)) {
+ offset = c->sentlen;
+ ion = 0;
+ willwrite = 0;
+
+ /* fill-in the iov[] array */
+ for(node = listFirst(c->reply); node; node = listNextNode(node)) {
+ o = listNodeValue(node);
+ objlen = sdslen(o->ptr);
+
+ if (totwritten + objlen - offset > REDIS_MAX_WRITE_PER_EVENT)
+ break;
+
+ if(ion == REDIS_WRITEV_IOVEC_COUNT)
+ break; /* no more iovecs */
+
+ iov[ion].iov_base = ((char*)o->ptr) + offset;
+ iov[ion].iov_len = objlen - offset;
+ willwrite += objlen - offset;
+ offset = 0; /* just for the first item */
+ ion++;
+ }
+
+ if(willwrite == 0)
+ break;
+
+ /* write all collected blocks at once */
+ if((nwritten = writev(fd, iov, ion)) < 0) {
+ if (errno != EAGAIN) {
+ redisLog(REDIS_VERBOSE,
+ "Error writing to client: %s", strerror(errno));
+ freeClient(c);
+ return;
+ }
+ break;
+ }
+
+ totwritten += nwritten;
+ offset = c->sentlen;
+
+ /* remove written robjs from c->reply */
+ while (nwritten && listLength(c->reply)) {
+ o = listNodeValue(listFirst(c->reply));
+ objlen = sdslen(o->ptr);
+
+ if(nwritten >= objlen - offset) {
+ listDelNode(c->reply, listFirst(c->reply));
+ nwritten -= objlen - offset;
+ c->sentlen = 0;
+ } else {
+ /* partial write */
+ c->sentlen += nwritten;
+ break;
+ }
+ offset = 0;
+ }
+ }
+
+ if (totwritten > 0)
+ c->lastinteraction = time(NULL);
+
+ if (listLength(c->reply) == 0) {
+ c->sentlen = 0;
+ aeDeleteFileEvent(server.el,c->fd,AE_WRITABLE);
+ }
+}
+
+/* resetClient prepare the client to process the next command */
+void resetClient(redisClient *c) {
+ freeClientArgv(c);
+ c->bulklen = -1;
+ c->multibulk = 0;
+}
+
+void closeTimedoutClients(void) {
+ redisClient *c;
+ listNode *ln;
+ time_t now = time(NULL);
+ listIter li;
+
+ listRewind(server.clients,&li);
+ while ((ln = listNext(&li)) != NULL) {
+ c = listNodeValue(ln);
+ if (server.maxidletime &&
+ !(c->flags & REDIS_SLAVE) && /* no timeout for slaves */
+ !(c->flags & REDIS_MASTER) && /* no timeout for masters */
+ dictSize(c->pubsub_channels) == 0 && /* no timeout for pubsub */
+ listLength(c->pubsub_patterns) == 0 &&
+ (now - c->lastinteraction > server.maxidletime))
+ {
+ redisLog(REDIS_VERBOSE,"Closing idle client");
+ freeClient(c);
+ } else if (c->flags & REDIS_BLOCKED) {
+ if (c->blockingto != 0 && c->blockingto < now) {
+ addReply(c,shared.nullmultibulk);
+ unblockClientWaitingData(c);
+ }
+ }
+ }
+}
+
+void processInputBuffer(redisClient *c) {
+again:
+ /* Before to process the input buffer, make sure the client is not
+ * waitig for a blocking operation such as BLPOP. Note that the first
+ * iteration the client is never blocked, otherwise the processInputBuffer
+ * would not be called at all, but after the execution of the first commands
+ * in the input buffer the client may be blocked, and the "goto again"
+ * will try to reiterate. The following line will make it return asap. */
+ if (c->flags & REDIS_BLOCKED || c->flags & REDIS_IO_WAIT) return;
+ if (c->bulklen == -1) {
+ /* Read the first line of the query */
+ char *p = strchr(c->querybuf,'\n');
+ size_t querylen;
+
+ if (p) {
+ sds query, *argv;
+ int argc, j;
+
+ query = c->querybuf;
+ c->querybuf = sdsempty();
+ querylen = 1+(p-(query));
+ if (sdslen(query) > querylen) {
+ /* leave data after the first line of the query in the buffer */
+ c->querybuf = sdscatlen(c->querybuf,query+querylen,sdslen(query)-querylen);
+ }
+ *p = '\0'; /* remove "\n" */
+ if (*(p-1) == '\r') *(p-1) = '\0'; /* and "\r" if any */
+ sdsupdatelen(query);
+
+ /* Now we can split the query in arguments */
+ argv = sdssplitlen(query,sdslen(query)," ",1,&argc);
+ sdsfree(query);
+
+ if (c->argv) zfree(c->argv);
+ c->argv = zmalloc(sizeof(robj*)*argc);
+
+ for (j = 0; j < argc; j++) {
+ if (sdslen(argv[j])) {
+ c->argv[c->argc] = createObject(REDIS_STRING,argv[j]);
+ c->argc++;
+ } else {
+ sdsfree(argv[j]);
+ }
+ }
+ zfree(argv);
+ if (c->argc) {
+ /* Execute the command. If the client is still valid
+ * after processCommand() return and there is something
+ * on the query buffer try to process the next command. */
+ if (processCommand(c) && sdslen(c->querybuf)) goto again;
+ } else {
+ /* Nothing to process, argc == 0. Just process the query
+ * buffer if it's not empty or return to the caller */
+ if (sdslen(c->querybuf)) goto again;
+ }
+ return;
+ } else if (sdslen(c->querybuf) >= REDIS_REQUEST_MAX_SIZE) {
+ redisLog(REDIS_VERBOSE, "Client protocol error");
+ freeClient(c);
+ return;
+ }
+ } else {
+ /* Bulk read handling. Note that if we are at this point
+ the client already sent a command terminated with a newline,
+ we are reading the bulk data that is actually the last
+ argument of the command. */
+ int qbl = sdslen(c->querybuf);
+
+ if (c->bulklen <= qbl) {
+ /* Copy everything but the final CRLF as final argument */
+ c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
+ c->argc++;
+ c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
+ /* Process the command. If the client is still valid after
+ * the processing and there is more data in the buffer
+ * try to parse it. */
+ if (processCommand(c) && sdslen(c->querybuf)) goto again;
+ return;
+ }
+ }
+}
+
+void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask) {
+ redisClient *c = (redisClient*) privdata;
+ char buf[REDIS_IOBUF_LEN];
+ int nread;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(mask);
+
+ nread = read(fd, buf, REDIS_IOBUF_LEN);
+ if (nread == -1) {
+ if (errno == EAGAIN) {
+ nread = 0;
+ } else {
+ redisLog(REDIS_VERBOSE, "Reading from client: %s",strerror(errno));
+ freeClient(c);
+ return;
+ }
+ } else if (nread == 0) {
+ redisLog(REDIS_VERBOSE, "Client closed connection");
+ freeClient(c);
+ return;
+ }
+ if (nread) {
+ c->querybuf = sdscatlen(c->querybuf, buf, nread);
+ c->lastinteraction = time(NULL);
+ } else {
+ return;
+ }
+ processInputBuffer(c);
+}
diff --git a/src/object.c b/src/object.c
new file mode 100644
index 000000000..4854909e0
--- /dev/null
+++ b/src/object.c
@@ -0,0 +1,405 @@
+#include "redis.h"
+#include <pthread.h>
+
+robj *createObject(int type, void *ptr) {
+ robj *o;
+
+ if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
+ if (listLength(server.objfreelist)) {
+ listNode *head = listFirst(server.objfreelist);
+ o = listNodeValue(head);
+ listDelNode(server.objfreelist,head);
+ if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
+ } else {
+ if (server.vm_enabled)
+ pthread_mutex_unlock(&server.obj_freelist_mutex);
+ o = zmalloc(sizeof(*o));
+ }
+ o->type = type;
+ o->encoding = REDIS_ENCODING_RAW;
+ o->ptr = ptr;
+ o->refcount = 1;
+ if (server.vm_enabled) {
+ /* Note that this code may run in the context of an I/O thread
+ * and accessing server.lruclock in theory is an error
+ * (no locks). But in practice this is safe, and even if we read
+ * garbage Redis will not fail. */
+ o->lru = server.lruclock;
+ o->storage = REDIS_VM_MEMORY;
+ }
+ return o;
+}
+
+robj *createStringObject(char *ptr, size_t len) {
+ return createObject(REDIS_STRING,sdsnewlen(ptr,len));
+}
+
+robj *createStringObjectFromLongLong(long long value) {
+ robj *o;
+ if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
+ incrRefCount(shared.integers[value]);
+ o = shared.integers[value];
+ } else {
+ if (value >= LONG_MIN && value <= LONG_MAX) {
+ o = createObject(REDIS_STRING, NULL);
+ o->encoding = REDIS_ENCODING_INT;
+ o->ptr = (void*)((long)value);
+ } else {
+ o = createObject(REDIS_STRING,sdsfromlonglong(value));
+ }
+ }
+ return o;
+}
+
+robj *dupStringObject(robj *o) {
+ redisAssert(o->encoding == REDIS_ENCODING_RAW);
+ return createStringObject(o->ptr,sdslen(o->ptr));
+}
+
+robj *createListObject(void) {
+ list *l = listCreate();
+ robj *o = createObject(REDIS_LIST,l);
+ listSetFreeMethod(l,decrRefCount);
+ o->encoding = REDIS_ENCODING_LINKEDLIST;
+ return o;
+}
+
+robj *createZiplistObject(void) {
+ unsigned char *zl = ziplistNew();
+ robj *o = createObject(REDIS_LIST,zl);
+ o->encoding = REDIS_ENCODING_ZIPLIST;
+ return o;
+}
+
+robj *createSetObject(void) {
+ dict *d = dictCreate(&setDictType,NULL);
+ return createObject(REDIS_SET,d);
+}
+
+robj *createHashObject(void) {
+ /* All the Hashes start as zipmaps. Will be automatically converted
+ * into hash tables if there are enough elements or big elements
+ * inside. */
+ unsigned char *zm = zipmapNew();
+ robj *o = createObject(REDIS_HASH,zm);
+ o->encoding = REDIS_ENCODING_ZIPMAP;
+ return o;
+}
+
+robj *createZsetObject(void) {
+ zset *zs = zmalloc(sizeof(*zs));
+
+ zs->dict = dictCreate(&zsetDictType,NULL);
+ zs->zsl = zslCreate();
+ return createObject(REDIS_ZSET,zs);
+}
+
+void freeStringObject(robj *o) {
+ if (o->encoding == REDIS_ENCODING_RAW) {
+ sdsfree(o->ptr);
+ }
+}
+
+void freeListObject(robj *o) {
+ switch (o->encoding) {
+ case REDIS_ENCODING_LINKEDLIST:
+ listRelease((list*) o->ptr);
+ break;
+ case REDIS_ENCODING_ZIPLIST:
+ zfree(o->ptr);
+ break;
+ default:
+ redisPanic("Unknown list encoding type");
+ }
+}
+
+void freeSetObject(robj *o) {
+ dictRelease((dict*) o->ptr);
+}
+
+void freeZsetObject(robj *o) {
+ zset *zs = o->ptr;
+
+ dictRelease(zs->dict);
+ zslFree(zs->zsl);
+ zfree(zs);
+}
+
+void freeHashObject(robj *o) {
+ switch (o->encoding) {
+ case REDIS_ENCODING_HT:
+ dictRelease((dict*) o->ptr);
+ break;
+ case REDIS_ENCODING_ZIPMAP:
+ zfree(o->ptr);
+ break;
+ default:
+ redisPanic("Unknown hash encoding type");
+ break;
+ }
+}
+
+void incrRefCount(robj *o) {
+ o->refcount++;
+}
+
+void decrRefCount(void *obj) {
+ robj *o = obj;
+
+ /* Object is a swapped out value, or in the process of being loaded. */
+ if (server.vm_enabled &&
+ (o->storage == REDIS_VM_SWAPPED || o->storage == REDIS_VM_LOADING))
+ {
+ vmpointer *vp = obj;
+ if (o->storage == REDIS_VM_LOADING) vmCancelThreadedIOJob(o);
+ vmMarkPagesFree(vp->page,vp->usedpages);
+ server.vm_stats_swapped_objects--;
+ zfree(vp);
+ return;
+ }
+
+ if (o->refcount <= 0) redisPanic("decrRefCount against refcount <= 0");
+ /* Object is in memory, or in the process of being swapped out.
+ *
+ * If the object is being swapped out, abort the operation on
+ * decrRefCount even if the refcount does not drop to 0: the object
+ * is referenced at least two times, as value of the key AND as
+ * job->val in the iojob. So if we don't invalidate the iojob, when it is
+ * done but the relevant key was removed in the meantime, the
+ * complete jobs handler will not find the key about the job and the
+ * assert will fail. */
+ if (server.vm_enabled && o->storage == REDIS_VM_SWAPPING)
+ vmCancelThreadedIOJob(o);
+ if (--(o->refcount) == 0) {
+ switch(o->type) {
+ case REDIS_STRING: freeStringObject(o); break;
+ case REDIS_LIST: freeListObject(o); break;
+ case REDIS_SET: freeSetObject(o); break;
+ case REDIS_ZSET: freeZsetObject(o); break;
+ case REDIS_HASH: freeHashObject(o); break;
+ default: redisPanic("Unknown object type"); break;
+ }
+ if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
+ if (listLength(server.objfreelist) > REDIS_OBJFREELIST_MAX ||
+ !listAddNodeHead(server.objfreelist,o))
+ zfree(o);
+ if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
+ }
+}
+
+int checkType(redisClient *c, robj *o, int type) {
+ if (o->type != type) {
+ addReply(c,shared.wrongtypeerr);
+ return 1;
+ }
+ return 0;
+}
+
+/* Try to encode a string object in order to save space */
+robj *tryObjectEncoding(robj *o) {
+ long value;
+ sds s = o->ptr;
+
+ if (o->encoding != REDIS_ENCODING_RAW)
+ return o; /* Already encoded */
+
+ /* It's not safe to encode shared objects: shared objects can be shared
+ * everywhere in the "object space" of Redis. Encoded objects can only
+ * appear as "values" (and not, for instance, as keys) */
+ if (o->refcount > 1) return o;
+
+ /* Currently we try to encode only strings */
+ redisAssert(o->type == REDIS_STRING);
+
+ /* Check if we can represent this string as a long integer */
+ if (isStringRepresentableAsLong(s,&value) == REDIS_ERR) return o;
+
+ /* Ok, this object can be encoded */
+ if (value >= 0 && value < REDIS_SHARED_INTEGERS) {
+ decrRefCount(o);
+ incrRefCount(shared.integers[value]);
+ return shared.integers[value];
+ } else {
+ o->encoding = REDIS_ENCODING_INT;
+ sdsfree(o->ptr);
+ o->ptr = (void*) value;
+ return o;
+ }
+}
+
+/* Get a decoded version of an encoded object (returned as a new object).
+ * If the object is already raw-encoded just increment the ref count. */
+robj *getDecodedObject(robj *o) {
+ robj *dec;
+
+ if (o->encoding == REDIS_ENCODING_RAW) {
+ incrRefCount(o);
+ return o;
+ }
+ if (o->type == REDIS_STRING && o->encoding == REDIS_ENCODING_INT) {
+ char buf[32];
+
+ ll2string(buf,32,(long)o->ptr);
+ dec = createStringObject(buf,strlen(buf));
+ return dec;
+ } else {
+ redisPanic("Unknown encoding type");
+ }
+}
+
+/* Compare two string objects via strcmp() or alike.
+ * Note that the objects may be integer-encoded. In such a case we
+ * use ll2string() to get a string representation of the numbers on the stack
+ * and compare the strings, it's much faster than calling getDecodedObject().
+ *
+ * Important note: if objects are not integer encoded, but binary-safe strings,
+ * sdscmp() from sds.c will apply memcmp() so this function ca be considered
+ * binary safe. */
+int compareStringObjects(robj *a, robj *b) {
+ redisAssert(a->type == REDIS_STRING && b->type == REDIS_STRING);
+ char bufa[128], bufb[128], *astr, *bstr;
+ int bothsds = 1;
+
+ if (a == b) return 0;
+ if (a->encoding != REDIS_ENCODING_RAW) {
+ ll2string(bufa,sizeof(bufa),(long) a->ptr);
+ astr = bufa;
+ bothsds = 0;
+ } else {
+ astr = a->ptr;
+ }
+ if (b->encoding != REDIS_ENCODING_RAW) {
+ ll2string(bufb,sizeof(bufb),(long) b->ptr);
+ bstr = bufb;
+ bothsds = 0;
+ } else {
+ bstr = b->ptr;
+ }
+ return bothsds ? sdscmp(astr,bstr) : strcmp(astr,bstr);
+}
+
+/* Equal string objects return 1 if the two objects are the same from the
+ * point of view of a string comparison, otherwise 0 is returned. Note that
+ * this function is faster then checking for (compareStringObject(a,b) == 0)
+ * because it can perform some more optimization. */
+int equalStringObjects(robj *a, robj *b) {
+ if (a->encoding != REDIS_ENCODING_RAW && b->encoding != REDIS_ENCODING_RAW){
+ return a->ptr == b->ptr;
+ } else {
+ return compareStringObjects(a,b) == 0;
+ }
+}
+
+size_t stringObjectLen(robj *o) {
+ redisAssert(o->type == REDIS_STRING);
+ if (o->encoding == REDIS_ENCODING_RAW) {
+ return sdslen(o->ptr);
+ } else {
+ char buf[32];
+
+ return ll2string(buf,32,(long)o->ptr);
+ }
+}
+
+int getDoubleFromObject(robj *o, double *target) {
+ double value;
+ char *eptr;
+
+ if (o == NULL) {
+ value = 0;
+ } else {
+ redisAssert(o->type == REDIS_STRING);
+ if (o->encoding == REDIS_ENCODING_RAW) {
+ value = strtod(o->ptr, &eptr);
+ if (eptr[0] != '\0') return REDIS_ERR;
+ } else if (o->encoding == REDIS_ENCODING_INT) {
+ value = (long)o->ptr;
+ } else {
+ redisPanic("Unknown string encoding");
+ }
+ }
+
+ *target = value;
+ return REDIS_OK;
+}
+
+int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg) {
+ double value;
+ if (getDoubleFromObject(o, &value) != REDIS_OK) {
+ if (msg != NULL) {
+ addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
+ } else {
+ addReplySds(c, sdsnew("-ERR value is not a double\r\n"));
+ }
+ return REDIS_ERR;
+ }
+
+ *target = value;
+ return REDIS_OK;
+}
+
+int getLongLongFromObject(robj *o, long long *target) {
+ long long value;
+ char *eptr;
+
+ if (o == NULL) {
+ value = 0;
+ } else {
+ redisAssert(o->type == REDIS_STRING);
+ if (o->encoding == REDIS_ENCODING_RAW) {
+ value = strtoll(o->ptr, &eptr, 10);
+ if (eptr[0] != '\0') return REDIS_ERR;
+ } else if (o->encoding == REDIS_ENCODING_INT) {
+ value = (long)o->ptr;
+ } else {
+ redisPanic("Unknown string encoding");
+ }
+ }
+
+ *target = value;
+ return REDIS_OK;
+}
+
+int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg) {
+ long long value;
+ if (getLongLongFromObject(o, &value) != REDIS_OK) {
+ if (msg != NULL) {
+ addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
+ } else {
+ addReplySds(c, sdsnew("-ERR value is not an integer\r\n"));
+ }
+ return REDIS_ERR;
+ }
+
+ *target = value;
+ return REDIS_OK;
+}
+
+int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg) {
+ long long value;
+
+ if (getLongLongFromObjectOrReply(c, o, &value, msg) != REDIS_OK) return REDIS_ERR;
+ if (value < LONG_MIN || value > LONG_MAX) {
+ if (msg != NULL) {
+ addReplySds(c, sdscatprintf(sdsempty(), "-ERR %s\r\n", msg));
+ } else {
+ addReplySds(c, sdsnew("-ERR value is out of range\r\n"));
+ }
+ return REDIS_ERR;
+ }
+
+ *target = value;
+ return REDIS_OK;
+}
+
+char *strEncoding(int encoding) {
+ switch(encoding) {
+ case REDIS_ENCODING_RAW: return "raw";
+ case REDIS_ENCODING_INT: return "int";
+ case REDIS_ENCODING_HT: return "hashtable";
+ case REDIS_ENCODING_ZIPMAP: return "zipmap";
+ case REDIS_ENCODING_LINKEDLIST: return "linkedlist";
+ case REDIS_ENCODING_ZIPLIST: return "ziplist";
+ default: return "unknown";
+ }
+}
diff --git a/src/pqsort.c b/src/pqsort.c
new file mode 100644
index 000000000..257756376
--- /dev/null
+++ b/src/pqsort.c
@@ -0,0 +1,197 @@
+/* The following is the NetBSD libc qsort implementation modified in order to
+ * support partial sorting of ranges for Redis.
+ *
+ * Copyright(C) 2009-2010 Salvatore Sanfilippo. All rights reserved.
+ *
+ * The original copyright notice follows. */
+
+
+/* $NetBSD: qsort.c,v 1.19 2009/01/30 23:38:44 lukem Exp $ */
+
+/*-
+ * Copyright (c) 1992, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+
+static inline char *med3 (char *, char *, char *,
+ int (*)(const void *, const void *));
+static inline void swapfunc (char *, char *, size_t, int);
+
+#define min(a, b) (a) < (b) ? a : b
+
+/*
+ * Qsort routine from Bentley & McIlroy's "Engineering a Sort Function".
+ */
+#define swapcode(TYPE, parmi, parmj, n) { \
+ size_t i = (n) / sizeof (TYPE); \
+ TYPE *pi = (TYPE *)(void *)(parmi); \
+ TYPE *pj = (TYPE *)(void *)(parmj); \
+ do { \
+ TYPE t = *pi; \
+ *pi++ = *pj; \
+ *pj++ = t; \
+ } while (--i > 0); \
+}
+
+#define SWAPINIT(a, es) swaptype = ((char *)a - (char *)0) % sizeof(long) || \
+ es % sizeof(long) ? 2 : es == sizeof(long)? 0 : 1;
+
+static inline void
+swapfunc(char *a, char *b, size_t n, int swaptype)
+{
+
+ if (swaptype <= 1)
+ swapcode(long, a, b, n)
+ else
+ swapcode(char, a, b, n)
+}
+
+#define swap(a, b) \
+ if (swaptype == 0) { \
+ long t = *(long *)(void *)(a); \
+ *(long *)(void *)(a) = *(long *)(void *)(b); \
+ *(long *)(void *)(b) = t; \
+ } else \
+ swapfunc(a, b, es, swaptype)
+
+#define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype)
+
+static inline char *
+med3(char *a, char *b, char *c,
+ int (*cmp) (const void *, const void *))
+{
+
+ return cmp(a, b) < 0 ?
+ (cmp(b, c) < 0 ? b : (cmp(a, c) < 0 ? c : a ))
+ :(cmp(b, c) > 0 ? b : (cmp(a, c) < 0 ? a : c ));
+}
+
+static void
+_pqsort(void *a, size_t n, size_t es,
+ int (*cmp) (const void *, const void *), void *lrange, void *rrange)
+{
+ char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
+ size_t d, r;
+ int swaptype, swap_cnt, cmp_result;
+
+loop: SWAPINIT(a, es);
+ swap_cnt = 0;
+ if (n < 7) {
+ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+ for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0;
+ pl -= es)
+ swap(pl, pl - es);
+ return;
+ }
+ pm = (char *) a + (n / 2) * es;
+ if (n > 7) {
+ pl = (char *) a;
+ pn = (char *) a + (n - 1) * es;
+ if (n > 40) {
+ d = (n / 8) * es;
+ pl = med3(pl, pl + d, pl + 2 * d, cmp);
+ pm = med3(pm - d, pm, pm + d, cmp);
+ pn = med3(pn - 2 * d, pn - d, pn, cmp);
+ }
+ pm = med3(pl, pm, pn, cmp);
+ }
+ swap(a, pm);
+ pa = pb = (char *) a + es;
+
+ pc = pd = (char *) a + (n - 1) * es;
+ for (;;) {
+ while (pb <= pc && (cmp_result = cmp(pb, a)) <= 0) {
+ if (cmp_result == 0) {
+ swap_cnt = 1;
+ swap(pa, pb);
+ pa += es;
+ }
+ pb += es;
+ }
+ while (pb <= pc && (cmp_result = cmp(pc, a)) >= 0) {
+ if (cmp_result == 0) {
+ swap_cnt = 1;
+ swap(pc, pd);
+ pd -= es;
+ }
+ pc -= es;
+ }
+ if (pb > pc)
+ break;
+ swap(pb, pc);
+ swap_cnt = 1;
+ pb += es;
+ pc -= es;
+ }
+ if (swap_cnt == 0) { /* Switch to insertion sort */
+ for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+ for (pl = pm; pl > (char *) a && cmp(pl - es, pl) > 0;
+ pl -= es)
+ swap(pl, pl - es);
+ return;
+ }
+
+ pn = (char *) a + n * es;
+ r = min(pa - (char *) a, pb - pa);
+ vecswap(a, pb - r, r);
+ r = min((size_t)(pd - pc), pn - pd - es);
+ vecswap(pb, pn - r, r);
+ if ((r = pb - pa) > es) {
+ void *_l = a, *_r = ((unsigned char*)a)+r-1;
+ if (!((lrange < _l && rrange < _l) ||
+ (lrange > _r && rrange > _r)))
+ _pqsort(a, r / es, es, cmp, lrange, rrange);
+ }
+ if ((r = pd - pc) > es) {
+ void *_l, *_r;
+
+ /* Iterate rather than recurse to save stack space */
+ a = pn - r;
+ n = r / es;
+
+ _l = a;
+ _r = ((unsigned char*)a)+r-1;
+ if (!((lrange < _l && rrange < _l) ||
+ (lrange > _r && rrange > _r)))
+ goto loop;
+ }
+/* qsort(pn - r, r / es, es, cmp);*/
+}
+
+void
+pqsort(void *a, size_t n, size_t es,
+ int (*cmp) (const void *, const void *), size_t lrange, size_t rrange)
+{
+ _pqsort(a,n,es,cmp,((unsigned char*)a)+(lrange*es),
+ ((unsigned char*)a)+((rrange+1)*es)-1);
+}
diff --git a/src/pqsort.h b/src/pqsort.h
new file mode 100644
index 000000000..5054d5209
--- /dev/null
+++ b/src/pqsort.h
@@ -0,0 +1,15 @@
+/* The following is the NetBSD libc qsort implementation modified in order to
+ * support partial sorting of ranges for Redis.
+ *
+ * Copyright(C) 2009-2010 Salvatore Sanfilippo. All rights reserved.
+ *
+ * See the pqsort.c file for the original copyright notice. */
+
+#ifndef __PQSORT_H
+#define __PQSORT_H
+
+void
+pqsort(void *a, size_t n, size_t es,
+ int (*cmp) (const void *, const void *), size_t lrange, size_t rrange);
+
+#endif
diff --git a/src/pubsub.c b/src/pubsub.c
new file mode 100644
index 000000000..c9f5f310e
--- /dev/null
+++ b/src/pubsub.c
@@ -0,0 +1,259 @@
+#include "redis.h"
+
+void freePubsubPattern(void *p) {
+ pubsubPattern *pat = p;
+
+ decrRefCount(pat->pattern);
+ zfree(pat);
+}
+
+int listMatchPubsubPattern(void *a, void *b) {
+ pubsubPattern *pa = a, *pb = b;
+
+ return (pa->client == pb->client) &&
+ (equalStringObjects(pa->pattern,pb->pattern));
+}
+
+/* Subscribe a client to a channel. Returns 1 if the operation succeeded, or
+ * 0 if the client was already subscribed to that channel. */
+int pubsubSubscribeChannel(redisClient *c, robj *channel) {
+ struct dictEntry *de;
+ list *clients = NULL;
+ int retval = 0;
+
+ /* Add the channel to the client -> channels hash table */
+ if (dictAdd(c->pubsub_channels,channel,NULL) == DICT_OK) {
+ retval = 1;
+ incrRefCount(channel);
+ /* Add the client to the channel -> list of clients hash table */
+ de = dictFind(server.pubsub_channels,channel);
+ if (de == NULL) {
+ clients = listCreate();
+ dictAdd(server.pubsub_channels,channel,clients);
+ incrRefCount(channel);
+ } else {
+ clients = dictGetEntryVal(de);
+ }
+ listAddNodeTail(clients,c);
+ }
+ /* Notify the client */
+ addReply(c,shared.mbulk3);
+ addReply(c,shared.subscribebulk);
+ addReplyBulk(c,channel);
+ addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
+ return retval;
+}
+
+/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
+ * 0 if the client was not subscribed to the specified channel. */
+int pubsubUnsubscribeChannel(redisClient *c, robj *channel, int notify) {
+ struct dictEntry *de;
+ list *clients;
+ listNode *ln;
+ int retval = 0;
+
+ /* Remove the channel from the client -> channels hash table */
+ incrRefCount(channel); /* channel may be just a pointer to the same object
+ we have in the hash tables. Protect it... */
+ if (dictDelete(c->pubsub_channels,channel) == DICT_OK) {
+ retval = 1;
+ /* Remove the client from the channel -> clients list hash table */
+ de = dictFind(server.pubsub_channels,channel);
+ redisAssert(de != NULL);
+ clients = dictGetEntryVal(de);
+ ln = listSearchKey(clients,c);
+ redisAssert(ln != NULL);
+ listDelNode(clients,ln);
+ if (listLength(clients) == 0) {
+ /* Free the list and associated hash entry at all if this was
+ * the latest client, so that it will be possible to abuse
+ * Redis PUBSUB creating millions of channels. */
+ dictDelete(server.pubsub_channels,channel);
+ }
+ }
+ /* Notify the client */
+ if (notify) {
+ addReply(c,shared.mbulk3);
+ addReply(c,shared.unsubscribebulk);
+ addReplyBulk(c,channel);
+ addReplyLongLong(c,dictSize(c->pubsub_channels)+
+ listLength(c->pubsub_patterns));
+
+ }
+ decrRefCount(channel); /* it is finally safe to release it */
+ return retval;
+}
+
+/* Subscribe a client to a pattern. Returns 1 if the operation succeeded, or 0 if the clinet was already subscribed to that pattern. */
+int pubsubSubscribePattern(redisClient *c, robj *pattern) {
+ int retval = 0;
+
+ if (listSearchKey(c->pubsub_patterns,pattern) == NULL) {
+ retval = 1;
+ pubsubPattern *pat;
+ listAddNodeTail(c->pubsub_patterns,pattern);
+ incrRefCount(pattern);
+ pat = zmalloc(sizeof(*pat));
+ pat->pattern = getDecodedObject(pattern);
+ pat->client = c;
+ listAddNodeTail(server.pubsub_patterns,pat);
+ }
+ /* Notify the client */
+ addReply(c,shared.mbulk3);
+ addReply(c,shared.psubscribebulk);
+ addReplyBulk(c,pattern);
+ addReplyLongLong(c,dictSize(c->pubsub_channels)+listLength(c->pubsub_patterns));
+ return retval;
+}
+
+/* Unsubscribe a client from a channel. Returns 1 if the operation succeeded, or
+ * 0 if the client was not subscribed to the specified channel. */
+int pubsubUnsubscribePattern(redisClient *c, robj *pattern, int notify) {
+ listNode *ln;
+ pubsubPattern pat;
+ int retval = 0;
+
+ incrRefCount(pattern); /* Protect the object. May be the same we remove */
+ if ((ln = listSearchKey(c->pubsub_patterns,pattern)) != NULL) {
+ retval = 1;
+ listDelNode(c->pubsub_patterns,ln);
+ pat.client = c;
+ pat.pattern = pattern;
+ ln = listSearchKey(server.pubsub_patterns,&pat);
+ listDelNode(server.pubsub_patterns,ln);
+ }
+ /* Notify the client */
+ if (notify) {
+ addReply(c,shared.mbulk3);
+ addReply(c,shared.punsubscribebulk);
+ addReplyBulk(c,pattern);
+ addReplyLongLong(c,dictSize(c->pubsub_channels)+
+ listLength(c->pubsub_patterns));
+ }
+ decrRefCount(pattern);
+ return retval;
+}
+
+/* Unsubscribe from all the channels. Return the number of channels the
+ * client was subscribed from. */
+int pubsubUnsubscribeAllChannels(redisClient *c, int notify) {
+ dictIterator *di = dictGetIterator(c->pubsub_channels);
+ dictEntry *de;
+ int count = 0;
+
+ while((de = dictNext(di)) != NULL) {
+ robj *channel = dictGetEntryKey(de);
+
+ count += pubsubUnsubscribeChannel(c,channel,notify);
+ }
+ dictReleaseIterator(di);
+ return count;
+}
+
+/* Unsubscribe from all the patterns. Return the number of patterns the
+ * client was subscribed from. */
+int pubsubUnsubscribeAllPatterns(redisClient *c, int notify) {
+ listNode *ln;
+ listIter li;
+ int count = 0;
+
+ listRewind(c->pubsub_patterns,&li);
+ while ((ln = listNext(&li)) != NULL) {
+ robj *pattern = ln->value;
+
+ count += pubsubUnsubscribePattern(c,pattern,notify);
+ }
+ return count;
+}
+
+/* Publish a message */
+int pubsubPublishMessage(robj *channel, robj *message) {
+ int receivers = 0;
+ struct dictEntry *de;
+ listNode *ln;
+ listIter li;
+
+ /* Send to clients listening for that channel */
+ de = dictFind(server.pubsub_channels,channel);
+ if (de) {
+ list *list = dictGetEntryVal(de);
+ listNode *ln;
+ listIter li;
+
+ listRewind(list,&li);
+ while ((ln = listNext(&li)) != NULL) {
+ redisClient *c = ln->value;
+
+ addReply(c,shared.mbulk3);
+ addReply(c,shared.messagebulk);
+ addReplyBulk(c,channel);
+ addReplyBulk(c,message);
+ receivers++;
+ }
+ }
+ /* Send to clients listening to matching channels */
+ if (listLength(server.pubsub_patterns)) {
+ listRewind(server.pubsub_patterns,&li);
+ channel = getDecodedObject(channel);
+ while ((ln = listNext(&li)) != NULL) {
+ pubsubPattern *pat = ln->value;
+
+ if (stringmatchlen((char*)pat->pattern->ptr,
+ sdslen(pat->pattern->ptr),
+ (char*)channel->ptr,
+ sdslen(channel->ptr),0)) {
+ addReply(pat->client,shared.mbulk4);
+ addReply(pat->client,shared.pmessagebulk);
+ addReplyBulk(pat->client,pat->pattern);
+ addReplyBulk(pat->client,channel);
+ addReplyBulk(pat->client,message);
+ receivers++;
+ }
+ }
+ decrRefCount(channel);
+ }
+ return receivers;
+}
+
+void subscribeCommand(redisClient *c) {
+ int j;
+
+ for (j = 1; j < c->argc; j++)
+ pubsubSubscribeChannel(c,c->argv[j]);
+}
+
+void unsubscribeCommand(redisClient *c) {
+ if (c->argc == 1) {
+ pubsubUnsubscribeAllChannels(c,1);
+ return;
+ } else {
+ int j;
+
+ for (j = 1; j < c->argc; j++)
+ pubsubUnsubscribeChannel(c,c->argv[j],1);
+ }
+}
+
+void psubscribeCommand(redisClient *c) {
+ int j;
+
+ for (j = 1; j < c->argc; j++)
+ pubsubSubscribePattern(c,c->argv[j]);
+}
+
+void punsubscribeCommand(redisClient *c) {
+ if (c->argc == 1) {
+ pubsubUnsubscribeAllPatterns(c,1);
+ return;
+ } else {
+ int j;
+
+ for (j = 1; j < c->argc; j++)
+ pubsubUnsubscribePattern(c,c->argv[j],1);
+ }
+}
+
+void publishCommand(redisClient *c) {
+ int receivers = pubsubPublishMessage(c->argv[1],c->argv[2]);
+ addReplyLongLong(c,receivers);
+}
diff --git a/src/rdb.c b/src/rdb.c
new file mode 100644
index 000000000..5bda5e565
--- /dev/null
+++ b/src/rdb.c
@@ -0,0 +1,886 @@
+#include "redis.h"
+#include "lzf.h" /* LZF compression library */
+
+#include <math.h>
+
+int rdbSaveType(FILE *fp, unsigned char type) {
+ if (fwrite(&type,1,1,fp) == 0) return -1;
+ return 0;
+}
+
+int rdbSaveTime(FILE *fp, time_t t) {
+ int32_t t32 = (int32_t) t;
+ if (fwrite(&t32,4,1,fp) == 0) return -1;
+ return 0;
+}
+
+/* check rdbLoadLen() comments for more info */
+int rdbSaveLen(FILE *fp, uint32_t len) {
+ unsigned char buf[2];
+
+ if (len < (1<<6)) {
+ /* Save a 6 bit len */
+ buf[0] = (len&0xFF)|(REDIS_RDB_6BITLEN<<6);
+ if (fwrite(buf,1,1,fp) == 0) return -1;
+ } else if (len < (1<<14)) {
+ /* Save a 14 bit len */
+ buf[0] = ((len>>8)&0xFF)|(REDIS_RDB_14BITLEN<<6);
+ buf[1] = len&0xFF;
+ if (fwrite(buf,2,1,fp) == 0) return -1;
+ } else {
+ /* Save a 32 bit len */
+ buf[0] = (REDIS_RDB_32BITLEN<<6);
+ if (fwrite(buf,1,1,fp) == 0) return -1;
+ len = htonl(len);
+ if (fwrite(&len,4,1,fp) == 0) return -1;
+ }
+ return 0;
+}
+
+/* Encode 'value' as an integer if possible (if integer will fit the
+ * supported range). If the function sucessful encoded the integer
+ * then the (up to 5 bytes) encoded representation is written in the
+ * string pointed by 'enc' and the length is returned. Otherwise
+ * 0 is returned. */
+int rdbEncodeInteger(long long value, unsigned char *enc) {
+ /* Finally check if it fits in our ranges */
+ if (value >= -(1<<7) && value <= (1<<7)-1) {
+ enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT8;
+ enc[1] = value&0xFF;
+ return 2;
+ } else if (value >= -(1<<15) && value <= (1<<15)-1) {
+ enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT16;
+ enc[1] = value&0xFF;
+ enc[2] = (value>>8)&0xFF;
+ return 3;
+ } else if (value >= -((long long)1<<31) && value <= ((long long)1<<31)-1) {
+ enc[0] = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_INT32;
+ enc[1] = value&0xFF;
+ enc[2] = (value>>8)&0xFF;
+ enc[3] = (value>>16)&0xFF;
+ enc[4] = (value>>24)&0xFF;
+ return 5;
+ } else {
+ return 0;
+ }
+}
+
+/* String objects in the form "2391" "-100" without any space and with a
+ * range of values that can fit in an 8, 16 or 32 bit signed value can be
+ * encoded as integers to save space */
+int rdbTryIntegerEncoding(char *s, size_t len, unsigned char *enc) {
+ long long value;
+ char *endptr, buf[32];
+
+ /* Check if it's possible to encode this value as a number */
+ value = strtoll(s, &endptr, 10);
+ if (endptr[0] != '\0') return 0;
+ ll2string(buf,32,value);
+
+ /* If the number converted back into a string is not identical
+ * then it's not possible to encode the string as integer */
+ if (strlen(buf) != len || memcmp(buf,s,len)) return 0;
+
+ return rdbEncodeInteger(value,enc);
+}
+
+int rdbSaveLzfStringObject(FILE *fp, unsigned char *s, size_t len) {
+ size_t comprlen, outlen;
+ unsigned char byte;
+ void *out;
+
+ /* We require at least four bytes compression for this to be worth it */
+ if (len <= 4) return 0;
+ outlen = len-4;
+ if ((out = zmalloc(outlen+1)) == NULL) return 0;
+ comprlen = lzf_compress(s, len, out, outlen);
+ if (comprlen == 0) {
+ zfree(out);
+ return 0;
+ }
+ /* Data compressed! Let's save it on disk */
+ byte = (REDIS_RDB_ENCVAL<<6)|REDIS_RDB_ENC_LZF;
+ if (fwrite(&byte,1,1,fp) == 0) goto writeerr;
+ if (rdbSaveLen(fp,comprlen) == -1) goto writeerr;
+ if (rdbSaveLen(fp,len) == -1) goto writeerr;
+ if (fwrite(out,comprlen,1,fp) == 0) goto writeerr;
+ zfree(out);
+ return comprlen;
+
+writeerr:
+ zfree(out);
+ return -1;
+}
+
+/* Save a string objet as [len][data] on disk. If the object is a string
+ * representation of an integer value we try to safe it in a special form */
+int rdbSaveRawString(FILE *fp, unsigned char *s, size_t len) {
+ int enclen;
+
+ /* Try integer encoding */
+ if (len <= 11) {
+ unsigned char buf[5];
+ if ((enclen = rdbTryIntegerEncoding((char*)s,len,buf)) > 0) {
+ if (fwrite(buf,enclen,1,fp) == 0) return -1;
+ return 0;
+ }
+ }
+
+ /* Try LZF compression - under 20 bytes it's unable to compress even
+ * aaaaaaaaaaaaaaaaaa so skip it */
+ if (server.rdbcompression && len > 20) {
+ int retval;
+
+ retval = rdbSaveLzfStringObject(fp,s,len);
+ if (retval == -1) return -1;
+ if (retval > 0) return 0;
+ /* retval == 0 means data can't be compressed, save the old way */
+ }
+
+ /* Store verbatim */
+ if (rdbSaveLen(fp,len) == -1) return -1;
+ if (len && fwrite(s,len,1,fp) == 0) return -1;
+ return 0;
+}
+
+/* Save a long long value as either an encoded string or a string. */
+int rdbSaveLongLongAsStringObject(FILE *fp, long long value) {
+ unsigned char buf[32];
+ int enclen = rdbEncodeInteger(value,buf);
+ if (enclen > 0) {
+ if (fwrite(buf,enclen,1,fp) == 0) return -1;
+ } else {
+ /* Encode as string */
+ enclen = ll2string((char*)buf,32,value);
+ redisAssert(enclen < 32);
+ if (rdbSaveLen(fp,enclen) == -1) return -1;
+ if (fwrite(buf,enclen,1,fp) == 0) return -1;
+ }
+ return 0;
+}
+
+/* Like rdbSaveStringObjectRaw() but handle encoded objects */
+int rdbSaveStringObject(FILE *fp, robj *obj) {
+ /* Avoid to decode the object, then encode it again, if the
+ * object is alrady integer encoded. */
+ if (obj->encoding == REDIS_ENCODING_INT) {
+ return rdbSaveLongLongAsStringObject(fp,(long)obj->ptr);
+ } else {
+ redisAssert(obj->encoding == REDIS_ENCODING_RAW);
+ return rdbSaveRawString(fp,obj->ptr,sdslen(obj->ptr));
+ }
+}
+
+/* Save a double value. Doubles are saved as strings prefixed by an unsigned
+ * 8 bit integer specifing the length of the representation.
+ * This 8 bit integer has special values in order to specify the following
+ * conditions:
+ * 253: not a number
+ * 254: + inf
+ * 255: - inf
+ */
+int rdbSaveDoubleValue(FILE *fp, double val) {
+ unsigned char buf[128];
+ int len;
+
+ if (isnan(val)) {
+ buf[0] = 253;
+ len = 1;
+ } else if (!isfinite(val)) {
+ len = 1;
+ buf[0] = (val < 0) ? 255 : 254;
+ } else {
+#if (DBL_MANT_DIG >= 52) && (LLONG_MAX == 0x7fffffffffffffffLL)
+ /* Check if the float is in a safe range to be casted into a
+ * long long. We are assuming that long long is 64 bit here.
+ * Also we are assuming that there are no implementations around where
+ * double has precision < 52 bit.
+ *
+ * Under this assumptions we test if a double is inside an interval
+ * where casting to long long is safe. Then using two castings we
+ * make sure the decimal part is zero. If all this is true we use
+ * integer printing function that is much faster. */
+ double min = -4503599627370495; /* (2^52)-1 */
+ double max = 4503599627370496; /* -(2^52) */
+ if (val > min && val < max && val == ((double)((long long)val)))
+ ll2string((char*)buf+1,sizeof(buf),(long long)val);
+ else
+#endif
+ snprintf((char*)buf+1,sizeof(buf)-1,"%.17g",val);
+ buf[0] = strlen((char*)buf+1);
+ len = buf[0]+1;
+ }
+ if (fwrite(buf,len,1,fp) == 0) return -1;
+ return 0;
+}
+
+/* Save a Redis object. */
+int rdbSaveObject(FILE *fp, robj *o) {
+ if (o->type == REDIS_STRING) {
+ /* Save a string value */
+ if (rdbSaveStringObject(fp,o) == -1) return -1;
+ } else if (o->type == REDIS_LIST) {
+ /* Save a list value */
+ if (o->encoding == REDIS_ENCODING_ZIPLIST) {
+ unsigned char *p;
+ unsigned char *vstr;
+ unsigned int vlen;
+ long long vlong;
+
+ if (rdbSaveLen(fp,ziplistLen(o->ptr)) == -1) return -1;
+ p = ziplistIndex(o->ptr,0);
+ while(ziplistGet(p,&vstr,&vlen,&vlong)) {
+ if (vstr) {
+ if (rdbSaveRawString(fp,vstr,vlen) == -1)
+ return -1;
+ } else {
+ if (rdbSaveLongLongAsStringObject(fp,vlong) == -1)
+ return -1;
+ }
+ p = ziplistNext(o->ptr,p);
+ }
+ } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) {
+ list *list = o->ptr;
+ listIter li;
+ listNode *ln;
+
+ if (rdbSaveLen(fp,listLength(list)) == -1) return -1;
+ listRewind(list,&li);
+ while((ln = listNext(&li))) {
+ robj *eleobj = listNodeValue(ln);
+ if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
+ }
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+ } else if (o->type == REDIS_SET) {
+ /* Save a set value */
+ dict *set = o->ptr;
+ dictIterator *di = dictGetIterator(set);
+ dictEntry *de;
+
+ if (rdbSaveLen(fp,dictSize(set)) == -1) return -1;
+ while((de = dictNext(di)) != NULL) {
+ robj *eleobj = dictGetEntryKey(de);
+
+ if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
+ }
+ dictReleaseIterator(di);
+ } else if (o->type == REDIS_ZSET) {
+ /* Save a set value */
+ zset *zs = o->ptr;
+ dictIterator *di = dictGetIterator(zs->dict);
+ dictEntry *de;
+
+ if (rdbSaveLen(fp,dictSize(zs->dict)) == -1) return -1;
+ while((de = dictNext(di)) != NULL) {
+ robj *eleobj = dictGetEntryKey(de);
+ double *score = dictGetEntryVal(de);
+
+ if (rdbSaveStringObject(fp,eleobj) == -1) return -1;
+ if (rdbSaveDoubleValue(fp,*score) == -1) return -1;
+ }
+ dictReleaseIterator(di);
+ } else if (o->type == REDIS_HASH) {
+ /* Save a hash value */
+ if (o->encoding == REDIS_ENCODING_ZIPMAP) {
+ unsigned char *p = zipmapRewind(o->ptr);
+ unsigned int count = zipmapLen(o->ptr);
+ unsigned char *key, *val;
+ unsigned int klen, vlen;
+
+ if (rdbSaveLen(fp,count) == -1) return -1;
+ while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
+ if (rdbSaveRawString(fp,key,klen) == -1) return -1;
+ if (rdbSaveRawString(fp,val,vlen) == -1) return -1;
+ }
+ } else {
+ dictIterator *di = dictGetIterator(o->ptr);
+ dictEntry *de;
+
+ if (rdbSaveLen(fp,dictSize((dict*)o->ptr)) == -1) return -1;
+ while((de = dictNext(di)) != NULL) {
+ robj *key = dictGetEntryKey(de);
+ robj *val = dictGetEntryVal(de);
+
+ if (rdbSaveStringObject(fp,key) == -1) return -1;
+ if (rdbSaveStringObject(fp,val) == -1) return -1;
+ }
+ dictReleaseIterator(di);
+ }
+ } else {
+ redisPanic("Unknown object type");
+ }
+ return 0;
+}
+
+/* Return the length the object will have on disk if saved with
+ * the rdbSaveObject() function. Currently we use a trick to get
+ * this length with very little changes to the code. In the future
+ * we could switch to a faster solution. */
+off_t rdbSavedObjectLen(robj *o, FILE *fp) {
+ if (fp == NULL) fp = server.devnull;
+ rewind(fp);
+ redisAssert(rdbSaveObject(fp,o) != 1);
+ return ftello(fp);
+}
+
+/* Return the number of pages required to save this object in the swap file */
+off_t rdbSavedObjectPages(robj *o, FILE *fp) {
+ off_t bytes = rdbSavedObjectLen(o,fp);
+
+ return (bytes+(server.vm_page_size-1))/server.vm_page_size;
+}
+
+/* Save the DB on disk. Return REDIS_ERR on error, REDIS_OK on success */
+int rdbSave(char *filename) {
+ dictIterator *di = NULL;
+ dictEntry *de;
+ FILE *fp;
+ char tmpfile[256];
+ int j;
+ time_t now = time(NULL);
+
+ /* Wait for I/O therads to terminate, just in case this is a
+ * foreground-saving, to avoid seeking the swap file descriptor at the
+ * same time. */
+ if (server.vm_enabled)
+ waitEmptyIOJobsQueue();
+
+ snprintf(tmpfile,256,"temp-%d.rdb", (int) getpid());
+ fp = fopen(tmpfile,"w");
+ if (!fp) {
+ redisLog(REDIS_WARNING, "Failed saving the DB: %s", strerror(errno));
+ return REDIS_ERR;
+ }
+ if (fwrite("REDIS0001",9,1,fp) == 0) goto werr;
+ for (j = 0; j < server.dbnum; j++) {
+ redisDb *db = server.db+j;
+ dict *d = db->dict;
+ if (dictSize(d) == 0) continue;
+ di = dictGetIterator(d);
+ if (!di) {
+ fclose(fp);
+ return REDIS_ERR;
+ }
+
+ /* Write the SELECT DB opcode */
+ if (rdbSaveType(fp,REDIS_SELECTDB) == -1) goto werr;
+ if (rdbSaveLen(fp,j) == -1) goto werr;
+
+ /* Iterate this DB writing every entry */
+ while((de = dictNext(di)) != NULL) {
+ sds keystr = dictGetEntryKey(de);
+ robj key, *o = dictGetEntryVal(de);
+ time_t expiretime;
+
+ initStaticStringObject(key,keystr);
+ expiretime = getExpire(db,&key);
+
+ /* Save the expire time */
+ if (expiretime != -1) {
+ /* If this key is already expired skip it */
+ if (expiretime < now) continue;
+ if (rdbSaveType(fp,REDIS_EXPIRETIME) == -1) goto werr;
+ if (rdbSaveTime(fp,expiretime) == -1) goto werr;
+ }
+ /* Save the key and associated value. This requires special
+ * handling if the value is swapped out. */
+ if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
+ o->storage == REDIS_VM_SWAPPING) {
+ /* Save type, key, value */
+ if (rdbSaveType(fp,o->type) == -1) goto werr;
+ if (rdbSaveStringObject(fp,&key) == -1) goto werr;
+ if (rdbSaveObject(fp,o) == -1) goto werr;
+ } else {
+ /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
+ robj *po;
+ /* Get a preview of the object in memory */
+ po = vmPreviewObject(o);
+ /* Save type, key, value */
+ if (rdbSaveType(fp,po->type) == -1) goto werr;
+ if (rdbSaveStringObject(fp,&key) == -1) goto werr;
+ if (rdbSaveObject(fp,po) == -1) goto werr;
+ /* Remove the loaded object from memory */
+ decrRefCount(po);
+ }
+ }
+ dictReleaseIterator(di);
+ }
+ /* EOF opcode */
+ if (rdbSaveType(fp,REDIS_EOF) == -1) goto werr;
+
+ /* Make sure data will not remain on the OS's output buffers */
+ fflush(fp);
+ fsync(fileno(fp));
+ fclose(fp);
+
+ /* Use RENAME to make sure the DB file is changed atomically only
+ * if the generate DB file is ok. */
+ if (rename(tmpfile,filename) == -1) {
+ redisLog(REDIS_WARNING,"Error moving temp DB file on the final destination: %s", strerror(errno));
+ unlink(tmpfile);
+ return REDIS_ERR;
+ }
+ redisLog(REDIS_NOTICE,"DB saved on disk");
+ server.dirty = 0;
+ server.lastsave = time(NULL);
+ return REDIS_OK;
+
+werr:
+ fclose(fp);
+ unlink(tmpfile);
+ redisLog(REDIS_WARNING,"Write error saving DB on disk: %s", strerror(errno));
+ if (di) dictReleaseIterator(di);
+ return REDIS_ERR;
+}
+
+int rdbSaveBackground(char *filename) {
+ pid_t childpid;
+
+ if (server.bgsavechildpid != -1) return REDIS_ERR;
+ if (server.vm_enabled) waitEmptyIOJobsQueue();
+ if ((childpid = fork()) == 0) {
+ /* Child */
+ if (server.vm_enabled) vmReopenSwapFile();
+ close(server.fd);
+ if (rdbSave(filename) == REDIS_OK) {
+ _exit(0);
+ } else {
+ _exit(1);
+ }
+ } else {
+ /* Parent */
+ if (childpid == -1) {
+ redisLog(REDIS_WARNING,"Can't save in background: fork: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ redisLog(REDIS_NOTICE,"Background saving started by pid %d",childpid);
+ server.bgsavechildpid = childpid;
+ updateDictResizePolicy();
+ return REDIS_OK;
+ }
+ return REDIS_OK; /* unreached */
+}
+
+void rdbRemoveTempFile(pid_t childpid) {
+ char tmpfile[256];
+
+ snprintf(tmpfile,256,"temp-%d.rdb", (int) childpid);
+ unlink(tmpfile);
+}
+
+int rdbLoadType(FILE *fp) {
+ unsigned char type;
+ if (fread(&type,1,1,fp) == 0) return -1;
+ return type;
+}
+
+time_t rdbLoadTime(FILE *fp) {
+ int32_t t32;
+ if (fread(&t32,4,1,fp) == 0) return -1;
+ return (time_t) t32;
+}
+
+/* Load an encoded length from the DB, see the REDIS_RDB_* defines on the top
+ * of this file for a description of how this are stored on disk.
+ *
+ * isencoded is set to 1 if the readed length is not actually a length but
+ * an "encoding type", check the above comments for more info */
+uint32_t rdbLoadLen(FILE *fp, int *isencoded) {
+ unsigned char buf[2];
+ uint32_t len;
+ int type;
+
+ if (isencoded) *isencoded = 0;
+ if (fread(buf,1,1,fp) == 0) return REDIS_RDB_LENERR;
+ type = (buf[0]&0xC0)>>6;
+ if (type == REDIS_RDB_6BITLEN) {
+ /* Read a 6 bit len */
+ return buf[0]&0x3F;
+ } else if (type == REDIS_RDB_ENCVAL) {
+ /* Read a 6 bit len encoding type */
+ if (isencoded) *isencoded = 1;
+ return buf[0]&0x3F;
+ } else if (type == REDIS_RDB_14BITLEN) {
+ /* Read a 14 bit len */
+ if (fread(buf+1,1,1,fp) == 0) return REDIS_RDB_LENERR;
+ return ((buf[0]&0x3F)<<8)|buf[1];
+ } else {
+ /* Read a 32 bit len */
+ if (fread(&len,4,1,fp) == 0) return REDIS_RDB_LENERR;
+ return ntohl(len);
+ }
+}
+
+/* Load an integer-encoded object from file 'fp', with the specified
+ * encoding type 'enctype'. If encode is true the function may return
+ * an integer-encoded object as reply, otherwise the returned object
+ * will always be encoded as a raw string. */
+robj *rdbLoadIntegerObject(FILE *fp, int enctype, int encode) {
+ unsigned char enc[4];
+ long long val;
+
+ if (enctype == REDIS_RDB_ENC_INT8) {
+ if (fread(enc,1,1,fp) == 0) return NULL;
+ val = (signed char)enc[0];
+ } else if (enctype == REDIS_RDB_ENC_INT16) {
+ uint16_t v;
+ if (fread(enc,2,1,fp) == 0) return NULL;
+ v = enc[0]|(enc[1]<<8);
+ val = (int16_t)v;
+ } else if (enctype == REDIS_RDB_ENC_INT32) {
+ uint32_t v;
+ if (fread(enc,4,1,fp) == 0) return NULL;
+ v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
+ val = (int32_t)v;
+ } else {
+ val = 0; /* anti-warning */
+ redisPanic("Unknown RDB integer encoding type");
+ }
+ if (encode)
+ return createStringObjectFromLongLong(val);
+ else
+ return createObject(REDIS_STRING,sdsfromlonglong(val));
+}
+
+robj *rdbLoadLzfStringObject(FILE*fp) {
+ unsigned int len, clen;
+ unsigned char *c = NULL;
+ sds val = NULL;
+
+ if ((clen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+ if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+ if ((c = zmalloc(clen)) == NULL) goto err;
+ if ((val = sdsnewlen(NULL,len)) == NULL) goto err;
+ if (fread(c,clen,1,fp) == 0) goto err;
+ if (lzf_decompress(c,clen,val,len) == 0) goto err;
+ zfree(c);
+ return createObject(REDIS_STRING,val);
+err:
+ zfree(c);
+ sdsfree(val);
+ return NULL;
+}
+
+robj *rdbGenericLoadStringObject(FILE*fp, int encode) {
+ int isencoded;
+ uint32_t len;
+ sds val;
+
+ len = rdbLoadLen(fp,&isencoded);
+ if (isencoded) {
+ switch(len) {
+ case REDIS_RDB_ENC_INT8:
+ case REDIS_RDB_ENC_INT16:
+ case REDIS_RDB_ENC_INT32:
+ return rdbLoadIntegerObject(fp,len,encode);
+ case REDIS_RDB_ENC_LZF:
+ return rdbLoadLzfStringObject(fp);
+ default:
+ redisPanic("Unknown RDB encoding type");
+ }
+ }
+
+ if (len == REDIS_RDB_LENERR) return NULL;
+ val = sdsnewlen(NULL,len);
+ if (len && fread(val,len,1,fp) == 0) {
+ sdsfree(val);
+ return NULL;
+ }
+ return createObject(REDIS_STRING,val);
+}
+
+robj *rdbLoadStringObject(FILE *fp) {
+ return rdbGenericLoadStringObject(fp,0);
+}
+
+robj *rdbLoadEncodedStringObject(FILE *fp) {
+ return rdbGenericLoadStringObject(fp,1);
+}
+
+/* For information about double serialization check rdbSaveDoubleValue() */
+int rdbLoadDoubleValue(FILE *fp, double *val) {
+ char buf[128];
+ unsigned char len;
+
+ if (fread(&len,1,1,fp) == 0) return -1;
+ switch(len) {
+ case 255: *val = R_NegInf; return 0;
+ case 254: *val = R_PosInf; return 0;
+ case 253: *val = R_Nan; return 0;
+ default:
+ if (fread(buf,len,1,fp) == 0) return -1;
+ buf[len] = '\0';
+ sscanf(buf, "%lg", val);
+ return 0;
+ }
+}
+
+/* Load a Redis object of the specified type from the specified file.
+ * On success a newly allocated object is returned, otherwise NULL. */
+robj *rdbLoadObject(int type, FILE *fp) {
+ robj *o, *ele, *dec;
+ size_t len;
+
+ redisLog(REDIS_DEBUG,"LOADING OBJECT %d (at %d)\n",type,ftell(fp));
+ if (type == REDIS_STRING) {
+ /* Read string value */
+ if ((o = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
+ o = tryObjectEncoding(o);
+ } else if (type == REDIS_LIST) {
+ /* Read list value */
+ if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+
+ /* Use a real list when there are too many entries */
+ if (len > server.list_max_ziplist_entries) {
+ o = createListObject();
+ } else {
+ o = createZiplistObject();
+ }
+
+ /* Load every single element of the list */
+ while(len--) {
+ if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
+
+ /* If we are using a ziplist and the value is too big, convert
+ * the object to a real list. */
+ if (o->encoding == REDIS_ENCODING_ZIPLIST &&
+ ele->encoding == REDIS_ENCODING_RAW &&
+ sdslen(ele->ptr) > server.list_max_ziplist_value)
+ listTypeConvert(o,REDIS_ENCODING_LINKEDLIST);
+
+ if (o->encoding == REDIS_ENCODING_ZIPLIST) {
+ dec = getDecodedObject(ele);
+ o->ptr = ziplistPush(o->ptr,dec->ptr,sdslen(dec->ptr),REDIS_TAIL);
+ decrRefCount(dec);
+ decrRefCount(ele);
+ } else {
+ ele = tryObjectEncoding(ele);
+ listAddNodeTail(o->ptr,ele);
+ }
+ }
+ } else if (type == REDIS_SET) {
+ /* Read list/set value */
+ if ((len = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+ o = createSetObject();
+ /* It's faster to expand the dict to the right size asap in order
+ * to avoid rehashing */
+ if (len > DICT_HT_INITIAL_SIZE)
+ dictExpand(o->ptr,len);
+ /* Load every single element of the list/set */
+ while(len--) {
+ if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
+ ele = tryObjectEncoding(ele);
+ dictAdd((dict*)o->ptr,ele,NULL);
+ }
+ } else if (type == REDIS_ZSET) {
+ /* Read list/set value */
+ size_t zsetlen;
+ zset *zs;
+
+ if ((zsetlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+ o = createZsetObject();
+ zs = o->ptr;
+ /* Load every single element of the list/set */
+ while(zsetlen--) {
+ robj *ele;
+ double *score = zmalloc(sizeof(double));
+
+ if ((ele = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
+ ele = tryObjectEncoding(ele);
+ if (rdbLoadDoubleValue(fp,score) == -1) return NULL;
+ dictAdd(zs->dict,ele,score);
+ zslInsert(zs->zsl,*score,ele);
+ incrRefCount(ele); /* added to skiplist */
+ }
+ } else if (type == REDIS_HASH) {
+ size_t hashlen;
+
+ if ((hashlen = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR) return NULL;
+ o = createHashObject();
+ /* Too many entries? Use an hash table. */
+ if (hashlen > server.hash_max_zipmap_entries)
+ convertToRealHash(o);
+ /* Load every key/value, then set it into the zipmap or hash
+ * table, as needed. */
+ while(hashlen--) {
+ robj *key, *val;
+
+ if ((key = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
+ if ((val = rdbLoadEncodedStringObject(fp)) == NULL) return NULL;
+ /* If we are using a zipmap and there are too big values
+ * the object is converted to real hash table encoding. */
+ if (o->encoding != REDIS_ENCODING_HT &&
+ ((key->encoding == REDIS_ENCODING_RAW &&
+ sdslen(key->ptr) > server.hash_max_zipmap_value) ||
+ (val->encoding == REDIS_ENCODING_RAW &&
+ sdslen(val->ptr) > server.hash_max_zipmap_value)))
+ {
+ convertToRealHash(o);
+ }
+
+ if (o->encoding == REDIS_ENCODING_ZIPMAP) {
+ unsigned char *zm = o->ptr;
+ robj *deckey, *decval;
+
+ /* We need raw string objects to add them to the zipmap */
+ deckey = getDecodedObject(key);
+ decval = getDecodedObject(val);
+ zm = zipmapSet(zm,deckey->ptr,sdslen(deckey->ptr),
+ decval->ptr,sdslen(decval->ptr),NULL);
+ o->ptr = zm;
+ decrRefCount(deckey);
+ decrRefCount(decval);
+ decrRefCount(key);
+ decrRefCount(val);
+ } else {
+ key = tryObjectEncoding(key);
+ val = tryObjectEncoding(val);
+ dictAdd((dict*)o->ptr,key,val);
+ }
+ }
+ } else {
+ redisPanic("Unknown object type");
+ }
+ return o;
+}
+
+int rdbLoad(char *filename) {
+ FILE *fp;
+ uint32_t dbid;
+ int type, retval, rdbver;
+ int swap_all_values = 0;
+ redisDb *db = server.db+0;
+ char buf[1024];
+ time_t expiretime, now = time(NULL);
+
+ fp = fopen(filename,"r");
+ if (!fp) return REDIS_ERR;
+ if (fread(buf,9,1,fp) == 0) goto eoferr;
+ buf[9] = '\0';
+ if (memcmp(buf,"REDIS",5) != 0) {
+ fclose(fp);
+ redisLog(REDIS_WARNING,"Wrong signature trying to load DB from file");
+ return REDIS_ERR;
+ }
+ rdbver = atoi(buf+5);
+ if (rdbver != 1) {
+ fclose(fp);
+ redisLog(REDIS_WARNING,"Can't handle RDB format version %d",rdbver);
+ return REDIS_ERR;
+ }
+ while(1) {
+ robj *key, *val;
+ int force_swapout;
+
+ expiretime = -1;
+ /* Read type. */
+ if ((type = rdbLoadType(fp)) == -1) goto eoferr;
+ if (type == REDIS_EXPIRETIME) {
+ if ((expiretime = rdbLoadTime(fp)) == -1) goto eoferr;
+ /* We read the time so we need to read the object type again */
+ if ((type = rdbLoadType(fp)) == -1) goto eoferr;
+ }
+ if (type == REDIS_EOF) break;
+ /* Handle SELECT DB opcode as a special case */
+ if (type == REDIS_SELECTDB) {
+ if ((dbid = rdbLoadLen(fp,NULL)) == REDIS_RDB_LENERR)
+ goto eoferr;
+ if (dbid >= (unsigned)server.dbnum) {
+ redisLog(REDIS_WARNING,"FATAL: Data file was created with a Redis server configured to handle more than %d databases. Exiting\n", server.dbnum);
+ exit(1);
+ }
+ db = server.db+dbid;
+ continue;
+ }
+ /* Read key */
+ if ((key = rdbLoadStringObject(fp)) == NULL) goto eoferr;
+ /* Read value */
+ if ((val = rdbLoadObject(type,fp)) == NULL) goto eoferr;
+ /* Check if the key already expired */
+ if (expiretime != -1 && expiretime < now) {
+ decrRefCount(key);
+ decrRefCount(val);
+ continue;
+ }
+ /* Add the new object in the hash table */
+ retval = dbAdd(db,key,val);
+ if (retval == REDIS_ERR) {
+ redisLog(REDIS_WARNING,"Loading DB, duplicated key (%s) found! Unrecoverable error, exiting now.", key->ptr);
+ exit(1);
+ }
+ /* Set the expire time if needed */
+ if (expiretime != -1) setExpire(db,key,expiretime);
+
+ /* Handle swapping while loading big datasets when VM is on */
+
+ /* If we detecter we are hopeless about fitting something in memory
+ * we just swap every new key on disk. Directly...
+ * Note that's important to check for this condition before resorting
+ * to random sampling, otherwise we may try to swap already
+ * swapped keys. */
+ if (swap_all_values) {
+ dictEntry *de = dictFind(db->dict,key->ptr);
+
+ /* de may be NULL since the key already expired */
+ if (de) {
+ vmpointer *vp;
+ val = dictGetEntryVal(de);
+
+ if (val->refcount == 1 &&
+ (vp = vmSwapObjectBlocking(val)) != NULL)
+ dictGetEntryVal(de) = vp;
+ }
+ decrRefCount(key);
+ continue;
+ }
+ decrRefCount(key);
+
+ /* Flush data on disk once 32 MB of additional RAM are used... */
+ force_swapout = 0;
+ if ((zmalloc_used_memory() - server.vm_max_memory) > 1024*1024*32)
+ force_swapout = 1;
+
+ /* If we have still some hope of having some value fitting memory
+ * then we try random sampling. */
+ if (!swap_all_values && server.vm_enabled && force_swapout) {
+ while (zmalloc_used_memory() > server.vm_max_memory) {
+ if (vmSwapOneObjectBlocking() == REDIS_ERR) break;
+ }
+ if (zmalloc_used_memory() > server.vm_max_memory)
+ swap_all_values = 1; /* We are already using too much mem */
+ }
+ }
+ fclose(fp);
+ return REDIS_OK;
+
+eoferr: /* unexpected end of file is handled here with a fatal exit */
+ redisLog(REDIS_WARNING,"Short read or OOM loading DB. Unrecoverable error, aborting now.");
+ exit(1);
+ return REDIS_ERR; /* Just to avoid warning */
+}
+
+/* A background saving child (BGSAVE) terminated its work. Handle this. */
+void backgroundSaveDoneHandler(int statloc) {
+ int exitcode = WEXITSTATUS(statloc);
+ int bysignal = WIFSIGNALED(statloc);
+
+ if (!bysignal && exitcode == 0) {
+ redisLog(REDIS_NOTICE,
+ "Background saving terminated with success");
+ server.dirty = 0;
+ server.lastsave = time(NULL);
+ } else if (!bysignal && exitcode != 0) {
+ redisLog(REDIS_WARNING, "Background saving error");
+ } else {
+ redisLog(REDIS_WARNING,
+ "Background saving terminated by signal %d", WTERMSIG(statloc));
+ rdbRemoveTempFile(server.bgsavechildpid);
+ }
+ server.bgsavechildpid = -1;
+ /* Possibly there are slaves waiting for a BGSAVE in order to be served
+ * (the first stage of SYNC is a bulk transfer of dump.rdb) */
+ updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
+}
diff --git a/src/redis-benchmark.c b/src/redis-benchmark.c
new file mode 100644
index 000000000..123d81180
--- /dev/null
+++ b/src/redis-benchmark.c
@@ -0,0 +1,665 @@
+/* Redis benchmark utility.
+ *
+ * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fmacros.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <assert.h>
+
+#include "ae.h"
+#include "anet.h"
+#include "sds.h"
+#include "adlist.h"
+#include "zmalloc.h"
+
+#define REPLY_INT 0
+#define REPLY_RETCODE 1
+#define REPLY_BULK 2
+#define REPLY_MBULK 3
+
+#define CLIENT_CONNECTING 0
+#define CLIENT_SENDQUERY 1
+#define CLIENT_READREPLY 2
+
+#define MAX_LATENCY 5000
+
+#define REDIS_NOTUSED(V) ((void) V)
+
+static struct config {
+ int debug;
+ int numclients;
+ int requests;
+ int liveclients;
+ int donerequests;
+ int keysize;
+ int datasize;
+ int randomkeys;
+ int randomkeys_keyspacelen;
+ aeEventLoop *el;
+ char *hostip;
+ int hostport;
+ int keepalive;
+ long long start;
+ long long totlatency;
+ int *latency;
+ list *clients;
+ int quiet;
+ int loop;
+ int idlemode;
+} config;
+
+typedef struct _client {
+ int state;
+ int fd;
+ sds obuf;
+ sds ibuf;
+ int mbulk; /* Number of elements in an mbulk reply */
+ int readlen; /* readlen == -1 means read a single line */
+ int totreceived;
+ unsigned int written; /* bytes of 'obuf' already written */
+ int replytype;
+ long long start; /* start time in milliseconds */
+} *client;
+
+/* Prototypes */
+static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask);
+static void createMissingClients(client c);
+
+/* Implementation */
+static long long mstime(void) {
+ struct timeval tv;
+ long long mst;
+
+ gettimeofday(&tv, NULL);
+ mst = ((long)tv.tv_sec)*1000;
+ mst += tv.tv_usec/1000;
+ return mst;
+}
+
+static void freeClient(client c) {
+ listNode *ln;
+
+ aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE);
+ aeDeleteFileEvent(config.el,c->fd,AE_READABLE);
+ sdsfree(c->ibuf);
+ sdsfree(c->obuf);
+ close(c->fd);
+ zfree(c);
+ config.liveclients--;
+ ln = listSearchKey(config.clients,c);
+ assert(ln != NULL);
+ listDelNode(config.clients,ln);
+}
+
+static void freeAllClients(void) {
+ listNode *ln = config.clients->head, *next;
+
+ while(ln) {
+ next = ln->next;
+ freeClient(ln->value);
+ ln = next;
+ }
+}
+
+static void resetClient(client c) {
+ aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE);
+ aeDeleteFileEvent(config.el,c->fd,AE_READABLE);
+ aeCreateFileEvent(config.el,c->fd, AE_WRITABLE,writeHandler,c);
+ sdsfree(c->ibuf);
+ c->ibuf = sdsempty();
+ c->readlen = (c->replytype == REPLY_BULK ||
+ c->replytype == REPLY_MBULK) ? -1 : 0;
+ c->mbulk = -1;
+ c->written = 0;
+ c->totreceived = 0;
+ c->state = CLIENT_SENDQUERY;
+ c->start = mstime();
+ createMissingClients(c);
+}
+
+static void randomizeClientKey(client c) {
+ char *p;
+ char buf[32];
+ long r;
+
+ p = strstr(c->obuf, "_rand");
+ if (!p) return;
+ p += 5;
+ r = random() % config.randomkeys_keyspacelen;
+ sprintf(buf,"%ld",r);
+ memcpy(p,buf,strlen(buf));
+}
+
+static void prepareClientForReply(client c, int type) {
+ if (type == REPLY_BULK) {
+ c->replytype = REPLY_BULK;
+ c->readlen = -1;
+ } else if (type == REPLY_MBULK) {
+ c->replytype = REPLY_MBULK;
+ c->readlen = -1;
+ c->mbulk = -1;
+ } else {
+ c->replytype = type;
+ c->readlen = 0;
+ }
+}
+
+static void clientDone(client c) {
+ static int last_tot_received = 1;
+
+ long long latency;
+ config.donerequests ++;
+ latency = mstime() - c->start;
+ if (latency > MAX_LATENCY) latency = MAX_LATENCY;
+ config.latency[latency]++;
+
+ if (config.debug && last_tot_received != c->totreceived) {
+ printf("Tot bytes received: %d\n", c->totreceived);
+ last_tot_received = c->totreceived;
+ }
+ if (config.donerequests == config.requests) {
+ freeClient(c);
+ aeStop(config.el);
+ return;
+ }
+ if (config.keepalive) {
+ resetClient(c);
+ if (config.randomkeys) randomizeClientKey(c);
+ } else {
+ config.liveclients--;
+ createMissingClients(c);
+ config.liveclients++;
+ freeClient(c);
+ }
+}
+
+static void readHandler(aeEventLoop *el, int fd, void *privdata, int mask)
+{
+ char buf[1024];
+ int nread;
+ client c = privdata;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(fd);
+ REDIS_NOTUSED(mask);
+
+ nread = read(c->fd, buf, 1024);
+ if (nread == -1) {
+ fprintf(stderr, "Reading from socket: %s\n", strerror(errno));
+ freeClient(c);
+ return;
+ }
+ if (nread == 0) {
+ fprintf(stderr, "EOF from client\n");
+ freeClient(c);
+ return;
+ }
+ c->totreceived += nread;
+ c->ibuf = sdscatlen(c->ibuf,buf,nread);
+
+processdata:
+ /* Are we waiting for the first line of the command of for sdf
+ * count in bulk or multi bulk operations? */
+ if (c->replytype == REPLY_INT ||
+ c->replytype == REPLY_RETCODE ||
+ (c->replytype == REPLY_BULK && c->readlen == -1) ||
+ (c->replytype == REPLY_MBULK && c->readlen == -1) ||
+ (c->replytype == REPLY_MBULK && c->mbulk == -1)) {
+ char *p;
+
+ /* Check if the first line is complete. This is only true if
+ * there is a newline inside the buffer. */
+ if ((p = strchr(c->ibuf,'\n')) != NULL) {
+ if (c->replytype == REPLY_BULK ||
+ (c->replytype == REPLY_MBULK && c->mbulk != -1))
+ {
+ /* Read the count of a bulk reply (being it a single bulk or
+ * a multi bulk reply). "$<count>" for the protocol spec. */
+ *p = '\0';
+ *(p-1) = '\0';
+ c->readlen = atoi(c->ibuf+1)+2;
+ // printf("BULK ATOI: %s\n", c->ibuf+1);
+ /* Handle null bulk reply "$-1" */
+ if (c->readlen-2 == -1) {
+ clientDone(c);
+ return;
+ }
+ /* Leave all the rest in the input buffer */
+ c->ibuf = sdsrange(c->ibuf,(p-c->ibuf)+1,-1);
+ /* fall through to reach the point where the code will try
+ * to check if the bulk reply is complete. */
+ } else if (c->replytype == REPLY_MBULK && c->mbulk == -1) {
+ /* Read the count of a multi bulk reply. That is, how many
+ * bulk replies we have to read next. "*<count>" protocol. */
+ *p = '\0';
+ *(p-1) = '\0';
+ c->mbulk = atoi(c->ibuf+1);
+ /* Handle null bulk reply "*-1" */
+ if (c->mbulk == -1) {
+ clientDone(c);
+ return;
+ }
+ // printf("%p) %d elements list\n", c, c->mbulk);
+ /* Leave all the rest in the input buffer */
+ c->ibuf = sdsrange(c->ibuf,(p-c->ibuf)+1,-1);
+ goto processdata;
+ } else {
+ c->ibuf = sdstrim(c->ibuf,"\r\n");
+ clientDone(c);
+ return;
+ }
+ }
+ }
+ /* bulk read, did we read everything? */
+ if (((c->replytype == REPLY_MBULK && c->mbulk != -1) ||
+ (c->replytype == REPLY_BULK)) && c->readlen != -1 &&
+ (unsigned)c->readlen <= sdslen(c->ibuf))
+ {
+ // printf("BULKSTATUS mbulk:%d readlen:%d sdslen:%d\n",
+ // c->mbulk,c->readlen,sdslen(c->ibuf));
+ if (c->replytype == REPLY_BULK) {
+ clientDone(c);
+ } else if (c->replytype == REPLY_MBULK) {
+ // printf("%p) %d (%d)) ",c, c->mbulk, c->readlen);
+ // fwrite(c->ibuf,c->readlen,1,stdout);
+ // printf("\n");
+ if (--c->mbulk == 0) {
+ clientDone(c);
+ } else {
+ c->ibuf = sdsrange(c->ibuf,c->readlen,-1);
+ c->readlen = -1;
+ goto processdata;
+ }
+ }
+ }
+}
+
+static void writeHandler(aeEventLoop *el, int fd, void *privdata, int mask)
+{
+ client c = privdata;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(fd);
+ REDIS_NOTUSED(mask);
+
+ if (c->state == CLIENT_CONNECTING) {
+ c->state = CLIENT_SENDQUERY;
+ c->start = mstime();
+ }
+ if (sdslen(c->obuf) > c->written) {
+ void *ptr = c->obuf+c->written;
+ int len = sdslen(c->obuf) - c->written;
+ int nwritten = write(c->fd, ptr, len);
+ if (nwritten == -1) {
+ if (errno != EPIPE)
+ fprintf(stderr, "Writing to socket: %s\n", strerror(errno));
+ freeClient(c);
+ return;
+ }
+ c->written += nwritten;
+ if (sdslen(c->obuf) == c->written) {
+ aeDeleteFileEvent(config.el,c->fd,AE_WRITABLE);
+ aeCreateFileEvent(config.el,c->fd,AE_READABLE,readHandler,c);
+ c->state = CLIENT_READREPLY;
+ }
+ }
+}
+
+static client createClient(void) {
+ client c = zmalloc(sizeof(struct _client));
+ char err[ANET_ERR_LEN];
+
+ c->fd = anetTcpNonBlockConnect(err,config.hostip,config.hostport);
+ if (c->fd == ANET_ERR) {
+ zfree(c);
+ fprintf(stderr,"Connect: %s\n",err);
+ return NULL;
+ }
+ anetTcpNoDelay(NULL,c->fd);
+ c->obuf = sdsempty();
+ c->ibuf = sdsempty();
+ c->mbulk = -1;
+ c->readlen = 0;
+ c->written = 0;
+ c->totreceived = 0;
+ c->state = CLIENT_CONNECTING;
+ aeCreateFileEvent(config.el, c->fd, AE_WRITABLE, writeHandler, c);
+ config.liveclients++;
+ listAddNodeTail(config.clients,c);
+ return c;
+}
+
+static void createMissingClients(client c) {
+ while(config.liveclients < config.numclients) {
+ client new = createClient();
+ if (!new) continue;
+ sdsfree(new->obuf);
+ new->obuf = sdsdup(c->obuf);
+ if (config.randomkeys) randomizeClientKey(c);
+ prepareClientForReply(new,c->replytype);
+ }
+}
+
+static void showLatencyReport(char *title) {
+ int j, seen = 0;
+ float perc, reqpersec;
+
+ reqpersec = (float)config.donerequests/((float)config.totlatency/1000);
+ if (!config.quiet) {
+ printf("====== %s ======\n", title);
+ printf(" %d requests completed in %.2f seconds\n", config.donerequests,
+ (float)config.totlatency/1000);
+ printf(" %d parallel clients\n", config.numclients);
+ printf(" %d bytes payload\n", config.datasize);
+ printf(" keep alive: %d\n", config.keepalive);
+ printf("\n");
+ for (j = 0; j <= MAX_LATENCY; j++) {
+ if (config.latency[j]) {
+ seen += config.latency[j];
+ perc = ((float)seen*100)/config.donerequests;
+ printf("%.2f%% <= %d milliseconds\n", perc, j);
+ }
+ }
+ printf("%.2f requests per second\n\n", reqpersec);
+ } else {
+ printf("%s: %.2f requests per second\n", title, reqpersec);
+ }
+}
+
+static void prepareForBenchmark(void)
+{
+ memset(config.latency,0,sizeof(int)*(MAX_LATENCY+1));
+ config.start = mstime();
+ config.donerequests = 0;
+}
+
+static void endBenchmark(char *title) {
+ config.totlatency = mstime()-config.start;
+ showLatencyReport(title);
+ freeAllClients();
+}
+
+void parseOptions(int argc, char **argv) {
+ int i;
+
+ for (i = 1; i < argc; i++) {
+ int lastarg = i==argc-1;
+
+ if (!strcmp(argv[i],"-c") && !lastarg) {
+ config.numclients = atoi(argv[i+1]);
+ i++;
+ } else if (!strcmp(argv[i],"-n") && !lastarg) {
+ config.requests = atoi(argv[i+1]);
+ i++;
+ } else if (!strcmp(argv[i],"-k") && !lastarg) {
+ config.keepalive = atoi(argv[i+1]);
+ i++;
+ } else if (!strcmp(argv[i],"-h") && !lastarg) {
+ char *ip = zmalloc(32);
+ if (anetResolve(NULL,argv[i+1],ip) == ANET_ERR) {
+ printf("Can't resolve %s\n", argv[i]);
+ exit(1);
+ }
+ config.hostip = ip;
+ i++;
+ } else if (!strcmp(argv[i],"-p") && !lastarg) {
+ config.hostport = atoi(argv[i+1]);
+ i++;
+ } else if (!strcmp(argv[i],"-d") && !lastarg) {
+ config.datasize = atoi(argv[i+1]);
+ i++;
+ if (config.datasize < 1) config.datasize=1;
+ if (config.datasize > 1024*1024) config.datasize = 1024*1024;
+ } else if (!strcmp(argv[i],"-r") && !lastarg) {
+ config.randomkeys = 1;
+ config.randomkeys_keyspacelen = atoi(argv[i+1]);
+ if (config.randomkeys_keyspacelen < 0)
+ config.randomkeys_keyspacelen = 0;
+ i++;
+ } else if (!strcmp(argv[i],"-q")) {
+ config.quiet = 1;
+ } else if (!strcmp(argv[i],"-l")) {
+ config.loop = 1;
+ } else if (!strcmp(argv[i],"-D")) {
+ config.debug = 1;
+ } else if (!strcmp(argv[i],"-I")) {
+ config.idlemode = 1;
+ } else {
+ printf("Wrong option '%s' or option argument missing\n\n",argv[i]);
+ printf("Usage: redis-benchmark [-h <host>] [-p <port>] [-c <clients>] [-n <requests]> [-k <boolean>]\n\n");
+ printf(" -h <hostname> Server hostname (default 127.0.0.1)\n");
+ printf(" -p <hostname> Server port (default 6379)\n");
+ printf(" -c <clients> Number of parallel connections (default 50)\n");
+ printf(" -n <requests> Total number of requests (default 10000)\n");
+ printf(" -d <size> Data size of SET/GET value in bytes (default 2)\n");
+ printf(" -k <boolean> 1=keep alive 0=reconnect (default 1)\n");
+ printf(" -r <keyspacelen> Use random keys for SET/GET/INCR, random values for SADD\n");
+ printf(" Using this option the benchmark will get/set keys\n");
+ printf(" in the form mykey_rand000000012456 instead of constant\n");
+ printf(" keys, the <keyspacelen> argument determines the max\n");
+ printf(" number of values for the random number. For instance\n");
+ printf(" if set to 10 only rand000000000000 - rand000000000009\n");
+ printf(" range will be allowed.\n");
+ printf(" -q Quiet. Just show query/sec values\n");
+ printf(" -l Loop. Run the tests forever\n");
+ printf(" -I Idle mode. Just open N idle connections and wait.\n");
+ printf(" -D Debug mode. more verbose.\n");
+ exit(1);
+ }
+ }
+}
+
+int main(int argc, char **argv) {
+ client c;
+
+ signal(SIGHUP, SIG_IGN);
+ signal(SIGPIPE, SIG_IGN);
+
+ config.debug = 0;
+ config.numclients = 50;
+ config.requests = 10000;
+ config.liveclients = 0;
+ config.el = aeCreateEventLoop();
+ config.keepalive = 1;
+ config.donerequests = 0;
+ config.datasize = 3;
+ config.randomkeys = 0;
+ config.randomkeys_keyspacelen = 0;
+ config.quiet = 0;
+ config.loop = 0;
+ config.idlemode = 0;
+ config.latency = NULL;
+ config.clients = listCreate();
+ config.latency = zmalloc(sizeof(int)*(MAX_LATENCY+1));
+
+ config.hostip = "127.0.0.1";
+ config.hostport = 6379;
+
+ parseOptions(argc,argv);
+
+ if (config.keepalive == 0) {
+ printf("WARNING: keepalive disabled, you probably need 'echo 1 > /proc/sys/net/ipv4/tcp_tw_reuse' for Linux and 'sudo sysctl -w net.inet.tcp.msl=1000' for Mac OS X in order to use a lot of clients/requests\n");
+ }
+
+ if (config.idlemode) {
+ printf("Creating %d idle connections and waiting forever (Ctrl+C when done)\n", config.numclients);
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdsempty();
+ prepareClientForReply(c,REPLY_RETCODE); /* will never receive it */
+ createMissingClients(c);
+ aeMain(config.el);
+ /* and will wait for every */
+ }
+
+ do {
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"PING\r\n");
+ prepareClientForReply(c,REPLY_RETCODE);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("PING");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"*1\r\n$4\r\nPING\r\n");
+ prepareClientForReply(c,REPLY_RETCODE);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("PING (multi bulk)");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscatprintf(c->obuf,"SET foo_rand000000000000 %d\r\n",config.datasize);
+ {
+ char *data = zmalloc(config.datasize+2);
+ memset(data,'x',config.datasize);
+ data[config.datasize] = '\r';
+ data[config.datasize+1] = '\n';
+ c->obuf = sdscatlen(c->obuf,data,config.datasize+2);
+ }
+ prepareClientForReply(c,REPLY_RETCODE);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("SET");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"GET foo_rand000000000000\r\n");
+ prepareClientForReply(c,REPLY_BULK);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("GET");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"INCR counter_rand000000000000\r\n");
+ prepareClientForReply(c,REPLY_INT);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("INCR");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"LPUSH mylist 3\r\nbar\r\n");
+ prepareClientForReply(c,REPLY_INT);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("LPUSH");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"LPOP mylist\r\n");
+ prepareClientForReply(c,REPLY_BULK);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("LPOP");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"SADD myset 24\r\ncounter_rand000000000000\r\n");
+ prepareClientForReply(c,REPLY_RETCODE);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("SADD");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"SPOP myset\r\n");
+ prepareClientForReply(c,REPLY_BULK);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("SPOP");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"LPUSH mylist 3\r\nbar\r\n");
+ prepareClientForReply(c,REPLY_RETCODE);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("LPUSH (again, in order to bench LRANGE)");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"LRANGE mylist 0 99\r\n");
+ prepareClientForReply(c,REPLY_MBULK);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("LRANGE (first 100 elements)");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"LRANGE mylist 0 299\r\n");
+ prepareClientForReply(c,REPLY_MBULK);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("LRANGE (first 300 elements)");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"LRANGE mylist 0 449\r\n");
+ prepareClientForReply(c,REPLY_MBULK);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("LRANGE (first 450 elements)");
+
+ prepareForBenchmark();
+ c = createClient();
+ if (!c) exit(1);
+ c->obuf = sdscat(c->obuf,"LRANGE mylist 0 599\r\n");
+ prepareClientForReply(c,REPLY_MBULK);
+ createMissingClients(c);
+ aeMain(config.el);
+ endBenchmark("LRANGE (first 600 elements)");
+
+ printf("\n");
+ } while(config.loop);
+
+ return 0;
+}
diff --git a/src/redis-check-aof.c b/src/redis-check-aof.c
new file mode 100644
index 000000000..ff0d1f82c
--- /dev/null
+++ b/src/redis-check-aof.c
@@ -0,0 +1,185 @@
+#include "fmacros.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include "config.h"
+
+#define ERROR(...) { \
+ char __buf[1024]; \
+ sprintf(__buf, __VA_ARGS__); \
+ sprintf(error, "0x%08lx: %s", epos, __buf); \
+}
+
+static char error[1024];
+static long epos;
+
+int consumeNewline(char *buf) {
+ if (strncmp(buf,"\r\n",2) != 0) {
+ ERROR("Expected \\r\\n, got: %02x%02x",buf[0],buf[1]);
+ return 0;
+ }
+ return 1;
+}
+
+int readLong(FILE *fp, char prefix, long *target) {
+ char buf[128], *eptr;
+ epos = ftell(fp);
+ if (fgets(buf,sizeof(buf),fp) == NULL) {
+ return 0;
+ }
+ if (buf[0] != prefix) {
+ ERROR("Expected prefix '%c', got: '%c'",buf[0],prefix);
+ return 0;
+ }
+ *target = strtol(buf+1,&eptr,10);
+ return consumeNewline(eptr);
+}
+
+int readBytes(FILE *fp, char *target, long length) {
+ long real;
+ epos = ftell(fp);
+ real = fread(target,1,length,fp);
+ if (real != length) {
+ ERROR("Expected to read %ld bytes, got %ld bytes",length,real);
+ return 0;
+ }
+ return 1;
+}
+
+int readString(FILE *fp, char** target) {
+ long len;
+ *target = NULL;
+ if (!readLong(fp,'$',&len)) {
+ return 0;
+ }
+
+ /* Increase length to also consume \r\n */
+ len += 2;
+ *target = (char*)malloc(len);
+ if (!readBytes(fp,*target,len)) {
+ return 0;
+ }
+ if (!consumeNewline(*target+len-2)) {
+ return 0;
+ }
+ (*target)[len-2] = '\0';
+ return 1;
+}
+
+int readArgc(FILE *fp, long *target) {
+ return readLong(fp,'*',target);
+}
+
+long process(FILE *fp) {
+ long argc, pos = 0;
+ int i, multi = 0;
+ char *str;
+
+ while(1) {
+ if (!multi) pos = ftell(fp);
+ if (!readArgc(fp, &argc)) break;
+
+ for (i = 0; i < argc; i++) {
+ if (!readString(fp,&str)) break;
+ if (i == 0) {
+ if (strcasecmp(str, "multi") == 0) {
+ if (multi++) {
+ ERROR("Unexpected MULTI");
+ break;
+ }
+ } else if (strcasecmp(str, "exec") == 0) {
+ if (--multi) {
+ ERROR("Unexpected EXEC");
+ break;
+ }
+ }
+ }
+ free(str);
+ }
+
+ /* Stop if the loop did not finish */
+ if (i < argc) {
+ if (str) free(str);
+ break;
+ }
+ }
+
+ if (feof(fp) && multi && strlen(error) == 0) {
+ ERROR("Reached EOF before reading EXEC for MULTI");
+ }
+ if (strlen(error) > 0) {
+ printf("%s\n", error);
+ }
+ return pos;
+}
+
+int main(int argc, char **argv) {
+ char *filename;
+ int fix = 0;
+
+ if (argc < 2) {
+ printf("Usage: %s [--fix] <file.aof>\n", argv[0]);
+ exit(1);
+ } else if (argc == 2) {
+ filename = argv[1];
+ } else if (argc == 3) {
+ if (strcmp(argv[1],"--fix") != 0) {
+ printf("Invalid argument: %s\n", argv[1]);
+ exit(1);
+ }
+ filename = argv[2];
+ fix = 1;
+ } else {
+ printf("Invalid arguments\n");
+ exit(1);
+ }
+
+ FILE *fp = fopen(filename,"r+");
+ if (fp == NULL) {
+ printf("Cannot open file: %s\n", filename);
+ exit(1);
+ }
+
+ struct redis_stat sb;
+ if (redis_fstat(fileno(fp),&sb) == -1) {
+ printf("Cannot stat file: %s\n", filename);
+ exit(1);
+ }
+
+ long size = sb.st_size;
+ if (size == 0) {
+ printf("Empty file: %s\n", filename);
+ exit(1);
+ }
+
+ long pos = process(fp);
+ long diff = size-pos;
+ if (diff > 0) {
+ if (fix) {
+ char buf[2];
+ printf("This will shrink the AOF from %ld bytes, with %ld bytes, to %ld bytes\n",size,diff,pos);
+ printf("Continue? [y/N]: ");
+ if (fgets(buf,sizeof(buf),stdin) == NULL ||
+ strncasecmp(buf,"y",1) != 0) {
+ printf("Aborting...\n");
+ exit(1);
+ }
+ if (ftruncate(fileno(fp), pos) == -1) {
+ printf("Failed to truncate AOF\n");
+ exit(1);
+ } else {
+ printf("Successfully truncated AOF\n");
+ }
+ } else {
+ printf("AOF is not valid\n");
+ exit(1);
+ }
+ } else {
+ printf("AOF is valid\n");
+ }
+
+ fclose(fp);
+ return 0;
+}
diff --git a/src/redis-check-dump.c b/src/redis-check-dump.c
new file mode 100644
index 000000000..0b002790d
--- /dev/null
+++ b/src/redis-check-dump.c
@@ -0,0 +1,671 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <arpa/inet.h>
+#include <stdint.h>
+#include <limits.h>
+#include "lzf.h"
+
+/* Object types */
+#define REDIS_STRING 0
+#define REDIS_LIST 1
+#define REDIS_SET 2
+#define REDIS_ZSET 3
+#define REDIS_HASH 4
+
+/* Objects encoding. Some kind of objects like Strings and Hashes can be
+ * internally represented in multiple ways. The 'encoding' field of the object
+ * is set to one of this fields for this object. */
+#define REDIS_ENCODING_RAW 0 /* Raw representation */
+#define REDIS_ENCODING_INT 1 /* Encoded as integer */
+#define REDIS_ENCODING_ZIPMAP 2 /* Encoded as zipmap */
+#define REDIS_ENCODING_HT 3 /* Encoded as an hash table */
+
+/* Object types only used for dumping to disk */
+#define REDIS_EXPIRETIME 253
+#define REDIS_SELECTDB 254
+#define REDIS_EOF 255
+
+/* Defines related to the dump file format. To store 32 bits lengths for short
+ * keys requires a lot of space, so we check the most significant 2 bits of
+ * the first byte to interpreter the length:
+ *
+ * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
+ * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
+ * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
+ * 11|000000 this means: specially encoded object will follow. The six bits
+ * number specify the kind of object that follows.
+ * See the REDIS_RDB_ENC_* defines.
+ *
+ * Lenghts up to 63 are stored using a single byte, most DB keys, and may
+ * values, will fit inside. */
+#define REDIS_RDB_6BITLEN 0
+#define REDIS_RDB_14BITLEN 1
+#define REDIS_RDB_32BITLEN 2
+#define REDIS_RDB_ENCVAL 3
+#define REDIS_RDB_LENERR UINT_MAX
+
+/* When a length of a string object stored on disk has the first two bits
+ * set, the remaining two bits specify a special encoding for the object
+ * accordingly to the following defines: */
+#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
+#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
+#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
+#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
+
+#define ERROR(...) { \
+ printf(__VA_ARGS__); \
+ exit(1); \
+}
+
+/* data type to hold offset in file and size */
+typedef struct {
+ void *data;
+ unsigned long size;
+ unsigned long offset;
+} pos;
+
+static unsigned char level = 0;
+static pos positions[16];
+
+#define CURR_OFFSET (positions[level].offset)
+
+/* Hold a stack of errors */
+typedef struct {
+ char error[16][1024];
+ unsigned long offset[16];
+ unsigned int level;
+} errors_t;
+static errors_t errors;
+
+#define SHIFT_ERROR(provided_offset, ...) { \
+ sprintf(errors.error[errors.level], __VA_ARGS__); \
+ errors.offset[errors.level] = provided_offset; \
+ errors.level++; \
+}
+
+/* Data type to hold opcode with optional key name an success status */
+typedef struct {
+ char* key;
+ int type;
+ char success;
+} entry;
+
+/* Global vars that are actally used as constants. The following double
+ * values are used for double on-disk serialization, and are initialized
+ * at runtime to avoid strange compiler optimizations. */
+static double R_Zero, R_PosInf, R_NegInf, R_Nan;
+
+/* store string types for output */
+static char types[256][16];
+
+/* when number of bytes to read is negative, do a peek */
+int readBytes(void *target, long num) {
+ char peek = (num < 0) ? 1 : 0;
+ num = (num < 0) ? -num : num;
+
+ pos p = positions[level];
+ if (p.offset + num > p.size) {
+ return 0;
+ } else {
+ memcpy(target, (void*)((unsigned long)p.data + p.offset), num);
+ if (!peek) positions[level].offset += num;
+ }
+ return 1;
+}
+
+int processHeader() {
+ char buf[10] = "_________";
+ int dump_version;
+
+ if (!readBytes(buf, 9)) {
+ ERROR("Cannot read header\n");
+ }
+
+ /* expect the first 5 bytes to equal REDIS */
+ if (memcmp(buf,"REDIS",5) != 0) {
+ ERROR("Wrong signature in header\n");
+ }
+
+ dump_version = (int)strtol(buf + 5, NULL, 10);
+ if (dump_version != 1) {
+ ERROR("Unknown RDB format version: %d\n", dump_version);
+ }
+ return 1;
+}
+
+int loadType(entry *e) {
+ uint32_t offset = CURR_OFFSET;
+
+ /* this byte needs to qualify as type */
+ unsigned char t;
+ if (readBytes(&t, 1)) {
+ if (t <= 4 || t >= 253) {
+ e->type = t;
+ return 1;
+ } else {
+ SHIFT_ERROR(offset, "Unknown type (0x%02x)", t);
+ }
+ } else {
+ SHIFT_ERROR(offset, "Could not read type");
+ }
+
+ /* failure */
+ return 0;
+}
+
+int peekType() {
+ unsigned char t;
+ if (readBytes(&t, -1) && (t <= 4 || t >= 253)) return t;
+ return -1;
+}
+
+/* discard time, just consume the bytes */
+int processTime() {
+ uint32_t offset = CURR_OFFSET;
+ unsigned char t[4];
+ if (readBytes(t, 4)) {
+ return 1;
+ } else {
+ SHIFT_ERROR(offset, "Could not read time");
+ }
+
+ /* failure */
+ return 0;
+}
+
+uint32_t loadLength(int *isencoded) {
+ unsigned char buf[2];
+ uint32_t len;
+ int type;
+
+ if (isencoded) *isencoded = 0;
+ if (!readBytes(buf, 1)) return REDIS_RDB_LENERR;
+ type = (buf[0] & 0xC0) >> 6;
+ if (type == REDIS_RDB_6BITLEN) {
+ /* Read a 6 bit len */
+ return buf[0] & 0x3F;
+ } else if (type == REDIS_RDB_ENCVAL) {
+ /* Read a 6 bit len encoding type */
+ if (isencoded) *isencoded = 1;
+ return buf[0] & 0x3F;
+ } else if (type == REDIS_RDB_14BITLEN) {
+ /* Read a 14 bit len */
+ if (!readBytes(buf+1,1)) return REDIS_RDB_LENERR;
+ return ((buf[0] & 0x3F) << 8) | buf[1];
+ } else {
+ /* Read a 32 bit len */
+ if (!readBytes(&len, 4)) return REDIS_RDB_LENERR;
+ return (unsigned int)ntohl(len);
+ }
+}
+
+char *loadIntegerObject(int enctype) {
+ uint32_t offset = CURR_OFFSET;
+ unsigned char enc[4];
+ long long val;
+
+ if (enctype == REDIS_RDB_ENC_INT8) {
+ uint8_t v;
+ if (!readBytes(enc, 1)) return NULL;
+ v = enc[0];
+ val = (int8_t)v;
+ } else if (enctype == REDIS_RDB_ENC_INT16) {
+ uint16_t v;
+ if (!readBytes(enc, 2)) return NULL;
+ v = enc[0]|(enc[1]<<8);
+ val = (int16_t)v;
+ } else if (enctype == REDIS_RDB_ENC_INT32) {
+ uint32_t v;
+ if (!readBytes(enc, 4)) return NULL;
+ v = enc[0]|(enc[1]<<8)|(enc[2]<<16)|(enc[3]<<24);
+ val = (int32_t)v;
+ } else {
+ SHIFT_ERROR(offset, "Unknown integer encoding (0x%02x)", enctype);
+ return NULL;
+ }
+
+ /* convert val into string */
+ char *buf;
+ buf = malloc(sizeof(char) * 128);
+ sprintf(buf, "%lld", val);
+ return buf;
+}
+
+char* loadLzfStringObject() {
+ unsigned int slen, clen;
+ char *c, *s;
+
+ if ((clen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL;
+ if ((slen = loadLength(NULL)) == REDIS_RDB_LENERR) return NULL;
+
+ c = malloc(clen);
+ if (!readBytes(c, clen)) {
+ free(c);
+ return NULL;
+ }
+
+ s = malloc(slen+1);
+ if (lzf_decompress(c,clen,s,slen) == 0) {
+ free(c); free(s);
+ return NULL;
+ }
+
+ free(c);
+ return s;
+}
+
+/* returns NULL when not processable, char* when valid */
+char* loadStringObject() {
+ uint32_t offset = CURR_OFFSET;
+ int isencoded;
+ uint32_t len;
+
+ len = loadLength(&isencoded);
+ if (isencoded) {
+ switch(len) {
+ case REDIS_RDB_ENC_INT8:
+ case REDIS_RDB_ENC_INT16:
+ case REDIS_RDB_ENC_INT32:
+ return loadIntegerObject(len);
+ case REDIS_RDB_ENC_LZF:
+ return loadLzfStringObject();
+ default:
+ /* unknown encoding */
+ SHIFT_ERROR(offset, "Unknown string encoding (0x%02x)", len);
+ return NULL;
+ }
+ }
+
+ if (len == REDIS_RDB_LENERR) return NULL;
+
+ char *buf = malloc(sizeof(char) * (len+1));
+ buf[len] = '\0';
+ if (!readBytes(buf, len)) {
+ free(buf);
+ return NULL;
+ }
+ return buf;
+}
+
+int processStringObject(char** store) {
+ unsigned long offset = CURR_OFFSET;
+ char *key = loadStringObject();
+ if (key == NULL) {
+ SHIFT_ERROR(offset, "Error reading string object");
+ free(key);
+ return 0;
+ }
+
+ if (store != NULL) {
+ *store = key;
+ } else {
+ free(key);
+ }
+ return 1;
+}
+
+double* loadDoubleValue() {
+ char buf[256];
+ unsigned char len;
+ double* val;
+
+ if (!readBytes(&len,1)) return NULL;
+
+ val = malloc(sizeof(double));
+ switch(len) {
+ case 255: *val = R_NegInf; return val;
+ case 254: *val = R_PosInf; return val;
+ case 253: *val = R_Nan; return val;
+ default:
+ if (!readBytes(buf, len)) {
+ free(val);
+ return NULL;
+ }
+ buf[len] = '\0';
+ sscanf(buf, "%lg", val);
+ return val;
+ }
+}
+
+int processDoubleValue(double** store) {
+ unsigned long offset = CURR_OFFSET;
+ double *val = loadDoubleValue();
+ if (val == NULL) {
+ SHIFT_ERROR(offset, "Error reading double value");
+ free(val);
+ return 0;
+ }
+
+ if (store != NULL) {
+ *store = val;
+ } else {
+ free(val);
+ }
+ return 1;
+}
+
+int loadPair(entry *e) {
+ uint32_t offset = CURR_OFFSET;
+ uint32_t i;
+
+ /* read key first */
+ char *key;
+ if (processStringObject(&key)) {
+ e->key = key;
+ } else {
+ SHIFT_ERROR(offset, "Error reading entry key");
+ return 0;
+ }
+
+ uint32_t length = 0;
+ if (e->type == REDIS_LIST ||
+ e->type == REDIS_SET ||
+ e->type == REDIS_ZSET ||
+ e->type == REDIS_HASH) {
+ if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) {
+ SHIFT_ERROR(offset, "Error reading %s length", types[e->type]);
+ return 0;
+ }
+ }
+
+ switch(e->type) {
+ case REDIS_STRING:
+ if (!processStringObject(NULL)) {
+ SHIFT_ERROR(offset, "Error reading entry value");
+ return 0;
+ }
+ break;
+ case REDIS_LIST:
+ case REDIS_SET:
+ for (i = 0; i < length; i++) {
+ offset = CURR_OFFSET;
+ if (!processStringObject(NULL)) {
+ SHIFT_ERROR(offset, "Error reading element at index %d (length: %d)", i, length);
+ return 0;
+ }
+ }
+ break;
+ case REDIS_ZSET:
+ for (i = 0; i < length; i++) {
+ offset = CURR_OFFSET;
+ if (!processStringObject(NULL)) {
+ SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length);
+ return 0;
+ }
+ offset = CURR_OFFSET;
+ if (!processDoubleValue(NULL)) {
+ SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length);
+ return 0;
+ }
+ }
+ break;
+ case REDIS_HASH:
+ for (i = 0; i < length; i++) {
+ offset = CURR_OFFSET;
+ if (!processStringObject(NULL)) {
+ SHIFT_ERROR(offset, "Error reading element key at index %d (length: %d)", i, length);
+ return 0;
+ }
+ offset = CURR_OFFSET;
+ if (!processStringObject(NULL)) {
+ SHIFT_ERROR(offset, "Error reading element value at index %d (length: %d)", i, length);
+ return 0;
+ }
+ }
+ break;
+ default:
+ SHIFT_ERROR(offset, "Type not implemented");
+ return 0;
+ }
+ /* because we're done, we assume success */
+ e->success = 1;
+ return 1;
+}
+
+entry loadEntry() {
+ entry e = { NULL, -1, 0 };
+ uint32_t length, offset[4];
+
+ /* reset error container */
+ errors.level = 0;
+
+ offset[0] = CURR_OFFSET;
+ if (!loadType(&e)) {
+ return e;
+ }
+
+ offset[1] = CURR_OFFSET;
+ if (e.type == REDIS_SELECTDB) {
+ if ((length = loadLength(NULL)) == REDIS_RDB_LENERR) {
+ SHIFT_ERROR(offset[1], "Error reading database number");
+ return e;
+ }
+ if (length > 63) {
+ SHIFT_ERROR(offset[1], "Database number out of range (%d)", length);
+ return e;
+ }
+ } else if (e.type == REDIS_EOF) {
+ if (positions[level].offset < positions[level].size) {
+ SHIFT_ERROR(offset[0], "Unexpected EOF");
+ } else {
+ e.success = 1;
+ }
+ return e;
+ } else {
+ /* optionally consume expire */
+ if (e.type == REDIS_EXPIRETIME) {
+ if (!processTime()) return e;
+ if (!loadType(&e)) return e;
+ }
+
+ offset[1] = CURR_OFFSET;
+ if (!loadPair(&e)) {
+ SHIFT_ERROR(offset[1], "Error for type %s", types[e.type]);
+ return e;
+ }
+ }
+
+ /* all entries are followed by a valid type:
+ * e.g. a new entry, SELECTDB, EXPIRE, EOF */
+ offset[2] = CURR_OFFSET;
+ if (peekType() == -1) {
+ SHIFT_ERROR(offset[2], "Followed by invalid type");
+ SHIFT_ERROR(offset[0], "Error for type %s", types[e.type]);
+ e.success = 0;
+ } else {
+ e.success = 1;
+ }
+
+ return e;
+}
+
+void printCentered(int indent, int width, char* body) {
+ char head[256], tail[256];
+ memset(head, '\0', 256);
+ memset(tail, '\0', 256);
+
+ memset(head, '=', indent);
+ memset(tail, '=', width - 2 - indent - strlen(body));
+ printf("%s %s %s\n", head, body, tail);
+}
+
+void printValid(int ops, int bytes) {
+ char body[80];
+ sprintf(body, "Processed %d valid opcodes (in %d bytes)", ops, bytes);
+ printCentered(4, 80, body);
+}
+
+void printSkipped(int bytes, int offset) {
+ char body[80];
+ sprintf(body, "Skipped %d bytes (resuming at 0x%08x)", bytes, offset);
+ printCentered(4, 80, body);
+}
+
+void printErrorStack(entry *e) {
+ unsigned int i;
+ char body[64];
+
+ if (e->type == -1) {
+ sprintf(body, "Error trace");
+ } else if (e->type >= 253) {
+ sprintf(body, "Error trace (%s)", types[e->type]);
+ } else if (!e->key) {
+ sprintf(body, "Error trace (%s: (unknown))", types[e->type]);
+ } else {
+ char tmp[41];
+ strncpy(tmp, e->key, 40);
+
+ /* display truncation at the last 3 chars */
+ if (strlen(e->key) > 40) {
+ memset(&tmp[37], '.', 3);
+ }
+
+ /* display unprintable characters as ? */
+ for (i = 0; i < strlen(tmp); i++) {
+ if (tmp[i] <= 32) tmp[i] = '?';
+ }
+ sprintf(body, "Error trace (%s: %s)", types[e->type], tmp);
+ }
+
+ printCentered(4, 80, body);
+
+ /* display error stack */
+ for (i = 0; i < errors.level; i++) {
+ printf("0x%08lx - %s\n", errors.offset[i], errors.error[i]);
+ }
+}
+
+void process() {
+ int i, num_errors = 0, num_valid_ops = 0, num_valid_bytes = 0;
+ entry entry;
+ processHeader();
+
+ level = 1;
+ while(positions[0].offset < positions[0].size) {
+ positions[1] = positions[0];
+
+ entry = loadEntry();
+ if (!entry.success) {
+ printValid(num_valid_ops, num_valid_bytes);
+ printErrorStack(&entry);
+ num_errors++;
+ num_valid_ops = 0;
+ num_valid_bytes = 0;
+
+ /* search for next valid entry */
+ unsigned long offset = positions[0].offset + 1;
+ while (!entry.success && offset < positions[0].size) {
+ positions[1].offset = offset;
+
+ /* find 3 consecutive valid entries */
+ for (i = 0; i < 3; i++) {
+ entry = loadEntry();
+ if (!entry.success) break;
+ }
+ /* check if we found 3 consecutive valid entries */
+ if (i < 3) {
+ offset++;
+ }
+ }
+
+ /* print how many bytes we have skipped to find a new valid opcode */
+ if (offset < positions[0].size) {
+ printSkipped(offset - positions[0].offset, offset);
+ }
+
+ positions[0].offset = offset;
+ } else {
+ num_valid_ops++;
+ num_valid_bytes += positions[1].offset - positions[0].offset;
+
+ /* advance position */
+ positions[0] = positions[1];
+ }
+ }
+
+ /* because there is another potential error,
+ * print how many valid ops we have processed */
+ printValid(num_valid_ops, num_valid_bytes);
+
+ /* expect an eof */
+ if (entry.type != REDIS_EOF) {
+ /* last byte should be EOF, add error */
+ errors.level = 0;
+ SHIFT_ERROR(positions[0].offset, "Expected EOF, got %s", types[entry.type]);
+
+ /* this is an EOF error so reset type */
+ entry.type = -1;
+ printErrorStack(&entry);
+
+ num_errors++;
+ }
+
+ /* print summary on errors */
+ if (num_errors > 0) {
+ printf("\n");
+ printf("Total unprocessable opcodes: %d\n", num_errors);
+ }
+}
+
+int main(int argc, char **argv) {
+ /* expect the first argument to be the dump file */
+ if (argc <= 1) {
+ printf("Usage: %s <dump.rdb>\n", argv[0]);
+ exit(0);
+ }
+
+ int fd;
+ unsigned long size;
+ struct stat stat;
+ void *data;
+
+ fd = open(argv[1], O_RDONLY);
+ if (fd < 1) {
+ ERROR("Cannot open file: %s\n", argv[1]);
+ }
+ if (fstat(fd, &stat) == -1) {
+ ERROR("Cannot stat: %s\n", argv[1]);
+ } else {
+ size = stat.st_size;
+ }
+
+ data = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0);
+ if (data == MAP_FAILED) {
+ ERROR("Cannot mmap: %s\n", argv[1]);
+ }
+
+ /* Initialize static vars */
+ positions[0].data = data;
+ positions[0].size = size;
+ positions[0].offset = 0;
+ errors.level = 0;
+
+ /* Object types */
+ sprintf(types[REDIS_STRING], "STRING");
+ sprintf(types[REDIS_LIST], "LIST");
+ sprintf(types[REDIS_SET], "SET");
+ sprintf(types[REDIS_ZSET], "ZSET");
+ sprintf(types[REDIS_HASH], "HASH");
+
+ /* Object types only used for dumping to disk */
+ sprintf(types[REDIS_EXPIRETIME], "EXPIRETIME");
+ sprintf(types[REDIS_SELECTDB], "SELECTDB");
+ sprintf(types[REDIS_EOF], "EOF");
+
+ /* Double constants initialization */
+ R_Zero = 0.0;
+ R_PosInf = 1.0/R_Zero;
+ R_NegInf = -1.0/R_Zero;
+ R_Nan = R_Zero/R_Zero;
+
+ process();
+
+ munmap(data, size);
+ close(fd);
+ return 0;
+}
diff --git a/src/redis-cli.c b/src/redis-cli.c
new file mode 100644
index 000000000..2daa7c461
--- /dev/null
+++ b/src/redis-cli.c
@@ -0,0 +1,493 @@
+/* Redis CLI (command line interface)
+ *
+ * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fmacros.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+
+#include "anet.h"
+#include "sds.h"
+#include "adlist.h"
+#include "zmalloc.h"
+#include "linenoise.h"
+
+#define REDIS_CMD_INLINE 1
+#define REDIS_CMD_BULK 2
+#define REDIS_CMD_MULTIBULK 4
+
+#define REDIS_NOTUSED(V) ((void) V)
+
+static struct config {
+ char *hostip;
+ int hostport;
+ long repeat;
+ int dbnum;
+ int argn_from_stdin;
+ int interactive;
+ int shutdown;
+ int monitor_mode;
+ int pubsub_mode;
+ int raw_output;
+ char *auth;
+} config;
+
+static int cliReadReply(int fd);
+static void usage();
+
+static int cliConnect(void) {
+ char err[ANET_ERR_LEN];
+ static int fd = ANET_ERR;
+
+ if (fd == ANET_ERR) {
+ fd = anetTcpConnect(err,config.hostip,config.hostport);
+ if (fd == ANET_ERR) {
+ fprintf(stderr, "Could not connect to Redis at %s:%d: %s", config.hostip, config.hostport, err);
+ return -1;
+ }
+ anetTcpNoDelay(NULL,fd);
+ }
+ return fd;
+}
+
+static sds cliReadLine(int fd) {
+ sds line = sdsempty();
+
+ while(1) {
+ char c;
+ ssize_t ret;
+
+ ret = read(fd,&c,1);
+ if (ret == -1) {
+ sdsfree(line);
+ return NULL;
+ } else if ((ret == 0) || (c == '\n')) {
+ break;
+ } else {
+ line = sdscatlen(line,&c,1);
+ }
+ }
+ return sdstrim(line,"\r\n");
+}
+
+static int cliReadSingleLineReply(int fd, int quiet) {
+ sds reply = cliReadLine(fd);
+
+ if (reply == NULL) return 1;
+ if (!quiet)
+ printf("%s\n", reply);
+ sdsfree(reply);
+ return 0;
+}
+
+static void printStringRepr(char *s, int len) {
+ printf("\"");
+ while(len--) {
+ switch(*s) {
+ case '\\':
+ case '"':
+ printf("\\%c",*s);
+ break;
+ case '\n': printf("\\n"); break;
+ case '\r': printf("\\r"); break;
+ case '\t': printf("\\t"); break;
+ case '\a': printf("\\a"); break;
+ case '\b': printf("\\b"); break;
+ default:
+ if (isprint(*s))
+ printf("%c",*s);
+ else
+ printf("\\x%02x",(unsigned char)*s);
+ break;
+ }
+ s++;
+ }
+ printf("\"\n");
+}
+
+static int cliReadBulkReply(int fd) {
+ sds replylen = cliReadLine(fd);
+ char *reply, crlf[2];
+ int bulklen;
+
+ if (replylen == NULL) return 1;
+ bulklen = atoi(replylen);
+ if (bulklen == -1) {
+ sdsfree(replylen);
+ printf("(nil)\n");
+ return 0;
+ }
+ reply = zmalloc(bulklen);
+ anetRead(fd,reply,bulklen);
+ anetRead(fd,crlf,2);
+ if (config.raw_output || !isatty(fileno(stdout))) {
+ if (bulklen && fwrite(reply,bulklen,1,stdout) == 0) {
+ zfree(reply);
+ return 1;
+ }
+ } else {
+ /* If you are producing output for the standard output we want
+ * a more interesting output with quoted characters and so forth */
+ printStringRepr(reply,bulklen);
+ }
+ zfree(reply);
+ return 0;
+}
+
+static int cliReadMultiBulkReply(int fd) {
+ sds replylen = cliReadLine(fd);
+ int elements, c = 1;
+
+ if (replylen == NULL) return 1;
+ elements = atoi(replylen);
+ if (elements == -1) {
+ sdsfree(replylen);
+ printf("(nil)\n");
+ return 0;
+ }
+ if (elements == 0) {
+ printf("(empty list or set)\n");
+ }
+ while(elements--) {
+ printf("%d. ", c);
+ if (cliReadReply(fd)) return 1;
+ c++;
+ }
+ return 0;
+}
+
+static int cliReadReply(int fd) {
+ char type;
+
+ if (anetRead(fd,&type,1) <= 0) {
+ if (config.shutdown) return 0;
+ exit(1);
+ }
+ switch(type) {
+ case '-':
+ printf("(error) ");
+ cliReadSingleLineReply(fd,0);
+ return 1;
+ case '+':
+ return cliReadSingleLineReply(fd,0);
+ case ':':
+ printf("(integer) ");
+ return cliReadSingleLineReply(fd,0);
+ case '$':
+ return cliReadBulkReply(fd);
+ case '*':
+ return cliReadMultiBulkReply(fd);
+ default:
+ printf("protocol error, got '%c' as reply type byte\n", type);
+ return 1;
+ }
+}
+
+static int selectDb(int fd) {
+ int retval;
+ sds cmd;
+ char type;
+
+ if (config.dbnum == 0)
+ return 0;
+
+ cmd = sdsempty();
+ cmd = sdscatprintf(cmd,"SELECT %d\r\n",config.dbnum);
+ anetWrite(fd,cmd,sdslen(cmd));
+ anetRead(fd,&type,1);
+ if (type <= 0 || type != '+') return 1;
+ retval = cliReadSingleLineReply(fd,1);
+ if (retval) {
+ return retval;
+ }
+ return 0;
+}
+
+static int cliSendCommand(int argc, char **argv, int repeat) {
+ char *command = argv[0];
+ int fd, j, retval = 0;
+ sds cmd;
+
+ config.raw_output = !strcasecmp(command,"info");
+ if (!strcasecmp(command,"shutdown")) config.shutdown = 1;
+ if (!strcasecmp(command,"monitor")) config.monitor_mode = 1;
+ if (!strcasecmp(command,"subscribe") ||
+ !strcasecmp(command,"psubscribe")) config.pubsub_mode = 1;
+ if ((fd = cliConnect()) == -1) return 1;
+
+ /* Select db number */
+ retval = selectDb(fd);
+ if (retval) {
+ fprintf(stderr,"Error setting DB num\n");
+ return 1;
+ }
+
+ /* Build the command to send */
+ cmd = sdscatprintf(sdsempty(),"*%d\r\n",argc);
+ for (j = 0; j < argc; j++) {
+ cmd = sdscatprintf(cmd,"$%lu\r\n",
+ (unsigned long)sdslen(argv[j]));
+ cmd = sdscatlen(cmd,argv[j],sdslen(argv[j]));
+ cmd = sdscatlen(cmd,"\r\n",2);
+ }
+
+ while(repeat--) {
+ anetWrite(fd,cmd,sdslen(cmd));
+ while (config.monitor_mode) {
+ cliReadSingleLineReply(fd,0);
+ }
+
+ if (config.pubsub_mode) {
+ printf("Reading messages... (press Ctrl-c to quit)\n");
+ while (1) {
+ cliReadReply(fd);
+ printf("\n");
+ }
+ }
+
+ retval = cliReadReply(fd);
+ if (retval) {
+ return retval;
+ }
+ }
+ return 0;
+}
+
+static int parseOptions(int argc, char **argv) {
+ int i;
+
+ for (i = 1; i < argc; i++) {
+ int lastarg = i==argc-1;
+
+ if (!strcmp(argv[i],"-h") && !lastarg) {
+ char *ip = zmalloc(32);
+ if (anetResolve(NULL,argv[i+1],ip) == ANET_ERR) {
+ printf("Can't resolve %s\n", argv[i]);
+ exit(1);
+ }
+ config.hostip = ip;
+ i++;
+ } else if (!strcmp(argv[i],"-h") && lastarg) {
+ usage();
+ } else if (!strcmp(argv[i],"-p") && !lastarg) {
+ config.hostport = atoi(argv[i+1]);
+ i++;
+ } else if (!strcmp(argv[i],"-r") && !lastarg) {
+ config.repeat = strtoll(argv[i+1],NULL,10);
+ i++;
+ } else if (!strcmp(argv[i],"-n") && !lastarg) {
+ config.dbnum = atoi(argv[i+1]);
+ i++;
+ } else if (!strcmp(argv[i],"-a") && !lastarg) {
+ config.auth = argv[i+1];
+ i++;
+ } else if (!strcmp(argv[i],"-i")) {
+ config.interactive = 1;
+ } else if (!strcmp(argv[i],"-c")) {
+ config.argn_from_stdin = 1;
+ } else {
+ break;
+ }
+ }
+ return i;
+}
+
+static sds readArgFromStdin(void) {
+ char buf[1024];
+ sds arg = sdsempty();
+
+ while(1) {
+ int nread = read(fileno(stdin),buf,1024);
+
+ if (nread == 0) break;
+ else if (nread == -1) {
+ perror("Reading from standard input");
+ exit(1);
+ }
+ arg = sdscatlen(arg,buf,nread);
+ }
+ return arg;
+}
+
+static void usage() {
+ fprintf(stderr, "usage: redis-cli [-h host] [-p port] [-a authpw] [-r repeat_times] [-n db_num] [-i] cmd arg1 arg2 arg3 ... argN\n");
+ fprintf(stderr, "usage: echo \"argN\" | redis-cli -c [-h host] [-p port] [-a authpw] [-r repeat_times] [-n db_num] cmd arg1 arg2 ... arg(N-1)\n");
+ fprintf(stderr, "\nIf a pipe from standard input is detected this data is used as last argument.\n\n");
+ fprintf(stderr, "example: cat /etc/passwd | redis-cli set my_passwd\n");
+ fprintf(stderr, "example: redis-cli get my_passwd\n");
+ fprintf(stderr, "example: redis-cli -r 100 lpush mylist x\n");
+ fprintf(stderr, "\nRun in interactive mode: redis-cli -i or just don't pass any command\n");
+ exit(1);
+}
+
+/* Turn the plain C strings into Sds strings */
+static char **convertToSds(int count, char** args) {
+ int j;
+ char **sds = zmalloc(sizeof(char*)*count);
+
+ for(j = 0; j < count; j++)
+ sds[j] = sdsnew(args[j]);
+
+ return sds;
+}
+
+static char **splitArguments(char *line, int *argc) {
+ char *p = line;
+ char *current = NULL;
+ char **vector = NULL;
+
+ *argc = 0;
+ while(1) {
+ /* skip blanks */
+ while(*p && isspace(*p)) p++;
+ if (*p) {
+ /* get a token */
+ int inq=0; /* set to 1 if we are in "quotes" */
+ int done = 0;
+
+ if (current == NULL) current = sdsempty();
+ while(!done) {
+ if (inq) {
+ if (*p == '\\' && *(p+1)) {
+ char c;
+
+ p++;
+ switch(*p) {
+ case 'n': c = '\n'; break;
+ case 'r': c = '\r'; break;
+ case 't': c = '\t'; break;
+ case 'b': c = '\b'; break;
+ case 'a': c = '\a'; break;
+ default: c = *p; break;
+ }
+ current = sdscatlen(current,&c,1);
+ } else if (*p == '"') {
+ done = 1;
+ } else {
+ current = sdscatlen(current,p,1);
+ }
+ } else {
+ switch(*p) {
+ case ' ':
+ case '\n':
+ case '\r':
+ case '\t':
+ case '\0':
+ done=1;
+ break;
+ case '"':
+ inq=1;
+ break;
+ default:
+ current = sdscatlen(current,p,1);
+ break;
+ }
+ }
+ if (*p) p++;
+ }
+ /* add the token to the vector */
+ vector = zrealloc(vector,((*argc)+1)*sizeof(char*));
+ vector[*argc] = current;
+ (*argc)++;
+ current = NULL;
+ } else {
+ return vector;
+ }
+ }
+}
+
+#define LINE_BUFLEN 4096
+static void repl() {
+ int argc, j;
+ char *line, **argv;
+
+ while((line = linenoise("redis> ")) != NULL) {
+ if (line[0] != '\0') {
+ argv = splitArguments(line,&argc);
+ linenoiseHistoryAdd(line);
+ if (argc > 0) {
+ if (strcasecmp(argv[0],"quit") == 0 ||
+ strcasecmp(argv[0],"exit") == 0)
+ exit(0);
+ else
+ cliSendCommand(argc, argv, 1);
+ }
+ /* Free the argument vector */
+ for (j = 0; j < argc; j++)
+ sdsfree(argv[j]);
+ zfree(argv);
+ }
+ /* linenoise() returns malloc-ed lines like readline() */
+ free(line);
+ }
+ exit(0);
+}
+
+int main(int argc, char **argv) {
+ int firstarg;
+ char **argvcopy;
+
+ config.hostip = "127.0.0.1";
+ config.hostport = 6379;
+ config.repeat = 1;
+ config.dbnum = 0;
+ config.argn_from_stdin = 0;
+ config.shutdown = 0;
+ config.interactive = 0;
+ config.monitor_mode = 0;
+ config.pubsub_mode = 0;
+ config.raw_output = 0;
+ config.auth = NULL;
+
+ firstarg = parseOptions(argc,argv);
+ argc -= firstarg;
+ argv += firstarg;
+
+ if (config.auth != NULL) {
+ char *authargv[2];
+
+ authargv[0] = "AUTH";
+ authargv[1] = config.auth;
+ cliSendCommand(2, convertToSds(2, authargv), 1);
+ }
+
+ if (argc == 0 || config.interactive == 1) repl();
+
+ argvcopy = convertToSds(argc+1, argv);
+ if (config.argn_from_stdin) {
+ sds lastarg = readArgFromStdin();
+ argvcopy[argc] = lastarg;
+ argc++;
+ }
+ return cliSendCommand(argc, argvcopy, config.repeat);
+}
diff --git a/src/redis.c b/src/redis.c
new file mode 100644
index 000000000..5f539216f
--- /dev/null
+++ b/src/redis.c
@@ -0,0 +1,1516 @@
+/*
+ * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "redis.h"
+
+#ifdef HAVE_BACKTRACE
+#include <execinfo.h>
+#include <ucontext.h>
+#endif /* HAVE_BACKTRACE */
+
+#include <time.h>
+#include <signal.h>
+#include <sys/wait.h>
+#include <errno.h>
+#include <assert.h>
+#include <ctype.h>
+#include <stdarg.h>
+#include <inttypes.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/uio.h>
+#include <limits.h>
+#include <float.h>
+#include <math.h>
+#include <pthread.h>
+
+/* Our shared "common" objects */
+
+struct sharedObjectsStruct shared;
+
+/* Global vars that are actally used as constants. The following double
+ * values are used for double on-disk serialization, and are initialized
+ * at runtime to avoid strange compiler optimizations. */
+
+double R_Zero, R_PosInf, R_NegInf, R_Nan;
+
+/*================================= Globals ================================= */
+
+/* Global vars */
+struct redisServer server; /* server global state */
+struct redisCommand *commandTable;
+struct redisCommand readonlyCommandTable[] = {
+ {"get",getCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"set",setCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
+ {"setnx",setnxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
+ {"setex",setexCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,0,0,0},
+ {"append",appendCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"substr",substrCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"del",delCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"exists",existsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"incr",incrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"decr",decrCommand,2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"mget",mgetCommand,-2,REDIS_CMD_INLINE,NULL,1,-1,1},
+ {"rpush",rpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"lpush",lpushCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"rpushx",rpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"lpushx",lpushxCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"linsert",linsertCommand,5,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"rpop",rpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"lpop",lpopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"brpop",brpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"blpop",blpopCommand,-3,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"llen",llenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"lindex",lindexCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"lset",lsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"lrange",lrangeCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"ltrim",ltrimCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"lrem",lremCommand,4,REDIS_CMD_BULK,NULL,1,1,1},
+ {"rpoplpush",rpoplpushcommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,2,1},
+ {"sadd",saddCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"srem",sremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"smove",smoveCommand,4,REDIS_CMD_BULK,NULL,1,2,1},
+ {"sismember",sismemberCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"scard",scardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"spop",spopCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"srandmember",srandmemberCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"sinter",sinterCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
+ {"sinterstore",sinterstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
+ {"sunion",sunionCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
+ {"sunionstore",sunionstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
+ {"sdiff",sdiffCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,-1,1},
+ {"sdiffstore",sdiffstoreCommand,-3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,2,-1,1},
+ {"smembers",sinterCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"zadd",zaddCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"zincrby",zincrbyCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"zrem",zremCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"zremrangebyscore",zremrangebyscoreCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"zremrangebyrank",zremrangebyrankCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"zunionstore",zunionstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
+ {"zinterstore",zinterstoreCommand,-4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,zunionInterBlockClientOnSwappedKeys,0,0,0},
+ {"zrange",zrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"zrangebyscore",zrangebyscoreCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"zcount",zcountCommand,4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"zrevrange",zrevrangeCommand,-4,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"zcard",zcardCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"zscore",zscoreCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"zrank",zrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"zrevrank",zrevrankCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"hset",hsetCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"hsetnx",hsetnxCommand,4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"hget",hgetCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"hmset",hmsetCommand,-4,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"hmget",hmgetCommand,-3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"hincrby",hincrbyCommand,4,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"hdel",hdelCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"hlen",hlenCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"hkeys",hkeysCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"hvals",hvalsCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"hgetall",hgetallCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"hexists",hexistsCommand,3,REDIS_CMD_BULK,NULL,1,1,1},
+ {"incrby",incrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"decrby",decrbyCommand,3,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"getset",getsetCommand,3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"mset",msetCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
+ {"msetnx",msetnxCommand,-3,REDIS_CMD_BULK|REDIS_CMD_DENYOOM,NULL,1,-1,2},
+ {"randomkey",randomkeyCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"select",selectCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"move",moveCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"rename",renameCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"renamenx",renamenxCommand,3,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"expire",expireCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"expireat",expireatCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"keys",keysCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"dbsize",dbsizeCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"auth",authCommand,2,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"ping",pingCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"echo",echoCommand,2,REDIS_CMD_BULK,NULL,0,0,0},
+ {"save",saveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"bgsave",bgsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"bgrewriteaof",bgrewriteaofCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"shutdown",shutdownCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"lastsave",lastsaveCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"type",typeCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"multi",multiCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"exec",execCommand,1,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,execBlockClientOnSwappedKeys,0,0,0},
+ {"discard",discardCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"sync",syncCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"flushdb",flushdbCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"flushall",flushallCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"sort",sortCommand,-2,REDIS_CMD_INLINE|REDIS_CMD_DENYOOM,NULL,1,1,1},
+ {"info",infoCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"monitor",monitorCommand,1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"ttl",ttlCommand,2,REDIS_CMD_INLINE,NULL,1,1,1},
+ {"slaveof",slaveofCommand,3,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"debug",debugCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"config",configCommand,-2,REDIS_CMD_BULK,NULL,0,0,0},
+ {"subscribe",subscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"unsubscribe",unsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"psubscribe",psubscribeCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"punsubscribe",punsubscribeCommand,-1,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"publish",publishCommand,3,REDIS_CMD_BULK|REDIS_CMD_FORCE_REPLICATION,NULL,0,0,0},
+ {"watch",watchCommand,-2,REDIS_CMD_INLINE,NULL,0,0,0},
+ {"unwatch",unwatchCommand,1,REDIS_CMD_INLINE,NULL,0,0,0}
+};
+
+/*============================ Utility functions ============================ */
+
+void redisLog(int level, const char *fmt, ...) {
+ va_list ap;
+ FILE *fp;
+
+ fp = (server.logfile == NULL) ? stdout : fopen(server.logfile,"a");
+ if (!fp) return;
+
+ va_start(ap, fmt);
+ if (level >= server.verbosity) {
+ char *c = ".-*#";
+ char buf[64];
+ time_t now;
+
+ now = time(NULL);
+ strftime(buf,64,"%d %b %H:%M:%S",localtime(&now));
+ fprintf(fp,"[%d] %s %c ",(int)getpid(),buf,c[level]);
+ vfprintf(fp, fmt, ap);
+ fprintf(fp,"\n");
+ fflush(fp);
+ }
+ va_end(ap);
+
+ if (server.logfile) fclose(fp);
+}
+
+/* Redis generally does not try to recover from out of memory conditions
+ * when allocating objects or strings, it is not clear if it will be possible
+ * to report this condition to the client since the networking layer itself
+ * is based on heap allocation for send buffers, so we simply abort.
+ * At least the code will be simpler to read... */
+void oom(const char *msg) {
+ redisLog(REDIS_WARNING, "%s: Out of memory\n",msg);
+ sleep(1);
+ abort();
+}
+
+/*====================== Hash table type implementation ==================== */
+
+/* This is an hash table type that uses the SDS dynamic strings libary as
+ * keys and radis objects as values (objects can hold SDS strings,
+ * lists, sets). */
+
+void dictVanillaFree(void *privdata, void *val)
+{
+ DICT_NOTUSED(privdata);
+ zfree(val);
+}
+
+void dictListDestructor(void *privdata, void *val)
+{
+ DICT_NOTUSED(privdata);
+ listRelease((list*)val);
+}
+
+int dictSdsKeyCompare(void *privdata, const void *key1,
+ const void *key2)
+{
+ int l1,l2;
+ DICT_NOTUSED(privdata);
+
+ l1 = sdslen((sds)key1);
+ l2 = sdslen((sds)key2);
+ if (l1 != l2) return 0;
+ return memcmp(key1, key2, l1) == 0;
+}
+
+void dictRedisObjectDestructor(void *privdata, void *val)
+{
+ DICT_NOTUSED(privdata);
+
+ if (val == NULL) return; /* Values of swapped out keys as set to NULL */
+ decrRefCount(val);
+}
+
+void dictSdsDestructor(void *privdata, void *val)
+{
+ DICT_NOTUSED(privdata);
+
+ sdsfree(val);
+}
+
+int dictObjKeyCompare(void *privdata, const void *key1,
+ const void *key2)
+{
+ const robj *o1 = key1, *o2 = key2;
+ return dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
+}
+
+unsigned int dictObjHash(const void *key) {
+ const robj *o = key;
+ return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
+}
+
+unsigned int dictSdsHash(const void *key) {
+ return dictGenHashFunction((unsigned char*)key, sdslen((char*)key));
+}
+
+int dictEncObjKeyCompare(void *privdata, const void *key1,
+ const void *key2)
+{
+ robj *o1 = (robj*) key1, *o2 = (robj*) key2;
+ int cmp;
+
+ if (o1->encoding == REDIS_ENCODING_INT &&
+ o2->encoding == REDIS_ENCODING_INT)
+ return o1->ptr == o2->ptr;
+
+ o1 = getDecodedObject(o1);
+ o2 = getDecodedObject(o2);
+ cmp = dictSdsKeyCompare(privdata,o1->ptr,o2->ptr);
+ decrRefCount(o1);
+ decrRefCount(o2);
+ return cmp;
+}
+
+unsigned int dictEncObjHash(const void *key) {
+ robj *o = (robj*) key;
+
+ if (o->encoding == REDIS_ENCODING_RAW) {
+ return dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
+ } else {
+ if (o->encoding == REDIS_ENCODING_INT) {
+ char buf[32];
+ int len;
+
+ len = ll2string(buf,32,(long)o->ptr);
+ return dictGenHashFunction((unsigned char*)buf, len);
+ } else {
+ unsigned int hash;
+
+ o = getDecodedObject(o);
+ hash = dictGenHashFunction(o->ptr, sdslen((sds)o->ptr));
+ decrRefCount(o);
+ return hash;
+ }
+ }
+}
+
+/* Sets type */
+dictType setDictType = {
+ dictEncObjHash, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ dictEncObjKeyCompare, /* key compare */
+ dictRedisObjectDestructor, /* key destructor */
+ NULL /* val destructor */
+};
+
+/* Sorted sets hash (note: a skiplist is used in addition to the hash table) */
+dictType zsetDictType = {
+ dictEncObjHash, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ dictEncObjKeyCompare, /* key compare */
+ dictRedisObjectDestructor, /* key destructor */
+ dictVanillaFree /* val destructor of malloc(sizeof(double)) */
+};
+
+/* Db->dict, keys are sds strings, vals are Redis objects. */
+dictType dbDictType = {
+ dictSdsHash, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ dictSdsKeyCompare, /* key compare */
+ dictSdsDestructor, /* key destructor */
+ dictRedisObjectDestructor /* val destructor */
+};
+
+/* Db->expires */
+dictType keyptrDictType = {
+ dictSdsHash, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ dictSdsKeyCompare, /* key compare */
+ NULL, /* key destructor */
+ NULL /* val destructor */
+};
+
+/* Hash type hash table (note that small hashes are represented with zimpaps) */
+dictType hashDictType = {
+ dictEncObjHash, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ dictEncObjKeyCompare, /* key compare */
+ dictRedisObjectDestructor, /* key destructor */
+ dictRedisObjectDestructor /* val destructor */
+};
+
+/* Keylist hash table type has unencoded redis objects as keys and
+ * lists as values. It's used for blocking operations (BLPOP) and to
+ * map swapped keys to a list of clients waiting for this keys to be loaded. */
+dictType keylistDictType = {
+ dictObjHash, /* hash function */
+ NULL, /* key dup */
+ NULL, /* val dup */
+ dictObjKeyCompare, /* key compare */
+ dictRedisObjectDestructor, /* key destructor */
+ dictListDestructor /* val destructor */
+};
+
+int htNeedsResize(dict *dict) {
+ long long size, used;
+
+ size = dictSlots(dict);
+ used = dictSize(dict);
+ return (size && used && size > DICT_HT_INITIAL_SIZE &&
+ (used*100/size < REDIS_HT_MINFILL));
+}
+
+/* If the percentage of used slots in the HT reaches REDIS_HT_MINFILL
+ * we resize the hash table to save memory */
+void tryResizeHashTables(void) {
+ int j;
+
+ for (j = 0; j < server.dbnum; j++) {
+ if (htNeedsResize(server.db[j].dict))
+ dictResize(server.db[j].dict);
+ if (htNeedsResize(server.db[j].expires))
+ dictResize(server.db[j].expires);
+ }
+}
+
+/* Our hash table implementation performs rehashing incrementally while
+ * we write/read from the hash table. Still if the server is idle, the hash
+ * table will use two tables for a long time. So we try to use 1 millisecond
+ * of CPU time at every serverCron() loop in order to rehash some key. */
+void incrementallyRehash(void) {
+ int j;
+
+ for (j = 0; j < server.dbnum; j++) {
+ if (dictIsRehashing(server.db[j].dict)) {
+ dictRehashMilliseconds(server.db[j].dict,1);
+ break; /* already used our millisecond for this loop... */
+ }
+ }
+}
+
+/* This function is called once a background process of some kind terminates,
+ * as we want to avoid resizing the hash tables when there is a child in order
+ * to play well with copy-on-write (otherwise when a resize happens lots of
+ * memory pages are copied). The goal of this function is to update the ability
+ * for dict.c to resize the hash tables accordingly to the fact we have o not
+ * running childs. */
+void updateDictResizePolicy(void) {
+ if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1)
+ dictEnableResize();
+ else
+ dictDisableResize();
+}
+
+/* ======================= Cron: called every 100 ms ======================== */
+
+int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
+ int j, loops = server.cronloops++;
+ REDIS_NOTUSED(eventLoop);
+ REDIS_NOTUSED(id);
+ REDIS_NOTUSED(clientData);
+
+ /* We take a cached value of the unix time in the global state because
+ * with virtual memory and aging there is to store the current time
+ * in objects at every object access, and accuracy is not needed.
+ * To access a global var is faster than calling time(NULL) */
+ server.unixtime = time(NULL);
+ /* We have just 21 bits per object for LRU information.
+ * So we use an (eventually wrapping) LRU clock with minutes resolution.
+ *
+ * When we need to select what object to swap, we compute the minimum
+ * time distance between the current lruclock and the object last access
+ * lruclock info. Even if clocks will wrap on overflow, there is
+ * the interesting property that we are sure that at least
+ * ABS(A-B) minutes passed between current time and timestamp B.
+ *
+ * This is not precise but we don't need at all precision, but just
+ * something statistically reasonable.
+ */
+ server.lruclock = (time(NULL)/60)&((1<<21)-1);
+
+ /* We received a SIGTERM, shutting down here in a safe way, as it is
+ * not ok doing so inside the signal handler. */
+ if (server.shutdown_asap) {
+ if (prepareForShutdown() == REDIS_OK) exit(0);
+ redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
+ }
+
+ /* Show some info about non-empty databases */
+ for (j = 0; j < server.dbnum; j++) {
+ long long size, used, vkeys;
+
+ size = dictSlots(server.db[j].dict);
+ used = dictSize(server.db[j].dict);
+ vkeys = dictSize(server.db[j].expires);
+ if (!(loops % 50) && (used || vkeys)) {
+ redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
+ /* dictPrintStats(server.dict); */
+ }
+ }
+
+ /* We don't want to resize the hash tables while a bacground saving
+ * is in progress: the saving child is created using fork() that is
+ * implemented with a copy-on-write semantic in most modern systems, so
+ * if we resize the HT while there is the saving child at work actually
+ * a lot of memory movements in the parent will cause a lot of pages
+ * copied. */
+ if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
+ if (!(loops % 10)) tryResizeHashTables();
+ if (server.activerehashing) incrementallyRehash();
+ }
+
+ /* Show information about connected clients */
+ if (!(loops % 50)) {
+ redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
+ listLength(server.clients)-listLength(server.slaves),
+ listLength(server.slaves),
+ zmalloc_used_memory());
+ }
+
+ /* Close connections of timedout clients */
+ if ((server.maxidletime && !(loops % 100)) || server.blpop_blocked_clients)
+ closeTimedoutClients();
+
+ /* Check if a background saving or AOF rewrite in progress terminated */
+ if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
+ int statloc;
+ pid_t pid;
+
+ if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
+ if (pid == server.bgsavechildpid) {
+ backgroundSaveDoneHandler(statloc);
+ } else {
+ backgroundRewriteDoneHandler(statloc);
+ }
+ updateDictResizePolicy();
+ }
+ } else {
+ /* If there is not a background saving in progress check if
+ * we have to save now */
+ time_t now = time(NULL);
+ for (j = 0; j < server.saveparamslen; j++) {
+ struct saveparam *sp = server.saveparams+j;
+
+ if (server.dirty >= sp->changes &&
+ now-server.lastsave > sp->seconds) {
+ redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
+ sp->changes, sp->seconds);
+ rdbSaveBackground(server.dbfilename);
+ break;
+ }
+ }
+ }
+
+ /* Try to expire a few timed out keys. The algorithm used is adaptive and
+ * will use few CPU cycles if there are few expiring keys, otherwise
+ * it will get more aggressive to avoid that too much memory is used by
+ * keys that can be removed from the keyspace. */
+ for (j = 0; j < server.dbnum; j++) {
+ int expired;
+ redisDb *db = server.db+j;
+
+ /* Continue to expire if at the end of the cycle more than 25%
+ * of the keys were expired. */
+ do {
+ long num = dictSize(db->expires);
+ time_t now = time(NULL);
+
+ expired = 0;
+ if (num > REDIS_EXPIRELOOKUPS_PER_CRON)
+ num = REDIS_EXPIRELOOKUPS_PER_CRON;
+ while (num--) {
+ dictEntry *de;
+ time_t t;
+
+ if ((de = dictGetRandomKey(db->expires)) == NULL) break;
+ t = (time_t) dictGetEntryVal(de);
+ if (now > t) {
+ sds key = dictGetEntryKey(de);
+ robj *keyobj = createStringObject(key,sdslen(key));
+
+ dbDelete(db,keyobj);
+ decrRefCount(keyobj);
+ expired++;
+ server.stat_expiredkeys++;
+ }
+ }
+ } while (expired > REDIS_EXPIRELOOKUPS_PER_CRON/4);
+ }
+
+ /* Swap a few keys on disk if we are over the memory limit and VM
+ * is enbled. Try to free objects from the free list first. */
+ if (vmCanSwapOut()) {
+ while (server.vm_enabled && zmalloc_used_memory() >
+ server.vm_max_memory)
+ {
+ int retval;
+
+ if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
+ retval = (server.vm_max_threads == 0) ?
+ vmSwapOneObjectBlocking() :
+ vmSwapOneObjectThreaded();
+ if (retval == REDIS_ERR && !(loops % 300) &&
+ zmalloc_used_memory() >
+ (server.vm_max_memory+server.vm_max_memory/10))
+ {
+ redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
+ }
+ /* Note that when using threade I/O we free just one object,
+ * because anyway when the I/O thread in charge to swap this
+ * object out will finish, the handler of completed jobs
+ * will try to swap more objects if we are still out of memory. */
+ if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
+ }
+ }
+
+ /* Check if we should connect to a MASTER */
+ if (server.replstate == REDIS_REPL_CONNECT && !(loops % 10)) {
+ redisLog(REDIS_NOTICE,"Connecting to MASTER...");
+ if (syncWithMaster() == REDIS_OK) {
+ redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync succeeded");
+ if (server.appendonly) rewriteAppendOnlyFileBackground();
+ }
+ }
+ return 100;
+}
+
+/* This function gets called every time Redis is entering the
+ * main loop of the event driven library, that is, before to sleep
+ * for ready file descriptors. */
+void beforeSleep(struct aeEventLoop *eventLoop) {
+ REDIS_NOTUSED(eventLoop);
+
+ /* Awake clients that got all the swapped keys they requested */
+ if (server.vm_enabled && listLength(server.io_ready_clients)) {
+ listIter li;
+ listNode *ln;
+
+ listRewind(server.io_ready_clients,&li);
+ while((ln = listNext(&li))) {
+ redisClient *c = ln->value;
+ struct redisCommand *cmd;
+
+ /* Resume the client. */
+ listDelNode(server.io_ready_clients,ln);
+ c->flags &= (~REDIS_IO_WAIT);
+ server.vm_blocked_clients--;
+ aeCreateFileEvent(server.el, c->fd, AE_READABLE,
+ readQueryFromClient, c);
+ cmd = lookupCommand(c->argv[0]->ptr);
+ redisAssert(cmd != NULL);
+ call(c,cmd);
+ resetClient(c);
+ /* There may be more data to process in the input buffer. */
+ if (c->querybuf && sdslen(c->querybuf) > 0)
+ processInputBuffer(c);
+ }
+ }
+ /* Write the AOF buffer on disk */
+ flushAppendOnlyFile();
+}
+
+/* =========================== Server initialization ======================== */
+
+void createSharedObjects(void) {
+ int j;
+
+ shared.crlf = createObject(REDIS_STRING,sdsnew("\r\n"));
+ shared.ok = createObject(REDIS_STRING,sdsnew("+OK\r\n"));
+ shared.err = createObject(REDIS_STRING,sdsnew("-ERR\r\n"));
+ shared.emptybulk = createObject(REDIS_STRING,sdsnew("$0\r\n\r\n"));
+ shared.czero = createObject(REDIS_STRING,sdsnew(":0\r\n"));
+ shared.cone = createObject(REDIS_STRING,sdsnew(":1\r\n"));
+ shared.cnegone = createObject(REDIS_STRING,sdsnew(":-1\r\n"));
+ shared.nullbulk = createObject(REDIS_STRING,sdsnew("$-1\r\n"));
+ shared.nullmultibulk = createObject(REDIS_STRING,sdsnew("*-1\r\n"));
+ shared.emptymultibulk = createObject(REDIS_STRING,sdsnew("*0\r\n"));
+ shared.pong = createObject(REDIS_STRING,sdsnew("+PONG\r\n"));
+ shared.queued = createObject(REDIS_STRING,sdsnew("+QUEUED\r\n"));
+ shared.wrongtypeerr = createObject(REDIS_STRING,sdsnew(
+ "-ERR Operation against a key holding the wrong kind of value\r\n"));
+ shared.nokeyerr = createObject(REDIS_STRING,sdsnew(
+ "-ERR no such key\r\n"));
+ shared.syntaxerr = createObject(REDIS_STRING,sdsnew(
+ "-ERR syntax error\r\n"));
+ shared.sameobjecterr = createObject(REDIS_STRING,sdsnew(
+ "-ERR source and destination objects are the same\r\n"));
+ shared.outofrangeerr = createObject(REDIS_STRING,sdsnew(
+ "-ERR index out of range\r\n"));
+ shared.space = createObject(REDIS_STRING,sdsnew(" "));
+ shared.colon = createObject(REDIS_STRING,sdsnew(":"));
+ shared.plus = createObject(REDIS_STRING,sdsnew("+"));
+ shared.select0 = createStringObject("select 0\r\n",10);
+ shared.select1 = createStringObject("select 1\r\n",10);
+ shared.select2 = createStringObject("select 2\r\n",10);
+ shared.select3 = createStringObject("select 3\r\n",10);
+ shared.select4 = createStringObject("select 4\r\n",10);
+ shared.select5 = createStringObject("select 5\r\n",10);
+ shared.select6 = createStringObject("select 6\r\n",10);
+ shared.select7 = createStringObject("select 7\r\n",10);
+ shared.select8 = createStringObject("select 8\r\n",10);
+ shared.select9 = createStringObject("select 9\r\n",10);
+ shared.messagebulk = createStringObject("$7\r\nmessage\r\n",13);
+ shared.pmessagebulk = createStringObject("$8\r\npmessage\r\n",14);
+ shared.subscribebulk = createStringObject("$9\r\nsubscribe\r\n",15);
+ shared.unsubscribebulk = createStringObject("$11\r\nunsubscribe\r\n",18);
+ shared.psubscribebulk = createStringObject("$10\r\npsubscribe\r\n",17);
+ shared.punsubscribebulk = createStringObject("$12\r\npunsubscribe\r\n",19);
+ shared.mbulk3 = createStringObject("*3\r\n",4);
+ shared.mbulk4 = createStringObject("*4\r\n",4);
+ for (j = 0; j < REDIS_SHARED_INTEGERS; j++) {
+ shared.integers[j] = createObject(REDIS_STRING,(void*)(long)j);
+ shared.integers[j]->encoding = REDIS_ENCODING_INT;
+ }
+}
+
+void initServerConfig() {
+ server.dbnum = REDIS_DEFAULT_DBNUM;
+ server.port = REDIS_SERVERPORT;
+ server.verbosity = REDIS_VERBOSE;
+ server.maxidletime = REDIS_MAXIDLETIME;
+ server.saveparams = NULL;
+ server.logfile = NULL; /* NULL = log on standard output */
+ server.bindaddr = NULL;
+ server.glueoutputbuf = 1;
+ server.daemonize = 0;
+ server.appendonly = 0;
+ server.appendfsync = APPENDFSYNC_EVERYSEC;
+ server.no_appendfsync_on_rewrite = 0;
+ server.lastfsync = time(NULL);
+ server.appendfd = -1;
+ server.appendseldb = -1; /* Make sure the first time will not match */
+ server.pidfile = zstrdup("/var/run/redis.pid");
+ server.dbfilename = zstrdup("dump.rdb");
+ server.appendfilename = zstrdup("appendonly.aof");
+ server.requirepass = NULL;
+ server.rdbcompression = 1;
+ server.activerehashing = 1;
+ server.maxclients = 0;
+ server.blpop_blocked_clients = 0;
+ server.maxmemory = 0;
+ server.vm_enabled = 0;
+ server.vm_swap_file = zstrdup("/tmp/redis-%p.vm");
+ server.vm_page_size = 256; /* 256 bytes per page */
+ server.vm_pages = 1024*1024*100; /* 104 millions of pages */
+ server.vm_max_memory = 1024LL*1024*1024*1; /* 1 GB of RAM */
+ server.vm_max_threads = 4;
+ server.vm_blocked_clients = 0;
+ server.hash_max_zipmap_entries = REDIS_HASH_MAX_ZIPMAP_ENTRIES;
+ server.hash_max_zipmap_value = REDIS_HASH_MAX_ZIPMAP_VALUE;
+ server.list_max_ziplist_entries = REDIS_LIST_MAX_ZIPLIST_ENTRIES;
+ server.list_max_ziplist_value = REDIS_LIST_MAX_ZIPLIST_VALUE;
+ server.shutdown_asap = 0;
+
+ resetServerSaveParams();
+
+ appendServerSaveParams(60*60,1); /* save after 1 hour and 1 change */
+ appendServerSaveParams(300,100); /* save after 5 minutes and 100 changes */
+ appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */
+ /* Replication related */
+ server.isslave = 0;
+ server.masterauth = NULL;
+ server.masterhost = NULL;
+ server.masterport = 6379;
+ server.master = NULL;
+ server.replstate = REDIS_REPL_NONE;
+
+ /* Double constants initialization */
+ R_Zero = 0.0;
+ R_PosInf = 1.0/R_Zero;
+ R_NegInf = -1.0/R_Zero;
+ R_Nan = R_Zero/R_Zero;
+}
+
+void initServer() {
+ int j;
+
+ signal(SIGHUP, SIG_IGN);
+ signal(SIGPIPE, SIG_IGN);
+ setupSigSegvAction();
+
+ server.devnull = fopen("/dev/null","w");
+ if (server.devnull == NULL) {
+ redisLog(REDIS_WARNING, "Can't open /dev/null: %s", server.neterr);
+ exit(1);
+ }
+ server.clients = listCreate();
+ server.slaves = listCreate();
+ server.monitors = listCreate();
+ server.objfreelist = listCreate();
+ createSharedObjects();
+ server.el = aeCreateEventLoop();
+ server.db = zmalloc(sizeof(redisDb)*server.dbnum);
+ server.fd = anetTcpServer(server.neterr, server.port, server.bindaddr);
+ if (server.fd == -1) {
+ redisLog(REDIS_WARNING, "Opening TCP port: %s", server.neterr);
+ exit(1);
+ }
+ for (j = 0; j < server.dbnum; j++) {
+ server.db[j].dict = dictCreate(&dbDictType,NULL);
+ server.db[j].expires = dictCreate(&keyptrDictType,NULL);
+ server.db[j].blocking_keys = dictCreate(&keylistDictType,NULL);
+ server.db[j].watched_keys = dictCreate(&keylistDictType,NULL);
+ if (server.vm_enabled)
+ server.db[j].io_keys = dictCreate(&keylistDictType,NULL);
+ server.db[j].id = j;
+ }
+ server.pubsub_channels = dictCreate(&keylistDictType,NULL);
+ server.pubsub_patterns = listCreate();
+ listSetFreeMethod(server.pubsub_patterns,freePubsubPattern);
+ listSetMatchMethod(server.pubsub_patterns,listMatchPubsubPattern);
+ server.cronloops = 0;
+ server.bgsavechildpid = -1;
+ server.bgrewritechildpid = -1;
+ server.bgrewritebuf = sdsempty();
+ server.aofbuf = sdsempty();
+ server.lastsave = time(NULL);
+ server.dirty = 0;
+ server.stat_numcommands = 0;
+ server.stat_numconnections = 0;
+ server.stat_expiredkeys = 0;
+ server.stat_starttime = time(NULL);
+ server.unixtime = time(NULL);
+ aeCreateTimeEvent(server.el, 1, serverCron, NULL, NULL);
+ if (aeCreateFileEvent(server.el, server.fd, AE_READABLE,
+ acceptHandler, NULL) == AE_ERR) oom("creating file event");
+
+ if (server.appendonly) {
+ server.appendfd = open(server.appendfilename,O_WRONLY|O_APPEND|O_CREAT,0644);
+ if (server.appendfd == -1) {
+ redisLog(REDIS_WARNING, "Can't open the append-only file: %s",
+ strerror(errno));
+ exit(1);
+ }
+ }
+
+ if (server.vm_enabled) vmInit();
+}
+
+int qsortRedisCommands(const void *r1, const void *r2) {
+ return strcasecmp(
+ ((struct redisCommand*)r1)->name,
+ ((struct redisCommand*)r2)->name);
+}
+
+void sortCommandTable() {
+ /* Copy and sort the read-only version of the command table */
+ commandTable = (struct redisCommand*)malloc(sizeof(readonlyCommandTable));
+ memcpy(commandTable,readonlyCommandTable,sizeof(readonlyCommandTable));
+ qsort(commandTable,
+ sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
+ sizeof(struct redisCommand),qsortRedisCommands);
+}
+
+/* ====================== Commands lookup and execution ===================== */
+
+struct redisCommand *lookupCommand(char *name) {
+ struct redisCommand tmp = {name,NULL,0,0,NULL,0,0,0};
+ return bsearch(
+ &tmp,
+ commandTable,
+ sizeof(readonlyCommandTable)/sizeof(struct redisCommand),
+ sizeof(struct redisCommand),
+ qsortRedisCommands);
+}
+
+/* Call() is the core of Redis execution of a command */
+void call(redisClient *c, struct redisCommand *cmd) {
+ long long dirty;
+
+ dirty = server.dirty;
+ cmd->proc(c);
+ dirty = server.dirty-dirty;
+
+ if (server.appendonly && dirty)
+ feedAppendOnlyFile(cmd,c->db->id,c->argv,c->argc);
+ if ((dirty || cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
+ listLength(server.slaves))
+ replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
+ if (listLength(server.monitors))
+ replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
+ server.stat_numcommands++;
+}
+
+/* If this function gets called we already read a whole
+ * command, argments are in the client argv/argc fields.
+ * processCommand() execute the command or prepare the
+ * server for a bulk read from the client.
+ *
+ * If 1 is returned the client is still alive and valid and
+ * and other operations can be performed by the caller. Otherwise
+ * if 0 is returned the client was destroied (i.e. after QUIT). */
+int processCommand(redisClient *c) {
+ struct redisCommand *cmd;
+
+ /* Free some memory if needed (maxmemory setting) */
+ if (server.maxmemory) freeMemoryIfNeeded();
+
+ /* Handle the multi bulk command type. This is an alternative protocol
+ * supported by Redis in order to receive commands that are composed of
+ * multiple binary-safe "bulk" arguments. The latency of processing is
+ * a bit higher but this allows things like multi-sets, so if this
+ * protocol is used only for MSET and similar commands this is a big win. */
+ if (c->multibulk == 0 && c->argc == 1 && ((char*)(c->argv[0]->ptr))[0] == '*') {
+ c->multibulk = atoi(((char*)c->argv[0]->ptr)+1);
+ if (c->multibulk <= 0) {
+ resetClient(c);
+ return 1;
+ } else {
+ decrRefCount(c->argv[c->argc-1]);
+ c->argc--;
+ return 1;
+ }
+ } else if (c->multibulk) {
+ if (c->bulklen == -1) {
+ if (((char*)c->argv[0]->ptr)[0] != '$') {
+ addReplySds(c,sdsnew("-ERR multi bulk protocol error\r\n"));
+ resetClient(c);
+ return 1;
+ } else {
+ int bulklen = atoi(((char*)c->argv[0]->ptr)+1);
+ decrRefCount(c->argv[0]);
+ if (bulklen < 0 || bulklen > 1024*1024*1024) {
+ c->argc--;
+ addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
+ resetClient(c);
+ return 1;
+ }
+ c->argc--;
+ c->bulklen = bulklen+2; /* add two bytes for CR+LF */
+ return 1;
+ }
+ } else {
+ c->mbargv = zrealloc(c->mbargv,(sizeof(robj*))*(c->mbargc+1));
+ c->mbargv[c->mbargc] = c->argv[0];
+ c->mbargc++;
+ c->argc--;
+ c->multibulk--;
+ if (c->multibulk == 0) {
+ robj **auxargv;
+ int auxargc;
+
+ /* Here we need to swap the multi-bulk argc/argv with the
+ * normal argc/argv of the client structure. */
+ auxargv = c->argv;
+ c->argv = c->mbargv;
+ c->mbargv = auxargv;
+
+ auxargc = c->argc;
+ c->argc = c->mbargc;
+ c->mbargc = auxargc;
+
+ /* We need to set bulklen to something different than -1
+ * in order for the code below to process the command without
+ * to try to read the last argument of a bulk command as
+ * a special argument. */
+ c->bulklen = 0;
+ /* continue below and process the command */
+ } else {
+ c->bulklen = -1;
+ return 1;
+ }
+ }
+ }
+ /* -- end of multi bulk commands processing -- */
+
+ /* The QUIT command is handled as a special case. Normal command
+ * procs are unable to close the client connection safely */
+ if (!strcasecmp(c->argv[0]->ptr,"quit")) {
+ freeClient(c);
+ return 0;
+ }
+
+ /* Now lookup the command and check ASAP about trivial error conditions
+ * such wrong arity, bad command name and so forth. */
+ cmd = lookupCommand(c->argv[0]->ptr);
+ if (!cmd) {
+ addReplySds(c,
+ sdscatprintf(sdsempty(), "-ERR unknown command '%s'\r\n",
+ (char*)c->argv[0]->ptr));
+ resetClient(c);
+ return 1;
+ } else if ((cmd->arity > 0 && cmd->arity != c->argc) ||
+ (c->argc < -cmd->arity)) {
+ addReplySds(c,
+ sdscatprintf(sdsempty(),
+ "-ERR wrong number of arguments for '%s' command\r\n",
+ cmd->name));
+ resetClient(c);
+ return 1;
+ } else if (cmd->flags & REDIS_CMD_BULK && c->bulklen == -1) {
+ /* This is a bulk command, we have to read the last argument yet. */
+ int bulklen = atoi(c->argv[c->argc-1]->ptr);
+
+ decrRefCount(c->argv[c->argc-1]);
+ if (bulklen < 0 || bulklen > 1024*1024*1024) {
+ c->argc--;
+ addReplySds(c,sdsnew("-ERR invalid bulk write count\r\n"));
+ resetClient(c);
+ return 1;
+ }
+ c->argc--;
+ c->bulklen = bulklen+2; /* add two bytes for CR+LF */
+ /* It is possible that the bulk read is already in the
+ * buffer. Check this condition and handle it accordingly.
+ * This is just a fast path, alternative to call processInputBuffer().
+ * It's a good idea since the code is small and this condition
+ * happens most of the times. */
+ if ((signed)sdslen(c->querybuf) >= c->bulklen) {
+ c->argv[c->argc] = createStringObject(c->querybuf,c->bulklen-2);
+ c->argc++;
+ c->querybuf = sdsrange(c->querybuf,c->bulklen,-1);
+ } else {
+ /* Otherwise return... there is to read the last argument
+ * from the socket. */
+ return 1;
+ }
+ }
+ /* Let's try to encode the bulk object to save space. */
+ if (cmd->flags & REDIS_CMD_BULK)
+ c->argv[c->argc-1] = tryObjectEncoding(c->argv[c->argc-1]);
+
+ /* Check if the user is authenticated */
+ if (server.requirepass && !c->authenticated && cmd->proc != authCommand) {
+ addReplySds(c,sdsnew("-ERR operation not permitted\r\n"));
+ resetClient(c);
+ return 1;
+ }
+
+ /* Handle the maxmemory directive */
+ if (server.maxmemory && (cmd->flags & REDIS_CMD_DENYOOM) &&
+ zmalloc_used_memory() > server.maxmemory)
+ {
+ addReplySds(c,sdsnew("-ERR command not allowed when used memory > 'maxmemory'\r\n"));
+ resetClient(c);
+ return 1;
+ }
+
+ /* Only allow SUBSCRIBE and UNSUBSCRIBE in the context of Pub/Sub */
+ if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
+ &&
+ cmd->proc != subscribeCommand && cmd->proc != unsubscribeCommand &&
+ cmd->proc != psubscribeCommand && cmd->proc != punsubscribeCommand) {
+ addReplySds(c,sdsnew("-ERR only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context\r\n"));
+ resetClient(c);
+ return 1;
+ }
+
+ /* Exec the command */
+ if (c->flags & REDIS_MULTI &&
+ cmd->proc != execCommand && cmd->proc != discardCommand &&
+ cmd->proc != multiCommand && cmd->proc != watchCommand)
+ {
+ queueMultiCommand(c,cmd);
+ addReply(c,shared.queued);
+ } else {
+ if (server.vm_enabled && server.vm_max_threads > 0 &&
+ blockClientOnSwappedKeys(c,cmd)) return 1;
+ call(c,cmd);
+ }
+
+ /* Prepare the client for the next command */
+ resetClient(c);
+ return 1;
+}
+
+/*================================== Shutdown =============================== */
+
+int prepareForShutdown() {
+ redisLog(REDIS_WARNING,"User requested shutdown, saving DB...");
+ /* Kill the saving child if there is a background saving in progress.
+ We want to avoid race conditions, for instance our saving child may
+ overwrite the synchronous saving did by SHUTDOWN. */
+ if (server.bgsavechildpid != -1) {
+ redisLog(REDIS_WARNING,"There is a live saving child. Killing it!");
+ kill(server.bgsavechildpid,SIGKILL);
+ rdbRemoveTempFile(server.bgsavechildpid);
+ }
+ if (server.appendonly) {
+ /* Append only file: fsync() the AOF and exit */
+ aof_fsync(server.appendfd);
+ if (server.vm_enabled) unlink(server.vm_swap_file);
+ } else {
+ /* Snapshotting. Perform a SYNC SAVE and exit */
+ if (rdbSave(server.dbfilename) == REDIS_OK) {
+ if (server.daemonize)
+ unlink(server.pidfile);
+ redisLog(REDIS_WARNING,"%zu bytes used at exit",zmalloc_used_memory());
+ } else {
+ /* Ooops.. error saving! The best we can do is to continue
+ * operating. Note that if there was a background saving process,
+ * in the next cron() Redis will be notified that the background
+ * saving aborted, handling special stuff like slaves pending for
+ * synchronization... */
+ redisLog(REDIS_WARNING,"Error trying to save the DB, can't exit");
+ return REDIS_ERR;
+ }
+ }
+ redisLog(REDIS_WARNING,"Server exit now, bye bye...");
+ return REDIS_OK;
+}
+
+/*================================== Commands =============================== */
+
+void authCommand(redisClient *c) {
+ if (!server.requirepass || !strcmp(c->argv[1]->ptr, server.requirepass)) {
+ c->authenticated = 1;
+ addReply(c,shared.ok);
+ } else {
+ c->authenticated = 0;
+ addReplySds(c,sdscatprintf(sdsempty(),"-ERR invalid password\r\n"));
+ }
+}
+
+void pingCommand(redisClient *c) {
+ addReply(c,shared.pong);
+}
+
+void echoCommand(redisClient *c) {
+ addReplyBulk(c,c->argv[1]);
+}
+
+/* Convert an amount of bytes into a human readable string in the form
+ * of 100B, 2G, 100M, 4K, and so forth. */
+void bytesToHuman(char *s, unsigned long long n) {
+ double d;
+
+ if (n < 1024) {
+ /* Bytes */
+ sprintf(s,"%lluB",n);
+ return;
+ } else if (n < (1024*1024)) {
+ d = (double)n/(1024);
+ sprintf(s,"%.2fK",d);
+ } else if (n < (1024LL*1024*1024)) {
+ d = (double)n/(1024*1024);
+ sprintf(s,"%.2fM",d);
+ } else if (n < (1024LL*1024*1024*1024)) {
+ d = (double)n/(1024LL*1024*1024);
+ sprintf(s,"%.2fG",d);
+ }
+}
+
+/* Create the string returned by the INFO command. This is decoupled
+ * by the INFO command itself as we need to report the same information
+ * on memory corruption problems. */
+sds genRedisInfoString(void) {
+ sds info;
+ time_t uptime = time(NULL)-server.stat_starttime;
+ int j;
+ char hmem[64];
+
+ bytesToHuman(hmem,zmalloc_used_memory());
+ info = sdscatprintf(sdsempty(),
+ "redis_version:%s\r\n"
+ "redis_git_sha1:%s\r\n"
+ "redis_git_dirty:%d\r\n"
+ "arch_bits:%s\r\n"
+ "multiplexing_api:%s\r\n"
+ "process_id:%ld\r\n"
+ "uptime_in_seconds:%ld\r\n"
+ "uptime_in_days:%ld\r\n"
+ "connected_clients:%d\r\n"
+ "connected_slaves:%d\r\n"
+ "blocked_clients:%d\r\n"
+ "used_memory:%zu\r\n"
+ "used_memory_human:%s\r\n"
+ "changes_since_last_save:%lld\r\n"
+ "bgsave_in_progress:%d\r\n"
+ "last_save_time:%ld\r\n"
+ "bgrewriteaof_in_progress:%d\r\n"
+ "total_connections_received:%lld\r\n"
+ "total_commands_processed:%lld\r\n"
+ "expired_keys:%lld\r\n"
+ "hash_max_zipmap_entries:%zu\r\n"
+ "hash_max_zipmap_value:%zu\r\n"
+ "pubsub_channels:%ld\r\n"
+ "pubsub_patterns:%u\r\n"
+ "vm_enabled:%d\r\n"
+ "role:%s\r\n"
+ ,REDIS_VERSION,
+ redisGitSHA1(),
+ strtol(redisGitDirty(),NULL,10) > 0,
+ (sizeof(long) == 8) ? "64" : "32",
+ aeGetApiName(),
+ (long) getpid(),
+ uptime,
+ uptime/(3600*24),
+ listLength(server.clients)-listLength(server.slaves),
+ listLength(server.slaves),
+ server.blpop_blocked_clients,
+ zmalloc_used_memory(),
+ hmem,
+ server.dirty,
+ server.bgsavechildpid != -1,
+ server.lastsave,
+ server.bgrewritechildpid != -1,
+ server.stat_numconnections,
+ server.stat_numcommands,
+ server.stat_expiredkeys,
+ server.hash_max_zipmap_entries,
+ server.hash_max_zipmap_value,
+ dictSize(server.pubsub_channels),
+ listLength(server.pubsub_patterns),
+ server.vm_enabled != 0,
+ server.masterhost == NULL ? "master" : "slave"
+ );
+ if (server.masterhost) {
+ info = sdscatprintf(info,
+ "master_host:%s\r\n"
+ "master_port:%d\r\n"
+ "master_link_status:%s\r\n"
+ "master_last_io_seconds_ago:%d\r\n"
+ ,server.masterhost,
+ server.masterport,
+ (server.replstate == REDIS_REPL_CONNECTED) ?
+ "up" : "down",
+ server.master ? ((int)(time(NULL)-server.master->lastinteraction)) : -1
+ );
+ }
+ if (server.vm_enabled) {
+ lockThreadedIO();
+ info = sdscatprintf(info,
+ "vm_conf_max_memory:%llu\r\n"
+ "vm_conf_page_size:%llu\r\n"
+ "vm_conf_pages:%llu\r\n"
+ "vm_stats_used_pages:%llu\r\n"
+ "vm_stats_swapped_objects:%llu\r\n"
+ "vm_stats_swappin_count:%llu\r\n"
+ "vm_stats_swappout_count:%llu\r\n"
+ "vm_stats_io_newjobs_len:%lu\r\n"
+ "vm_stats_io_processing_len:%lu\r\n"
+ "vm_stats_io_processed_len:%lu\r\n"
+ "vm_stats_io_active_threads:%lu\r\n"
+ "vm_stats_blocked_clients:%lu\r\n"
+ ,(unsigned long long) server.vm_max_memory,
+ (unsigned long long) server.vm_page_size,
+ (unsigned long long) server.vm_pages,
+ (unsigned long long) server.vm_stats_used_pages,
+ (unsigned long long) server.vm_stats_swapped_objects,
+ (unsigned long long) server.vm_stats_swapins,
+ (unsigned long long) server.vm_stats_swapouts,
+ (unsigned long) listLength(server.io_newjobs),
+ (unsigned long) listLength(server.io_processing),
+ (unsigned long) listLength(server.io_processed),
+ (unsigned long) server.io_active_threads,
+ (unsigned long) server.vm_blocked_clients
+ );
+ unlockThreadedIO();
+ }
+ for (j = 0; j < server.dbnum; j++) {
+ long long keys, vkeys;
+
+ keys = dictSize(server.db[j].dict);
+ vkeys = dictSize(server.db[j].expires);
+ if (keys || vkeys) {
+ info = sdscatprintf(info, "db%d:keys=%lld,expires=%lld\r\n",
+ j, keys, vkeys);
+ }
+ }
+ return info;
+}
+
+void infoCommand(redisClient *c) {
+ sds info = genRedisInfoString();
+ addReplySds(c,sdscatprintf(sdsempty(),"$%lu\r\n",
+ (unsigned long)sdslen(info)));
+ addReplySds(c,info);
+ addReply(c,shared.crlf);
+}
+
+void monitorCommand(redisClient *c) {
+ /* ignore MONITOR if aleady slave or in monitor mode */
+ if (c->flags & REDIS_SLAVE) return;
+
+ c->flags |= (REDIS_SLAVE|REDIS_MONITOR);
+ c->slaveseldb = 0;
+ listAddNodeTail(server.monitors,c);
+ addReply(c,shared.ok);
+}
+
+/* ============================ Maxmemory directive ======================== */
+
+/* Try to free one object form the pre-allocated objects free list.
+ * This is useful under low mem conditions as by default we take 1 million
+ * free objects allocated. On success REDIS_OK is returned, otherwise
+ * REDIS_ERR. */
+int tryFreeOneObjectFromFreelist(void) {
+ robj *o;
+
+ if (server.vm_enabled) pthread_mutex_lock(&server.obj_freelist_mutex);
+ if (listLength(server.objfreelist)) {
+ listNode *head = listFirst(server.objfreelist);
+ o = listNodeValue(head);
+ listDelNode(server.objfreelist,head);
+ if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
+ zfree(o);
+ return REDIS_OK;
+ } else {
+ if (server.vm_enabled) pthread_mutex_unlock(&server.obj_freelist_mutex);
+ return REDIS_ERR;
+ }
+}
+
+/* This function gets called when 'maxmemory' is set on the config file to limit
+ * the max memory used by the server, and we are out of memory.
+ * This function will try to, in order:
+ *
+ * - Free objects from the free list
+ * - Try to remove keys with an EXPIRE set
+ *
+ * It is not possible to free enough memory to reach used-memory < maxmemory
+ * the server will start refusing commands that will enlarge even more the
+ * memory usage.
+ */
+void freeMemoryIfNeeded(void) {
+ while (server.maxmemory && zmalloc_used_memory() > server.maxmemory) {
+ int j, k, freed = 0;
+
+ if (tryFreeOneObjectFromFreelist() == REDIS_OK) continue;
+ for (j = 0; j < server.dbnum; j++) {
+ int minttl = -1;
+ robj *minkey = NULL;
+ struct dictEntry *de;
+
+ if (dictSize(server.db[j].expires)) {
+ freed = 1;
+ /* From a sample of three keys drop the one nearest to
+ * the natural expire */
+ for (k = 0; k < 3; k++) {
+ time_t t;
+
+ de = dictGetRandomKey(server.db[j].expires);
+ t = (time_t) dictGetEntryVal(de);
+ if (minttl == -1 || t < minttl) {
+ minkey = dictGetEntryKey(de);
+ minttl = t;
+ }
+ }
+ dbDelete(server.db+j,minkey);
+ }
+ }
+ if (!freed) return; /* nothing to free... */
+ }
+}
+
+/* =================================== Main! ================================ */
+
+#ifdef __linux__
+int linuxOvercommitMemoryValue(void) {
+ FILE *fp = fopen("/proc/sys/vm/overcommit_memory","r");
+ char buf[64];
+
+ if (!fp) return -1;
+ if (fgets(buf,64,fp) == NULL) {
+ fclose(fp);
+ return -1;
+ }
+ fclose(fp);
+
+ return atoi(buf);
+}
+
+void linuxOvercommitMemoryWarning(void) {
+ if (linuxOvercommitMemoryValue() == 0) {
+ redisLog(REDIS_WARNING,"WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.");
+ }
+}
+#endif /* __linux__ */
+
+void daemonize(void) {
+ int fd;
+ FILE *fp;
+
+ if (fork() != 0) exit(0); /* parent exits */
+ setsid(); /* create a new session */
+
+ /* Every output goes to /dev/null. If Redis is daemonized but
+ * the 'logfile' is set to 'stdout' in the configuration file
+ * it will not log at all. */
+ if ((fd = open("/dev/null", O_RDWR, 0)) != -1) {
+ dup2(fd, STDIN_FILENO);
+ dup2(fd, STDOUT_FILENO);
+ dup2(fd, STDERR_FILENO);
+ if (fd > STDERR_FILENO) close(fd);
+ }
+ /* Try to write the pid file */
+ fp = fopen(server.pidfile,"w");
+ if (fp) {
+ fprintf(fp,"%d\n",getpid());
+ fclose(fp);
+ }
+}
+
+void version() {
+ printf("Redis server version %s (%s:%d)\n", REDIS_VERSION,
+ redisGitSHA1(), atoi(redisGitDirty()) > 0);
+ exit(0);
+}
+
+void usage() {
+ fprintf(stderr,"Usage: ./redis-server [/path/to/redis.conf]\n");
+ fprintf(stderr," ./redis-server - (read config from stdin)\n");
+ exit(1);
+}
+
+int main(int argc, char **argv) {
+ time_t start;
+
+ initServerConfig();
+ sortCommandTable();
+ if (argc == 2) {
+ if (strcmp(argv[1], "-v") == 0 ||
+ strcmp(argv[1], "--version") == 0) version();
+ if (strcmp(argv[1], "--help") == 0) usage();
+ resetServerSaveParams();
+ loadServerConfig(argv[1]);
+ } else if ((argc > 2)) {
+ usage();
+ } else {
+ redisLog(REDIS_WARNING,"Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'");
+ }
+ if (server.daemonize) daemonize();
+ initServer();
+ redisLog(REDIS_NOTICE,"Server started, Redis version " REDIS_VERSION);
+#ifdef __linux__
+ linuxOvercommitMemoryWarning();
+#endif
+ start = time(NULL);
+ if (server.appendonly) {
+ if (loadAppendOnlyFile(server.appendfilename) == REDIS_OK)
+ redisLog(REDIS_NOTICE,"DB loaded from append only file: %ld seconds",time(NULL)-start);
+ } else {
+ if (rdbLoad(server.dbfilename) == REDIS_OK)
+ redisLog(REDIS_NOTICE,"DB loaded from disk: %ld seconds",time(NULL)-start);
+ }
+ redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port);
+ aeSetBeforeSleepProc(server.el,beforeSleep);
+ aeMain(server.el);
+ aeDeleteEventLoop(server.el);
+ return 0;
+}
+
+/* ============================= Backtrace support ========================= */
+
+#ifdef HAVE_BACKTRACE
+void *getMcontextEip(ucontext_t *uc) {
+#if defined(__FreeBSD__)
+ return (void*) uc->uc_mcontext.mc_eip;
+#elif defined(__dietlibc__)
+ return (void*) uc->uc_mcontext.eip;
+#elif defined(__APPLE__) && !defined(MAC_OS_X_VERSION_10_6)
+ #if __x86_64__
+ return (void*) uc->uc_mcontext->__ss.__rip;
+ #else
+ return (void*) uc->uc_mcontext->__ss.__eip;
+ #endif
+#elif defined(__APPLE__) && defined(MAC_OS_X_VERSION_10_6)
+ #if defined(_STRUCT_X86_THREAD_STATE64) && !defined(__i386__)
+ return (void*) uc->uc_mcontext->__ss.__rip;
+ #else
+ return (void*) uc->uc_mcontext->__ss.__eip;
+ #endif
+#elif defined(__i386__) || defined(__X86_64__) || defined(__x86_64__)
+ return (void*) uc->uc_mcontext.gregs[REG_EIP]; /* Linux 32/64 bit */
+#elif defined(__ia64__) /* Linux IA64 */
+ return (void*) uc->uc_mcontext.sc_ip;
+#else
+ return NULL;
+#endif
+}
+
+void segvHandler(int sig, siginfo_t *info, void *secret) {
+ void *trace[100];
+ char **messages = NULL;
+ int i, trace_size = 0;
+ ucontext_t *uc = (ucontext_t*) secret;
+ sds infostring;
+ REDIS_NOTUSED(info);
+
+ redisLog(REDIS_WARNING,
+ "======= Ooops! Redis %s got signal: -%d- =======", REDIS_VERSION, sig);
+ infostring = genRedisInfoString();
+ redisLog(REDIS_WARNING, "%s",infostring);
+ /* It's not safe to sdsfree() the returned string under memory
+ * corruption conditions. Let it leak as we are going to abort */
+
+ trace_size = backtrace(trace, 100);
+ /* overwrite sigaction with caller's address */
+ if (getMcontextEip(uc) != NULL) {
+ trace[1] = getMcontextEip(uc);
+ }
+ messages = backtrace_symbols(trace, trace_size);
+
+ for (i=1; i<trace_size; ++i)
+ redisLog(REDIS_WARNING,"%s", messages[i]);
+
+ /* free(messages); Don't call free() with possibly corrupted memory. */
+ _exit(0);
+}
+
+void sigtermHandler(int sig) {
+ REDIS_NOTUSED(sig);
+
+ redisLog(REDIS_WARNING,"SIGTERM received, scheduling shutting down...");
+ server.shutdown_asap = 1;
+}
+
+void setupSigSegvAction(void) {
+ struct sigaction act;
+
+ sigemptyset (&act.sa_mask);
+ /* When the SA_SIGINFO flag is set in sa_flags then sa_sigaction
+ * is used. Otherwise, sa_handler is used */
+ act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND | SA_SIGINFO;
+ act.sa_sigaction = segvHandler;
+ sigaction (SIGSEGV, &act, NULL);
+ sigaction (SIGBUS, &act, NULL);
+ sigaction (SIGFPE, &act, NULL);
+ sigaction (SIGILL, &act, NULL);
+ sigaction (SIGBUS, &act, NULL);
+
+ act.sa_flags = SA_NODEFER | SA_ONSTACK | SA_RESETHAND;
+ act.sa_handler = sigtermHandler;
+ sigaction (SIGTERM, &act, NULL);
+ return;
+}
+
+#else /* HAVE_BACKTRACE */
+void setupSigSegvAction(void) {
+}
+#endif /* HAVE_BACKTRACE */
+
+/* The End */
diff --git a/src/redis.h b/src/redis.h
new file mode 100644
index 000000000..e54caa2a2
--- /dev/null
+++ b/src/redis.h
@@ -0,0 +1,885 @@
+#ifndef __REDIS_H
+#define __REDIS_H
+
+#include "fmacros.h"
+#include "config.h"
+
+#if defined(__sun)
+#include "solarisfixes.h"
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "ae.h" /* Event driven programming library */
+#include "sds.h" /* Dynamic safe strings */
+#include "dict.h" /* Hash tables */
+#include "adlist.h" /* Linked lists */
+#include "zmalloc.h" /* total memory usage aware version of malloc/free */
+#include "anet.h" /* Networking the easy way */
+#include "zipmap.h" /* Compact string -> string data structure */
+#include "ziplist.h" /* Compact list data structure */
+#include "version.h"
+
+/* Error codes */
+#define REDIS_OK 0
+#define REDIS_ERR -1
+
+/* Static server configuration */
+#define REDIS_SERVERPORT 6379 /* TCP port */
+#define REDIS_MAXIDLETIME (60*5) /* default client timeout */
+#define REDIS_IOBUF_LEN 1024
+#define REDIS_LOADBUF_LEN 1024
+#define REDIS_STATIC_ARGS 8
+#define REDIS_DEFAULT_DBNUM 16
+#define REDIS_CONFIGLINE_MAX 1024
+#define REDIS_OBJFREELIST_MAX 1000000 /* Max number of objects to cache */
+#define REDIS_MAX_SYNC_TIME 60 /* Slave can't take more to sync */
+#define REDIS_EXPIRELOOKUPS_PER_CRON 10 /* lookup 10 expires per loop */
+#define REDIS_MAX_WRITE_PER_EVENT (1024*64)
+#define REDIS_REQUEST_MAX_SIZE (1024*1024*256) /* max bytes in inline command */
+#define REDIS_SHARED_INTEGERS 10000
+
+/* If more then REDIS_WRITEV_THRESHOLD write packets are pending use writev */
+#define REDIS_WRITEV_THRESHOLD 3
+/* Max number of iovecs used for each writev call */
+#define REDIS_WRITEV_IOVEC_COUNT 256
+
+/* Hash table parameters */
+#define REDIS_HT_MINFILL 10 /* Minimal hash table fill 10% */
+
+/* Command flags */
+#define REDIS_CMD_BULK 1 /* Bulk write command */
+#define REDIS_CMD_INLINE 2 /* Inline command */
+/* REDIS_CMD_DENYOOM reserves a longer comment: all the commands marked with
+ this flags will return an error when the 'maxmemory' option is set in the
+ config file and the server is using more than maxmemory bytes of memory.
+ In short this commands are denied on low memory conditions. */
+#define REDIS_CMD_DENYOOM 4
+#define REDIS_CMD_FORCE_REPLICATION 8 /* Force replication even if dirty is 0 */
+
+/* Object types */
+#define REDIS_STRING 0
+#define REDIS_LIST 1
+#define REDIS_SET 2
+#define REDIS_ZSET 3
+#define REDIS_HASH 4
+#define REDIS_VMPOINTER 8
+
+/* Objects encoding. Some kind of objects like Strings and Hashes can be
+ * internally represented in multiple ways. The 'encoding' field of the object
+ * is set to one of this fields for this object. */
+#define REDIS_ENCODING_RAW 0 /* Raw representation */
+#define REDIS_ENCODING_INT 1 /* Encoded as integer */
+#define REDIS_ENCODING_HT 2 /* Encoded as hash table */
+#define REDIS_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
+#define REDIS_ENCODING_LINKEDLIST 4 /* Encoded as regular linked list */
+#define REDIS_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
+
+/* Object types only used for dumping to disk */
+#define REDIS_EXPIRETIME 253
+#define REDIS_SELECTDB 254
+#define REDIS_EOF 255
+
+/* Defines related to the dump file format. To store 32 bits lengths for short
+ * keys requires a lot of space, so we check the most significant 2 bits of
+ * the first byte to interpreter the length:
+ *
+ * 00|000000 => if the two MSB are 00 the len is the 6 bits of this byte
+ * 01|000000 00000000 => 01, the len is 14 byes, 6 bits + 8 bits of next byte
+ * 10|000000 [32 bit integer] => if it's 01, a full 32 bit len will follow
+ * 11|000000 this means: specially encoded object will follow. The six bits
+ * number specify the kind of object that follows.
+ * See the REDIS_RDB_ENC_* defines.
+ *
+ * Lenghts up to 63 are stored using a single byte, most DB keys, and may
+ * values, will fit inside. */
+#define REDIS_RDB_6BITLEN 0
+#define REDIS_RDB_14BITLEN 1
+#define REDIS_RDB_32BITLEN 2
+#define REDIS_RDB_ENCVAL 3
+#define REDIS_RDB_LENERR UINT_MAX
+
+/* When a length of a string object stored on disk has the first two bits
+ * set, the remaining two bits specify a special encoding for the object
+ * accordingly to the following defines: */
+#define REDIS_RDB_ENC_INT8 0 /* 8 bit signed integer */
+#define REDIS_RDB_ENC_INT16 1 /* 16 bit signed integer */
+#define REDIS_RDB_ENC_INT32 2 /* 32 bit signed integer */
+#define REDIS_RDB_ENC_LZF 3 /* string compressed with FASTLZ */
+
+/* Virtual memory object->where field. */
+#define REDIS_VM_MEMORY 0 /* The object is on memory */
+#define REDIS_VM_SWAPPED 1 /* The object is on disk */
+#define REDIS_VM_SWAPPING 2 /* Redis is swapping this object on disk */
+#define REDIS_VM_LOADING 3 /* Redis is loading this object from disk */
+
+/* Virtual memory static configuration stuff.
+ * Check vmFindContiguousPages() to know more about this magic numbers. */
+#define REDIS_VM_MAX_NEAR_PAGES 65536
+#define REDIS_VM_MAX_RANDOM_JUMP 4096
+#define REDIS_VM_MAX_THREADS 32
+#define REDIS_THREAD_STACK_SIZE (1024*1024*4)
+/* The following is the *percentage* of completed I/O jobs to process when the
+ * handelr is called. While Virtual Memory I/O operations are performed by
+ * threads, this operations must be processed by the main thread when completed
+ * in order to take effect. */
+#define REDIS_MAX_COMPLETED_JOBS_PROCESSED 1
+
+/* Client flags */
+#define REDIS_SLAVE 1 /* This client is a slave server */
+#define REDIS_MASTER 2 /* This client is a master server */
+#define REDIS_MONITOR 4 /* This client is a slave monitor, see MONITOR */
+#define REDIS_MULTI 8 /* This client is in a MULTI context */
+#define REDIS_BLOCKED 16 /* The client is waiting in a blocking operation */
+#define REDIS_IO_WAIT 32 /* The client is waiting for Virtual Memory I/O */
+#define REDIS_DIRTY_CAS 64 /* Watched keys modified. EXEC will fail. */
+
+/* Slave replication state - slave side */
+#define REDIS_REPL_NONE 0 /* No active replication */
+#define REDIS_REPL_CONNECT 1 /* Must connect to master */
+#define REDIS_REPL_CONNECTED 2 /* Connected to master */
+
+/* Slave replication state - from the point of view of master
+ * Note that in SEND_BULK and ONLINE state the slave receives new updates
+ * in its output queue. In the WAIT_BGSAVE state instead the server is waiting
+ * to start the next background saving in order to send updates to it. */
+#define REDIS_REPL_WAIT_BGSAVE_START 3 /* master waits bgsave to start feeding it */
+#define REDIS_REPL_WAIT_BGSAVE_END 4 /* master waits bgsave to start bulk DB transmission */
+#define REDIS_REPL_SEND_BULK 5 /* master is sending the bulk DB */
+#define REDIS_REPL_ONLINE 6 /* bulk DB already transmitted, receive updates */
+
+/* List related stuff */
+#define REDIS_HEAD 0
+#define REDIS_TAIL 1
+
+/* Sort operations */
+#define REDIS_SORT_GET 0
+#define REDIS_SORT_ASC 1
+#define REDIS_SORT_DESC 2
+#define REDIS_SORTKEY_MAX 1024
+
+/* Log levels */
+#define REDIS_DEBUG 0
+#define REDIS_VERBOSE 1
+#define REDIS_NOTICE 2
+#define REDIS_WARNING 3
+
+/* Anti-warning macro... */
+#define REDIS_NOTUSED(V) ((void) V)
+
+#define ZSKIPLIST_MAXLEVEL 32 /* Should be enough for 2^32 elements */
+#define ZSKIPLIST_P 0.25 /* Skiplist P = 1/4 */
+
+/* Append only defines */
+#define APPENDFSYNC_NO 0
+#define APPENDFSYNC_ALWAYS 1
+#define APPENDFSYNC_EVERYSEC 2
+
+/* Zip structure related defaults */
+#define REDIS_HASH_MAX_ZIPMAP_ENTRIES 64
+#define REDIS_HASH_MAX_ZIPMAP_VALUE 512
+#define REDIS_LIST_MAX_ZIPLIST_ENTRIES 1024
+#define REDIS_LIST_MAX_ZIPLIST_VALUE 32
+
+/* Sets operations codes */
+#define REDIS_OP_UNION 0
+#define REDIS_OP_DIFF 1
+#define REDIS_OP_INTER 2
+
+/* We can print the stacktrace, so our assert is defined this way: */
+#define redisAssert(_e) ((_e)?(void)0 : (_redisAssert(#_e,__FILE__,__LINE__),_exit(1)))
+#define redisPanic(_e) _redisPanic(#_e,__FILE__,__LINE__),_exit(1)
+void _redisAssert(char *estr, char *file, int line);
+void _redisPanic(char *msg, char *file, int line);
+
+/*-----------------------------------------------------------------------------
+ * Data types
+ *----------------------------------------------------------------------------*/
+
+/* A redis object, that is a type able to hold a string / list / set */
+
+/* The actual Redis Object */
+typedef struct redisObject {
+ unsigned type:4;
+ unsigned storage:2; /* REDIS_VM_MEMORY or REDIS_VM_SWAPPING */
+ unsigned encoding:4;
+ unsigned lru:22; /* lru time (relative to server.lruclock) */
+ int refcount;
+ void *ptr;
+ /* VM fields are only allocated if VM is active, otherwise the
+ * object allocation function will just allocate
+ * sizeof(redisObjct) minus sizeof(redisObjectVM), so using
+ * Redis without VM active will not have any overhead. */
+} robj;
+
+/* The VM pointer structure - identifies an object in the swap file.
+ *
+ * This object is stored in place of the value
+ * object in the main key->value hash table representing a database.
+ * Note that the first fields (type, storage) are the same as the redisObject
+ * structure so that vmPointer strucuters can be accessed even when casted
+ * as redisObject structures.
+ *
+ * This is useful as we don't know if a value object is or not on disk, but we
+ * are always able to read obj->storage to check this. For vmPointer
+ * structures "type" is set to REDIS_VMPOINTER (even if without this field
+ * is still possible to check the kind of object from the value of 'storage').*/
+typedef struct vmPointer {
+ unsigned type:4;
+ unsigned storage:2; /* REDIS_VM_SWAPPED or REDIS_VM_LOADING */
+ unsigned notused:26;
+ unsigned int vtype; /* type of the object stored in the swap file */
+ off_t page; /* the page at witch the object is stored on disk */
+ off_t usedpages; /* number of pages used on disk */
+} vmpointer;
+
+/* Macro used to initalize a Redis object allocated on the stack.
+ * Note that this macro is taken near the structure definition to make sure
+ * we'll update it when the structure is changed, to avoid bugs like
+ * bug #85 introduced exactly in this way. */
+#define initStaticStringObject(_var,_ptr) do { \
+ _var.refcount = 1; \
+ _var.type = REDIS_STRING; \
+ _var.encoding = REDIS_ENCODING_RAW; \
+ _var.ptr = _ptr; \
+ _var.storage = REDIS_VM_MEMORY; \
+} while(0);
+
+typedef struct redisDb {
+ dict *dict; /* The keyspace for this DB */
+ dict *expires; /* Timeout of keys with a timeout set */
+ dict *blocking_keys; /* Keys with clients waiting for data (BLPOP) */
+ dict *io_keys; /* Keys with clients waiting for VM I/O */
+ dict *watched_keys; /* WATCHED keys for MULTI/EXEC CAS */
+ int id;
+} redisDb;
+
+/* Client MULTI/EXEC state */
+typedef struct multiCmd {
+ robj **argv;
+ int argc;
+ struct redisCommand *cmd;
+} multiCmd;
+
+typedef struct multiState {
+ multiCmd *commands; /* Array of MULTI commands */
+ int count; /* Total number of MULTI commands */
+} multiState;
+
+/* With multiplexing we need to take per-clinet state.
+ * Clients are taken in a liked list. */
+typedef struct redisClient {
+ int fd;
+ redisDb *db;
+ int dictid;
+ sds querybuf;
+ robj **argv, **mbargv;
+ int argc, mbargc;
+ int bulklen; /* bulk read len. -1 if not in bulk read mode */
+ int multibulk; /* multi bulk command format active */
+ list *reply;
+ int sentlen;
+ time_t lastinteraction; /* time of the last interaction, used for timeout */
+ int flags; /* REDIS_SLAVE | REDIS_MONITOR | REDIS_MULTI ... */
+ int slaveseldb; /* slave selected db, if this client is a slave */
+ int authenticated; /* when requirepass is non-NULL */
+ int replstate; /* replication state if this is a slave */
+ int repldbfd; /* replication DB file descriptor */
+ long repldboff; /* replication DB file offset */
+ off_t repldbsize; /* replication DB file size */
+ multiState mstate; /* MULTI/EXEC state */
+ robj **blocking_keys; /* The key we are waiting to terminate a blocking
+ * operation such as BLPOP. Otherwise NULL. */
+ int blocking_keys_num; /* Number of blocking keys */
+ time_t blockingto; /* Blocking operation timeout. If UNIX current time
+ * is >= blockingto then the operation timed out. */
+ list *io_keys; /* Keys this client is waiting to be loaded from the
+ * swap file in order to continue. */
+ list *watched_keys; /* Keys WATCHED for MULTI/EXEC CAS */
+ dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */
+ list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */
+} redisClient;
+
+struct saveparam {
+ time_t seconds;
+ int changes;
+};
+
+struct sharedObjectsStruct {
+ robj *crlf, *ok, *err, *emptybulk, *czero, *cone, *cnegone, *pong, *space,
+ *colon, *nullbulk, *nullmultibulk, *queued,
+ *emptymultibulk, *wrongtypeerr, *nokeyerr, *syntaxerr, *sameobjecterr,
+ *outofrangeerr, *plus,
+ *select0, *select1, *select2, *select3, *select4,
+ *select5, *select6, *select7, *select8, *select9,
+ *messagebulk, *pmessagebulk, *subscribebulk, *unsubscribebulk, *mbulk3,
+ *mbulk4, *psubscribebulk, *punsubscribebulk,
+ *integers[REDIS_SHARED_INTEGERS];
+};
+
+/* Global server state structure */
+struct redisServer {
+ int port;
+ int fd;
+ redisDb *db;
+ long long dirty; /* changes to DB from the last save */
+ list *clients;
+ list *slaves, *monitors;
+ char neterr[ANET_ERR_LEN];
+ aeEventLoop *el;
+ int cronloops; /* number of times the cron function run */
+ list *objfreelist; /* A list of freed objects to avoid malloc() */
+ time_t lastsave; /* Unix time of last save succeeede */
+ /* Fields used only for stats */
+ time_t stat_starttime; /* server start time */
+ long long stat_numcommands; /* number of processed commands */
+ long long stat_numconnections; /* number of connections received */
+ long long stat_expiredkeys; /* number of expired keys */
+ /* Configuration */
+ int verbosity;
+ int glueoutputbuf;
+ int maxidletime;
+ int dbnum;
+ int daemonize;
+ int appendonly;
+ int appendfsync;
+ int no_appendfsync_on_rewrite;
+ int shutdown_asap;
+ time_t lastfsync;
+ int appendfd;
+ int appendseldb;
+ char *pidfile;
+ pid_t bgsavechildpid;
+ pid_t bgrewritechildpid;
+ sds bgrewritebuf; /* buffer taken by parent during oppend only rewrite */
+ sds aofbuf; /* AOF buffer, written before entering the event loop */
+ struct saveparam *saveparams;
+ int saveparamslen;
+ char *logfile;
+ char *bindaddr;
+ char *dbfilename;
+ char *appendfilename;
+ char *requirepass;
+ int rdbcompression;
+ int activerehashing;
+ /* Replication related */
+ int isslave;
+ char *masterauth;
+ char *masterhost;
+ int masterport;
+ redisClient *master; /* client that is master for this slave */
+ int replstate;
+ unsigned int maxclients;
+ unsigned long long maxmemory;
+ unsigned int blpop_blocked_clients;
+ unsigned int vm_blocked_clients;
+ /* Sort parameters - qsort_r() is only available under BSD so we
+ * have to take this state global, in order to pass it to sortCompare() */
+ int sort_desc;
+ int sort_alpha;
+ int sort_bypattern;
+ /* Virtual memory configuration */
+ int vm_enabled;
+ char *vm_swap_file;
+ off_t vm_page_size;
+ off_t vm_pages;
+ unsigned long long vm_max_memory;
+ /* Zip structure config */
+ size_t hash_max_zipmap_entries;
+ size_t hash_max_zipmap_value;
+ size_t list_max_ziplist_entries;
+ size_t list_max_ziplist_value;
+ /* Virtual memory state */
+ FILE *vm_fp;
+ int vm_fd;
+ off_t vm_next_page; /* Next probably empty page */
+ off_t vm_near_pages; /* Number of pages allocated sequentially */
+ unsigned char *vm_bitmap; /* Bitmap of free/used pages */
+ time_t unixtime; /* Unix time sampled every second. */
+ /* Virtual memory I/O threads stuff */
+ /* An I/O thread process an element taken from the io_jobs queue and
+ * put the result of the operation in the io_done list. While the
+ * job is being processed, it's put on io_processing queue. */
+ list *io_newjobs; /* List of VM I/O jobs yet to be processed */
+ list *io_processing; /* List of VM I/O jobs being processed */
+ list *io_processed; /* List of VM I/O jobs already processed */
+ list *io_ready_clients; /* Clients ready to be unblocked. All keys loaded */
+ pthread_mutex_t io_mutex; /* lock to access io_jobs/io_done/io_thread_job */
+ pthread_mutex_t obj_freelist_mutex; /* safe redis objects creation/free */
+ pthread_mutex_t io_swapfile_mutex; /* So we can lseek + write */
+ pthread_attr_t io_threads_attr; /* attributes for threads creation */
+ int io_active_threads; /* Number of running I/O threads */
+ int vm_max_threads; /* Max number of I/O threads running at the same time */
+ /* Our main thread is blocked on the event loop, locking for sockets ready
+ * to be read or written, so when a threaded I/O operation is ready to be
+ * processed by the main thread, the I/O thread will use a unix pipe to
+ * awake the main thread. The followings are the two pipe FDs. */
+ int io_ready_pipe_read;
+ int io_ready_pipe_write;
+ /* Virtual memory stats */
+ unsigned long long vm_stats_used_pages;
+ unsigned long long vm_stats_swapped_objects;
+ unsigned long long vm_stats_swapouts;
+ unsigned long long vm_stats_swapins;
+ /* Pubsub */
+ dict *pubsub_channels; /* Map channels to list of subscribed clients */
+ list *pubsub_patterns; /* A list of pubsub_patterns */
+ /* Misc */
+ FILE *devnull;
+ unsigned lruclock:22; /* clock incrementing every minute, for LRU */
+ unsigned lruclock_padding:10;
+};
+
+typedef struct pubsubPattern {
+ redisClient *client;
+ robj *pattern;
+} pubsubPattern;
+
+typedef void redisCommandProc(redisClient *c);
+typedef void redisVmPreloadProc(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
+struct redisCommand {
+ char *name;
+ redisCommandProc *proc;
+ int arity;
+ int flags;
+ /* Use a function to determine which keys need to be loaded
+ * in the background prior to executing this command. Takes precedence
+ * over vm_firstkey and others, ignored when NULL */
+ redisVmPreloadProc *vm_preload_proc;
+ /* What keys should be loaded in background when calling this command? */
+ int vm_firstkey; /* The first argument that's a key (0 = no keys) */
+ int vm_lastkey; /* THe last argument that's a key */
+ int vm_keystep; /* The step between first and last key */
+};
+
+struct redisFunctionSym {
+ char *name;
+ unsigned long pointer;
+};
+
+typedef struct _redisSortObject {
+ robj *obj;
+ union {
+ double score;
+ robj *cmpobj;
+ } u;
+} redisSortObject;
+
+typedef struct _redisSortOperation {
+ int type;
+ robj *pattern;
+} redisSortOperation;
+
+/* ZSETs use a specialized version of Skiplists */
+
+typedef struct zskiplistNode {
+ struct zskiplistNode **forward;
+ struct zskiplistNode *backward;
+ unsigned int *span;
+ double score;
+ robj *obj;
+} zskiplistNode;
+
+typedef struct zskiplist {
+ struct zskiplistNode *header, *tail;
+ unsigned long length;
+ int level;
+} zskiplist;
+
+typedef struct zset {
+ dict *dict;
+ zskiplist *zsl;
+} zset;
+
+/* VM threaded I/O request message */
+#define REDIS_IOJOB_LOAD 0 /* Load from disk to memory */
+#define REDIS_IOJOB_PREPARE_SWAP 1 /* Compute needed pages */
+#define REDIS_IOJOB_DO_SWAP 2 /* Swap from memory to disk */
+typedef struct iojob {
+ int type; /* Request type, REDIS_IOJOB_* */
+ redisDb *db;/* Redis database */
+ robj *key; /* This I/O request is about swapping this key */
+ robj *id; /* Unique identifier of this job:
+ this is the object to swap for REDIS_IOREQ_*_SWAP, or the
+ vmpointer objct for REDIS_IOREQ_LOAD. */
+ robj *val; /* the value to swap for REDIS_IOREQ_*_SWAP, otherwise this
+ * field is populated by the I/O thread for REDIS_IOREQ_LOAD. */
+ off_t page; /* Swap page where to read/write the object */
+ off_t pages; /* Swap pages needed to save object. PREPARE_SWAP return val */
+ int canceled; /* True if this command was canceled by blocking side of VM */
+ pthread_t thread; /* ID of the thread processing this entry */
+} iojob;
+
+/* Structure to hold list iteration abstraction. */
+typedef struct {
+ robj *subject;
+ unsigned char encoding;
+ unsigned char direction; /* Iteration direction */
+ unsigned char *zi;
+ listNode *ln;
+} listTypeIterator;
+
+/* Structure for an entry while iterating over a list. */
+typedef struct {
+ listTypeIterator *li;
+ unsigned char *zi; /* Entry in ziplist */
+ listNode *ln; /* Entry in linked list */
+} listTypeEntry;
+
+/* Structure to hold hash iteration abstration. Note that iteration over
+ * hashes involves both fields and values. Because it is possible that
+ * not both are required, store pointers in the iterator to avoid
+ * unnecessary memory allocation for fields/values. */
+typedef struct {
+ int encoding;
+ unsigned char *zi;
+ unsigned char *zk, *zv;
+ unsigned int zklen, zvlen;
+
+ dictIterator *di;
+ dictEntry *de;
+} hashTypeIterator;
+
+#define REDIS_HASH_KEY 1
+#define REDIS_HASH_VALUE 2
+
+/*-----------------------------------------------------------------------------
+ * Extern declarations
+ *----------------------------------------------------------------------------*/
+
+extern struct redisServer server;
+extern struct sharedObjectsStruct shared;
+extern dictType setDictType;
+extern dictType zsetDictType;
+extern double R_Zero, R_PosInf, R_NegInf, R_Nan;
+dictType hashDictType;
+
+/*-----------------------------------------------------------------------------
+ * Functions prototypes
+ *----------------------------------------------------------------------------*/
+
+/* networking.c -- Networking and Client related operations */
+redisClient *createClient(int fd);
+void closeTimedoutClients(void);
+void freeClient(redisClient *c);
+void resetClient(redisClient *c);
+void sendReplyToClient(aeEventLoop *el, int fd, void *privdata, int mask);
+void sendReplyToClientWritev(aeEventLoop *el, int fd, void *privdata, int mask);
+void addReply(redisClient *c, robj *obj);
+void addReplySds(redisClient *c, sds s);
+void processInputBuffer(redisClient *c);
+void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
+void readQueryFromClient(aeEventLoop *el, int fd, void *privdata, int mask);
+void addReplyBulk(redisClient *c, robj *obj);
+void addReplyBulkCString(redisClient *c, char *s);
+void acceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
+void addReply(redisClient *c, robj *obj);
+void addReplySds(redisClient *c, sds s);
+void addReplyDouble(redisClient *c, double d);
+void addReplyLongLong(redisClient *c, long long ll);
+void addReplyUlong(redisClient *c, unsigned long ul);
+void *dupClientReplyValue(void *o);
+
+/* List data type */
+void listTypeTryConversion(robj *subject, robj *value);
+void listTypePush(robj *subject, robj *value, int where);
+robj *listTypePop(robj *subject, int where);
+unsigned long listTypeLength(robj *subject);
+listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction);
+void listTypeReleaseIterator(listTypeIterator *li);
+int listTypeNext(listTypeIterator *li, listTypeEntry *entry);
+robj *listTypeGet(listTypeEntry *entry);
+void listTypeInsert(listTypeEntry *entry, robj *value, int where);
+int listTypeEqual(listTypeEntry *entry, robj *o);
+void listTypeDelete(listTypeEntry *entry);
+void listTypeConvert(robj *subject, int enc);
+void unblockClientWaitingData(redisClient *c);
+int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele);
+void popGenericCommand(redisClient *c, int where);
+
+/* MULTI/EXEC/WATCH... */
+void unwatchAllKeys(redisClient *c);
+void initClientMultiState(redisClient *c);
+void freeClientMultiState(redisClient *c);
+void queueMultiCommand(redisClient *c, struct redisCommand *cmd);
+void touchWatchedKey(redisDb *db, robj *key);
+void touchWatchedKeysOnFlush(int dbid);
+
+/* Redis object implementation */
+void decrRefCount(void *o);
+void incrRefCount(robj *o);
+void freeStringObject(robj *o);
+void freeListObject(robj *o);
+void freeSetObject(robj *o);
+void freeZsetObject(robj *o);
+void freeHashObject(robj *o);
+robj *createObject(int type, void *ptr);
+robj *createStringObject(char *ptr, size_t len);
+robj *dupStringObject(robj *o);
+robj *tryObjectEncoding(robj *o);
+robj *getDecodedObject(robj *o);
+size_t stringObjectLen(robj *o);
+int tryFreeOneObjectFromFreelist(void);
+robj *createStringObjectFromLongLong(long long value);
+robj *createListObject(void);
+robj *createZiplistObject(void);
+robj *createSetObject(void);
+robj *createHashObject(void);
+robj *createZsetObject(void);
+int getLongFromObjectOrReply(redisClient *c, robj *o, long *target, const char *msg);
+int checkType(redisClient *c, robj *o, int type);
+int getLongLongFromObjectOrReply(redisClient *c, robj *o, long long *target, const char *msg);
+int getDoubleFromObjectOrReply(redisClient *c, robj *o, double *target, const char *msg);
+int getLongLongFromObject(robj *o, long long *target);
+char *strEncoding(int encoding);
+int compareStringObjects(robj *a, robj *b);
+int equalStringObjects(robj *a, robj *b);
+
+/* Replication */
+void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
+void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc);
+int syncWithMaster(void);
+void updateSlavesWaitingBgsave(int bgsaveerr);
+
+/* RDB persistence */
+int rdbLoad(char *filename);
+int rdbSaveBackground(char *filename);
+void rdbRemoveTempFile(pid_t childpid);
+int rdbSave(char *filename);
+int rdbSaveObject(FILE *fp, robj *o);
+off_t rdbSavedObjectPages(robj *o, FILE *fp);
+off_t rdbSavedObjectLen(robj *o, FILE *fp);
+robj *rdbLoadObject(int type, FILE *fp);
+void backgroundSaveDoneHandler(int statloc);
+
+/* AOF persistence */
+void flushAppendOnlyFile(void);
+void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc);
+void aofRemoveTempFile(pid_t childpid);
+int rewriteAppendOnlyFileBackground(void);
+int loadAppendOnlyFile(char *filename);
+void stopAppendOnly(void);
+int startAppendOnly(void);
+void backgroundRewriteDoneHandler(int statloc);
+
+/* Sorted sets data type */
+zskiplist *zslCreate(void);
+void zslFree(zskiplist *zsl);
+void zslInsert(zskiplist *zsl, double score, robj *obj);
+
+/* Core functions */
+void freeMemoryIfNeeded(void);
+int processCommand(redisClient *c);
+void setupSigSegvAction(void);
+struct redisCommand *lookupCommand(char *name);
+void call(redisClient *c, struct redisCommand *cmd);
+int prepareForShutdown();
+void redisLog(int level, const char *fmt, ...);
+void usage();
+void updateDictResizePolicy(void);
+int htNeedsResize(dict *dict);
+void oom(const char *msg);
+
+/* Virtual Memory */
+void vmInit(void);
+void vmMarkPagesFree(off_t page, off_t count);
+robj *vmLoadObject(robj *o);
+robj *vmPreviewObject(robj *o);
+int vmSwapOneObjectBlocking(void);
+int vmSwapOneObjectThreaded(void);
+int vmCanSwapOut(void);
+void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata, int mask);
+void vmCancelThreadedIOJob(robj *o);
+void lockThreadedIO(void);
+void unlockThreadedIO(void);
+int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db);
+void freeIOJob(iojob *j);
+void queueIOJob(iojob *j);
+int vmWriteObjectOnSwap(robj *o, off_t page);
+robj *vmReadObjectFromSwap(off_t page, int type);
+void waitEmptyIOJobsQueue(void);
+void vmReopenSwapFile(void);
+int vmFreePage(off_t page);
+void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
+void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv);
+int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd);
+int dontWaitForSwappedKey(redisClient *c, robj *key);
+void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key);
+vmpointer *vmSwapObjectBlocking(robj *val);
+
+/* Hash data type */
+void convertToRealHash(robj *o);
+void hashTypeTryConversion(robj *subject, robj **argv, int start, int end);
+void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2);
+robj *hashTypeGet(robj *o, robj *key);
+int hashTypeExists(robj *o, robj *key);
+int hashTypeSet(robj *o, robj *key, robj *value);
+int hashTypeDelete(robj *o, robj *key);
+unsigned long hashTypeLength(robj *o);
+hashTypeIterator *hashTypeInitIterator(robj *subject);
+void hashTypeReleaseIterator(hashTypeIterator *hi);
+int hashTypeNext(hashTypeIterator *hi);
+robj *hashTypeCurrent(hashTypeIterator *hi, int what);
+robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key);
+
+/* Pub / Sub */
+int pubsubUnsubscribeAllChannels(redisClient *c, int notify);
+int pubsubUnsubscribeAllPatterns(redisClient *c, int notify);
+void freePubsubPattern(void *p);
+int listMatchPubsubPattern(void *a, void *b);
+
+/* Utility functions */
+int stringmatchlen(const char *pattern, int patternLen,
+ const char *string, int stringLen, int nocase);
+int stringmatch(const char *pattern, const char *string, int nocase);
+long long memtoll(const char *p, int *err);
+int ll2string(char *s, size_t len, long long value);
+int isStringRepresentableAsLong(sds s, long *longval);
+
+/* Configuration */
+void loadServerConfig(char *filename);
+void appendServerSaveParams(time_t seconds, int changes);
+void resetServerSaveParams();
+
+/* db.c -- Keyspace access API */
+int removeExpire(redisDb *db, robj *key);
+int expireIfNeeded(redisDb *db, robj *key);
+int deleteIfVolatile(redisDb *db, robj *key);
+time_t getExpire(redisDb *db, robj *key);
+int setExpire(redisDb *db, robj *key, time_t when);
+robj *lookupKey(redisDb *db, robj *key);
+robj *lookupKeyRead(redisDb *db, robj *key);
+robj *lookupKeyWrite(redisDb *db, robj *key);
+robj *lookupKeyReadOrReply(redisClient *c, robj *key, robj *reply);
+robj *lookupKeyWriteOrReply(redisClient *c, robj *key, robj *reply);
+int dbAdd(redisDb *db, robj *key, robj *val);
+int dbReplace(redisDb *db, robj *key, robj *val);
+int dbExists(redisDb *db, robj *key);
+robj *dbRandomKey(redisDb *db);
+int dbDelete(redisDb *db, robj *key);
+long long emptyDb();
+int selectDb(redisClient *c, int id);
+
+/* Git SHA1 */
+char *redisGitSHA1(void);
+char *redisGitDirty(void);
+
+/* Commands prototypes */
+void authCommand(redisClient *c);
+void pingCommand(redisClient *c);
+void echoCommand(redisClient *c);
+void setCommand(redisClient *c);
+void setnxCommand(redisClient *c);
+void setexCommand(redisClient *c);
+void getCommand(redisClient *c);
+void delCommand(redisClient *c);
+void existsCommand(redisClient *c);
+void incrCommand(redisClient *c);
+void decrCommand(redisClient *c);
+void incrbyCommand(redisClient *c);
+void decrbyCommand(redisClient *c);
+void selectCommand(redisClient *c);
+void randomkeyCommand(redisClient *c);
+void keysCommand(redisClient *c);
+void dbsizeCommand(redisClient *c);
+void lastsaveCommand(redisClient *c);
+void saveCommand(redisClient *c);
+void bgsaveCommand(redisClient *c);
+void bgrewriteaofCommand(redisClient *c);
+void shutdownCommand(redisClient *c);
+void moveCommand(redisClient *c);
+void renameCommand(redisClient *c);
+void renamenxCommand(redisClient *c);
+void lpushCommand(redisClient *c);
+void rpushCommand(redisClient *c);
+void lpushxCommand(redisClient *c);
+void rpushxCommand(redisClient *c);
+void linsertCommand(redisClient *c);
+void lpopCommand(redisClient *c);
+void rpopCommand(redisClient *c);
+void llenCommand(redisClient *c);
+void lindexCommand(redisClient *c);
+void lrangeCommand(redisClient *c);
+void ltrimCommand(redisClient *c);
+void typeCommand(redisClient *c);
+void lsetCommand(redisClient *c);
+void saddCommand(redisClient *c);
+void sremCommand(redisClient *c);
+void smoveCommand(redisClient *c);
+void sismemberCommand(redisClient *c);
+void scardCommand(redisClient *c);
+void spopCommand(redisClient *c);
+void srandmemberCommand(redisClient *c);
+void sinterCommand(redisClient *c);
+void sinterstoreCommand(redisClient *c);
+void sunionCommand(redisClient *c);
+void sunionstoreCommand(redisClient *c);
+void sdiffCommand(redisClient *c);
+void sdiffstoreCommand(redisClient *c);
+void syncCommand(redisClient *c);
+void flushdbCommand(redisClient *c);
+void flushallCommand(redisClient *c);
+void sortCommand(redisClient *c);
+void lremCommand(redisClient *c);
+void rpoplpushcommand(redisClient *c);
+void infoCommand(redisClient *c);
+void mgetCommand(redisClient *c);
+void monitorCommand(redisClient *c);
+void expireCommand(redisClient *c);
+void expireatCommand(redisClient *c);
+void getsetCommand(redisClient *c);
+void ttlCommand(redisClient *c);
+void slaveofCommand(redisClient *c);
+void debugCommand(redisClient *c);
+void msetCommand(redisClient *c);
+void msetnxCommand(redisClient *c);
+void zaddCommand(redisClient *c);
+void zincrbyCommand(redisClient *c);
+void zrangeCommand(redisClient *c);
+void zrangebyscoreCommand(redisClient *c);
+void zcountCommand(redisClient *c);
+void zrevrangeCommand(redisClient *c);
+void zcardCommand(redisClient *c);
+void zremCommand(redisClient *c);
+void zscoreCommand(redisClient *c);
+void zremrangebyscoreCommand(redisClient *c);
+void multiCommand(redisClient *c);
+void execCommand(redisClient *c);
+void discardCommand(redisClient *c);
+void blpopCommand(redisClient *c);
+void brpopCommand(redisClient *c);
+void appendCommand(redisClient *c);
+void substrCommand(redisClient *c);
+void zrankCommand(redisClient *c);
+void zrevrankCommand(redisClient *c);
+void hsetCommand(redisClient *c);
+void hsetnxCommand(redisClient *c);
+void hgetCommand(redisClient *c);
+void hmsetCommand(redisClient *c);
+void hmgetCommand(redisClient *c);
+void hdelCommand(redisClient *c);
+void hlenCommand(redisClient *c);
+void zremrangebyrankCommand(redisClient *c);
+void zunionstoreCommand(redisClient *c);
+void zinterstoreCommand(redisClient *c);
+void hkeysCommand(redisClient *c);
+void hvalsCommand(redisClient *c);
+void hgetallCommand(redisClient *c);
+void hexistsCommand(redisClient *c);
+void configCommand(redisClient *c);
+void hincrbyCommand(redisClient *c);
+void subscribeCommand(redisClient *c);
+void unsubscribeCommand(redisClient *c);
+void psubscribeCommand(redisClient *c);
+void punsubscribeCommand(redisClient *c);
+void publishCommand(redisClient *c);
+void watchCommand(redisClient *c);
+void unwatchCommand(redisClient *c);
+
+#endif
diff --git a/src/release.c b/src/release.c
new file mode 100644
index 000000000..64186ec4e
--- /dev/null
+++ b/src/release.c
@@ -0,0 +1,13 @@
+/* Every time the Redis Git SHA1 or Dirty status changes only this file
+ * small file is recompiled, as we access this information in all the other
+ * files using this functions. */
+
+#include "release.h"
+
+char *redisGitSHA1(void) {
+ return REDIS_GIT_SHA1;
+}
+
+char *redisGitDirty(void) {
+ return REDIS_GIT_DIRTY;
+}
diff --git a/src/replication.c b/src/replication.c
new file mode 100644
index 000000000..ecb04ce1a
--- /dev/null
+++ b/src/replication.c
@@ -0,0 +1,475 @@
+#include "redis.h"
+
+#include <sys/time.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
+ listNode *ln;
+ listIter li;
+ int outc = 0, j;
+ robj **outv;
+ /* We need 1+(ARGS*3) objects since commands are using the new protocol
+ * and we one 1 object for the first "*<count>\r\n" multibulk count, then
+ * for every additional object we have "$<count>\r\n" + object + "\r\n". */
+ robj *static_outv[REDIS_STATIC_ARGS*3+1];
+ robj *lenobj;
+
+ if (argc <= REDIS_STATIC_ARGS) {
+ outv = static_outv;
+ } else {
+ outv = zmalloc(sizeof(robj*)*(argc*3+1));
+ }
+
+ lenobj = createObject(REDIS_STRING,
+ sdscatprintf(sdsempty(), "*%d\r\n", argc));
+ lenobj->refcount = 0;
+ outv[outc++] = lenobj;
+ for (j = 0; j < argc; j++) {
+ lenobj = createObject(REDIS_STRING,
+ sdscatprintf(sdsempty(),"$%lu\r\n",
+ (unsigned long) stringObjectLen(argv[j])));
+ lenobj->refcount = 0;
+ outv[outc++] = lenobj;
+ outv[outc++] = argv[j];
+ outv[outc++] = shared.crlf;
+ }
+
+ /* Increment all the refcounts at start and decrement at end in order to
+ * be sure to free objects if there is no slave in a replication state
+ * able to be feed with commands */
+ for (j = 0; j < outc; j++) incrRefCount(outv[j]);
+ listRewind(slaves,&li);
+ while((ln = listNext(&li))) {
+ redisClient *slave = ln->value;
+
+ /* Don't feed slaves that are still waiting for BGSAVE to start */
+ if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue;
+
+ /* Feed all the other slaves, MONITORs and so on */
+ if (slave->slaveseldb != dictid) {
+ robj *selectcmd;
+
+ switch(dictid) {
+ case 0: selectcmd = shared.select0; break;
+ case 1: selectcmd = shared.select1; break;
+ case 2: selectcmd = shared.select2; break;
+ case 3: selectcmd = shared.select3; break;
+ case 4: selectcmd = shared.select4; break;
+ case 5: selectcmd = shared.select5; break;
+ case 6: selectcmd = shared.select6; break;
+ case 7: selectcmd = shared.select7; break;
+ case 8: selectcmd = shared.select8; break;
+ case 9: selectcmd = shared.select9; break;
+ default:
+ selectcmd = createObject(REDIS_STRING,
+ sdscatprintf(sdsempty(),"select %d\r\n",dictid));
+ selectcmd->refcount = 0;
+ break;
+ }
+ addReply(slave,selectcmd);
+ slave->slaveseldb = dictid;
+ }
+ for (j = 0; j < outc; j++) addReply(slave,outv[j]);
+ }
+ for (j = 0; j < outc; j++) decrRefCount(outv[j]);
+ if (outv != static_outv) zfree(outv);
+}
+
+void replicationFeedMonitors(list *monitors, int dictid, robj **argv, int argc) {
+ listNode *ln;
+ listIter li;
+ int j;
+ sds cmdrepr = sdsnew("+");
+ robj *cmdobj;
+ struct timeval tv;
+
+ gettimeofday(&tv,NULL);
+ cmdrepr = sdscatprintf(cmdrepr,"%ld.%ld ",(long)tv.tv_sec,(long)tv.tv_usec);
+ if (dictid != 0) cmdrepr = sdscatprintf(cmdrepr,"(db %d) ", dictid);
+
+ for (j = 0; j < argc; j++) {
+ if (argv[j]->encoding == REDIS_ENCODING_INT) {
+ cmdrepr = sdscatprintf(cmdrepr, "%ld", (long)argv[j]->ptr);
+ } else {
+ cmdrepr = sdscatrepr(cmdrepr,(char*)argv[j]->ptr,
+ sdslen(argv[j]->ptr));
+ }
+ if (j != argc-1)
+ cmdrepr = sdscatlen(cmdrepr," ",1);
+ }
+ cmdrepr = sdscatlen(cmdrepr,"\r\n",2);
+ cmdobj = createObject(REDIS_STRING,cmdrepr);
+
+ listRewind(monitors,&li);
+ while((ln = listNext(&li))) {
+ redisClient *monitor = ln->value;
+ addReply(monitor,cmdobj);
+ }
+ decrRefCount(cmdobj);
+}
+
+int syncWrite(int fd, char *ptr, ssize_t size, int timeout) {
+ ssize_t nwritten, ret = size;
+ time_t start = time(NULL);
+
+ timeout++;
+ while(size) {
+ if (aeWait(fd,AE_WRITABLE,1000) & AE_WRITABLE) {
+ nwritten = write(fd,ptr,size);
+ if (nwritten == -1) return -1;
+ ptr += nwritten;
+ size -= nwritten;
+ }
+ if ((time(NULL)-start) > timeout) {
+ errno = ETIMEDOUT;
+ return -1;
+ }
+ }
+ return ret;
+}
+
+int syncRead(int fd, char *ptr, ssize_t size, int timeout) {
+ ssize_t nread, totread = 0;
+ time_t start = time(NULL);
+
+ timeout++;
+ while(size) {
+ if (aeWait(fd,AE_READABLE,1000) & AE_READABLE) {
+ nread = read(fd,ptr,size);
+ if (nread == -1) return -1;
+ ptr += nread;
+ size -= nread;
+ totread += nread;
+ }
+ if ((time(NULL)-start) > timeout) {
+ errno = ETIMEDOUT;
+ return -1;
+ }
+ }
+ return totread;
+}
+
+int syncReadLine(int fd, char *ptr, ssize_t size, int timeout) {
+ ssize_t nread = 0;
+
+ size--;
+ while(size) {
+ char c;
+
+ if (syncRead(fd,&c,1,timeout) == -1) return -1;
+ if (c == '\n') {
+ *ptr = '\0';
+ if (nread && *(ptr-1) == '\r') *(ptr-1) = '\0';
+ return nread;
+ } else {
+ *ptr++ = c;
+ *ptr = '\0';
+ nread++;
+ }
+ }
+ return nread;
+}
+
+void syncCommand(redisClient *c) {
+ /* ignore SYNC if aleady slave or in monitor mode */
+ if (c->flags & REDIS_SLAVE) return;
+
+ /* SYNC can't be issued when the server has pending data to send to
+ * the client about already issued commands. We need a fresh reply
+ * buffer registering the differences between the BGSAVE and the current
+ * dataset, so that we can copy to other slaves if needed. */
+ if (listLength(c->reply) != 0) {
+ addReplySds(c,sdsnew("-ERR SYNC is invalid with pending input\r\n"));
+ return;
+ }
+
+ redisLog(REDIS_NOTICE,"Slave ask for synchronization");
+ /* Here we need to check if there is a background saving operation
+ * in progress, or if it is required to start one */
+ if (server.bgsavechildpid != -1) {
+ /* Ok a background save is in progress. Let's check if it is a good
+ * one for replication, i.e. if there is another slave that is
+ * registering differences since the server forked to save */
+ redisClient *slave;
+ listNode *ln;
+ listIter li;
+
+ listRewind(server.slaves,&li);
+ while((ln = listNext(&li))) {
+ slave = ln->value;
+ if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break;
+ }
+ if (ln) {
+ /* Perfect, the server is already registering differences for
+ * another slave. Set the right state, and copy the buffer. */
+ listRelease(c->reply);
+ c->reply = listDup(slave->reply);
+ c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
+ redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC");
+ } else {
+ /* No way, we need to wait for the next BGSAVE in order to
+ * register differences */
+ c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
+ redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC");
+ }
+ } else {
+ /* Ok we don't have a BGSAVE in progress, let's start one */
+ redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
+ if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
+ redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
+ addReplySds(c,sdsnew("-ERR Unalbe to perform background save\r\n"));
+ return;
+ }
+ c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
+ }
+ c->repldbfd = -1;
+ c->flags |= REDIS_SLAVE;
+ c->slaveseldb = 0;
+ listAddNodeTail(server.slaves,c);
+ return;
+}
+
+void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
+ redisClient *slave = privdata;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(mask);
+ char buf[REDIS_IOBUF_LEN];
+ ssize_t nwritten, buflen;
+
+ if (slave->repldboff == 0) {
+ /* Write the bulk write count before to transfer the DB. In theory here
+ * we don't know how much room there is in the output buffer of the
+ * socket, but in pratice SO_SNDLOWAT (the minimum count for output
+ * operations) will never be smaller than the few bytes we need. */
+ sds bulkcount;
+
+ bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long)
+ slave->repldbsize);
+ if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount))
+ {
+ sdsfree(bulkcount);
+ freeClient(slave);
+ return;
+ }
+ sdsfree(bulkcount);
+ }
+ lseek(slave->repldbfd,slave->repldboff,SEEK_SET);
+ buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN);
+ if (buflen <= 0) {
+ redisLog(REDIS_WARNING,"Read error sending DB to slave: %s",
+ (buflen == 0) ? "premature EOF" : strerror(errno));
+ freeClient(slave);
+ return;
+ }
+ if ((nwritten = write(fd,buf,buflen)) == -1) {
+ redisLog(REDIS_VERBOSE,"Write error sending DB to slave: %s",
+ strerror(errno));
+ freeClient(slave);
+ return;
+ }
+ slave->repldboff += nwritten;
+ if (slave->repldboff == slave->repldbsize) {
+ close(slave->repldbfd);
+ slave->repldbfd = -1;
+ aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
+ slave->replstate = REDIS_REPL_ONLINE;
+ if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE,
+ sendReplyToClient, slave) == AE_ERR) {
+ freeClient(slave);
+ return;
+ }
+ addReplySds(slave,sdsempty());
+ redisLog(REDIS_NOTICE,"Synchronization with slave succeeded");
+ }
+}
+
+/* This function is called at the end of every backgrond saving.
+ * The argument bgsaveerr is REDIS_OK if the background saving succeeded
+ * otherwise REDIS_ERR is passed to the function.
+ *
+ * The goal of this function is to handle slaves waiting for a successful
+ * background saving in order to perform non-blocking synchronization. */
+void updateSlavesWaitingBgsave(int bgsaveerr) {
+ listNode *ln;
+ int startbgsave = 0;
+ listIter li;
+
+ listRewind(server.slaves,&li);
+ while((ln = listNext(&li))) {
+ redisClient *slave = ln->value;
+
+ if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) {
+ startbgsave = 1;
+ slave->replstate = REDIS_REPL_WAIT_BGSAVE_END;
+ } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) {
+ struct redis_stat buf;
+
+ if (bgsaveerr != REDIS_OK) {
+ freeClient(slave);
+ redisLog(REDIS_WARNING,"SYNC failed. BGSAVE child returned an error");
+ continue;
+ }
+ if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 ||
+ redis_fstat(slave->repldbfd,&buf) == -1) {
+ freeClient(slave);
+ redisLog(REDIS_WARNING,"SYNC failed. Can't open/stat DB after BGSAVE: %s", strerror(errno));
+ continue;
+ }
+ slave->repldboff = 0;
+ slave->repldbsize = buf.st_size;
+ slave->replstate = REDIS_REPL_SEND_BULK;
+ aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE);
+ if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) {
+ freeClient(slave);
+ continue;
+ }
+ }
+ }
+ if (startbgsave) {
+ if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
+ listIter li;
+
+ listRewind(server.slaves,&li);
+ redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed");
+ while((ln = listNext(&li))) {
+ redisClient *slave = ln->value;
+
+ if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START)
+ freeClient(slave);
+ }
+ }
+ }
+}
+
+int syncWithMaster(void) {
+ char buf[1024], tmpfile[256], authcmd[1024];
+ long dumpsize;
+ int fd = anetTcpConnect(NULL,server.masterhost,server.masterport);
+ int dfd, maxtries = 5;
+
+ if (fd == -1) {
+ redisLog(REDIS_WARNING,"Unable to connect to MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+
+ /* AUTH with the master if required. */
+ if(server.masterauth) {
+ snprintf(authcmd, 1024, "AUTH %s\r\n", server.masterauth);
+ if (syncWrite(fd, authcmd, strlen(server.masterauth)+7, 5) == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"Unable to AUTH to MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ /* Read the AUTH result. */
+ if (syncReadLine(fd,buf,1024,3600) == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"I/O error reading auth result from MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ if (buf[0] != '+') {
+ close(fd);
+ redisLog(REDIS_WARNING,"Cannot AUTH to MASTER, is the masterauth password correct?");
+ return REDIS_ERR;
+ }
+ }
+
+ /* Issue the SYNC command */
+ if (syncWrite(fd,"SYNC \r\n",7,5) == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"I/O error writing to MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ /* Read the bulk write count */
+ if (syncReadLine(fd,buf,1024,3600) == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"I/O error reading bulk count from MASTER: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ if (buf[0] != '$') {
+ close(fd);
+ redisLog(REDIS_WARNING,"Bad protocol from MASTER, the first byte is not '$', are you sure the host and port are right?");
+ return REDIS_ERR;
+ }
+ dumpsize = strtol(buf+1,NULL,10);
+ redisLog(REDIS_NOTICE,"Receiving %ld bytes data dump from MASTER",dumpsize);
+ /* Read the bulk write data on a temp file */
+ while(maxtries--) {
+ snprintf(tmpfile,256,
+ "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid());
+ dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644);
+ if (dfd != -1) break;
+ sleep(1);
+ }
+ if (dfd == -1) {
+ close(fd);
+ redisLog(REDIS_WARNING,"Opening the temp file needed for MASTER <-> SLAVE synchronization: %s",strerror(errno));
+ return REDIS_ERR;
+ }
+ while(dumpsize) {
+ int nread, nwritten;
+
+ nread = read(fd,buf,(dumpsize < 1024)?dumpsize:1024);
+ if (nread == -1) {
+ redisLog(REDIS_WARNING,"I/O error trying to sync with MASTER: %s",
+ strerror(errno));
+ close(fd);
+ close(dfd);
+ return REDIS_ERR;
+ }
+ nwritten = write(dfd,buf,nread);
+ if (nwritten == -1) {
+ redisLog(REDIS_WARNING,"Write error writing to the DB dump file needed for MASTER <-> SLAVE synchrnonization: %s", strerror(errno));
+ close(fd);
+ close(dfd);
+ return REDIS_ERR;
+ }
+ dumpsize -= nread;
+ }
+ close(dfd);
+ if (rename(tmpfile,server.dbfilename) == -1) {
+ redisLog(REDIS_WARNING,"Failed trying to rename the temp DB into dump.rdb in MASTER <-> SLAVE synchronization: %s", strerror(errno));
+ unlink(tmpfile);
+ close(fd);
+ return REDIS_ERR;
+ }
+ emptyDb();
+ if (rdbLoad(server.dbfilename) != REDIS_OK) {
+ redisLog(REDIS_WARNING,"Failed trying to load the MASTER synchronization DB from disk");
+ close(fd);
+ return REDIS_ERR;
+ }
+ server.master = createClient(fd);
+ server.master->flags |= REDIS_MASTER;
+ server.master->authenticated = 1;
+ server.replstate = REDIS_REPL_CONNECTED;
+ return REDIS_OK;
+}
+
+void slaveofCommand(redisClient *c) {
+ if (!strcasecmp(c->argv[1]->ptr,"no") &&
+ !strcasecmp(c->argv[2]->ptr,"one")) {
+ if (server.masterhost) {
+ sdsfree(server.masterhost);
+ server.masterhost = NULL;
+ if (server.master) freeClient(server.master);
+ server.replstate = REDIS_REPL_NONE;
+ redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)");
+ }
+ } else {
+ sdsfree(server.masterhost);
+ server.masterhost = sdsdup(c->argv[1]->ptr);
+ server.masterport = atoi(c->argv[2]->ptr);
+ if (server.master) freeClient(server.master);
+ server.replstate = REDIS_REPL_CONNECT;
+ redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)",
+ server.masterhost, server.masterport);
+ }
+ addReply(c,shared.ok);
+}
diff --git a/src/sds.c b/src/sds.c
new file mode 100644
index 000000000..5e67f0443
--- /dev/null
+++ b/src/sds.c
@@ -0,0 +1,384 @@
+/* SDSLib, A C dynamic strings library
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define SDS_ABORT_ON_OOM
+
+#include "sds.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+#include "zmalloc.h"
+
+static void sdsOomAbort(void) {
+ fprintf(stderr,"SDS: Out Of Memory (SDS_ABORT_ON_OOM defined)\n");
+ abort();
+}
+
+sds sdsnewlen(const void *init, size_t initlen) {
+ struct sdshdr *sh;
+
+ sh = zmalloc(sizeof(struct sdshdr)+initlen+1);
+#ifdef SDS_ABORT_ON_OOM
+ if (sh == NULL) sdsOomAbort();
+#else
+ if (sh == NULL) return NULL;
+#endif
+ sh->len = initlen;
+ sh->free = 0;
+ if (initlen) {
+ if (init) memcpy(sh->buf, init, initlen);
+ else memset(sh->buf,0,initlen);
+ }
+ sh->buf[initlen] = '\0';
+ return (char*)sh->buf;
+}
+
+sds sdsempty(void) {
+ return sdsnewlen("",0);
+}
+
+sds sdsnew(const char *init) {
+ size_t initlen = (init == NULL) ? 0 : strlen(init);
+ return sdsnewlen(init, initlen);
+}
+
+size_t sdslen(const sds s) {
+ struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr)));
+ return sh->len;
+}
+
+sds sdsdup(const sds s) {
+ return sdsnewlen(s, sdslen(s));
+}
+
+void sdsfree(sds s) {
+ if (s == NULL) return;
+ zfree(s-sizeof(struct sdshdr));
+}
+
+size_t sdsavail(sds s) {
+ struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr)));
+ return sh->free;
+}
+
+void sdsupdatelen(sds s) {
+ struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr)));
+ int reallen = strlen(s);
+ sh->free += (sh->len-reallen);
+ sh->len = reallen;
+}
+
+static sds sdsMakeRoomFor(sds s, size_t addlen) {
+ struct sdshdr *sh, *newsh;
+ size_t free = sdsavail(s);
+ size_t len, newlen;
+
+ if (free >= addlen) return s;
+ len = sdslen(s);
+ sh = (void*) (s-(sizeof(struct sdshdr)));
+ newlen = (len+addlen)*2;
+ newsh = zrealloc(sh, sizeof(struct sdshdr)+newlen+1);
+#ifdef SDS_ABORT_ON_OOM
+ if (newsh == NULL) sdsOomAbort();
+#else
+ if (newsh == NULL) return NULL;
+#endif
+
+ newsh->free = newlen - len;
+ return newsh->buf;
+}
+
+sds sdscatlen(sds s, void *t, size_t len) {
+ struct sdshdr *sh;
+ size_t curlen = sdslen(s);
+
+ s = sdsMakeRoomFor(s,len);
+ if (s == NULL) return NULL;
+ sh = (void*) (s-(sizeof(struct sdshdr)));
+ memcpy(s+curlen, t, len);
+ sh->len = curlen+len;
+ sh->free = sh->free-len;
+ s[curlen+len] = '\0';
+ return s;
+}
+
+sds sdscat(sds s, char *t) {
+ return sdscatlen(s, t, strlen(t));
+}
+
+sds sdscpylen(sds s, char *t, size_t len) {
+ struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr)));
+ size_t totlen = sh->free+sh->len;
+
+ if (totlen < len) {
+ s = sdsMakeRoomFor(s,len-sh->len);
+ if (s == NULL) return NULL;
+ sh = (void*) (s-(sizeof(struct sdshdr)));
+ totlen = sh->free+sh->len;
+ }
+ memcpy(s, t, len);
+ s[len] = '\0';
+ sh->len = len;
+ sh->free = totlen-len;
+ return s;
+}
+
+sds sdscpy(sds s, char *t) {
+ return sdscpylen(s, t, strlen(t));
+}
+
+sds sdscatprintf(sds s, const char *fmt, ...) {
+ va_list ap;
+ char *buf, *t;
+ size_t buflen = 16;
+
+ while(1) {
+ buf = zmalloc(buflen);
+#ifdef SDS_ABORT_ON_OOM
+ if (buf == NULL) sdsOomAbort();
+#else
+ if (buf == NULL) return NULL;
+#endif
+ buf[buflen-2] = '\0';
+ va_start(ap, fmt);
+ vsnprintf(buf, buflen, fmt, ap);
+ va_end(ap);
+ if (buf[buflen-2] != '\0') {
+ zfree(buf);
+ buflen *= 2;
+ continue;
+ }
+ break;
+ }
+ t = sdscat(s, buf);
+ zfree(buf);
+ return t;
+}
+
+sds sdstrim(sds s, const char *cset) {
+ struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr)));
+ char *start, *end, *sp, *ep;
+ size_t len;
+
+ sp = start = s;
+ ep = end = s+sdslen(s)-1;
+ while(sp <= end && strchr(cset, *sp)) sp++;
+ while(ep > start && strchr(cset, *ep)) ep--;
+ len = (sp > ep) ? 0 : ((ep-sp)+1);
+ if (sh->buf != sp) memmove(sh->buf, sp, len);
+ sh->buf[len] = '\0';
+ sh->free = sh->free+(sh->len-len);
+ sh->len = len;
+ return s;
+}
+
+sds sdsrange(sds s, int start, int end) {
+ struct sdshdr *sh = (void*) (s-(sizeof(struct sdshdr)));
+ size_t newlen, len = sdslen(s);
+
+ if (len == 0) return s;
+ if (start < 0) {
+ start = len+start;
+ if (start < 0) start = 0;
+ }
+ if (end < 0) {
+ end = len+end;
+ if (end < 0) end = 0;
+ }
+ newlen = (start > end) ? 0 : (end-start)+1;
+ if (newlen != 0) {
+ if (start >= (signed)len) start = len-1;
+ if (end >= (signed)len) end = len-1;
+ newlen = (start > end) ? 0 : (end-start)+1;
+ } else {
+ start = 0;
+ }
+ if (start != 0) memmove(sh->buf, sh->buf+start, newlen);
+ sh->buf[newlen] = 0;
+ sh->free = sh->free+(sh->len-newlen);
+ sh->len = newlen;
+ return s;
+}
+
+void sdstolower(sds s) {
+ int len = sdslen(s), j;
+
+ for (j = 0; j < len; j++) s[j] = tolower(s[j]);
+}
+
+void sdstoupper(sds s) {
+ int len = sdslen(s), j;
+
+ for (j = 0; j < len; j++) s[j] = toupper(s[j]);
+}
+
+int sdscmp(sds s1, sds s2) {
+ size_t l1, l2, minlen;
+ int cmp;
+
+ l1 = sdslen(s1);
+ l2 = sdslen(s2);
+ minlen = (l1 < l2) ? l1 : l2;
+ cmp = memcmp(s1,s2,minlen);
+ if (cmp == 0) return l1-l2;
+ return cmp;
+}
+
+/* Split 's' with separator in 'sep'. An array
+ * of sds strings is returned. *count will be set
+ * by reference to the number of tokens returned.
+ *
+ * On out of memory, zero length string, zero length
+ * separator, NULL is returned.
+ *
+ * Note that 'sep' is able to split a string using
+ * a multi-character separator. For example
+ * sdssplit("foo_-_bar","_-_"); will return two
+ * elements "foo" and "bar".
+ *
+ * This version of the function is binary-safe but
+ * requires length arguments. sdssplit() is just the
+ * same function but for zero-terminated strings.
+ */
+sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count) {
+ int elements = 0, slots = 5, start = 0, j;
+
+ sds *tokens = zmalloc(sizeof(sds)*slots);
+#ifdef SDS_ABORT_ON_OOM
+ if (tokens == NULL) sdsOomAbort();
+#endif
+ if (seplen < 1 || len < 0 || tokens == NULL) return NULL;
+ if (len == 0) {
+ *count = 0;
+ return tokens;
+ }
+ for (j = 0; j < (len-(seplen-1)); j++) {
+ /* make sure there is room for the next element and the final one */
+ if (slots < elements+2) {
+ sds *newtokens;
+
+ slots *= 2;
+ newtokens = zrealloc(tokens,sizeof(sds)*slots);
+ if (newtokens == NULL) {
+#ifdef SDS_ABORT_ON_OOM
+ sdsOomAbort();
+#else
+ goto cleanup;
+#endif
+ }
+ tokens = newtokens;
+ }
+ /* search the separator */
+ if ((seplen == 1 && *(s+j) == sep[0]) || (memcmp(s+j,sep,seplen) == 0)) {
+ tokens[elements] = sdsnewlen(s+start,j-start);
+ if (tokens[elements] == NULL) {
+#ifdef SDS_ABORT_ON_OOM
+ sdsOomAbort();
+#else
+ goto cleanup;
+#endif
+ }
+ elements++;
+ start = j+seplen;
+ j = j+seplen-1; /* skip the separator */
+ }
+ }
+ /* Add the final element. We are sure there is room in the tokens array. */
+ tokens[elements] = sdsnewlen(s+start,len-start);
+ if (tokens[elements] == NULL) {
+#ifdef SDS_ABORT_ON_OOM
+ sdsOomAbort();
+#else
+ goto cleanup;
+#endif
+ }
+ elements++;
+ *count = elements;
+ return tokens;
+
+#ifndef SDS_ABORT_ON_OOM
+cleanup:
+ {
+ int i;
+ for (i = 0; i < elements; i++) sdsfree(tokens[i]);
+ zfree(tokens);
+ return NULL;
+ }
+#endif
+}
+
+void sdsfreesplitres(sds *tokens, int count) {
+ if (!tokens) return;
+ while(count--)
+ sdsfree(tokens[count]);
+ zfree(tokens);
+}
+
+sds sdsfromlonglong(long long value) {
+ char buf[32], *p;
+ unsigned long long v;
+
+ v = (value < 0) ? -value : value;
+ p = buf+31; /* point to the last character */
+ do {
+ *p-- = '0'+(v%10);
+ v /= 10;
+ } while(v);
+ if (value < 0) *p-- = '-';
+ p++;
+ return sdsnewlen(p,32-(p-buf));
+}
+
+sds sdscatrepr(sds s, char *p, size_t len) {
+ s = sdscatlen(s,"\"",1);
+ while(len--) {
+ switch(*p) {
+ case '\\':
+ case '"':
+ s = sdscatprintf(s,"\\%c",*p);
+ break;
+ case '\n': s = sdscatlen(s,"\\n",1); break;
+ case '\r': s = sdscatlen(s,"\\r",1); break;
+ case '\t': s = sdscatlen(s,"\\t",1); break;
+ case '\a': s = sdscatlen(s,"\\a",1); break;
+ case '\b': s = sdscatlen(s,"\\b",1); break;
+ default:
+ if (isprint(*p))
+ s = sdscatprintf(s,"%c",*p);
+ else
+ s = sdscatprintf(s,"\\x%02x",(unsigned char)*p);
+ break;
+ }
+ p++;
+ }
+ return sdscatlen(s,"\"",1);
+}
diff --git a/src/sds.h b/src/sds.h
new file mode 100644
index 000000000..ef3a418f2
--- /dev/null
+++ b/src/sds.h
@@ -0,0 +1,74 @@
+/* SDSLib, A C dynamic strings library
+ *
+ * Copyright (c) 2006-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __SDS_H
+#define __SDS_H
+
+#include <sys/types.h>
+
+typedef char *sds;
+
+struct sdshdr {
+ int len;
+ int free;
+ char buf[];
+};
+
+sds sdsnewlen(const void *init, size_t initlen);
+sds sdsnew(const char *init);
+sds sdsempty();
+size_t sdslen(const sds s);
+sds sdsdup(const sds s);
+void sdsfree(sds s);
+size_t sdsavail(sds s);
+sds sdscatlen(sds s, void *t, size_t len);
+sds sdscat(sds s, char *t);
+sds sdscpylen(sds s, char *t, size_t len);
+sds sdscpy(sds s, char *t);
+
+#ifdef __GNUC__
+sds sdscatprintf(sds s, const char *fmt, ...)
+ __attribute__((format(printf, 2, 3)));
+#else
+sds sdscatprintf(sds s, const char *fmt, ...);
+#endif
+
+sds sdstrim(sds s, const char *cset);
+sds sdsrange(sds s, int start, int end);
+void sdsupdatelen(sds s);
+int sdscmp(sds s1, sds s2);
+sds *sdssplitlen(char *s, int len, char *sep, int seplen, int *count);
+void sdsfreesplitres(sds *tokens, int count);
+void sdstolower(sds s);
+void sdstoupper(sds s);
+sds sdsfromlonglong(long long value);
+sds sdscatrepr(sds s, char *p, size_t len);
+
+#endif
diff --git a/src/sha1.c b/src/sha1.c
new file mode 100644
index 000000000..2c50433e8
--- /dev/null
+++ b/src/sha1.c
@@ -0,0 +1,276 @@
+
+/* from valgrind tests */
+
+/* ================ sha1.c ================ */
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+
+Test Vectors (from FIPS PUB 180-1)
+"abc"
+ A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
+"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
+ 84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
+A million repetitions of "a"
+ 34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
+*/
+
+/* #define LITTLE_ENDIAN * This should be #define'd already, if true. */
+/* #define SHA1HANDSOFF * Copies data before messing with it. */
+
+#define SHA1HANDSOFF
+
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h> /* for u_int*_t */
+#if defined(__sun)
+#include "solarisfixes.h"
+#endif
+#include "sha1.h"
+
+#ifndef BYTE_ORDER
+#if (BSD >= 199103)
+# include <machine/endian.h>
+#else
+#if defined(linux) || defined(__linux__)
+# include <endian.h>
+#else
+#define LITTLE_ENDIAN 1234 /* least-significant byte first (vax, pc) */
+#define BIG_ENDIAN 4321 /* most-significant byte first (IBM, net) */
+#define PDP_ENDIAN 3412 /* LSB first in word, MSW first in long (pdp)*/
+
+#if defined(vax) || defined(ns32000) || defined(sun386) || defined(__i386__) || \
+ defined(MIPSEL) || defined(_MIPSEL) || defined(BIT_ZERO_ON_RIGHT) || \
+ defined(__alpha__) || defined(__alpha)
+#define BYTE_ORDER LITTLE_ENDIAN
+#endif
+
+#if defined(sel) || defined(pyr) || defined(mc68000) || defined(sparc) || \
+ defined(is68k) || defined(tahoe) || defined(ibm032) || defined(ibm370) || \
+ defined(MIPSEB) || defined(_MIPSEB) || defined(_IBMR2) || defined(DGUX) ||\
+ defined(apollo) || defined(__convex__) || defined(_CRAY) || \
+ defined(__hppa) || defined(__hp9000) || \
+ defined(__hp9000s300) || defined(__hp9000s700) || \
+ defined (BIT_ZERO_ON_LEFT) || defined(m68k) || defined(__sparc)
+#define BYTE_ORDER BIG_ENDIAN
+#endif
+#endif /* linux */
+#endif /* BSD */
+#endif /* BYTE_ORDER */
+
+#if defined(__BYTE_ORDER) && !defined(BYTE_ORDER)
+#if (__BYTE_ORDER == __LITTLE_ENDIAN)
+#define BYTE_ORDER LITTLE_ENDIAN
+#else
+#define BYTE_ORDER BIG_ENDIAN
+#endif
+#endif
+
+#if !defined(BYTE_ORDER) || \
+ (BYTE_ORDER != BIG_ENDIAN && BYTE_ORDER != LITTLE_ENDIAN && \
+ BYTE_ORDER != PDP_ENDIAN)
+ /* you must determine what the correct bit order is for
+ * your compiler - the next line is an intentional error
+ * which will force your compiles to bomb until you fix
+ * the above macros.
+ */
+#error "Undefined or invalid BYTE_ORDER"
+#endif
+
+#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
+
+/* blk0() and blk() perform the initial expand. */
+/* I got the idea of expanding during the round function from SSLeay */
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
+ |(rol(block->l[i],8)&0x00FF00FF))
+#elif BYTE_ORDER == BIG_ENDIAN
+#define blk0(i) block->l[i]
+#else
+#error "Endianness not defined!"
+#endif
+#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
+ ^block->l[(i+2)&15]^block->l[i&15],1))
+
+/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
+#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
+#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
+#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
+#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
+
+
+/* Hash a single 512-bit block. This is the core of the algorithm. */
+
+void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64])
+{
+u_int32_t a, b, c, d, e;
+typedef union {
+ unsigned char c[64];
+ u_int32_t l[16];
+} CHAR64LONG16;
+#ifdef SHA1HANDSOFF
+CHAR64LONG16 block[1]; /* use array to appear as a pointer */
+ memcpy(block, buffer, 64);
+#else
+ /* The following had better never be used because it causes the
+ * pointer-to-const buffer to be cast into a pointer to non-const.
+ * And the result is written through. I threw a "const" in, hoping
+ * this will cause a diagnostic.
+ */
+CHAR64LONG16* block = (const CHAR64LONG16*)buffer;
+#endif
+ /* Copy context->state[] to working vars */
+ a = state[0];
+ b = state[1];
+ c = state[2];
+ d = state[3];
+ e = state[4];
+ /* 4 rounds of 20 operations each. Loop unrolled. */
+ R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
+ R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
+ R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
+ R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
+ R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
+ R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
+ R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
+ R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
+ R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
+ R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
+ R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
+ R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
+ R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
+ R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
+ R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
+ R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
+ R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
+ R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
+ R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
+ R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
+ /* Add the working vars back into context.state[] */
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+ state[4] += e;
+ /* Wipe variables */
+ a = b = c = d = e = 0;
+#ifdef SHA1HANDSOFF
+ memset(block, '\0', sizeof(block));
+#endif
+}
+
+
+/* SHA1Init - Initialize new context */
+
+void SHA1Init(SHA1_CTX* context)
+{
+ /* SHA1 initialization constants */
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xEFCDAB89;
+ context->state[2] = 0x98BADCFE;
+ context->state[3] = 0x10325476;
+ context->state[4] = 0xC3D2E1F0;
+ context->count[0] = context->count[1] = 0;
+}
+
+
+/* Run your data through this. */
+
+void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len)
+{
+u_int32_t i;
+u_int32_t j;
+
+ j = context->count[0];
+ if ((context->count[0] += len << 3) < j)
+ context->count[1]++;
+ context->count[1] += (len>>29);
+ j = (j >> 3) & 63;
+ if ((j + len) > 63) {
+ memcpy(&context->buffer[j], data, (i = 64-j));
+ SHA1Transform(context->state, context->buffer);
+ for ( ; i + 63 < len; i += 64) {
+ SHA1Transform(context->state, &data[i]);
+ }
+ j = 0;
+ }
+ else i = 0;
+ memcpy(&context->buffer[j], &data[i], len - i);
+}
+
+
+/* Add padding and return the message digest. */
+
+void SHA1Final(unsigned char digest[20], SHA1_CTX* context)
+{
+unsigned i;
+unsigned char finalcount[8];
+unsigned char c;
+
+#if 0 /* untested "improvement" by DHR */
+ /* Convert context->count to a sequence of bytes
+ * in finalcount. Second element first, but
+ * big-endian order within element.
+ * But we do it all backwards.
+ */
+ unsigned char *fcp = &finalcount[8];
+
+ for (i = 0; i < 2; i++)
+ {
+ u_int32_t t = context->count[i];
+ int j;
+
+ for (j = 0; j < 4; t >>= 8, j++)
+ *--fcp = (unsigned char) t
+ }
+#else
+ for (i = 0; i < 8; i++) {
+ finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)]
+ >> ((3-(i & 3)) * 8) ) & 255); /* Endian independent */
+ }
+#endif
+ c = 0200;
+ SHA1Update(context, &c, 1);
+ while ((context->count[0] & 504) != 448) {
+ c = 0000;
+ SHA1Update(context, &c, 1);
+ }
+ SHA1Update(context, finalcount, 8); /* Should cause a SHA1Transform() */
+ for (i = 0; i < 20; i++) {
+ digest[i] = (unsigned char)
+ ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255);
+ }
+ /* Wipe variables */
+ memset(context, '\0', sizeof(*context));
+ memset(&finalcount, '\0', sizeof(finalcount));
+}
+/* ================ end of sha1.c ================ */
+
+#if 0
+#define BUFSIZE 4096
+
+int
+main(int argc, char **argv)
+{
+ SHA1_CTX ctx;
+ unsigned char hash[20], buf[BUFSIZE];
+ int i;
+
+ for(i=0;i<BUFSIZE;i++)
+ buf[i] = i;
+
+ SHA1Init(&ctx);
+ for(i=0;i<1000;i++)
+ SHA1Update(&ctx, buf, BUFSIZE);
+ SHA1Final(hash, &ctx);
+
+ printf("SHA1=");
+ for(i=0;i<20;i++)
+ printf("%02x", hash[i]);
+ printf("\n");
+ return 0;
+}
+
+#endif
diff --git a/src/sha1.h b/src/sha1.h
new file mode 100644
index 000000000..9d6f12965
--- /dev/null
+++ b/src/sha1.h
@@ -0,0 +1,17 @@
+/* ================ sha1.h ================ */
+/*
+SHA-1 in C
+By Steve Reid <steve@edmweb.com>
+100% Public Domain
+*/
+
+typedef struct {
+ u_int32_t state[5];
+ u_int32_t count[2];
+ unsigned char buffer[64];
+} SHA1_CTX;
+
+void SHA1Transform(u_int32_t state[5], const unsigned char buffer[64]);
+void SHA1Init(SHA1_CTX* context);
+void SHA1Update(SHA1_CTX* context, const unsigned char* data, u_int32_t len);
+void SHA1Final(unsigned char digest[20], SHA1_CTX* context);
diff --git a/src/solarisfixes.h b/src/solarisfixes.h
new file mode 100644
index 000000000..ce8e7b6fd
--- /dev/null
+++ b/src/solarisfixes.h
@@ -0,0 +1,21 @@
+/* Solaris specific fixes */
+
+#if defined(__GNUC__)
+#undef isnan
+#define isnan(x) \
+ __extension__({ __typeof (x) __x_a = (x); \
+ __builtin_expect(__x_a != __x_a, 0); })
+
+#undef isfinite
+#define isfinite(x) \
+ __extension__ ({ __typeof (x) __x_f = (x); \
+ __builtin_expect(!isnan(__x_f - __x_f), 1); })
+
+#undef isinf
+#define isinf(x) \
+ __extension__ ({ __typeof (x) __x_i = (x); \
+ __builtin_expect(!isnan(__x_i) && !isfinite(__x_i), 0); })
+
+#define u_int uint
+#define u_int32_t uint32_t
+#endif /* __GNUC__ */
diff --git a/src/sort.c b/src/sort.c
new file mode 100644
index 000000000..0bc86b474
--- /dev/null
+++ b/src/sort.c
@@ -0,0 +1,383 @@
+#include "redis.h"
+#include "pqsort.h" /* Partial qsort for SORT+LIMIT */
+
+redisSortOperation *createSortOperation(int type, robj *pattern) {
+ redisSortOperation *so = zmalloc(sizeof(*so));
+ so->type = type;
+ so->pattern = pattern;
+ return so;
+}
+
+/* Return the value associated to the key with a name obtained
+ * substituting the first occurence of '*' in 'pattern' with 'subst'.
+ * The returned object will always have its refcount increased by 1
+ * when it is non-NULL. */
+robj *lookupKeyByPattern(redisDb *db, robj *pattern, robj *subst) {
+ char *p, *f;
+ sds spat, ssub;
+ robj keyobj, fieldobj, *o;
+ int prefixlen, sublen, postfixlen, fieldlen;
+ /* Expoit the internal sds representation to create a sds string allocated on the stack in order to make this function faster */
+ struct {
+ int len;
+ int free;
+ char buf[REDIS_SORTKEY_MAX+1];
+ } keyname, fieldname;
+
+ /* If the pattern is "#" return the substitution object itself in order
+ * to implement the "SORT ... GET #" feature. */
+ spat = pattern->ptr;
+ if (spat[0] == '#' && spat[1] == '\0') {
+ incrRefCount(subst);
+ return subst;
+ }
+
+ /* The substitution object may be specially encoded. If so we create
+ * a decoded object on the fly. Otherwise getDecodedObject will just
+ * increment the ref count, that we'll decrement later. */
+ subst = getDecodedObject(subst);
+
+ ssub = subst->ptr;
+ if (sdslen(spat)+sdslen(ssub)-1 > REDIS_SORTKEY_MAX) return NULL;
+ p = strchr(spat,'*');
+ if (!p) {
+ decrRefCount(subst);
+ return NULL;
+ }
+
+ /* Find out if we're dealing with a hash dereference. */
+ if ((f = strstr(p+1, "->")) != NULL) {
+ fieldlen = sdslen(spat)-(f-spat);
+ /* this also copies \0 character */
+ memcpy(fieldname.buf,f+2,fieldlen-1);
+ fieldname.len = fieldlen-2;
+ } else {
+ fieldlen = 0;
+ }
+
+ prefixlen = p-spat;
+ sublen = sdslen(ssub);
+ postfixlen = sdslen(spat)-(prefixlen+1)-fieldlen;
+ memcpy(keyname.buf,spat,prefixlen);
+ memcpy(keyname.buf+prefixlen,ssub,sublen);
+ memcpy(keyname.buf+prefixlen+sublen,p+1,postfixlen);
+ keyname.buf[prefixlen+sublen+postfixlen] = '\0';
+ keyname.len = prefixlen+sublen+postfixlen;
+ decrRefCount(subst);
+
+ /* Lookup substituted key */
+ initStaticStringObject(keyobj,((char*)&keyname)+(sizeof(struct sdshdr)));
+ o = lookupKeyRead(db,&keyobj);
+ if (o == NULL) return NULL;
+
+ if (fieldlen > 0) {
+ if (o->type != REDIS_HASH || fieldname.len < 1) return NULL;
+
+ /* Retrieve value from hash by the field name. This operation
+ * already increases the refcount of the returned object. */
+ initStaticStringObject(fieldobj,((char*)&fieldname)+(sizeof(struct sdshdr)));
+ o = hashTypeGet(o, &fieldobj);
+ } else {
+ if (o->type != REDIS_STRING) return NULL;
+
+ /* Every object that this function returns needs to have its refcount
+ * increased. sortCommand decreases it again. */
+ incrRefCount(o);
+ }
+
+ return o;
+}
+
+/* sortCompare() is used by qsort in sortCommand(). Given that qsort_r with
+ * the additional parameter is not standard but a BSD-specific we have to
+ * pass sorting parameters via the global 'server' structure */
+int sortCompare(const void *s1, const void *s2) {
+ const redisSortObject *so1 = s1, *so2 = s2;
+ int cmp;
+
+ if (!server.sort_alpha) {
+ /* Numeric sorting. Here it's trivial as we precomputed scores */
+ if (so1->u.score > so2->u.score) {
+ cmp = 1;
+ } else if (so1->u.score < so2->u.score) {
+ cmp = -1;
+ } else {
+ cmp = 0;
+ }
+ } else {
+ /* Alphanumeric sorting */
+ if (server.sort_bypattern) {
+ if (!so1->u.cmpobj || !so2->u.cmpobj) {
+ /* At least one compare object is NULL */
+ if (so1->u.cmpobj == so2->u.cmpobj)
+ cmp = 0;
+ else if (so1->u.cmpobj == NULL)
+ cmp = -1;
+ else
+ cmp = 1;
+ } else {
+ /* We have both the objects, use strcoll */
+ cmp = strcoll(so1->u.cmpobj->ptr,so2->u.cmpobj->ptr);
+ }
+ } else {
+ /* Compare elements directly. */
+ cmp = compareStringObjects(so1->obj,so2->obj);
+ }
+ }
+ return server.sort_desc ? -cmp : cmp;
+}
+
+/* The SORT command is the most complex command in Redis. Warning: this code
+ * is optimized for speed and a bit less for readability */
+void sortCommand(redisClient *c) {
+ list *operations;
+ unsigned int outputlen = 0;
+ int desc = 0, alpha = 0;
+ int limit_start = 0, limit_count = -1, start, end;
+ int j, dontsort = 0, vectorlen;
+ int getop = 0; /* GET operation counter */
+ robj *sortval, *sortby = NULL, *storekey = NULL;
+ redisSortObject *vector; /* Resulting vector to sort */
+
+ /* Lookup the key to sort. It must be of the right types */
+ sortval = lookupKeyRead(c->db,c->argv[1]);
+ if (sortval == NULL) {
+ addReply(c,shared.emptymultibulk);
+ return;
+ }
+ if (sortval->type != REDIS_SET && sortval->type != REDIS_LIST &&
+ sortval->type != REDIS_ZSET)
+ {
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+
+ /* Create a list of operations to perform for every sorted element.
+ * Operations can be GET/DEL/INCR/DECR */
+ operations = listCreate();
+ listSetFreeMethod(operations,zfree);
+ j = 2;
+
+ /* Now we need to protect sortval incrementing its count, in the future
+ * SORT may have options able to overwrite/delete keys during the sorting
+ * and the sorted key itself may get destroied */
+ incrRefCount(sortval);
+
+ /* The SORT command has an SQL-alike syntax, parse it */
+ while(j < c->argc) {
+ int leftargs = c->argc-j-1;
+ if (!strcasecmp(c->argv[j]->ptr,"asc")) {
+ desc = 0;
+ } else if (!strcasecmp(c->argv[j]->ptr,"desc")) {
+ desc = 1;
+ } else if (!strcasecmp(c->argv[j]->ptr,"alpha")) {
+ alpha = 1;
+ } else if (!strcasecmp(c->argv[j]->ptr,"limit") && leftargs >= 2) {
+ limit_start = atoi(c->argv[j+1]->ptr);
+ limit_count = atoi(c->argv[j+2]->ptr);
+ j+=2;
+ } else if (!strcasecmp(c->argv[j]->ptr,"store") && leftargs >= 1) {
+ storekey = c->argv[j+1];
+ j++;
+ } else if (!strcasecmp(c->argv[j]->ptr,"by") && leftargs >= 1) {
+ sortby = c->argv[j+1];
+ /* If the BY pattern does not contain '*', i.e. it is constant,
+ * we don't need to sort nor to lookup the weight keys. */
+ if (strchr(c->argv[j+1]->ptr,'*') == NULL) dontsort = 1;
+ j++;
+ } else if (!strcasecmp(c->argv[j]->ptr,"get") && leftargs >= 1) {
+ listAddNodeTail(operations,createSortOperation(
+ REDIS_SORT_GET,c->argv[j+1]));
+ getop++;
+ j++;
+ } else {
+ decrRefCount(sortval);
+ listRelease(operations);
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+ j++;
+ }
+
+ /* Load the sorting vector with all the objects to sort */
+ switch(sortval->type) {
+ case REDIS_LIST: vectorlen = listTypeLength(sortval); break;
+ case REDIS_SET: vectorlen = dictSize((dict*)sortval->ptr); break;
+ case REDIS_ZSET: vectorlen = dictSize(((zset*)sortval->ptr)->dict); break;
+ default: vectorlen = 0; redisPanic("Bad SORT type"); /* Avoid GCC warning */
+ }
+ vector = zmalloc(sizeof(redisSortObject)*vectorlen);
+ j = 0;
+
+ if (sortval->type == REDIS_LIST) {
+ listTypeIterator *li = listTypeInitIterator(sortval,0,REDIS_TAIL);
+ listTypeEntry entry;
+ while(listTypeNext(li,&entry)) {
+ vector[j].obj = listTypeGet(&entry);
+ vector[j].u.score = 0;
+ vector[j].u.cmpobj = NULL;
+ j++;
+ }
+ listTypeReleaseIterator(li);
+ } else {
+ dict *set;
+ dictIterator *di;
+ dictEntry *setele;
+
+ if (sortval->type == REDIS_SET) {
+ set = sortval->ptr;
+ } else {
+ zset *zs = sortval->ptr;
+ set = zs->dict;
+ }
+
+ di = dictGetIterator(set);
+ while((setele = dictNext(di)) != NULL) {
+ vector[j].obj = dictGetEntryKey(setele);
+ vector[j].u.score = 0;
+ vector[j].u.cmpobj = NULL;
+ j++;
+ }
+ dictReleaseIterator(di);
+ }
+ redisAssert(j == vectorlen);
+
+ /* Now it's time to load the right scores in the sorting vector */
+ if (dontsort == 0) {
+ for (j = 0; j < vectorlen; j++) {
+ robj *byval;
+ if (sortby) {
+ /* lookup value to sort by */
+ byval = lookupKeyByPattern(c->db,sortby,vector[j].obj);
+ if (!byval) continue;
+ } else {
+ /* use object itself to sort by */
+ byval = vector[j].obj;
+ }
+
+ if (alpha) {
+ if (sortby) vector[j].u.cmpobj = getDecodedObject(byval);
+ } else {
+ if (byval->encoding == REDIS_ENCODING_RAW) {
+ vector[j].u.score = strtod(byval->ptr,NULL);
+ } else if (byval->encoding == REDIS_ENCODING_INT) {
+ /* Don't need to decode the object if it's
+ * integer-encoded (the only encoding supported) so
+ * far. We can just cast it */
+ vector[j].u.score = (long)byval->ptr;
+ } else {
+ redisAssert(1 != 1);
+ }
+ }
+
+ /* when the object was retrieved using lookupKeyByPattern,
+ * its refcount needs to be decreased. */
+ if (sortby) {
+ decrRefCount(byval);
+ }
+ }
+ }
+
+ /* We are ready to sort the vector... perform a bit of sanity check
+ * on the LIMIT option too. We'll use a partial version of quicksort. */
+ start = (limit_start < 0) ? 0 : limit_start;
+ end = (limit_count < 0) ? vectorlen-1 : start+limit_count-1;
+ if (start >= vectorlen) {
+ start = vectorlen-1;
+ end = vectorlen-2;
+ }
+ if (end >= vectorlen) end = vectorlen-1;
+
+ if (dontsort == 0) {
+ server.sort_desc = desc;
+ server.sort_alpha = alpha;
+ server.sort_bypattern = sortby ? 1 : 0;
+ if (sortby && (start != 0 || end != vectorlen-1))
+ pqsort(vector,vectorlen,sizeof(redisSortObject),sortCompare, start,end);
+ else
+ qsort(vector,vectorlen,sizeof(redisSortObject),sortCompare);
+ }
+
+ /* Send command output to the output buffer, performing the specified
+ * GET/DEL/INCR/DECR operations if any. */
+ outputlen = getop ? getop*(end-start+1) : end-start+1;
+ if (storekey == NULL) {
+ /* STORE option not specified, sent the sorting result to client */
+ addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",outputlen));
+ for (j = start; j <= end; j++) {
+ listNode *ln;
+ listIter li;
+
+ if (!getop) addReplyBulk(c,vector[j].obj);
+ listRewind(operations,&li);
+ while((ln = listNext(&li))) {
+ redisSortOperation *sop = ln->value;
+ robj *val = lookupKeyByPattern(c->db,sop->pattern,
+ vector[j].obj);
+
+ if (sop->type == REDIS_SORT_GET) {
+ if (!val) {
+ addReply(c,shared.nullbulk);
+ } else {
+ addReplyBulk(c,val);
+ decrRefCount(val);
+ }
+ } else {
+ redisAssert(sop->type == REDIS_SORT_GET); /* always fails */
+ }
+ }
+ }
+ } else {
+ robj *sobj = createZiplistObject();
+
+ /* STORE option specified, set the sorting result as a List object */
+ for (j = start; j <= end; j++) {
+ listNode *ln;
+ listIter li;
+
+ if (!getop) {
+ listTypePush(sobj,vector[j].obj,REDIS_TAIL);
+ } else {
+ listRewind(operations,&li);
+ while((ln = listNext(&li))) {
+ redisSortOperation *sop = ln->value;
+ robj *val = lookupKeyByPattern(c->db,sop->pattern,
+ vector[j].obj);
+
+ if (sop->type == REDIS_SORT_GET) {
+ if (!val) val = createStringObject("",0);
+
+ /* listTypePush does an incrRefCount, so we should take care
+ * care of the incremented refcount caused by either
+ * lookupKeyByPattern or createStringObject("",0) */
+ listTypePush(sobj,val,REDIS_TAIL);
+ decrRefCount(val);
+ } else {
+ /* always fails */
+ redisAssert(sop->type == REDIS_SORT_GET);
+ }
+ }
+ }
+ }
+ dbReplace(c->db,storekey,sobj);
+ /* Note: we add 1 because the DB is dirty anyway since even if the
+ * SORT result is empty a new key is set and maybe the old content
+ * replaced. */
+ server.dirty += 1+outputlen;
+ addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",outputlen));
+ }
+
+ /* Cleanup */
+ if (sortval->type == REDIS_LIST)
+ for (j = 0; j < vectorlen; j++)
+ decrRefCount(vector[j].obj);
+ decrRefCount(sortval);
+ listRelease(operations);
+ for (j = 0; j < vectorlen; j++) {
+ if (alpha && vector[j].u.cmpobj)
+ decrRefCount(vector[j].u.cmpobj);
+ }
+ zfree(vector);
+}
+
+
diff --git a/src/t_hash.c b/src/t_hash.c
new file mode 100644
index 000000000..3f5fd6e16
--- /dev/null
+++ b/src/t_hash.c
@@ -0,0 +1,397 @@
+#include "redis.h"
+
+#include <math.h>
+
+/*-----------------------------------------------------------------------------
+ * Hash type API
+ *----------------------------------------------------------------------------*/
+
+/* Check the length of a number of objects to see if we need to convert a
+ * zipmap to a real hash. Note that we only check string encoded objects
+ * as their string length can be queried in constant time. */
+void hashTypeTryConversion(robj *subject, robj **argv, int start, int end) {
+ int i;
+ if (subject->encoding != REDIS_ENCODING_ZIPMAP) return;
+
+ for (i = start; i <= end; i++) {
+ if (argv[i]->encoding == REDIS_ENCODING_RAW &&
+ sdslen(argv[i]->ptr) > server.hash_max_zipmap_value)
+ {
+ convertToRealHash(subject);
+ return;
+ }
+ }
+}
+
+/* Encode given objects in-place when the hash uses a dict. */
+void hashTypeTryObjectEncoding(robj *subject, robj **o1, robj **o2) {
+ if (subject->encoding == REDIS_ENCODING_HT) {
+ if (o1) *o1 = tryObjectEncoding(*o1);
+ if (o2) *o2 = tryObjectEncoding(*o2);
+ }
+}
+
+/* Get the value from a hash identified by key. Returns either a string
+ * object or NULL if the value cannot be found. The refcount of the object
+ * is always increased by 1 when the value was found. */
+robj *hashTypeGet(robj *o, robj *key) {
+ robj *value = NULL;
+ if (o->encoding == REDIS_ENCODING_ZIPMAP) {
+ unsigned char *v;
+ unsigned int vlen;
+ key = getDecodedObject(key);
+ if (zipmapGet(o->ptr,key->ptr,sdslen(key->ptr),&v,&vlen)) {
+ value = createStringObject((char*)v,vlen);
+ }
+ decrRefCount(key);
+ } else {
+ dictEntry *de = dictFind(o->ptr,key);
+ if (de != NULL) {
+ value = dictGetEntryVal(de);
+ incrRefCount(value);
+ }
+ }
+ return value;
+}
+
+/* Test if the key exists in the given hash. Returns 1 if the key
+ * exists and 0 when it doesn't. */
+int hashTypeExists(robj *o, robj *key) {
+ if (o->encoding == REDIS_ENCODING_ZIPMAP) {
+ key = getDecodedObject(key);
+ if (zipmapExists(o->ptr,key->ptr,sdslen(key->ptr))) {
+ decrRefCount(key);
+ return 1;
+ }
+ decrRefCount(key);
+ } else {
+ if (dictFind(o->ptr,key) != NULL) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* Add an element, discard the old if the key already exists.
+ * Return 0 on insert and 1 on update. */
+int hashTypeSet(robj *o, robj *key, robj *value) {
+ int update = 0;
+ if (o->encoding == REDIS_ENCODING_ZIPMAP) {
+ key = getDecodedObject(key);
+ value = getDecodedObject(value);
+ o->ptr = zipmapSet(o->ptr,
+ key->ptr,sdslen(key->ptr),
+ value->ptr,sdslen(value->ptr), &update);
+ decrRefCount(key);
+ decrRefCount(value);
+
+ /* Check if the zipmap needs to be upgraded to a real hash table */
+ if (zipmapLen(o->ptr) > server.hash_max_zipmap_entries)
+ convertToRealHash(o);
+ } else {
+ if (dictReplace(o->ptr,key,value)) {
+ /* Insert */
+ incrRefCount(key);
+ } else {
+ /* Update */
+ update = 1;
+ }
+ incrRefCount(value);
+ }
+ return update;
+}
+
+/* Delete an element from a hash.
+ * Return 1 on deleted and 0 on not found. */
+int hashTypeDelete(robj *o, robj *key) {
+ int deleted = 0;
+ if (o->encoding == REDIS_ENCODING_ZIPMAP) {
+ key = getDecodedObject(key);
+ o->ptr = zipmapDel(o->ptr,key->ptr,sdslen(key->ptr), &deleted);
+ decrRefCount(key);
+ } else {
+ deleted = dictDelete((dict*)o->ptr,key) == DICT_OK;
+ /* Always check if the dictionary needs a resize after a delete. */
+ if (deleted && htNeedsResize(o->ptr)) dictResize(o->ptr);
+ }
+ return deleted;
+}
+
+/* Return the number of elements in a hash. */
+unsigned long hashTypeLength(robj *o) {
+ return (o->encoding == REDIS_ENCODING_ZIPMAP) ?
+ zipmapLen((unsigned char*)o->ptr) : dictSize((dict*)o->ptr);
+}
+
+hashTypeIterator *hashTypeInitIterator(robj *subject) {
+ hashTypeIterator *hi = zmalloc(sizeof(hashTypeIterator));
+ hi->encoding = subject->encoding;
+ if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
+ hi->zi = zipmapRewind(subject->ptr);
+ } else if (hi->encoding == REDIS_ENCODING_HT) {
+ hi->di = dictGetIterator(subject->ptr);
+ } else {
+ redisAssert(NULL);
+ }
+ return hi;
+}
+
+void hashTypeReleaseIterator(hashTypeIterator *hi) {
+ if (hi->encoding == REDIS_ENCODING_HT) {
+ dictReleaseIterator(hi->di);
+ }
+ zfree(hi);
+}
+
+/* Move to the next entry in the hash. Return REDIS_OK when the next entry
+ * could be found and REDIS_ERR when the iterator reaches the end. */
+int hashTypeNext(hashTypeIterator *hi) {
+ if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
+ if ((hi->zi = zipmapNext(hi->zi, &hi->zk, &hi->zklen,
+ &hi->zv, &hi->zvlen)) == NULL) return REDIS_ERR;
+ } else {
+ if ((hi->de = dictNext(hi->di)) == NULL) return REDIS_ERR;
+ }
+ return REDIS_OK;
+}
+
+/* Get key or value object at current iteration position.
+ * This increases the refcount of the field object by 1. */
+robj *hashTypeCurrent(hashTypeIterator *hi, int what) {
+ robj *o;
+ if (hi->encoding == REDIS_ENCODING_ZIPMAP) {
+ if (what & REDIS_HASH_KEY) {
+ o = createStringObject((char*)hi->zk,hi->zklen);
+ } else {
+ o = createStringObject((char*)hi->zv,hi->zvlen);
+ }
+ } else {
+ if (what & REDIS_HASH_KEY) {
+ o = dictGetEntryKey(hi->de);
+ } else {
+ o = dictGetEntryVal(hi->de);
+ }
+ incrRefCount(o);
+ }
+ return o;
+}
+
+robj *hashTypeLookupWriteOrCreate(redisClient *c, robj *key) {
+ robj *o = lookupKeyWrite(c->db,key);
+ if (o == NULL) {
+ o = createHashObject();
+ dbAdd(c->db,key,o);
+ } else {
+ if (o->type != REDIS_HASH) {
+ addReply(c,shared.wrongtypeerr);
+ return NULL;
+ }
+ }
+ return o;
+}
+
+void convertToRealHash(robj *o) {
+ unsigned char *key, *val, *p, *zm = o->ptr;
+ unsigned int klen, vlen;
+ dict *dict = dictCreate(&hashDictType,NULL);
+
+ redisAssert(o->type == REDIS_HASH && o->encoding != REDIS_ENCODING_HT);
+ p = zipmapRewind(zm);
+ while((p = zipmapNext(p,&key,&klen,&val,&vlen)) != NULL) {
+ robj *keyobj, *valobj;
+
+ keyobj = createStringObject((char*)key,klen);
+ valobj = createStringObject((char*)val,vlen);
+ keyobj = tryObjectEncoding(keyobj);
+ valobj = tryObjectEncoding(valobj);
+ dictAdd(dict,keyobj,valobj);
+ }
+ o->encoding = REDIS_ENCODING_HT;
+ o->ptr = dict;
+ zfree(zm);
+}
+
+/*-----------------------------------------------------------------------------
+ * Hash type commands
+ *----------------------------------------------------------------------------*/
+
+void hsetCommand(redisClient *c) {
+ int update;
+ robj *o;
+
+ if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
+ hashTypeTryConversion(o,c->argv,2,3);
+ hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
+ update = hashTypeSet(o,c->argv[2],c->argv[3]);
+ addReply(c, update ? shared.czero : shared.cone);
+ server.dirty++;
+}
+
+void hsetnxCommand(redisClient *c) {
+ robj *o;
+ if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
+ hashTypeTryConversion(o,c->argv,2,3);
+
+ if (hashTypeExists(o, c->argv[2])) {
+ addReply(c, shared.czero);
+ } else {
+ hashTypeTryObjectEncoding(o,&c->argv[2], &c->argv[3]);
+ hashTypeSet(o,c->argv[2],c->argv[3]);
+ addReply(c, shared.cone);
+ server.dirty++;
+ }
+}
+
+void hmsetCommand(redisClient *c) {
+ int i;
+ robj *o;
+
+ if ((c->argc % 2) == 1) {
+ addReplySds(c,sdsnew("-ERR wrong number of arguments for HMSET\r\n"));
+ return;
+ }
+
+ if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
+ hashTypeTryConversion(o,c->argv,2,c->argc-1);
+ for (i = 2; i < c->argc; i += 2) {
+ hashTypeTryObjectEncoding(o,&c->argv[i], &c->argv[i+1]);
+ hashTypeSet(o,c->argv[i],c->argv[i+1]);
+ }
+ addReply(c, shared.ok);
+ server.dirty++;
+}
+
+void hincrbyCommand(redisClient *c) {
+ long long value, incr;
+ robj *o, *current, *new;
+
+ if (getLongLongFromObjectOrReply(c,c->argv[3],&incr,NULL) != REDIS_OK) return;
+ if ((o = hashTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return;
+ if ((current = hashTypeGet(o,c->argv[2])) != NULL) {
+ if (getLongLongFromObjectOrReply(c,current,&value,
+ "hash value is not an integer") != REDIS_OK) {
+ decrRefCount(current);
+ return;
+ }
+ decrRefCount(current);
+ } else {
+ value = 0;
+ }
+
+ value += incr;
+ new = createStringObjectFromLongLong(value);
+ hashTypeTryObjectEncoding(o,&c->argv[2],NULL);
+ hashTypeSet(o,c->argv[2],new);
+ decrRefCount(new);
+ addReplyLongLong(c,value);
+ server.dirty++;
+}
+
+void hgetCommand(redisClient *c) {
+ robj *o, *value;
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
+ checkType(c,o,REDIS_HASH)) return;
+
+ if ((value = hashTypeGet(o,c->argv[2])) != NULL) {
+ addReplyBulk(c,value);
+ decrRefCount(value);
+ } else {
+ addReply(c,shared.nullbulk);
+ }
+}
+
+void hmgetCommand(redisClient *c) {
+ int i;
+ robj *o, *value;
+ o = lookupKeyRead(c->db,c->argv[1]);
+ if (o != NULL && o->type != REDIS_HASH) {
+ addReply(c,shared.wrongtypeerr);
+ }
+
+ /* Note the check for o != NULL happens inside the loop. This is
+ * done because objects that cannot be found are considered to be
+ * an empty hash. The reply should then be a series of NULLs. */
+ addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-2));
+ for (i = 2; i < c->argc; i++) {
+ if (o != NULL && (value = hashTypeGet(o,c->argv[i])) != NULL) {
+ addReplyBulk(c,value);
+ decrRefCount(value);
+ } else {
+ addReply(c,shared.nullbulk);
+ }
+ }
+}
+
+void hdelCommand(redisClient *c) {
+ robj *o;
+ if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,o,REDIS_HASH)) return;
+
+ if (hashTypeDelete(o,c->argv[2])) {
+ if (hashTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
+ addReply(c,shared.cone);
+ server.dirty++;
+ } else {
+ addReply(c,shared.czero);
+ }
+}
+
+void hlenCommand(redisClient *c) {
+ robj *o;
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,o,REDIS_HASH)) return;
+
+ addReplyUlong(c,hashTypeLength(o));
+}
+
+void genericHgetallCommand(redisClient *c, int flags) {
+ robj *o, *lenobj, *obj;
+ unsigned long count = 0;
+ hashTypeIterator *hi;
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
+ || checkType(c,o,REDIS_HASH)) return;
+
+ lenobj = createObject(REDIS_STRING,NULL);
+ addReply(c,lenobj);
+ decrRefCount(lenobj);
+
+ hi = hashTypeInitIterator(o);
+ while (hashTypeNext(hi) != REDIS_ERR) {
+ if (flags & REDIS_HASH_KEY) {
+ obj = hashTypeCurrent(hi,REDIS_HASH_KEY);
+ addReplyBulk(c,obj);
+ decrRefCount(obj);
+ count++;
+ }
+ if (flags & REDIS_HASH_VALUE) {
+ obj = hashTypeCurrent(hi,REDIS_HASH_VALUE);
+ addReplyBulk(c,obj);
+ decrRefCount(obj);
+ count++;
+ }
+ }
+ hashTypeReleaseIterator(hi);
+
+ lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",count);
+}
+
+void hkeysCommand(redisClient *c) {
+ genericHgetallCommand(c,REDIS_HASH_KEY);
+}
+
+void hvalsCommand(redisClient *c) {
+ genericHgetallCommand(c,REDIS_HASH_VALUE);
+}
+
+void hgetallCommand(redisClient *c) {
+ genericHgetallCommand(c,REDIS_HASH_KEY|REDIS_HASH_VALUE);
+}
+
+void hexistsCommand(redisClient *c) {
+ robj *o;
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,o,REDIS_HASH)) return;
+
+ addReply(c, hashTypeExists(o,c->argv[2]) ? shared.cone : shared.czero);
+}
diff --git a/src/t_list.c b/src/t_list.c
new file mode 100644
index 000000000..ec8b30c3f
--- /dev/null
+++ b/src/t_list.c
@@ -0,0 +1,829 @@
+#include "redis.h"
+
+/*-----------------------------------------------------------------------------
+ * List API
+ *----------------------------------------------------------------------------*/
+
+/* Check the argument length to see if it requires us to convert the ziplist
+ * to a real list. Only check raw-encoded objects because integer encoded
+ * objects are never too long. */
+void listTypeTryConversion(robj *subject, robj *value) {
+ if (subject->encoding != REDIS_ENCODING_ZIPLIST) return;
+ if (value->encoding == REDIS_ENCODING_RAW &&
+ sdslen(value->ptr) > server.list_max_ziplist_value)
+ listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST);
+}
+
+void listTypePush(robj *subject, robj *value, int where) {
+ /* Check if we need to convert the ziplist */
+ listTypeTryConversion(subject,value);
+ if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
+ ziplistLen(subject->ptr) >= server.list_max_ziplist_entries)
+ listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST);
+
+ if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
+ int pos = (where == REDIS_HEAD) ? ZIPLIST_HEAD : ZIPLIST_TAIL;
+ value = getDecodedObject(value);
+ subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),pos);
+ decrRefCount(value);
+ } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) {
+ if (where == REDIS_HEAD) {
+ listAddNodeHead(subject->ptr,value);
+ } else {
+ listAddNodeTail(subject->ptr,value);
+ }
+ incrRefCount(value);
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+}
+
+robj *listTypePop(robj *subject, int where) {
+ robj *value = NULL;
+ if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
+ unsigned char *p;
+ unsigned char *vstr;
+ unsigned int vlen;
+ long long vlong;
+ int pos = (where == REDIS_HEAD) ? 0 : -1;
+ p = ziplistIndex(subject->ptr,pos);
+ if (ziplistGet(p,&vstr,&vlen,&vlong)) {
+ if (vstr) {
+ value = createStringObject((char*)vstr,vlen);
+ } else {
+ value = createStringObjectFromLongLong(vlong);
+ }
+ /* We only need to delete an element when it exists */
+ subject->ptr = ziplistDelete(subject->ptr,&p);
+ }
+ } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) {
+ list *list = subject->ptr;
+ listNode *ln;
+ if (where == REDIS_HEAD) {
+ ln = listFirst(list);
+ } else {
+ ln = listLast(list);
+ }
+ if (ln != NULL) {
+ value = listNodeValue(ln);
+ incrRefCount(value);
+ listDelNode(list,ln);
+ }
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+ return value;
+}
+
+unsigned long listTypeLength(robj *subject) {
+ if (subject->encoding == REDIS_ENCODING_ZIPLIST) {
+ return ziplistLen(subject->ptr);
+ } else if (subject->encoding == REDIS_ENCODING_LINKEDLIST) {
+ return listLength((list*)subject->ptr);
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+}
+
+/* Initialize an iterator at the specified index. */
+listTypeIterator *listTypeInitIterator(robj *subject, int index, unsigned char direction) {
+ listTypeIterator *li = zmalloc(sizeof(listTypeIterator));
+ li->subject = subject;
+ li->encoding = subject->encoding;
+ li->direction = direction;
+ if (li->encoding == REDIS_ENCODING_ZIPLIST) {
+ li->zi = ziplistIndex(subject->ptr,index);
+ } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) {
+ li->ln = listIndex(subject->ptr,index);
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+ return li;
+}
+
+/* Clean up the iterator. */
+void listTypeReleaseIterator(listTypeIterator *li) {
+ zfree(li);
+}
+
+/* Stores pointer to current the entry in the provided entry structure
+ * and advances the position of the iterator. Returns 1 when the current
+ * entry is in fact an entry, 0 otherwise. */
+int listTypeNext(listTypeIterator *li, listTypeEntry *entry) {
+ /* Protect from converting when iterating */
+ redisAssert(li->subject->encoding == li->encoding);
+
+ entry->li = li;
+ if (li->encoding == REDIS_ENCODING_ZIPLIST) {
+ entry->zi = li->zi;
+ if (entry->zi != NULL) {
+ if (li->direction == REDIS_TAIL)
+ li->zi = ziplistNext(li->subject->ptr,li->zi);
+ else
+ li->zi = ziplistPrev(li->subject->ptr,li->zi);
+ return 1;
+ }
+ } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) {
+ entry->ln = li->ln;
+ if (entry->ln != NULL) {
+ if (li->direction == REDIS_TAIL)
+ li->ln = li->ln->next;
+ else
+ li->ln = li->ln->prev;
+ return 1;
+ }
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+ return 0;
+}
+
+/* Return entry or NULL at the current position of the iterator. */
+robj *listTypeGet(listTypeEntry *entry) {
+ listTypeIterator *li = entry->li;
+ robj *value = NULL;
+ if (li->encoding == REDIS_ENCODING_ZIPLIST) {
+ unsigned char *vstr;
+ unsigned int vlen;
+ long long vlong;
+ redisAssert(entry->zi != NULL);
+ if (ziplistGet(entry->zi,&vstr,&vlen,&vlong)) {
+ if (vstr) {
+ value = createStringObject((char*)vstr,vlen);
+ } else {
+ value = createStringObjectFromLongLong(vlong);
+ }
+ }
+ } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) {
+ redisAssert(entry->ln != NULL);
+ value = listNodeValue(entry->ln);
+ incrRefCount(value);
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+ return value;
+}
+
+void listTypeInsert(listTypeEntry *entry, robj *value, int where) {
+ robj *subject = entry->li->subject;
+ if (entry->li->encoding == REDIS_ENCODING_ZIPLIST) {
+ value = getDecodedObject(value);
+ if (where == REDIS_TAIL) {
+ unsigned char *next = ziplistNext(subject->ptr,entry->zi);
+
+ /* When we insert after the current element, but the current element
+ * is the tail of the list, we need to do a push. */
+ if (next == NULL) {
+ subject->ptr = ziplistPush(subject->ptr,value->ptr,sdslen(value->ptr),REDIS_TAIL);
+ } else {
+ subject->ptr = ziplistInsert(subject->ptr,next,value->ptr,sdslen(value->ptr));
+ }
+ } else {
+ subject->ptr = ziplistInsert(subject->ptr,entry->zi,value->ptr,sdslen(value->ptr));
+ }
+ decrRefCount(value);
+ } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) {
+ if (where == REDIS_TAIL) {
+ listInsertNode(subject->ptr,entry->ln,value,AL_START_TAIL);
+ } else {
+ listInsertNode(subject->ptr,entry->ln,value,AL_START_HEAD);
+ }
+ incrRefCount(value);
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+}
+
+/* Compare the given object with the entry at the current position. */
+int listTypeEqual(listTypeEntry *entry, robj *o) {
+ listTypeIterator *li = entry->li;
+ if (li->encoding == REDIS_ENCODING_ZIPLIST) {
+ redisAssert(o->encoding == REDIS_ENCODING_RAW);
+ return ziplistCompare(entry->zi,o->ptr,sdslen(o->ptr));
+ } else if (li->encoding == REDIS_ENCODING_LINKEDLIST) {
+ return equalStringObjects(o,listNodeValue(entry->ln));
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+}
+
+/* Delete the element pointed to. */
+void listTypeDelete(listTypeEntry *entry) {
+ listTypeIterator *li = entry->li;
+ if (li->encoding == REDIS_ENCODING_ZIPLIST) {
+ unsigned char *p = entry->zi;
+ li->subject->ptr = ziplistDelete(li->subject->ptr,&p);
+
+ /* Update position of the iterator depending on the direction */
+ if (li->direction == REDIS_TAIL)
+ li->zi = p;
+ else
+ li->zi = ziplistPrev(li->subject->ptr,p);
+ } else if (entry->li->encoding == REDIS_ENCODING_LINKEDLIST) {
+ listNode *next;
+ if (li->direction == REDIS_TAIL)
+ next = entry->ln->next;
+ else
+ next = entry->ln->prev;
+ listDelNode(li->subject->ptr,entry->ln);
+ li->ln = next;
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+}
+
+void listTypeConvert(robj *subject, int enc) {
+ listTypeIterator *li;
+ listTypeEntry entry;
+ redisAssert(subject->type == REDIS_LIST);
+
+ if (enc == REDIS_ENCODING_LINKEDLIST) {
+ list *l = listCreate();
+ listSetFreeMethod(l,decrRefCount);
+
+ /* listTypeGet returns a robj with incremented refcount */
+ li = listTypeInitIterator(subject,0,REDIS_TAIL);
+ while (listTypeNext(li,&entry)) listAddNodeTail(l,listTypeGet(&entry));
+ listTypeReleaseIterator(li);
+
+ subject->encoding = REDIS_ENCODING_LINKEDLIST;
+ zfree(subject->ptr);
+ subject->ptr = l;
+ } else {
+ redisPanic("Unsupported list conversion");
+ }
+}
+
+/*-----------------------------------------------------------------------------
+ * List Commands
+ *----------------------------------------------------------------------------*/
+
+void pushGenericCommand(redisClient *c, int where) {
+ robj *lobj = lookupKeyWrite(c->db,c->argv[1]);
+ if (lobj == NULL) {
+ if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
+ addReply(c,shared.cone);
+ return;
+ }
+ lobj = createZiplistObject();
+ dbAdd(c->db,c->argv[1],lobj);
+ } else {
+ if (lobj->type != REDIS_LIST) {
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+ if (handleClientsWaitingListPush(c,c->argv[1],c->argv[2])) {
+ addReply(c,shared.cone);
+ return;
+ }
+ }
+ listTypePush(lobj,c->argv[2],where);
+ addReplyLongLong(c,listTypeLength(lobj));
+ server.dirty++;
+}
+
+void lpushCommand(redisClient *c) {
+ pushGenericCommand(c,REDIS_HEAD);
+}
+
+void rpushCommand(redisClient *c) {
+ pushGenericCommand(c,REDIS_TAIL);
+}
+
+void pushxGenericCommand(redisClient *c, robj *refval, robj *val, int where) {
+ robj *subject;
+ listTypeIterator *iter;
+ listTypeEntry entry;
+ int inserted = 0;
+
+ if ((subject = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,subject,REDIS_LIST)) return;
+
+ if (refval != NULL) {
+ /* Note: we expect refval to be string-encoded because it is *not* the
+ * last argument of the multi-bulk LINSERT. */
+ redisAssert(refval->encoding == REDIS_ENCODING_RAW);
+
+ /* We're not sure if this value can be inserted yet, but we cannot
+ * convert the list inside the iterator. We don't want to loop over
+ * the list twice (once to see if the value can be inserted and once
+ * to do the actual insert), so we assume this value can be inserted
+ * and convert the ziplist to a regular list if necessary. */
+ listTypeTryConversion(subject,val);
+
+ /* Seek refval from head to tail */
+ iter = listTypeInitIterator(subject,0,REDIS_TAIL);
+ while (listTypeNext(iter,&entry)) {
+ if (listTypeEqual(&entry,refval)) {
+ listTypeInsert(&entry,val,where);
+ inserted = 1;
+ break;
+ }
+ }
+ listTypeReleaseIterator(iter);
+
+ if (inserted) {
+ /* Check if the length exceeds the ziplist length threshold. */
+ if (subject->encoding == REDIS_ENCODING_ZIPLIST &&
+ ziplistLen(subject->ptr) > server.list_max_ziplist_entries)
+ listTypeConvert(subject,REDIS_ENCODING_LINKEDLIST);
+ server.dirty++;
+ } else {
+ /* Notify client of a failed insert */
+ addReply(c,shared.cnegone);
+ return;
+ }
+ } else {
+ listTypePush(subject,val,where);
+ server.dirty++;
+ }
+
+ addReplyUlong(c,listTypeLength(subject));
+}
+
+void lpushxCommand(redisClient *c) {
+ pushxGenericCommand(c,NULL,c->argv[2],REDIS_HEAD);
+}
+
+void rpushxCommand(redisClient *c) {
+ pushxGenericCommand(c,NULL,c->argv[2],REDIS_TAIL);
+}
+
+void linsertCommand(redisClient *c) {
+ if (strcasecmp(c->argv[2]->ptr,"after") == 0) {
+ pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_TAIL);
+ } else if (strcasecmp(c->argv[2]->ptr,"before") == 0) {
+ pushxGenericCommand(c,c->argv[3],c->argv[4],REDIS_HEAD);
+ } else {
+ addReply(c,shared.syntaxerr);
+ }
+}
+
+void llenCommand(redisClient *c) {
+ robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.czero);
+ if (o == NULL || checkType(c,o,REDIS_LIST)) return;
+ addReplyUlong(c,listTypeLength(o));
+}
+
+void lindexCommand(redisClient *c) {
+ robj *o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk);
+ if (o == NULL || checkType(c,o,REDIS_LIST)) return;
+ int index = atoi(c->argv[2]->ptr);
+ robj *value = NULL;
+
+ if (o->encoding == REDIS_ENCODING_ZIPLIST) {
+ unsigned char *p;
+ unsigned char *vstr;
+ unsigned int vlen;
+ long long vlong;
+ p = ziplistIndex(o->ptr,index);
+ if (ziplistGet(p,&vstr,&vlen,&vlong)) {
+ if (vstr) {
+ value = createStringObject((char*)vstr,vlen);
+ } else {
+ value = createStringObjectFromLongLong(vlong);
+ }
+ addReplyBulk(c,value);
+ decrRefCount(value);
+ } else {
+ addReply(c,shared.nullbulk);
+ }
+ } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) {
+ listNode *ln = listIndex(o->ptr,index);
+ if (ln != NULL) {
+ value = listNodeValue(ln);
+ addReplyBulk(c,value);
+ } else {
+ addReply(c,shared.nullbulk);
+ }
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+}
+
+void lsetCommand(redisClient *c) {
+ robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nokeyerr);
+ if (o == NULL || checkType(c,o,REDIS_LIST)) return;
+ int index = atoi(c->argv[2]->ptr);
+ robj *value = c->argv[3];
+
+ listTypeTryConversion(o,value);
+ if (o->encoding == REDIS_ENCODING_ZIPLIST) {
+ unsigned char *p, *zl = o->ptr;
+ p = ziplistIndex(zl,index);
+ if (p == NULL) {
+ addReply(c,shared.outofrangeerr);
+ } else {
+ o->ptr = ziplistDelete(o->ptr,&p);
+ value = getDecodedObject(value);
+ o->ptr = ziplistInsert(o->ptr,p,value->ptr,sdslen(value->ptr));
+ decrRefCount(value);
+ addReply(c,shared.ok);
+ server.dirty++;
+ }
+ } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) {
+ listNode *ln = listIndex(o->ptr,index);
+ if (ln == NULL) {
+ addReply(c,shared.outofrangeerr);
+ } else {
+ decrRefCount((robj*)listNodeValue(ln));
+ listNodeValue(ln) = value;
+ incrRefCount(value);
+ addReply(c,shared.ok);
+ server.dirty++;
+ }
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+}
+
+void popGenericCommand(redisClient *c, int where) {
+ robj *o = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk);
+ if (o == NULL || checkType(c,o,REDIS_LIST)) return;
+
+ robj *value = listTypePop(o,where);
+ if (value == NULL) {
+ addReply(c,shared.nullbulk);
+ } else {
+ addReplyBulk(c,value);
+ decrRefCount(value);
+ if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
+ server.dirty++;
+ }
+}
+
+void lpopCommand(redisClient *c) {
+ popGenericCommand(c,REDIS_HEAD);
+}
+
+void rpopCommand(redisClient *c) {
+ popGenericCommand(c,REDIS_TAIL);
+}
+
+void lrangeCommand(redisClient *c) {
+ robj *o, *value;
+ int start = atoi(c->argv[2]->ptr);
+ int end = atoi(c->argv[3]->ptr);
+ int llen;
+ int rangelen, j;
+ listTypeEntry entry;
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
+ || checkType(c,o,REDIS_LIST)) return;
+ llen = listTypeLength(o);
+
+ /* convert negative indexes */
+ if (start < 0) start = llen+start;
+ if (end < 0) end = llen+end;
+ if (start < 0) start = 0;
+ if (end < 0) end = 0;
+
+ /* indexes sanity checks */
+ if (start > end || start >= llen) {
+ /* Out of range start or start > end result in empty list */
+ addReply(c,shared.emptymultibulk);
+ return;
+ }
+ if (end >= llen) end = llen-1;
+ rangelen = (end-start)+1;
+
+ /* Return the result in form of a multi-bulk reply */
+ addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",rangelen));
+ listTypeIterator *li = listTypeInitIterator(o,start,REDIS_TAIL);
+ for (j = 0; j < rangelen; j++) {
+ redisAssert(listTypeNext(li,&entry));
+ value = listTypeGet(&entry);
+ addReplyBulk(c,value);
+ decrRefCount(value);
+ }
+ listTypeReleaseIterator(li);
+}
+
+void ltrimCommand(redisClient *c) {
+ robj *o;
+ int start = atoi(c->argv[2]->ptr);
+ int end = atoi(c->argv[3]->ptr);
+ int llen;
+ int j, ltrim, rtrim;
+ list *list;
+ listNode *ln;
+
+ if ((o = lookupKeyWriteOrReply(c,c->argv[1],shared.ok)) == NULL ||
+ checkType(c,o,REDIS_LIST)) return;
+ llen = listTypeLength(o);
+
+ /* convert negative indexes */
+ if (start < 0) start = llen+start;
+ if (end < 0) end = llen+end;
+ if (start < 0) start = 0;
+ if (end < 0) end = 0;
+
+ /* indexes sanity checks */
+ if (start > end || start >= llen) {
+ /* Out of range start or start > end result in empty list */
+ ltrim = llen;
+ rtrim = 0;
+ } else {
+ if (end >= llen) end = llen-1;
+ ltrim = start;
+ rtrim = llen-end-1;
+ }
+
+ /* Remove list elements to perform the trim */
+ if (o->encoding == REDIS_ENCODING_ZIPLIST) {
+ o->ptr = ziplistDeleteRange(o->ptr,0,ltrim);
+ o->ptr = ziplistDeleteRange(o->ptr,-rtrim,rtrim);
+ } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) {
+ list = o->ptr;
+ for (j = 0; j < ltrim; j++) {
+ ln = listFirst(list);
+ listDelNode(list,ln);
+ }
+ for (j = 0; j < rtrim; j++) {
+ ln = listLast(list);
+ listDelNode(list,ln);
+ }
+ } else {
+ redisPanic("Unknown list encoding");
+ }
+ if (listTypeLength(o) == 0) dbDelete(c->db,c->argv[1]);
+ server.dirty++;
+ addReply(c,shared.ok);
+}
+
+void lremCommand(redisClient *c) {
+ robj *subject, *obj = c->argv[3];
+ int toremove = atoi(c->argv[2]->ptr);
+ int removed = 0;
+ listTypeEntry entry;
+
+ subject = lookupKeyWriteOrReply(c,c->argv[1],shared.czero);
+ if (subject == NULL || checkType(c,subject,REDIS_LIST)) return;
+
+ /* Make sure obj is raw when we're dealing with a ziplist */
+ if (subject->encoding == REDIS_ENCODING_ZIPLIST)
+ obj = getDecodedObject(obj);
+
+ listTypeIterator *li;
+ if (toremove < 0) {
+ toremove = -toremove;
+ li = listTypeInitIterator(subject,-1,REDIS_HEAD);
+ } else {
+ li = listTypeInitIterator(subject,0,REDIS_TAIL);
+ }
+
+ while (listTypeNext(li,&entry)) {
+ if (listTypeEqual(&entry,obj)) {
+ listTypeDelete(&entry);
+ server.dirty++;
+ removed++;
+ if (toremove && removed == toremove) break;
+ }
+ }
+ listTypeReleaseIterator(li);
+
+ /* Clean up raw encoded object */
+ if (subject->encoding == REDIS_ENCODING_ZIPLIST)
+ decrRefCount(obj);
+
+ if (listTypeLength(subject) == 0) dbDelete(c->db,c->argv[1]);
+ addReplySds(c,sdscatprintf(sdsempty(),":%d\r\n",removed));
+}
+
+/* This is the semantic of this command:
+ * RPOPLPUSH srclist dstlist:
+ * IF LLEN(srclist) > 0
+ * element = RPOP srclist
+ * LPUSH dstlist element
+ * RETURN element
+ * ELSE
+ * RETURN nil
+ * END
+ * END
+ *
+ * The idea is to be able to get an element from a list in a reliable way
+ * since the element is not just returned but pushed against another list
+ * as well. This command was originally proposed by Ezra Zygmuntowicz.
+ */
+void rpoplpushcommand(redisClient *c) {
+ robj *sobj, *value;
+ if ((sobj = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
+ checkType(c,sobj,REDIS_LIST)) return;
+
+ if (listTypeLength(sobj) == 0) {
+ addReply(c,shared.nullbulk);
+ } else {
+ robj *dobj = lookupKeyWrite(c->db,c->argv[2]);
+ if (dobj && checkType(c,dobj,REDIS_LIST)) return;
+ value = listTypePop(sobj,REDIS_TAIL);
+
+ /* Add the element to the target list (unless it's directly
+ * passed to some BLPOP-ing client */
+ if (!handleClientsWaitingListPush(c,c->argv[2],value)) {
+ /* Create the list if the key does not exist */
+ if (!dobj) {
+ dobj = createZiplistObject();
+ dbAdd(c->db,c->argv[2],dobj);
+ }
+ listTypePush(dobj,value,REDIS_HEAD);
+ }
+
+ /* Send the element to the client as reply as well */
+ addReplyBulk(c,value);
+
+ /* listTypePop returns an object with its refcount incremented */
+ decrRefCount(value);
+
+ /* Delete the source list when it is empty */
+ if (listTypeLength(sobj) == 0) dbDelete(c->db,c->argv[1]);
+ server.dirty++;
+ }
+}
+
+/*-----------------------------------------------------------------------------
+ * Blocking POP operations
+ *----------------------------------------------------------------------------*/
+
+/* Currently Redis blocking operations support is limited to list POP ops,
+ * so the current implementation is not fully generic, but it is also not
+ * completely specific so it will not require a rewrite to support new
+ * kind of blocking operations in the future.
+ *
+ * Still it's important to note that list blocking operations can be already
+ * used as a notification mechanism in order to implement other blocking
+ * operations at application level, so there must be a very strong evidence
+ * of usefulness and generality before new blocking operations are implemented.
+ *
+ * This is how the current blocking POP works, we use BLPOP as example:
+ * - If the user calls BLPOP and the key exists and contains a non empty list
+ * then LPOP is called instead. So BLPOP is semantically the same as LPOP
+ * if there is not to block.
+ * - If instead BLPOP is called and the key does not exists or the list is
+ * empty we need to block. In order to do so we remove the notification for
+ * new data to read in the client socket (so that we'll not serve new
+ * requests if the blocking request is not served). Also we put the client
+ * in a dictionary (db->blocking_keys) mapping keys to a list of clients
+ * blocking for this keys.
+ * - If a PUSH operation against a key with blocked clients waiting is
+ * performed, we serve the first in the list: basically instead to push
+ * the new element inside the list we return it to the (first / oldest)
+ * blocking client, unblock the client, and remove it form the list.
+ *
+ * The above comment and the source code should be enough in order to understand
+ * the implementation and modify / fix it later.
+ */
+
+/* Set a client in blocking mode for the specified key, with the specified
+ * timeout */
+void blockForKeys(redisClient *c, robj **keys, int numkeys, time_t timeout) {
+ dictEntry *de;
+ list *l;
+ int j;
+
+ c->blocking_keys = zmalloc(sizeof(robj*)*numkeys);
+ c->blocking_keys_num = numkeys;
+ c->blockingto = timeout;
+ for (j = 0; j < numkeys; j++) {
+ /* Add the key in the client structure, to map clients -> keys */
+ c->blocking_keys[j] = keys[j];
+ incrRefCount(keys[j]);
+
+ /* And in the other "side", to map keys -> clients */
+ de = dictFind(c->db->blocking_keys,keys[j]);
+ if (de == NULL) {
+ int retval;
+
+ /* For every key we take a list of clients blocked for it */
+ l = listCreate();
+ retval = dictAdd(c->db->blocking_keys,keys[j],l);
+ incrRefCount(keys[j]);
+ redisAssert(retval == DICT_OK);
+ } else {
+ l = dictGetEntryVal(de);
+ }
+ listAddNodeTail(l,c);
+ }
+ /* Mark the client as a blocked client */
+ c->flags |= REDIS_BLOCKED;
+ server.blpop_blocked_clients++;
+}
+
+/* Unblock a client that's waiting in a blocking operation such as BLPOP */
+void unblockClientWaitingData(redisClient *c) {
+ dictEntry *de;
+ list *l;
+ int j;
+
+ redisAssert(c->blocking_keys != NULL);
+ /* The client may wait for multiple keys, so unblock it for every key. */
+ for (j = 0; j < c->blocking_keys_num; j++) {
+ /* Remove this client from the list of clients waiting for this key. */
+ de = dictFind(c->db->blocking_keys,c->blocking_keys[j]);
+ redisAssert(de != NULL);
+ l = dictGetEntryVal(de);
+ listDelNode(l,listSearchKey(l,c));
+ /* If the list is empty we need to remove it to avoid wasting memory */
+ if (listLength(l) == 0)
+ dictDelete(c->db->blocking_keys,c->blocking_keys[j]);
+ decrRefCount(c->blocking_keys[j]);
+ }
+ /* Cleanup the client structure */
+ zfree(c->blocking_keys);
+ c->blocking_keys = NULL;
+ c->flags &= (~REDIS_BLOCKED);
+ server.blpop_blocked_clients--;
+ /* We want to process data if there is some command waiting
+ * in the input buffer. Note that this is safe even if
+ * unblockClientWaitingData() gets called from freeClient() because
+ * freeClient() will be smart enough to call this function
+ * *after* c->querybuf was set to NULL. */
+ if (c->querybuf && sdslen(c->querybuf) > 0) processInputBuffer(c);
+}
+
+/* This should be called from any function PUSHing into lists.
+ * 'c' is the "pushing client", 'key' is the key it is pushing data against,
+ * 'ele' is the element pushed.
+ *
+ * If the function returns 0 there was no client waiting for a list push
+ * against this key.
+ *
+ * If the function returns 1 there was a client waiting for a list push
+ * against this key, the element was passed to this client thus it's not
+ * needed to actually add it to the list and the caller should return asap. */
+int handleClientsWaitingListPush(redisClient *c, robj *key, robj *ele) {
+ struct dictEntry *de;
+ redisClient *receiver;
+ list *l;
+ listNode *ln;
+
+ de = dictFind(c->db->blocking_keys,key);
+ if (de == NULL) return 0;
+ l = dictGetEntryVal(de);
+ ln = listFirst(l);
+ redisAssert(ln != NULL);
+ receiver = ln->value;
+
+ addReplySds(receiver,sdsnew("*2\r\n"));
+ addReplyBulk(receiver,key);
+ addReplyBulk(receiver,ele);
+ unblockClientWaitingData(receiver);
+ return 1;
+}
+
+/* Blocking RPOP/LPOP */
+void blockingPopGenericCommand(redisClient *c, int where) {
+ robj *o;
+ time_t timeout;
+ int j;
+
+ for (j = 1; j < c->argc-1; j++) {
+ o = lookupKeyWrite(c->db,c->argv[j]);
+ if (o != NULL) {
+ if (o->type != REDIS_LIST) {
+ addReply(c,shared.wrongtypeerr);
+ return;
+ } else {
+ if (listTypeLength(o) != 0) {
+ /* If the list contains elements fall back to the usual
+ * non-blocking POP operation */
+ robj *argv[2], **orig_argv;
+ int orig_argc;
+
+ /* We need to alter the command arguments before to call
+ * popGenericCommand() as the command takes a single key. */
+ orig_argv = c->argv;
+ orig_argc = c->argc;
+ argv[1] = c->argv[j];
+ c->argv = argv;
+ c->argc = 2;
+
+ /* Also the return value is different, we need to output
+ * the multi bulk reply header and the key name. The
+ * "real" command will add the last element (the value)
+ * for us. If this souds like an hack to you it's just
+ * because it is... */
+ addReplySds(c,sdsnew("*2\r\n"));
+ addReplyBulk(c,argv[1]);
+ popGenericCommand(c,where);
+
+ /* Fix the client structure with the original stuff */
+ c->argv = orig_argv;
+ c->argc = orig_argc;
+ return;
+ }
+ }
+ }
+ }
+ /* If the list is empty or the key does not exists we must block */
+ timeout = strtol(c->argv[c->argc-1]->ptr,NULL,10);
+ if (timeout > 0) timeout += time(NULL);
+ blockForKeys(c,c->argv+1,c->argc-2,timeout);
+}
+
+void blpopCommand(redisClient *c) {
+ blockingPopGenericCommand(c,REDIS_HEAD);
+}
+
+void brpopCommand(redisClient *c) {
+ blockingPopGenericCommand(c,REDIS_TAIL);
+}
diff --git a/src/t_set.c b/src/t_set.c
new file mode 100644
index 000000000..808ef268e
--- /dev/null
+++ b/src/t_set.c
@@ -0,0 +1,349 @@
+#include "redis.h"
+
+/*-----------------------------------------------------------------------------
+ * Set Commands
+ *----------------------------------------------------------------------------*/
+
+void saddCommand(redisClient *c) {
+ robj *set;
+
+ set = lookupKeyWrite(c->db,c->argv[1]);
+ if (set == NULL) {
+ set = createSetObject();
+ dbAdd(c->db,c->argv[1],set);
+ } else {
+ if (set->type != REDIS_SET) {
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+ }
+ if (dictAdd(set->ptr,c->argv[2],NULL) == DICT_OK) {
+ incrRefCount(c->argv[2]);
+ server.dirty++;
+ addReply(c,shared.cone);
+ } else {
+ addReply(c,shared.czero);
+ }
+}
+
+void sremCommand(redisClient *c) {
+ robj *set;
+
+ if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,set,REDIS_SET)) return;
+
+ if (dictDelete(set->ptr,c->argv[2]) == DICT_OK) {
+ server.dirty++;
+ if (htNeedsResize(set->ptr)) dictResize(set->ptr);
+ if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
+ addReply(c,shared.cone);
+ } else {
+ addReply(c,shared.czero);
+ }
+}
+
+void smoveCommand(redisClient *c) {
+ robj *srcset, *dstset;
+
+ srcset = lookupKeyWrite(c->db,c->argv[1]);
+ dstset = lookupKeyWrite(c->db,c->argv[2]);
+
+ /* If the source key does not exist return 0, if it's of the wrong type
+ * raise an error */
+ if (srcset == NULL || srcset->type != REDIS_SET) {
+ addReply(c, srcset ? shared.wrongtypeerr : shared.czero);
+ return;
+ }
+ /* Error if the destination key is not a set as well */
+ if (dstset && dstset->type != REDIS_SET) {
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+ /* Remove the element from the source set */
+ if (dictDelete(srcset->ptr,c->argv[3]) == DICT_ERR) {
+ /* Key not found in the src set! return zero */
+ addReply(c,shared.czero);
+ return;
+ }
+ if (dictSize((dict*)srcset->ptr) == 0 && srcset != dstset)
+ dbDelete(c->db,c->argv[1]);
+ server.dirty++;
+ /* Add the element to the destination set */
+ if (!dstset) {
+ dstset = createSetObject();
+ dbAdd(c->db,c->argv[2],dstset);
+ }
+ if (dictAdd(dstset->ptr,c->argv[3],NULL) == DICT_OK)
+ incrRefCount(c->argv[3]);
+ addReply(c,shared.cone);
+}
+
+void sismemberCommand(redisClient *c) {
+ robj *set;
+
+ if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,set,REDIS_SET)) return;
+
+ if (dictFind(set->ptr,c->argv[2]))
+ addReply(c,shared.cone);
+ else
+ addReply(c,shared.czero);
+}
+
+void scardCommand(redisClient *c) {
+ robj *o;
+ dict *s;
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,o,REDIS_SET)) return;
+
+ s = o->ptr;
+ addReplyUlong(c,dictSize(s));
+}
+
+void spopCommand(redisClient *c) {
+ robj *set;
+ dictEntry *de;
+
+ if ((set = lookupKeyWriteOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
+ checkType(c,set,REDIS_SET)) return;
+
+ de = dictGetRandomKey(set->ptr);
+ if (de == NULL) {
+ addReply(c,shared.nullbulk);
+ } else {
+ robj *ele = dictGetEntryKey(de);
+
+ addReplyBulk(c,ele);
+ dictDelete(set->ptr,ele);
+ if (htNeedsResize(set->ptr)) dictResize(set->ptr);
+ if (dictSize((dict*)set->ptr) == 0) dbDelete(c->db,c->argv[1]);
+ server.dirty++;
+ }
+}
+
+void srandmemberCommand(redisClient *c) {
+ robj *set;
+ dictEntry *de;
+
+ if ((set = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
+ checkType(c,set,REDIS_SET)) return;
+
+ de = dictGetRandomKey(set->ptr);
+ if (de == NULL) {
+ addReply(c,shared.nullbulk);
+ } else {
+ robj *ele = dictGetEntryKey(de);
+
+ addReplyBulk(c,ele);
+ }
+}
+
+int qsortCompareSetsByCardinality(const void *s1, const void *s2) {
+ dict **d1 = (void*) s1, **d2 = (void*) s2;
+
+ return dictSize(*d1)-dictSize(*d2);
+}
+
+void sinterGenericCommand(redisClient *c, robj **setskeys, unsigned long setsnum, robj *dstkey) {
+ dict **dv = zmalloc(sizeof(dict*)*setsnum);
+ dictIterator *di;
+ dictEntry *de;
+ robj *lenobj = NULL, *dstset = NULL;
+ unsigned long j, cardinality = 0;
+
+ for (j = 0; j < setsnum; j++) {
+ robj *setobj;
+
+ setobj = dstkey ?
+ lookupKeyWrite(c->db,setskeys[j]) :
+ lookupKeyRead(c->db,setskeys[j]);
+ if (!setobj) {
+ zfree(dv);
+ if (dstkey) {
+ if (dbDelete(c->db,dstkey))
+ server.dirty++;
+ addReply(c,shared.czero);
+ } else {
+ addReply(c,shared.emptymultibulk);
+ }
+ return;
+ }
+ if (setobj->type != REDIS_SET) {
+ zfree(dv);
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+ dv[j] = setobj->ptr;
+ }
+ /* Sort sets from the smallest to largest, this will improve our
+ * algorithm's performace */
+ qsort(dv,setsnum,sizeof(dict*),qsortCompareSetsByCardinality);
+
+ /* The first thing we should output is the total number of elements...
+ * since this is a multi-bulk write, but at this stage we don't know
+ * the intersection set size, so we use a trick, append an empty object
+ * to the output list and save the pointer to later modify it with the
+ * right length */
+ if (!dstkey) {
+ lenobj = createObject(REDIS_STRING,NULL);
+ addReply(c,lenobj);
+ decrRefCount(lenobj);
+ } else {
+ /* If we have a target key where to store the resulting set
+ * create this key with an empty set inside */
+ dstset = createSetObject();
+ }
+
+ /* Iterate all the elements of the first (smallest) set, and test
+ * the element against all the other sets, if at least one set does
+ * not include the element it is discarded */
+ di = dictGetIterator(dv[0]);
+
+ while((de = dictNext(di)) != NULL) {
+ robj *ele;
+
+ for (j = 1; j < setsnum; j++)
+ if (dictFind(dv[j],dictGetEntryKey(de)) == NULL) break;
+ if (j != setsnum)
+ continue; /* at least one set does not contain the member */
+ ele = dictGetEntryKey(de);
+ if (!dstkey) {
+ addReplyBulk(c,ele);
+ cardinality++;
+ } else {
+ dictAdd(dstset->ptr,ele,NULL);
+ incrRefCount(ele);
+ }
+ }
+ dictReleaseIterator(di);
+
+ if (dstkey) {
+ /* Store the resulting set into the target, if the intersection
+ * is not an empty set. */
+ dbDelete(c->db,dstkey);
+ if (dictSize((dict*)dstset->ptr) > 0) {
+ dbAdd(c->db,dstkey,dstset);
+ addReplyLongLong(c,dictSize((dict*)dstset->ptr));
+ } else {
+ decrRefCount(dstset);
+ addReply(c,shared.czero);
+ }
+ server.dirty++;
+ } else {
+ lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",cardinality);
+ }
+ zfree(dv);
+}
+
+void sinterCommand(redisClient *c) {
+ sinterGenericCommand(c,c->argv+1,c->argc-1,NULL);
+}
+
+void sinterstoreCommand(redisClient *c) {
+ sinterGenericCommand(c,c->argv+2,c->argc-2,c->argv[1]);
+}
+
+void sunionDiffGenericCommand(redisClient *c, robj **setskeys, int setsnum, robj *dstkey, int op) {
+ dict **dv = zmalloc(sizeof(dict*)*setsnum);
+ dictIterator *di;
+ dictEntry *de;
+ robj *dstset = NULL;
+ int j, cardinality = 0;
+
+ for (j = 0; j < setsnum; j++) {
+ robj *setobj;
+
+ setobj = dstkey ?
+ lookupKeyWrite(c->db,setskeys[j]) :
+ lookupKeyRead(c->db,setskeys[j]);
+ if (!setobj) {
+ dv[j] = NULL;
+ continue;
+ }
+ if (setobj->type != REDIS_SET) {
+ zfree(dv);
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+ dv[j] = setobj->ptr;
+ }
+
+ /* We need a temp set object to store our union. If the dstkey
+ * is not NULL (that is, we are inside an SUNIONSTORE operation) then
+ * this set object will be the resulting object to set into the target key*/
+ dstset = createSetObject();
+
+ /* Iterate all the elements of all the sets, add every element a single
+ * time to the result set */
+ for (j = 0; j < setsnum; j++) {
+ if (op == REDIS_OP_DIFF && j == 0 && !dv[j]) break; /* result set is empty */
+ if (!dv[j]) continue; /* non existing keys are like empty sets */
+
+ di = dictGetIterator(dv[j]);
+
+ while((de = dictNext(di)) != NULL) {
+ robj *ele;
+
+ /* dictAdd will not add the same element multiple times */
+ ele = dictGetEntryKey(de);
+ if (op == REDIS_OP_UNION || j == 0) {
+ if (dictAdd(dstset->ptr,ele,NULL) == DICT_OK) {
+ incrRefCount(ele);
+ cardinality++;
+ }
+ } else if (op == REDIS_OP_DIFF) {
+ if (dictDelete(dstset->ptr,ele) == DICT_OK) {
+ cardinality--;
+ }
+ }
+ }
+ dictReleaseIterator(di);
+
+ /* result set is empty? Exit asap. */
+ if (op == REDIS_OP_DIFF && cardinality == 0) break;
+ }
+
+ /* Output the content of the resulting set, if not in STORE mode */
+ if (!dstkey) {
+ addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",cardinality));
+ di = dictGetIterator(dstset->ptr);
+ while((de = dictNext(di)) != NULL) {
+ robj *ele;
+
+ ele = dictGetEntryKey(de);
+ addReplyBulk(c,ele);
+ }
+ dictReleaseIterator(di);
+ decrRefCount(dstset);
+ } else {
+ /* If we have a target key where to store the resulting set
+ * create this key with the result set inside */
+ dbDelete(c->db,dstkey);
+ if (dictSize((dict*)dstset->ptr) > 0) {
+ dbAdd(c->db,dstkey,dstset);
+ addReplyLongLong(c,dictSize((dict*)dstset->ptr));
+ } else {
+ decrRefCount(dstset);
+ addReply(c,shared.czero);
+ }
+ server.dirty++;
+ }
+ zfree(dv);
+}
+
+void sunionCommand(redisClient *c) {
+ sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_UNION);
+}
+
+void sunionstoreCommand(redisClient *c) {
+ sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_UNION);
+}
+
+void sdiffCommand(redisClient *c) {
+ sunionDiffGenericCommand(c,c->argv+1,c->argc-1,NULL,REDIS_OP_DIFF);
+}
+
+void sdiffstoreCommand(redisClient *c) {
+ sunionDiffGenericCommand(c,c->argv+2,c->argc-2,c->argv[1],REDIS_OP_DIFF);
+}
diff --git a/src/t_string.c b/src/t_string.c
new file mode 100644
index 000000000..eaaec05be
--- /dev/null
+++ b/src/t_string.c
@@ -0,0 +1,251 @@
+#include "redis.h"
+
+/*-----------------------------------------------------------------------------
+ * String Commands
+ *----------------------------------------------------------------------------*/
+
+void setGenericCommand(redisClient *c, int nx, robj *key, robj *val, robj *expire) {
+ int retval;
+ long seconds = 0; /* initialized to avoid an harmness warning */
+
+ if (expire) {
+ if (getLongFromObjectOrReply(c, expire, &seconds, NULL) != REDIS_OK)
+ return;
+ if (seconds <= 0) {
+ addReplySds(c,sdsnew("-ERR invalid expire time in SETEX\r\n"));
+ return;
+ }
+ }
+
+ touchWatchedKey(c->db,key);
+ if (nx) deleteIfVolatile(c->db,key);
+ retval = dbAdd(c->db,key,val);
+ if (retval == REDIS_ERR) {
+ if (!nx) {
+ dbReplace(c->db,key,val);
+ incrRefCount(val);
+ } else {
+ addReply(c,shared.czero);
+ return;
+ }
+ } else {
+ incrRefCount(val);
+ }
+ server.dirty++;
+ removeExpire(c->db,key);
+ if (expire) setExpire(c->db,key,time(NULL)+seconds);
+ addReply(c, nx ? shared.cone : shared.ok);
+}
+
+void setCommand(redisClient *c) {
+ setGenericCommand(c,0,c->argv[1],c->argv[2],NULL);
+}
+
+void setnxCommand(redisClient *c) {
+ setGenericCommand(c,1,c->argv[1],c->argv[2],NULL);
+}
+
+void setexCommand(redisClient *c) {
+ setGenericCommand(c,0,c->argv[1],c->argv[3],c->argv[2]);
+}
+
+int getGenericCommand(redisClient *c) {
+ robj *o;
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL)
+ return REDIS_OK;
+
+ if (o->type != REDIS_STRING) {
+ addReply(c,shared.wrongtypeerr);
+ return REDIS_ERR;
+ } else {
+ addReplyBulk(c,o);
+ return REDIS_OK;
+ }
+}
+
+void getCommand(redisClient *c) {
+ getGenericCommand(c);
+}
+
+void getsetCommand(redisClient *c) {
+ if (getGenericCommand(c) == REDIS_ERR) return;
+ dbReplace(c->db,c->argv[1],c->argv[2]);
+ incrRefCount(c->argv[2]);
+ server.dirty++;
+ removeExpire(c->db,c->argv[1]);
+}
+
+void mgetCommand(redisClient *c) {
+ int j;
+
+ addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",c->argc-1));
+ for (j = 1; j < c->argc; j++) {
+ robj *o = lookupKeyRead(c->db,c->argv[j]);
+ if (o == NULL) {
+ addReply(c,shared.nullbulk);
+ } else {
+ if (o->type != REDIS_STRING) {
+ addReply(c,shared.nullbulk);
+ } else {
+ addReplyBulk(c,o);
+ }
+ }
+ }
+}
+
+void msetGenericCommand(redisClient *c, int nx) {
+ int j, busykeys = 0;
+
+ if ((c->argc % 2) == 0) {
+ addReplySds(c,sdsnew("-ERR wrong number of arguments for MSET\r\n"));
+ return;
+ }
+ /* Handle the NX flag. The MSETNX semantic is to return zero and don't
+ * set nothing at all if at least one already key exists. */
+ if (nx) {
+ for (j = 1; j < c->argc; j += 2) {
+ if (lookupKeyWrite(c->db,c->argv[j]) != NULL) {
+ busykeys++;
+ }
+ }
+ }
+ if (busykeys) {
+ addReply(c, shared.czero);
+ return;
+ }
+
+ for (j = 1; j < c->argc; j += 2) {
+ c->argv[j+1] = tryObjectEncoding(c->argv[j+1]);
+ dbReplace(c->db,c->argv[j],c->argv[j+1]);
+ incrRefCount(c->argv[j+1]);
+ removeExpire(c->db,c->argv[j]);
+ }
+ server.dirty += (c->argc-1)/2;
+ addReply(c, nx ? shared.cone : shared.ok);
+}
+
+void msetCommand(redisClient *c) {
+ msetGenericCommand(c,0);
+}
+
+void msetnxCommand(redisClient *c) {
+ msetGenericCommand(c,1);
+}
+
+void incrDecrCommand(redisClient *c, long long incr) {
+ long long value;
+ robj *o;
+
+ o = lookupKeyWrite(c->db,c->argv[1]);
+ if (o != NULL && checkType(c,o,REDIS_STRING)) return;
+ if (getLongLongFromObjectOrReply(c,o,&value,NULL) != REDIS_OK) return;
+
+ value += incr;
+ o = createStringObjectFromLongLong(value);
+ dbReplace(c->db,c->argv[1],o);
+ server.dirty++;
+ addReply(c,shared.colon);
+ addReply(c,o);
+ addReply(c,shared.crlf);
+}
+
+void incrCommand(redisClient *c) {
+ incrDecrCommand(c,1);
+}
+
+void decrCommand(redisClient *c) {
+ incrDecrCommand(c,-1);
+}
+
+void incrbyCommand(redisClient *c) {
+ long long incr;
+
+ if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
+ incrDecrCommand(c,incr);
+}
+
+void decrbyCommand(redisClient *c) {
+ long long incr;
+
+ if (getLongLongFromObjectOrReply(c, c->argv[2], &incr, NULL) != REDIS_OK) return;
+ incrDecrCommand(c,-incr);
+}
+
+void appendCommand(redisClient *c) {
+ int retval;
+ size_t totlen;
+ robj *o;
+
+ o = lookupKeyWrite(c->db,c->argv[1]);
+ if (o == NULL) {
+ /* Create the key */
+ retval = dbAdd(c->db,c->argv[1],c->argv[2]);
+ incrRefCount(c->argv[2]);
+ totlen = stringObjectLen(c->argv[2]);
+ } else {
+ if (o->type != REDIS_STRING) {
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+ /* If the object is specially encoded or shared we have to make
+ * a copy */
+ if (o->refcount != 1 || o->encoding != REDIS_ENCODING_RAW) {
+ robj *decoded = getDecodedObject(o);
+
+ o = createStringObject(decoded->ptr, sdslen(decoded->ptr));
+ decrRefCount(decoded);
+ dbReplace(c->db,c->argv[1],o);
+ }
+ /* APPEND! */
+ if (c->argv[2]->encoding == REDIS_ENCODING_RAW) {
+ o->ptr = sdscatlen(o->ptr,
+ c->argv[2]->ptr, sdslen(c->argv[2]->ptr));
+ } else {
+ o->ptr = sdscatprintf(o->ptr, "%ld",
+ (unsigned long) c->argv[2]->ptr);
+ }
+ totlen = sdslen(o->ptr);
+ }
+ server.dirty++;
+ addReplySds(c,sdscatprintf(sdsempty(),":%lu\r\n",(unsigned long)totlen));
+}
+
+void substrCommand(redisClient *c) {
+ robj *o;
+ long start = atoi(c->argv[2]->ptr);
+ long end = atoi(c->argv[3]->ptr);
+ size_t rangelen, strlen;
+ sds range;
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
+ checkType(c,o,REDIS_STRING)) return;
+
+ o = getDecodedObject(o);
+ strlen = sdslen(o->ptr);
+
+ /* convert negative indexes */
+ if (start < 0) start = strlen+start;
+ if (end < 0) end = strlen+end;
+ if (start < 0) start = 0;
+ if (end < 0) end = 0;
+
+ /* indexes sanity checks */
+ if (start > end || (size_t)start >= strlen) {
+ /* Out of range start or start > end result in null reply */
+ addReply(c,shared.nullbulk);
+ decrRefCount(o);
+ return;
+ }
+ if ((size_t)end >= strlen) end = strlen-1;
+ rangelen = (end-start)+1;
+
+ /* Return the result */
+ addReplySds(c,sdscatprintf(sdsempty(),"$%zu\r\n",rangelen));
+ range = sdsnewlen((char*)o->ptr+start,rangelen);
+ addReplySds(c,range);
+ addReply(c,shared.crlf);
+ decrRefCount(o);
+}
+
+
diff --git a/src/t_zset.c b/src/t_zset.c
new file mode 100644
index 000000000..de32a8eed
--- /dev/null
+++ b/src/t_zset.c
@@ -0,0 +1,985 @@
+#include "redis.h"
+
+#include <math.h>
+
+/*-----------------------------------------------------------------------------
+ * Sorted set API
+ *----------------------------------------------------------------------------*/
+
+/* ZSETs are ordered sets using two data structures to hold the same elements
+ * in order to get O(log(N)) INSERT and REMOVE operations into a sorted
+ * data structure.
+ *
+ * The elements are added to an hash table mapping Redis objects to scores.
+ * At the same time the elements are added to a skip list mapping scores
+ * to Redis objects (so objects are sorted by scores in this "view"). */
+
+/* This skiplist implementation is almost a C translation of the original
+ * algorithm described by William Pugh in "Skip Lists: A Probabilistic
+ * Alternative to Balanced Trees", modified in three ways:
+ * a) this implementation allows for repeated values.
+ * b) the comparison is not just by key (our 'score') but by satellite data.
+ * c) there is a back pointer, so it's a doubly linked list with the back
+ * pointers being only at "level 1". This allows to traverse the list
+ * from tail to head, useful for ZREVRANGE. */
+
+zskiplistNode *zslCreateNode(int level, double score, robj *obj) {
+ zskiplistNode *zn = zmalloc(sizeof(*zn));
+
+ zn->forward = zmalloc(sizeof(zskiplistNode*) * level);
+ if (level > 1)
+ zn->span = zmalloc(sizeof(unsigned int) * (level - 1));
+ else
+ zn->span = NULL;
+ zn->score = score;
+ zn->obj = obj;
+ return zn;
+}
+
+zskiplist *zslCreate(void) {
+ int j;
+ zskiplist *zsl;
+
+ zsl = zmalloc(sizeof(*zsl));
+ zsl->level = 1;
+ zsl->length = 0;
+ zsl->header = zslCreateNode(ZSKIPLIST_MAXLEVEL,0,NULL);
+ for (j = 0; j < ZSKIPLIST_MAXLEVEL; j++) {
+ zsl->header->forward[j] = NULL;
+
+ /* span has space for ZSKIPLIST_MAXLEVEL-1 elements */
+ if (j < ZSKIPLIST_MAXLEVEL-1)
+ zsl->header->span[j] = 0;
+ }
+ zsl->header->backward = NULL;
+ zsl->tail = NULL;
+ return zsl;
+}
+
+void zslFreeNode(zskiplistNode *node) {
+ decrRefCount(node->obj);
+ zfree(node->forward);
+ zfree(node->span);
+ zfree(node);
+}
+
+void zslFree(zskiplist *zsl) {
+ zskiplistNode *node = zsl->header->forward[0], *next;
+
+ zfree(zsl->header->forward);
+ zfree(zsl->header->span);
+ zfree(zsl->header);
+ while(node) {
+ next = node->forward[0];
+ zslFreeNode(node);
+ node = next;
+ }
+ zfree(zsl);
+}
+
+int zslRandomLevel(void) {
+ int level = 1;
+ while ((random()&0xFFFF) < (ZSKIPLIST_P * 0xFFFF))
+ level += 1;
+ return (level<ZSKIPLIST_MAXLEVEL) ? level : ZSKIPLIST_MAXLEVEL;
+}
+
+void zslInsert(zskiplist *zsl, double score, robj *obj) {
+ zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
+ unsigned int rank[ZSKIPLIST_MAXLEVEL];
+ int i, level;
+
+ x = zsl->header;
+ for (i = zsl->level-1; i >= 0; i--) {
+ /* store rank that is crossed to reach the insert position */
+ rank[i] = i == (zsl->level-1) ? 0 : rank[i+1];
+
+ while (x->forward[i] &&
+ (x->forward[i]->score < score ||
+ (x->forward[i]->score == score &&
+ compareStringObjects(x->forward[i]->obj,obj) < 0))) {
+ rank[i] += i > 0 ? x->span[i-1] : 1;
+ x = x->forward[i];
+ }
+ update[i] = x;
+ }
+ /* we assume the key is not already inside, since we allow duplicated
+ * scores, and the re-insertion of score and redis object should never
+ * happpen since the caller of zslInsert() should test in the hash table
+ * if the element is already inside or not. */
+ level = zslRandomLevel();
+ if (level > zsl->level) {
+ for (i = zsl->level; i < level; i++) {
+ rank[i] = 0;
+ update[i] = zsl->header;
+ update[i]->span[i-1] = zsl->length;
+ }
+ zsl->level = level;
+ }
+ x = zslCreateNode(level,score,obj);
+ for (i = 0; i < level; i++) {
+ x->forward[i] = update[i]->forward[i];
+ update[i]->forward[i] = x;
+
+ /* update span covered by update[i] as x is inserted here */
+ if (i > 0) {
+ x->span[i-1] = update[i]->span[i-1] - (rank[0] - rank[i]);
+ update[i]->span[i-1] = (rank[0] - rank[i]) + 1;
+ }
+ }
+
+ /* increment span for untouched levels */
+ for (i = level; i < zsl->level; i++) {
+ update[i]->span[i-1]++;
+ }
+
+ x->backward = (update[0] == zsl->header) ? NULL : update[0];
+ if (x->forward[0])
+ x->forward[0]->backward = x;
+ else
+ zsl->tail = x;
+ zsl->length++;
+}
+
+/* Internal function used by zslDelete, zslDeleteByScore and zslDeleteByRank */
+void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) {
+ int i;
+ for (i = 0; i < zsl->level; i++) {
+ if (update[i]->forward[i] == x) {
+ if (i > 0) {
+ update[i]->span[i-1] += x->span[i-1] - 1;
+ }
+ update[i]->forward[i] = x->forward[i];
+ } else {
+ /* invariant: i > 0, because update[0]->forward[0]
+ * is always equal to x */
+ update[i]->span[i-1] -= 1;
+ }
+ }
+ if (x->forward[0]) {
+ x->forward[0]->backward = x->backward;
+ } else {
+ zsl->tail = x->backward;
+ }
+ while(zsl->level > 1 && zsl->header->forward[zsl->level-1] == NULL)
+ zsl->level--;
+ zsl->length--;
+}
+
+/* Delete an element with matching score/object from the skiplist. */
+int zslDelete(zskiplist *zsl, double score, robj *obj) {
+ zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
+ int i;
+
+ x = zsl->header;
+ for (i = zsl->level-1; i >= 0; i--) {
+ while (x->forward[i] &&
+ (x->forward[i]->score < score ||
+ (x->forward[i]->score == score &&
+ compareStringObjects(x->forward[i]->obj,obj) < 0)))
+ x = x->forward[i];
+ update[i] = x;
+ }
+ /* We may have multiple elements with the same score, what we need
+ * is to find the element with both the right score and object. */
+ x = x->forward[0];
+ if (x && score == x->score && equalStringObjects(x->obj,obj)) {
+ zslDeleteNode(zsl, x, update);
+ zslFreeNode(x);
+ return 1;
+ } else {
+ return 0; /* not found */
+ }
+ return 0; /* not found */
+}
+
+/* Delete all the elements with score between min and max from the skiplist.
+ * Min and mx are inclusive, so a score >= min || score <= max is deleted.
+ * Note that this function takes the reference to the hash table view of the
+ * sorted set, in order to remove the elements from the hash table too. */
+unsigned long zslDeleteRangeByScore(zskiplist *zsl, double min, double max, dict *dict) {
+ zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
+ unsigned long removed = 0;
+ int i;
+
+ x = zsl->header;
+ for (i = zsl->level-1; i >= 0; i--) {
+ while (x->forward[i] && x->forward[i]->score < min)
+ x = x->forward[i];
+ update[i] = x;
+ }
+ /* We may have multiple elements with the same score, what we need
+ * is to find the element with both the right score and object. */
+ x = x->forward[0];
+ while (x && x->score <= max) {
+ zskiplistNode *next = x->forward[0];
+ zslDeleteNode(zsl, x, update);
+ dictDelete(dict,x->obj);
+ zslFreeNode(x);
+ removed++;
+ x = next;
+ }
+ return removed; /* not found */
+}
+
+/* Delete all the elements with rank between start and end from the skiplist.
+ * Start and end are inclusive. Note that start and end need to be 1-based */
+unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned int end, dict *dict) {
+ zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x;
+ unsigned long traversed = 0, removed = 0;
+ int i;
+
+ x = zsl->header;
+ for (i = zsl->level-1; i >= 0; i--) {
+ while (x->forward[i] && (traversed + (i > 0 ? x->span[i-1] : 1)) < start) {
+ traversed += i > 0 ? x->span[i-1] : 1;
+ x = x->forward[i];
+ }
+ update[i] = x;
+ }
+
+ traversed++;
+ x = x->forward[0];
+ while (x && traversed <= end) {
+ zskiplistNode *next = x->forward[0];
+ zslDeleteNode(zsl, x, update);
+ dictDelete(dict,x->obj);
+ zslFreeNode(x);
+ removed++;
+ traversed++;
+ x = next;
+ }
+ return removed;
+}
+
+/* Find the first node having a score equal or greater than the specified one.
+ * Returns NULL if there is no match. */
+zskiplistNode *zslFirstWithScore(zskiplist *zsl, double score) {
+ zskiplistNode *x;
+ int i;
+
+ x = zsl->header;
+ for (i = zsl->level-1; i >= 0; i--) {
+ while (x->forward[i] && x->forward[i]->score < score)
+ x = x->forward[i];
+ }
+ /* We may have multiple elements with the same score, what we need
+ * is to find the element with both the right score and object. */
+ return x->forward[0];
+}
+
+/* Find the rank for an element by both score and key.
+ * Returns 0 when the element cannot be found, rank otherwise.
+ * Note that the rank is 1-based due to the span of zsl->header to the
+ * first element. */
+unsigned long zslistTypeGetRank(zskiplist *zsl, double score, robj *o) {
+ zskiplistNode *x;
+ unsigned long rank = 0;
+ int i;
+
+ x = zsl->header;
+ for (i = zsl->level-1; i >= 0; i--) {
+ while (x->forward[i] &&
+ (x->forward[i]->score < score ||
+ (x->forward[i]->score == score &&
+ compareStringObjects(x->forward[i]->obj,o) <= 0))) {
+ rank += i > 0 ? x->span[i-1] : 1;
+ x = x->forward[i];
+ }
+
+ /* x might be equal to zsl->header, so test if obj is non-NULL */
+ if (x->obj && equalStringObjects(x->obj,o)) {
+ return rank;
+ }
+ }
+ return 0;
+}
+
+/* Finds an element by its rank. The rank argument needs to be 1-based. */
+zskiplistNode* zslistTypeGetElementByRank(zskiplist *zsl, unsigned long rank) {
+ zskiplistNode *x;
+ unsigned long traversed = 0;
+ int i;
+
+ x = zsl->header;
+ for (i = zsl->level-1; i >= 0; i--) {
+ while (x->forward[i] && (traversed + (i>0 ? x->span[i-1] : 1)) <= rank)
+ {
+ traversed += i > 0 ? x->span[i-1] : 1;
+ x = x->forward[i];
+ }
+ if (traversed == rank) {
+ return x;
+ }
+ }
+ return NULL;
+}
+
+/*-----------------------------------------------------------------------------
+ * Sorted set commands
+ *----------------------------------------------------------------------------*/
+
+/* This generic command implements both ZADD and ZINCRBY.
+ * scoreval is the score if the operation is a ZADD (doincrement == 0) or
+ * the increment if the operation is a ZINCRBY (doincrement == 1). */
+void zaddGenericCommand(redisClient *c, robj *key, robj *ele, double scoreval, int doincrement) {
+ robj *zsetobj;
+ zset *zs;
+ double *score;
+
+ if (isnan(scoreval)) {
+ addReplySds(c,sdsnew("-ERR provide score is Not A Number (nan)\r\n"));
+ return;
+ }
+
+ zsetobj = lookupKeyWrite(c->db,key);
+ if (zsetobj == NULL) {
+ zsetobj = createZsetObject();
+ dbAdd(c->db,key,zsetobj);
+ } else {
+ if (zsetobj->type != REDIS_ZSET) {
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+ }
+ zs = zsetobj->ptr;
+
+ /* Ok now since we implement both ZADD and ZINCRBY here the code
+ * needs to handle the two different conditions. It's all about setting
+ * '*score', that is, the new score to set, to the right value. */
+ score = zmalloc(sizeof(double));
+ if (doincrement) {
+ dictEntry *de;
+
+ /* Read the old score. If the element was not present starts from 0 */
+ de = dictFind(zs->dict,ele);
+ if (de) {
+ double *oldscore = dictGetEntryVal(de);
+ *score = *oldscore + scoreval;
+ } else {
+ *score = scoreval;
+ }
+ if (isnan(*score)) {
+ addReplySds(c,
+ sdsnew("-ERR resulting score is Not A Number (nan)\r\n"));
+ zfree(score);
+ /* Note that we don't need to check if the zset may be empty and
+ * should be removed here, as we can only obtain Nan as score if
+ * there was already an element in the sorted set. */
+ return;
+ }
+ } else {
+ *score = scoreval;
+ }
+
+ /* What follows is a simple remove and re-insert operation that is common
+ * to both ZADD and ZINCRBY... */
+ if (dictAdd(zs->dict,ele,score) == DICT_OK) {
+ /* case 1: New element */
+ incrRefCount(ele); /* added to hash */
+ zslInsert(zs->zsl,*score,ele);
+ incrRefCount(ele); /* added to skiplist */
+ server.dirty++;
+ if (doincrement)
+ addReplyDouble(c,*score);
+ else
+ addReply(c,shared.cone);
+ } else {
+ dictEntry *de;
+ double *oldscore;
+
+ /* case 2: Score update operation */
+ de = dictFind(zs->dict,ele);
+ redisAssert(de != NULL);
+ oldscore = dictGetEntryVal(de);
+ if (*score != *oldscore) {
+ int deleted;
+
+ /* Remove and insert the element in the skip list with new score */
+ deleted = zslDelete(zs->zsl,*oldscore,ele);
+ redisAssert(deleted != 0);
+ zslInsert(zs->zsl,*score,ele);
+ incrRefCount(ele);
+ /* Update the score in the hash table */
+ dictReplace(zs->dict,ele,score);
+ server.dirty++;
+ } else {
+ zfree(score);
+ }
+ if (doincrement)
+ addReplyDouble(c,*score);
+ else
+ addReply(c,shared.czero);
+ }
+}
+
+void zaddCommand(redisClient *c) {
+ double scoreval;
+
+ if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
+ zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,0);
+}
+
+void zincrbyCommand(redisClient *c) {
+ double scoreval;
+
+ if (getDoubleFromObjectOrReply(c, c->argv[2], &scoreval, NULL) != REDIS_OK) return;
+ zaddGenericCommand(c,c->argv[1],c->argv[3],scoreval,1);
+}
+
+void zremCommand(redisClient *c) {
+ robj *zsetobj;
+ zset *zs;
+ dictEntry *de;
+ double *oldscore;
+ int deleted;
+
+ if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,zsetobj,REDIS_ZSET)) return;
+
+ zs = zsetobj->ptr;
+ de = dictFind(zs->dict,c->argv[2]);
+ if (de == NULL) {
+ addReply(c,shared.czero);
+ return;
+ }
+ /* Delete from the skiplist */
+ oldscore = dictGetEntryVal(de);
+ deleted = zslDelete(zs->zsl,*oldscore,c->argv[2]);
+ redisAssert(deleted != 0);
+
+ /* Delete from the hash table */
+ dictDelete(zs->dict,c->argv[2]);
+ if (htNeedsResize(zs->dict)) dictResize(zs->dict);
+ if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
+ server.dirty++;
+ addReply(c,shared.cone);
+}
+
+void zremrangebyscoreCommand(redisClient *c) {
+ double min;
+ double max;
+ long deleted;
+ robj *zsetobj;
+ zset *zs;
+
+ if ((getDoubleFromObjectOrReply(c, c->argv[2], &min, NULL) != REDIS_OK) ||
+ (getDoubleFromObjectOrReply(c, c->argv[3], &max, NULL) != REDIS_OK)) return;
+
+ if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,zsetobj,REDIS_ZSET)) return;
+
+ zs = zsetobj->ptr;
+ deleted = zslDeleteRangeByScore(zs->zsl,min,max,zs->dict);
+ if (htNeedsResize(zs->dict)) dictResize(zs->dict);
+ if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
+ server.dirty += deleted;
+ addReplyLongLong(c,deleted);
+}
+
+void zremrangebyrankCommand(redisClient *c) {
+ long start;
+ long end;
+ int llen;
+ long deleted;
+ robj *zsetobj;
+ zset *zs;
+
+ if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
+ (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
+
+ if ((zsetobj = lookupKeyWriteOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,zsetobj,REDIS_ZSET)) return;
+ zs = zsetobj->ptr;
+ llen = zs->zsl->length;
+
+ /* convert negative indexes */
+ if (start < 0) start = llen+start;
+ if (end < 0) end = llen+end;
+ if (start < 0) start = 0;
+ if (end < 0) end = 0;
+
+ /* indexes sanity checks */
+ if (start > end || start >= llen) {
+ addReply(c,shared.czero);
+ return;
+ }
+ if (end >= llen) end = llen-1;
+
+ /* increment start and end because zsl*Rank functions
+ * use 1-based rank */
+ deleted = zslDeleteRangeByRank(zs->zsl,start+1,end+1,zs->dict);
+ if (htNeedsResize(zs->dict)) dictResize(zs->dict);
+ if (dictSize(zs->dict) == 0) dbDelete(c->db,c->argv[1]);
+ server.dirty += deleted;
+ addReplyLongLong(c, deleted);
+}
+
+typedef struct {
+ dict *dict;
+ double weight;
+} zsetopsrc;
+
+int qsortCompareZsetopsrcByCardinality(const void *s1, const void *s2) {
+ zsetopsrc *d1 = (void*) s1, *d2 = (void*) s2;
+ unsigned long size1, size2;
+ size1 = d1->dict ? dictSize(d1->dict) : 0;
+ size2 = d2->dict ? dictSize(d2->dict) : 0;
+ return size1 - size2;
+}
+
+#define REDIS_AGGR_SUM 1
+#define REDIS_AGGR_MIN 2
+#define REDIS_AGGR_MAX 3
+#define zunionInterDictValue(_e) (dictGetEntryVal(_e) == NULL ? 1.0 : *(double*)dictGetEntryVal(_e))
+
+inline static void zunionInterAggregate(double *target, double val, int aggregate) {
+ if (aggregate == REDIS_AGGR_SUM) {
+ *target = *target + val;
+ } else if (aggregate == REDIS_AGGR_MIN) {
+ *target = val < *target ? val : *target;
+ } else if (aggregate == REDIS_AGGR_MAX) {
+ *target = val > *target ? val : *target;
+ } else {
+ /* safety net */
+ redisPanic("Unknown ZUNION/INTER aggregate type");
+ }
+}
+
+void zunionInterGenericCommand(redisClient *c, robj *dstkey, int op) {
+ int i, j, setnum;
+ int aggregate = REDIS_AGGR_SUM;
+ zsetopsrc *src;
+ robj *dstobj;
+ zset *dstzset;
+ dictIterator *di;
+ dictEntry *de;
+
+ /* expect setnum input keys to be given */
+ setnum = atoi(c->argv[2]->ptr);
+ if (setnum < 1) {
+ addReplySds(c,sdsnew("-ERR at least 1 input key is needed for ZUNIONSTORE/ZINTERSTORE\r\n"));
+ return;
+ }
+
+ /* test if the expected number of keys would overflow */
+ if (3+setnum > c->argc) {
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+
+ /* read keys to be used for input */
+ src = zmalloc(sizeof(zsetopsrc) * setnum);
+ for (i = 0, j = 3; i < setnum; i++, j++) {
+ robj *obj = lookupKeyWrite(c->db,c->argv[j]);
+ if (!obj) {
+ src[i].dict = NULL;
+ } else {
+ if (obj->type == REDIS_ZSET) {
+ src[i].dict = ((zset*)obj->ptr)->dict;
+ } else if (obj->type == REDIS_SET) {
+ src[i].dict = (obj->ptr);
+ } else {
+ zfree(src);
+ addReply(c,shared.wrongtypeerr);
+ return;
+ }
+ }
+
+ /* default all weights to 1 */
+ src[i].weight = 1.0;
+ }
+
+ /* parse optional extra arguments */
+ if (j < c->argc) {
+ int remaining = c->argc - j;
+
+ while (remaining) {
+ if (remaining >= (setnum + 1) && !strcasecmp(c->argv[j]->ptr,"weights")) {
+ j++; remaining--;
+ for (i = 0; i < setnum; i++, j++, remaining--) {
+ if (getDoubleFromObjectOrReply(c, c->argv[j], &src[i].weight, NULL) != REDIS_OK)
+ return;
+ }
+ } else if (remaining >= 2 && !strcasecmp(c->argv[j]->ptr,"aggregate")) {
+ j++; remaining--;
+ if (!strcasecmp(c->argv[j]->ptr,"sum")) {
+ aggregate = REDIS_AGGR_SUM;
+ } else if (!strcasecmp(c->argv[j]->ptr,"min")) {
+ aggregate = REDIS_AGGR_MIN;
+ } else if (!strcasecmp(c->argv[j]->ptr,"max")) {
+ aggregate = REDIS_AGGR_MAX;
+ } else {
+ zfree(src);
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+ j++; remaining--;
+ } else {
+ zfree(src);
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+ }
+ }
+
+ /* sort sets from the smallest to largest, this will improve our
+ * algorithm's performance */
+ qsort(src,setnum,sizeof(zsetopsrc),qsortCompareZsetopsrcByCardinality);
+
+ dstobj = createZsetObject();
+ dstzset = dstobj->ptr;
+
+ if (op == REDIS_OP_INTER) {
+ /* skip going over all entries if the smallest zset is NULL or empty */
+ if (src[0].dict && dictSize(src[0].dict) > 0) {
+ /* precondition: as src[0].dict is non-empty and the zsets are ordered
+ * from small to large, all src[i > 0].dict are non-empty too */
+ di = dictGetIterator(src[0].dict);
+ while((de = dictNext(di)) != NULL) {
+ double *score = zmalloc(sizeof(double)), value;
+ *score = src[0].weight * zunionInterDictValue(de);
+
+ for (j = 1; j < setnum; j++) {
+ dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
+ if (other) {
+ value = src[j].weight * zunionInterDictValue(other);
+ zunionInterAggregate(score, value, aggregate);
+ } else {
+ break;
+ }
+ }
+
+ /* skip entry when not present in every source dict */
+ if (j != setnum) {
+ zfree(score);
+ } else {
+ robj *o = dictGetEntryKey(de);
+ dictAdd(dstzset->dict,o,score);
+ incrRefCount(o); /* added to dictionary */
+ zslInsert(dstzset->zsl,*score,o);
+ incrRefCount(o); /* added to skiplist */
+ }
+ }
+ dictReleaseIterator(di);
+ }
+ } else if (op == REDIS_OP_UNION) {
+ for (i = 0; i < setnum; i++) {
+ if (!src[i].dict) continue;
+
+ di = dictGetIterator(src[i].dict);
+ while((de = dictNext(di)) != NULL) {
+ /* skip key when already processed */
+ if (dictFind(dstzset->dict,dictGetEntryKey(de)) != NULL) continue;
+
+ double *score = zmalloc(sizeof(double)), value;
+ *score = src[i].weight * zunionInterDictValue(de);
+
+ /* because the zsets are sorted by size, its only possible
+ * for sets at larger indices to hold this entry */
+ for (j = (i+1); j < setnum; j++) {
+ dictEntry *other = dictFind(src[j].dict,dictGetEntryKey(de));
+ if (other) {
+ value = src[j].weight * zunionInterDictValue(other);
+ zunionInterAggregate(score, value, aggregate);
+ }
+ }
+
+ robj *o = dictGetEntryKey(de);
+ dictAdd(dstzset->dict,o,score);
+ incrRefCount(o); /* added to dictionary */
+ zslInsert(dstzset->zsl,*score,o);
+ incrRefCount(o); /* added to skiplist */
+ }
+ dictReleaseIterator(di);
+ }
+ } else {
+ /* unknown operator */
+ redisAssert(op == REDIS_OP_INTER || op == REDIS_OP_UNION);
+ }
+
+ dbDelete(c->db,dstkey);
+ if (dstzset->zsl->length) {
+ dbAdd(c->db,dstkey,dstobj);
+ addReplyLongLong(c, dstzset->zsl->length);
+ server.dirty++;
+ } else {
+ decrRefCount(dstobj);
+ addReply(c, shared.czero);
+ }
+ zfree(src);
+}
+
+void zunionstoreCommand(redisClient *c) {
+ zunionInterGenericCommand(c,c->argv[1], REDIS_OP_UNION);
+}
+
+void zinterstoreCommand(redisClient *c) {
+ zunionInterGenericCommand(c,c->argv[1], REDIS_OP_INTER);
+}
+
+void zrangeGenericCommand(redisClient *c, int reverse) {
+ robj *o;
+ long start;
+ long end;
+ int withscores = 0;
+ int llen;
+ int rangelen, j;
+ zset *zsetobj;
+ zskiplist *zsl;
+ zskiplistNode *ln;
+ robj *ele;
+
+ if ((getLongFromObjectOrReply(c, c->argv[2], &start, NULL) != REDIS_OK) ||
+ (getLongFromObjectOrReply(c, c->argv[3], &end, NULL) != REDIS_OK)) return;
+
+ if (c->argc == 5 && !strcasecmp(c->argv[4]->ptr,"withscores")) {
+ withscores = 1;
+ } else if (c->argc >= 5) {
+ addReply(c,shared.syntaxerr);
+ return;
+ }
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL
+ || checkType(c,o,REDIS_ZSET)) return;
+ zsetobj = o->ptr;
+ zsl = zsetobj->zsl;
+ llen = zsl->length;
+
+ /* convert negative indexes */
+ if (start < 0) start = llen+start;
+ if (end < 0) end = llen+end;
+ if (start < 0) start = 0;
+ if (end < 0) end = 0;
+
+ /* indexes sanity checks */
+ if (start > end || start >= llen) {
+ /* Out of range start or start > end result in empty list */
+ addReply(c,shared.emptymultibulk);
+ return;
+ }
+ if (end >= llen) end = llen-1;
+ rangelen = (end-start)+1;
+
+ /* check if starting point is trivial, before searching
+ * the element in log(N) time */
+ if (reverse) {
+ ln = start == 0 ? zsl->tail : zslistTypeGetElementByRank(zsl, llen-start);
+ } else {
+ ln = start == 0 ?
+ zsl->header->forward[0] : zslistTypeGetElementByRank(zsl, start+1);
+ }
+
+ /* Return the result in form of a multi-bulk reply */
+ addReplySds(c,sdscatprintf(sdsempty(),"*%d\r\n",
+ withscores ? (rangelen*2) : rangelen));
+ for (j = 0; j < rangelen; j++) {
+ ele = ln->obj;
+ addReplyBulk(c,ele);
+ if (withscores)
+ addReplyDouble(c,ln->score);
+ ln = reverse ? ln->backward : ln->forward[0];
+ }
+}
+
+void zrangeCommand(redisClient *c) {
+ zrangeGenericCommand(c,0);
+}
+
+void zrevrangeCommand(redisClient *c) {
+ zrangeGenericCommand(c,1);
+}
+
+/* This command implements both ZRANGEBYSCORE and ZCOUNT.
+ * If justcount is non-zero, just the count is returned. */
+void genericZrangebyscoreCommand(redisClient *c, int justcount) {
+ robj *o;
+ double min, max;
+ int minex = 0, maxex = 0; /* are min or max exclusive? */
+ int offset = 0, limit = -1;
+ int withscores = 0;
+ int badsyntax = 0;
+
+ /* Parse the min-max interval. If one of the values is prefixed
+ * by the "(" character, it's considered "open". For instance
+ * ZRANGEBYSCORE zset (1.5 (2.5 will match min < x < max
+ * ZRANGEBYSCORE zset 1.5 2.5 will instead match min <= x <= max */
+ if (((char*)c->argv[2]->ptr)[0] == '(') {
+ min = strtod((char*)c->argv[2]->ptr+1,NULL);
+ minex = 1;
+ } else {
+ min = strtod(c->argv[2]->ptr,NULL);
+ }
+ if (((char*)c->argv[3]->ptr)[0] == '(') {
+ max = strtod((char*)c->argv[3]->ptr+1,NULL);
+ maxex = 1;
+ } else {
+ max = strtod(c->argv[3]->ptr,NULL);
+ }
+
+ /* Parse "WITHSCORES": note that if the command was called with
+ * the name ZCOUNT then we are sure that c->argc == 4, so we'll never
+ * enter the following paths to parse WITHSCORES and LIMIT. */
+ if (c->argc == 5 || c->argc == 8) {
+ if (strcasecmp(c->argv[c->argc-1]->ptr,"withscores") == 0)
+ withscores = 1;
+ else
+ badsyntax = 1;
+ }
+ if (c->argc != (4 + withscores) && c->argc != (7 + withscores))
+ badsyntax = 1;
+ if (badsyntax) {
+ addReplySds(c,
+ sdsnew("-ERR wrong number of arguments for ZRANGEBYSCORE\r\n"));
+ return;
+ }
+
+ /* Parse "LIMIT" */
+ if (c->argc == (7 + withscores) && strcasecmp(c->argv[4]->ptr,"limit")) {
+ addReply(c,shared.syntaxerr);
+ return;
+ } else if (c->argc == (7 + withscores)) {
+ offset = atoi(c->argv[5]->ptr);
+ limit = atoi(c->argv[6]->ptr);
+ if (offset < 0) offset = 0;
+ }
+
+ /* Ok, lookup the key and get the range */
+ o = lookupKeyRead(c->db,c->argv[1]);
+ if (o == NULL) {
+ addReply(c,justcount ? shared.czero : shared.emptymultibulk);
+ } else {
+ if (o->type != REDIS_ZSET) {
+ addReply(c,shared.wrongtypeerr);
+ } else {
+ zset *zsetobj = o->ptr;
+ zskiplist *zsl = zsetobj->zsl;
+ zskiplistNode *ln;
+ robj *ele, *lenobj = NULL;
+ unsigned long rangelen = 0;
+
+ /* Get the first node with the score >= min, or with
+ * score > min if 'minex' is true. */
+ ln = zslFirstWithScore(zsl,min);
+ while (minex && ln && ln->score == min) ln = ln->forward[0];
+
+ if (ln == NULL) {
+ /* No element matching the speciifed interval */
+ addReply(c,justcount ? shared.czero : shared.emptymultibulk);
+ return;
+ }
+
+ /* We don't know in advance how many matching elements there
+ * are in the list, so we push this object that will represent
+ * the multi-bulk length in the output buffer, and will "fix"
+ * it later */
+ if (!justcount) {
+ lenobj = createObject(REDIS_STRING,NULL);
+ addReply(c,lenobj);
+ decrRefCount(lenobj);
+ }
+
+ while(ln && (maxex ? (ln->score < max) : (ln->score <= max))) {
+ if (offset) {
+ offset--;
+ ln = ln->forward[0];
+ continue;
+ }
+ if (limit == 0) break;
+ if (!justcount) {
+ ele = ln->obj;
+ addReplyBulk(c,ele);
+ if (withscores)
+ addReplyDouble(c,ln->score);
+ }
+ ln = ln->forward[0];
+ rangelen++;
+ if (limit > 0) limit--;
+ }
+ if (justcount) {
+ addReplyLongLong(c,(long)rangelen);
+ } else {
+ lenobj->ptr = sdscatprintf(sdsempty(),"*%lu\r\n",
+ withscores ? (rangelen*2) : rangelen);
+ }
+ }
+ }
+}
+
+void zrangebyscoreCommand(redisClient *c) {
+ genericZrangebyscoreCommand(c,0);
+}
+
+void zcountCommand(redisClient *c) {
+ genericZrangebyscoreCommand(c,1);
+}
+
+void zcardCommand(redisClient *c) {
+ robj *o;
+ zset *zs;
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL ||
+ checkType(c,o,REDIS_ZSET)) return;
+
+ zs = o->ptr;
+ addReplyUlong(c,zs->zsl->length);
+}
+
+void zscoreCommand(redisClient *c) {
+ robj *o;
+ zset *zs;
+ dictEntry *de;
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
+ checkType(c,o,REDIS_ZSET)) return;
+
+ zs = o->ptr;
+ de = dictFind(zs->dict,c->argv[2]);
+ if (!de) {
+ addReply(c,shared.nullbulk);
+ } else {
+ double *score = dictGetEntryVal(de);
+
+ addReplyDouble(c,*score);
+ }
+}
+
+void zrankGenericCommand(redisClient *c, int reverse) {
+ robj *o;
+ zset *zs;
+ zskiplist *zsl;
+ dictEntry *de;
+ unsigned long rank;
+ double *score;
+
+ if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.nullbulk)) == NULL ||
+ checkType(c,o,REDIS_ZSET)) return;
+
+ zs = o->ptr;
+ zsl = zs->zsl;
+ de = dictFind(zs->dict,c->argv[2]);
+ if (!de) {
+ addReply(c,shared.nullbulk);
+ return;
+ }
+
+ score = dictGetEntryVal(de);
+ rank = zslistTypeGetRank(zsl, *score, c->argv[2]);
+ if (rank) {
+ if (reverse) {
+ addReplyLongLong(c, zsl->length - rank);
+ } else {
+ addReplyLongLong(c, rank-1);
+ }
+ } else {
+ addReply(c,shared.nullbulk);
+ }
+}
+
+void zrankCommand(redisClient *c) {
+ zrankGenericCommand(c, 0);
+}
+
+void zrevrankCommand(redisClient *c) {
+ zrankGenericCommand(c, 1);
+}
diff --git a/src/util.c b/src/util.c
new file mode 100644
index 000000000..cc2794f6a
--- /dev/null
+++ b/src/util.c
@@ -0,0 +1,223 @@
+#include "redis.h"
+#include <ctype.h>
+#include <limits.h>
+
+/* Glob-style pattern matching. */
+int stringmatchlen(const char *pattern, int patternLen,
+ const char *string, int stringLen, int nocase)
+{
+ while(patternLen) {
+ switch(pattern[0]) {
+ case '*':
+ while (pattern[1] == '*') {
+ pattern++;
+ patternLen--;
+ }
+ if (patternLen == 1)
+ return 1; /* match */
+ while(stringLen) {
+ if (stringmatchlen(pattern+1, patternLen-1,
+ string, stringLen, nocase))
+ return 1; /* match */
+ string++;
+ stringLen--;
+ }
+ return 0; /* no match */
+ break;
+ case '?':
+ if (stringLen == 0)
+ return 0; /* no match */
+ string++;
+ stringLen--;
+ break;
+ case '[':
+ {
+ int not, match;
+
+ pattern++;
+ patternLen--;
+ not = pattern[0] == '^';
+ if (not) {
+ pattern++;
+ patternLen--;
+ }
+ match = 0;
+ while(1) {
+ if (pattern[0] == '\\') {
+ pattern++;
+ patternLen--;
+ if (pattern[0] == string[0])
+ match = 1;
+ } else if (pattern[0] == ']') {
+ break;
+ } else if (patternLen == 0) {
+ pattern--;
+ patternLen++;
+ break;
+ } else if (pattern[1] == '-' && patternLen >= 3) {
+ int start = pattern[0];
+ int end = pattern[2];
+ int c = string[0];
+ if (start > end) {
+ int t = start;
+ start = end;
+ end = t;
+ }
+ if (nocase) {
+ start = tolower(start);
+ end = tolower(end);
+ c = tolower(c);
+ }
+ pattern += 2;
+ patternLen -= 2;
+ if (c >= start && c <= end)
+ match = 1;
+ } else {
+ if (!nocase) {
+ if (pattern[0] == string[0])
+ match = 1;
+ } else {
+ if (tolower((int)pattern[0]) == tolower((int)string[0]))
+ match = 1;
+ }
+ }
+ pattern++;
+ patternLen--;
+ }
+ if (not)
+ match = !match;
+ if (!match)
+ return 0; /* no match */
+ string++;
+ stringLen--;
+ break;
+ }
+ case '\\':
+ if (patternLen >= 2) {
+ pattern++;
+ patternLen--;
+ }
+ /* fall through */
+ default:
+ if (!nocase) {
+ if (pattern[0] != string[0])
+ return 0; /* no match */
+ } else {
+ if (tolower((int)pattern[0]) != tolower((int)string[0]))
+ return 0; /* no match */
+ }
+ string++;
+ stringLen--;
+ break;
+ }
+ pattern++;
+ patternLen--;
+ if (stringLen == 0) {
+ while(*pattern == '*') {
+ pattern++;
+ patternLen--;
+ }
+ break;
+ }
+ }
+ if (patternLen == 0 && stringLen == 0)
+ return 1;
+ return 0;
+}
+
+int stringmatch(const char *pattern, const char *string, int nocase) {
+ return stringmatchlen(pattern,strlen(pattern),string,strlen(string),nocase);
+}
+
+/* Convert a string representing an amount of memory into the number of
+ * bytes, so for instance memtoll("1Gi") will return 1073741824 that is
+ * (1024*1024*1024).
+ *
+ * On parsing error, if *err is not NULL, it's set to 1, otherwise it's
+ * set to 0 */
+long long memtoll(const char *p, int *err) {
+ const char *u;
+ char buf[128];
+ long mul; /* unit multiplier */
+ long long val;
+ unsigned int digits;
+
+ if (err) *err = 0;
+ /* Search the first non digit character. */
+ u = p;
+ if (*u == '-') u++;
+ while(*u && isdigit(*u)) u++;
+ if (*u == '\0' || !strcasecmp(u,"b")) {
+ mul = 1;
+ } else if (!strcasecmp(u,"k")) {
+ mul = 1000;
+ } else if (!strcasecmp(u,"kb")) {
+ mul = 1024;
+ } else if (!strcasecmp(u,"m")) {
+ mul = 1000*1000;
+ } else if (!strcasecmp(u,"mb")) {
+ mul = 1024*1024;
+ } else if (!strcasecmp(u,"g")) {
+ mul = 1000L*1000*1000;
+ } else if (!strcasecmp(u,"gb")) {
+ mul = 1024L*1024*1024;
+ } else {
+ if (err) *err = 1;
+ mul = 1;
+ }
+ digits = u-p;
+ if (digits >= sizeof(buf)) {
+ if (err) *err = 1;
+ return LLONG_MAX;
+ }
+ memcpy(buf,p,digits);
+ buf[digits] = '\0';
+ val = strtoll(buf,NULL,10);
+ return val*mul;
+}
+
+/* Convert a long long into a string. Returns the number of
+ * characters needed to represent the number, that can be shorter if passed
+ * buffer length is not enough to store the whole number. */
+int ll2string(char *s, size_t len, long long value) {
+ char buf[32], *p;
+ unsigned long long v;
+ size_t l;
+
+ if (len == 0) return 0;
+ v = (value < 0) ? -value : value;
+ p = buf+31; /* point to the last character */
+ do {
+ *p-- = '0'+(v%10);
+ v /= 10;
+ } while(v);
+ if (value < 0) *p-- = '-';
+ p++;
+ l = 32-(p-buf);
+ if (l+1 > len) l = len-1; /* Make sure it fits, including the nul term */
+ memcpy(s,p,l);
+ s[l] = '\0';
+ return l;
+}
+
+/* Check if the nul-terminated string 's' can be represented by a long
+ * (that is, is a number that fits into long without any other space or
+ * character before or after the digits).
+ *
+ * If so, the function returns REDIS_OK and *longval is set to the value
+ * of the number. Otherwise REDIS_ERR is returned */
+int isStringRepresentableAsLong(sds s, long *longval) {
+ char buf[32], *endptr;
+ long value;
+ int slen;
+
+ value = strtol(s, &endptr, 10);
+ if (endptr[0] != '\0') return REDIS_ERR;
+ slen = ll2string(buf,32,value);
+
+ /* If the number converted back into a string is not identical
+ * then it's not possible to encode the string as integer */
+ if (sdslen(s) != (unsigned)slen || memcmp(buf,s,slen)) return REDIS_ERR;
+ if (longval) *longval = value;
+ return REDIS_OK;
+}
diff --git a/src/version.h b/src/version.h
new file mode 100644
index 000000000..86d422474
--- /dev/null
+++ b/src/version.h
@@ -0,0 +1 @@
+#define REDIS_VERSION "2.1.1"
diff --git a/src/vm.c b/src/vm.c
new file mode 100644
index 000000000..1aaa57eb5
--- /dev/null
+++ b/src/vm.c
@@ -0,0 +1,1126 @@
+#include "redis.h"
+
+#include <fcntl.h>
+#include <pthread.h>
+#include <math.h>
+#include <signal.h>
+
+/* Virtual Memory is composed mainly of two subsystems:
+ * - Blocking Virutal Memory
+ * - Threaded Virtual Memory I/O
+ * The two parts are not fully decoupled, but functions are split among two
+ * different sections of the source code (delimited by comments) in order to
+ * make more clear what functionality is about the blocking VM and what about
+ * the threaded (not blocking) VM.
+ *
+ * Redis VM design:
+ *
+ * Redis VM is a blocking VM (one that blocks reading swapped values from
+ * disk into memory when a value swapped out is needed in memory) that is made
+ * unblocking by trying to examine the command argument vector in order to
+ * load in background values that will likely be needed in order to exec
+ * the command. The command is executed only once all the relevant keys
+ * are loaded into memory.
+ *
+ * This basically is almost as simple of a blocking VM, but almost as parallel
+ * as a fully non-blocking VM.
+ */
+
+/* =================== Virtual Memory - Blocking Side ====================== */
+
+/* Create a VM pointer object. This kind of objects are used in place of
+ * values in the key -> value hash table, for swapped out objects. */
+vmpointer *createVmPointer(int vtype) {
+ vmpointer *vp = zmalloc(sizeof(vmpointer));
+
+ vp->type = REDIS_VMPOINTER;
+ vp->storage = REDIS_VM_SWAPPED;
+ vp->vtype = vtype;
+ return vp;
+}
+
+void vmInit(void) {
+ off_t totsize;
+ int pipefds[2];
+ size_t stacksize;
+ struct flock fl;
+
+ if (server.vm_max_threads != 0)
+ zmalloc_enable_thread_safeness(); /* we need thread safe zmalloc() */
+
+ redisLog(REDIS_NOTICE,"Using '%s' as swap file",server.vm_swap_file);
+ /* Try to open the old swap file, otherwise create it */
+ if ((server.vm_fp = fopen(server.vm_swap_file,"r+b")) == NULL) {
+ server.vm_fp = fopen(server.vm_swap_file,"w+b");
+ }
+ if (server.vm_fp == NULL) {
+ redisLog(REDIS_WARNING,
+ "Can't open the swap file: %s. Exiting.",
+ strerror(errno));
+ exit(1);
+ }
+ server.vm_fd = fileno(server.vm_fp);
+ /* Lock the swap file for writing, this is useful in order to avoid
+ * another instance to use the same swap file for a config error. */
+ fl.l_type = F_WRLCK;
+ fl.l_whence = SEEK_SET;
+ fl.l_start = fl.l_len = 0;
+ if (fcntl(server.vm_fd,F_SETLK,&fl) == -1) {
+ redisLog(REDIS_WARNING,
+ "Can't lock the swap file at '%s': %s. Make sure it is not used by another Redis instance.", server.vm_swap_file, strerror(errno));
+ exit(1);
+ }
+ /* Initialize */
+ server.vm_next_page = 0;
+ server.vm_near_pages = 0;
+ server.vm_stats_used_pages = 0;
+ server.vm_stats_swapped_objects = 0;
+ server.vm_stats_swapouts = 0;
+ server.vm_stats_swapins = 0;
+ totsize = server.vm_pages*server.vm_page_size;
+ redisLog(REDIS_NOTICE,"Allocating %lld bytes of swap file",totsize);
+ if (ftruncate(server.vm_fd,totsize) == -1) {
+ redisLog(REDIS_WARNING,"Can't ftruncate swap file: %s. Exiting.",
+ strerror(errno));
+ exit(1);
+ } else {
+ redisLog(REDIS_NOTICE,"Swap file allocated with success");
+ }
+ server.vm_bitmap = zmalloc((server.vm_pages+7)/8);
+ redisLog(REDIS_VERBOSE,"Allocated %lld bytes page table for %lld pages",
+ (long long) (server.vm_pages+7)/8, server.vm_pages);
+ memset(server.vm_bitmap,0,(server.vm_pages+7)/8);
+
+ /* Initialize threaded I/O (used by Virtual Memory) */
+ server.io_newjobs = listCreate();
+ server.io_processing = listCreate();
+ server.io_processed = listCreate();
+ server.io_ready_clients = listCreate();
+ pthread_mutex_init(&server.io_mutex,NULL);
+ pthread_mutex_init(&server.obj_freelist_mutex,NULL);
+ pthread_mutex_init(&server.io_swapfile_mutex,NULL);
+ server.io_active_threads = 0;
+ if (pipe(pipefds) == -1) {
+ redisLog(REDIS_WARNING,"Unable to intialized VM: pipe(2): %s. Exiting."
+ ,strerror(errno));
+ exit(1);
+ }
+ server.io_ready_pipe_read = pipefds[0];
+ server.io_ready_pipe_write = pipefds[1];
+ redisAssert(anetNonBlock(NULL,server.io_ready_pipe_read) != ANET_ERR);
+ /* LZF requires a lot of stack */
+ pthread_attr_init(&server.io_threads_attr);
+ pthread_attr_getstacksize(&server.io_threads_attr, &stacksize);
+ while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
+ pthread_attr_setstacksize(&server.io_threads_attr, stacksize);
+ /* Listen for events in the threaded I/O pipe */
+ if (aeCreateFileEvent(server.el, server.io_ready_pipe_read, AE_READABLE,
+ vmThreadedIOCompletedJob, NULL) == AE_ERR)
+ oom("creating file event");
+}
+
+/* Mark the page as used */
+void vmMarkPageUsed(off_t page) {
+ off_t byte = page/8;
+ int bit = page&7;
+ redisAssert(vmFreePage(page) == 1);
+ server.vm_bitmap[byte] |= 1<<bit;
+}
+
+/* Mark N contiguous pages as used, with 'page' being the first. */
+void vmMarkPagesUsed(off_t page, off_t count) {
+ off_t j;
+
+ for (j = 0; j < count; j++)
+ vmMarkPageUsed(page+j);
+ server.vm_stats_used_pages += count;
+ redisLog(REDIS_DEBUG,"Mark USED pages: %lld pages at %lld\n",
+ (long long)count, (long long)page);
+}
+
+/* Mark the page as free */
+void vmMarkPageFree(off_t page) {
+ off_t byte = page/8;
+ int bit = page&7;
+ redisAssert(vmFreePage(page) == 0);
+ server.vm_bitmap[byte] &= ~(1<<bit);
+}
+
+/* Mark N contiguous pages as free, with 'page' being the first. */
+void vmMarkPagesFree(off_t page, off_t count) {
+ off_t j;
+
+ for (j = 0; j < count; j++)
+ vmMarkPageFree(page+j);
+ server.vm_stats_used_pages -= count;
+ redisLog(REDIS_DEBUG,"Mark FREE pages: %lld pages at %lld\n",
+ (long long)count, (long long)page);
+}
+
+/* Test if the page is free */
+int vmFreePage(off_t page) {
+ off_t byte = page/8;
+ int bit = page&7;
+ return (server.vm_bitmap[byte] & (1<<bit)) == 0;
+}
+
+/* Find N contiguous free pages storing the first page of the cluster in *first.
+ * Returns REDIS_OK if it was able to find N contiguous pages, otherwise
+ * REDIS_ERR is returned.
+ *
+ * This function uses a simple algorithm: we try to allocate
+ * REDIS_VM_MAX_NEAR_PAGES sequentially, when we reach this limit we start
+ * again from the start of the swap file searching for free spaces.
+ *
+ * If it looks pretty clear that there are no free pages near our offset
+ * we try to find less populated places doing a forward jump of
+ * REDIS_VM_MAX_RANDOM_JUMP, then we start scanning again a few pages
+ * without hurry, and then we jump again and so forth...
+ *
+ * This function can be improved using a free list to avoid to guess
+ * too much, since we could collect data about freed pages.
+ *
+ * note: I implemented this function just after watching an episode of
+ * Battlestar Galactica, where the hybrid was continuing to say "JUMP!"
+ */
+int vmFindContiguousPages(off_t *first, off_t n) {
+ off_t base, offset = 0, since_jump = 0, numfree = 0;
+
+ if (server.vm_near_pages == REDIS_VM_MAX_NEAR_PAGES) {
+ server.vm_near_pages = 0;
+ server.vm_next_page = 0;
+ }
+ server.vm_near_pages++; /* Yet another try for pages near to the old ones */
+ base = server.vm_next_page;
+
+ while(offset < server.vm_pages) {
+ off_t this = base+offset;
+
+ /* If we overflow, restart from page zero */
+ if (this >= server.vm_pages) {
+ this -= server.vm_pages;
+ if (this == 0) {
+ /* Just overflowed, what we found on tail is no longer
+ * interesting, as it's no longer contiguous. */
+ numfree = 0;
+ }
+ }
+ if (vmFreePage(this)) {
+ /* This is a free page */
+ numfree++;
+ /* Already got N free pages? Return to the caller, with success */
+ if (numfree == n) {
+ *first = this-(n-1);
+ server.vm_next_page = this+1;
+ redisLog(REDIS_DEBUG, "FOUND CONTIGUOUS PAGES: %lld pages at %lld\n", (long long) n, (long long) *first);
+ return REDIS_OK;
+ }
+ } else {
+ /* The current one is not a free page */
+ numfree = 0;
+ }
+
+ /* Fast-forward if the current page is not free and we already
+ * searched enough near this place. */
+ since_jump++;
+ if (!numfree && since_jump >= REDIS_VM_MAX_RANDOM_JUMP/4) {
+ offset += random() % REDIS_VM_MAX_RANDOM_JUMP;
+ since_jump = 0;
+ /* Note that even if we rewind after the jump, we are don't need
+ * to make sure numfree is set to zero as we only jump *if* it
+ * is set to zero. */
+ } else {
+ /* Otherwise just check the next page */
+ offset++;
+ }
+ }
+ return REDIS_ERR;
+}
+
+/* Write the specified object at the specified page of the swap file */
+int vmWriteObjectOnSwap(robj *o, off_t page) {
+ if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
+ if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
+ if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
+ redisLog(REDIS_WARNING,
+ "Critical VM problem in vmWriteObjectOnSwap(): can't seek: %s",
+ strerror(errno));
+ return REDIS_ERR;
+ }
+ rdbSaveObject(server.vm_fp,o);
+ fflush(server.vm_fp);
+ if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
+ return REDIS_OK;
+}
+
+/* Transfers the 'val' object to disk. Store all the information
+ * a 'vmpointer' object containing all the information needed to load the
+ * object back later is returned.
+ *
+ * If we can't find enough contiguous empty pages to swap the object on disk
+ * NULL is returned. */
+vmpointer *vmSwapObjectBlocking(robj *val) {
+ off_t pages = rdbSavedObjectPages(val,NULL);
+ off_t page;
+ vmpointer *vp;
+
+ redisAssert(val->storage == REDIS_VM_MEMORY);
+ redisAssert(val->refcount == 1);
+ if (vmFindContiguousPages(&page,pages) == REDIS_ERR) return NULL;
+ if (vmWriteObjectOnSwap(val,page) == REDIS_ERR) return NULL;
+
+ vp = createVmPointer(val->type);
+ vp->page = page;
+ vp->usedpages = pages;
+ decrRefCount(val); /* Deallocate the object from memory. */
+ vmMarkPagesUsed(page,pages);
+ redisLog(REDIS_DEBUG,"VM: object %p swapped out at %lld (%lld pages)",
+ (void*) val,
+ (unsigned long long) page, (unsigned long long) pages);
+ server.vm_stats_swapped_objects++;
+ server.vm_stats_swapouts++;
+ return vp;
+}
+
+robj *vmReadObjectFromSwap(off_t page, int type) {
+ robj *o;
+
+ if (server.vm_enabled) pthread_mutex_lock(&server.io_swapfile_mutex);
+ if (fseeko(server.vm_fp,page*server.vm_page_size,SEEK_SET) == -1) {
+ redisLog(REDIS_WARNING,
+ "Unrecoverable VM problem in vmReadObjectFromSwap(): can't seek: %s",
+ strerror(errno));
+ _exit(1);
+ }
+ o = rdbLoadObject(type,server.vm_fp);
+ if (o == NULL) {
+ redisLog(REDIS_WARNING, "Unrecoverable VM problem in vmReadObjectFromSwap(): can't load object from swap file: %s", strerror(errno));
+ _exit(1);
+ }
+ if (server.vm_enabled) pthread_mutex_unlock(&server.io_swapfile_mutex);
+ return o;
+}
+
+/* Load the specified object from swap to memory.
+ * The newly allocated object is returned.
+ *
+ * If preview is true the unserialized object is returned to the caller but
+ * the pages are not marked as freed, nor the vp object is freed. */
+robj *vmGenericLoadObject(vmpointer *vp, int preview) {
+ robj *val;
+
+ redisAssert(vp->type == REDIS_VMPOINTER &&
+ (vp->storage == REDIS_VM_SWAPPED || vp->storage == REDIS_VM_LOADING));
+ val = vmReadObjectFromSwap(vp->page,vp->vtype);
+ if (!preview) {
+ redisLog(REDIS_DEBUG, "VM: object %p loaded from disk", (void*)vp);
+ vmMarkPagesFree(vp->page,vp->usedpages);
+ zfree(vp);
+ server.vm_stats_swapped_objects--;
+ } else {
+ redisLog(REDIS_DEBUG, "VM: object %p previewed from disk", (void*)vp);
+ }
+ server.vm_stats_swapins++;
+ return val;
+}
+
+/* Plain object loading, from swap to memory.
+ *
+ * 'o' is actually a redisVmPointer structure that will be freed by the call.
+ * The return value is the loaded object. */
+robj *vmLoadObject(robj *o) {
+ /* If we are loading the object in background, stop it, we
+ * need to load this object synchronously ASAP. */
+ if (o->storage == REDIS_VM_LOADING)
+ vmCancelThreadedIOJob(o);
+ return vmGenericLoadObject((vmpointer*)o,0);
+}
+
+/* Just load the value on disk, without to modify the key.
+ * This is useful when we want to perform some operation on the value
+ * without to really bring it from swap to memory, like while saving the
+ * dataset or rewriting the append only log. */
+robj *vmPreviewObject(robj *o) {
+ return vmGenericLoadObject((vmpointer*)o,1);
+}
+
+/* How a good candidate is this object for swapping?
+ * The better candidate it is, the greater the returned value.
+ *
+ * Currently we try to perform a fast estimation of the object size in
+ * memory, and combine it with aging informations.
+ *
+ * Basically swappability = idle-time * log(estimated size)
+ *
+ * Bigger objects are preferred over smaller objects, but not
+ * proportionally, this is why we use the logarithm. This algorithm is
+ * just a first try and will probably be tuned later. */
+double computeObjectSwappability(robj *o) {
+ /* actual age can be >= minage, but not < minage. As we use wrapping
+ * 21 bit clocks with minutes resolution for the LRU. */
+ time_t minage = abs(server.lruclock - o->lru);
+ long asize = 0, elesize;
+ robj *ele;
+ list *l;
+ listNode *ln;
+ dict *d;
+ struct dictEntry *de;
+ int z;
+
+ if (minage <= 0) return 0;
+ switch(o->type) {
+ case REDIS_STRING:
+ if (o->encoding != REDIS_ENCODING_RAW) {
+ asize = sizeof(*o);
+ } else {
+ asize = sdslen(o->ptr)+sizeof(*o)+sizeof(long)*2;
+ }
+ break;
+ case REDIS_LIST:
+ if (o->encoding == REDIS_ENCODING_ZIPLIST) {
+ asize = sizeof(*o)+ziplistSize(o->ptr);
+ } else {
+ l = o->ptr;
+ ln = listFirst(l);
+ asize = sizeof(list);
+ if (ln) {
+ ele = ln->value;
+ elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
+ (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
+ asize += (sizeof(listNode)+elesize)*listLength(l);
+ }
+ }
+ break;
+ case REDIS_SET:
+ case REDIS_ZSET:
+ z = (o->type == REDIS_ZSET);
+ d = z ? ((zset*)o->ptr)->dict : o->ptr;
+
+ asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
+ if (z) asize += sizeof(zset)-sizeof(dict);
+ if (dictSize(d)) {
+ de = dictGetRandomKey(d);
+ ele = dictGetEntryKey(de);
+ elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
+ (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
+ asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
+ if (z) asize += sizeof(zskiplistNode)*dictSize(d);
+ }
+ break;
+ case REDIS_HASH:
+ if (o->encoding == REDIS_ENCODING_ZIPMAP) {
+ unsigned char *p = zipmapRewind((unsigned char*)o->ptr);
+ unsigned int len = zipmapLen((unsigned char*)o->ptr);
+ unsigned int klen, vlen;
+ unsigned char *key, *val;
+
+ if ((p = zipmapNext(p,&key,&klen,&val,&vlen)) == NULL) {
+ klen = 0;
+ vlen = 0;
+ }
+ asize = len*(klen+vlen+3);
+ } else if (o->encoding == REDIS_ENCODING_HT) {
+ d = o->ptr;
+ asize = sizeof(dict)+(sizeof(struct dictEntry*)*dictSlots(d));
+ if (dictSize(d)) {
+ de = dictGetRandomKey(d);
+ ele = dictGetEntryKey(de);
+ elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
+ (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
+ ele = dictGetEntryVal(de);
+ elesize = (ele->encoding == REDIS_ENCODING_RAW) ?
+ (sizeof(*o)+sdslen(ele->ptr)) : sizeof(*o);
+ asize += (sizeof(struct dictEntry)+elesize)*dictSize(d);
+ }
+ }
+ break;
+ }
+ return (double)minage*log(1+asize);
+}
+
+/* Try to swap an object that's a good candidate for swapping.
+ * Returns REDIS_OK if the object was swapped, REDIS_ERR if it's not possible
+ * to swap any object at all.
+ *
+ * If 'usethreaded' is true, Redis will try to swap the object in background
+ * using I/O threads. */
+int vmSwapOneObject(int usethreads) {
+ int j, i;
+ struct dictEntry *best = NULL;
+ double best_swappability = 0;
+ redisDb *best_db = NULL;
+ robj *val;
+ sds key;
+
+ for (j = 0; j < server.dbnum; j++) {
+ redisDb *db = server.db+j;
+ /* Why maxtries is set to 100?
+ * Because this way (usually) we'll find 1 object even if just 1% - 2%
+ * are swappable objects */
+ int maxtries = 100;
+
+ if (dictSize(db->dict) == 0) continue;
+ for (i = 0; i < 5; i++) {
+ dictEntry *de;
+ double swappability;
+
+ if (maxtries) maxtries--;
+ de = dictGetRandomKey(db->dict);
+ val = dictGetEntryVal(de);
+ /* Only swap objects that are currently in memory.
+ *
+ * Also don't swap shared objects: not a good idea in general and
+ * we need to ensure that the main thread does not touch the
+ * object while the I/O thread is using it, but we can't
+ * control other keys without adding additional mutex. */
+ if (val->storage != REDIS_VM_MEMORY || val->refcount != 1) {
+ if (maxtries) i--; /* don't count this try */
+ continue;
+ }
+ swappability = computeObjectSwappability(val);
+ if (!best || swappability > best_swappability) {
+ best = de;
+ best_swappability = swappability;
+ best_db = db;
+ }
+ }
+ }
+ if (best == NULL) return REDIS_ERR;
+ key = dictGetEntryKey(best);
+ val = dictGetEntryVal(best);
+
+ redisLog(REDIS_DEBUG,"Key with best swappability: %s, %f",
+ key, best_swappability);
+
+ /* Swap it */
+ if (usethreads) {
+ robj *keyobj = createStringObject(key,sdslen(key));
+ vmSwapObjectThreaded(keyobj,val,best_db);
+ decrRefCount(keyobj);
+ return REDIS_OK;
+ } else {
+ vmpointer *vp;
+
+ if ((vp = vmSwapObjectBlocking(val)) != NULL) {
+ dictGetEntryVal(best) = vp;
+ return REDIS_OK;
+ } else {
+ return REDIS_ERR;
+ }
+ }
+}
+
+int vmSwapOneObjectBlocking() {
+ return vmSwapOneObject(0);
+}
+
+int vmSwapOneObjectThreaded() {
+ return vmSwapOneObject(1);
+}
+
+/* Return true if it's safe to swap out objects in a given moment.
+ * Basically we don't want to swap objects out while there is a BGSAVE
+ * or a BGAEOREWRITE running in backgroud. */
+int vmCanSwapOut(void) {
+ return (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1);
+}
+
+/* =================== Virtual Memory - Threaded I/O ======================= */
+
+void freeIOJob(iojob *j) {
+ if ((j->type == REDIS_IOJOB_PREPARE_SWAP ||
+ j->type == REDIS_IOJOB_DO_SWAP ||
+ j->type == REDIS_IOJOB_LOAD) && j->val != NULL)
+ {
+ /* we fix the storage type, otherwise decrRefCount() will try to
+ * kill the I/O thread Job (that does no longer exists). */
+ if (j->val->storage == REDIS_VM_SWAPPING)
+ j->val->storage = REDIS_VM_MEMORY;
+ decrRefCount(j->val);
+ }
+ decrRefCount(j->key);
+ zfree(j);
+}
+
+/* Every time a thread finished a Job, it writes a byte into the write side
+ * of an unix pipe in order to "awake" the main thread, and this function
+ * is called. */
+void vmThreadedIOCompletedJob(aeEventLoop *el, int fd, void *privdata,
+ int mask)
+{
+ char buf[1];
+ int retval, processed = 0, toprocess = -1, trytoswap = 1;
+ REDIS_NOTUSED(el);
+ REDIS_NOTUSED(mask);
+ REDIS_NOTUSED(privdata);
+
+ /* For every byte we read in the read side of the pipe, there is one
+ * I/O job completed to process. */
+ while((retval = read(fd,buf,1)) == 1) {
+ iojob *j;
+ listNode *ln;
+ struct dictEntry *de;
+
+ redisLog(REDIS_DEBUG,"Processing I/O completed job");
+
+ /* Get the processed element (the oldest one) */
+ lockThreadedIO();
+ redisAssert(listLength(server.io_processed) != 0);
+ if (toprocess == -1) {
+ toprocess = (listLength(server.io_processed)*REDIS_MAX_COMPLETED_JOBS_PROCESSED)/100;
+ if (toprocess <= 0) toprocess = 1;
+ }
+ ln = listFirst(server.io_processed);
+ j = ln->value;
+ listDelNode(server.io_processed,ln);
+ unlockThreadedIO();
+ /* If this job is marked as canceled, just ignore it */
+ if (j->canceled) {
+ freeIOJob(j);
+ continue;
+ }
+ /* Post process it in the main thread, as there are things we
+ * can do just here to avoid race conditions and/or invasive locks */
+ redisLog(REDIS_DEBUG,"COMPLETED Job type: %d, ID %p, key: %s", j->type, (void*)j->id, (unsigned char*)j->key->ptr);
+ de = dictFind(j->db->dict,j->key->ptr);
+ redisAssert(de != NULL);
+ if (j->type == REDIS_IOJOB_LOAD) {
+ redisDb *db;
+ vmpointer *vp = dictGetEntryVal(de);
+
+ /* Key loaded, bring it at home */
+ vmMarkPagesFree(vp->page,vp->usedpages);
+ redisLog(REDIS_DEBUG, "VM: object %s loaded from disk (threaded)",
+ (unsigned char*) j->key->ptr);
+ server.vm_stats_swapped_objects--;
+ server.vm_stats_swapins++;
+ dictGetEntryVal(de) = j->val;
+ incrRefCount(j->val);
+ db = j->db;
+ /* Handle clients waiting for this key to be loaded. */
+ handleClientsBlockedOnSwappedKey(db,j->key);
+ freeIOJob(j);
+ zfree(vp);
+ } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
+ /* Now we know the amount of pages required to swap this object.
+ * Let's find some space for it, and queue this task again
+ * rebranded as REDIS_IOJOB_DO_SWAP. */
+ if (!vmCanSwapOut() ||
+ vmFindContiguousPages(&j->page,j->pages) == REDIS_ERR)
+ {
+ /* Ooops... no space or we can't swap as there is
+ * a fork()ed Redis trying to save stuff on disk. */
+ j->val->storage = REDIS_VM_MEMORY; /* undo operation */
+ freeIOJob(j);
+ } else {
+ /* Note that we need to mark this pages as used now,
+ * if the job will be canceled, we'll mark them as freed
+ * again. */
+ vmMarkPagesUsed(j->page,j->pages);
+ j->type = REDIS_IOJOB_DO_SWAP;
+ lockThreadedIO();
+ queueIOJob(j);
+ unlockThreadedIO();
+ }
+ } else if (j->type == REDIS_IOJOB_DO_SWAP) {
+ vmpointer *vp;
+
+ /* Key swapped. We can finally free some memory. */
+ if (j->val->storage != REDIS_VM_SWAPPING) {
+ vmpointer *vp = (vmpointer*) j->id;
+ printf("storage: %d\n",vp->storage);
+ printf("key->name: %s\n",(char*)j->key->ptr);
+ printf("val: %p\n",(void*)j->val);
+ printf("val->type: %d\n",j->val->type);
+ printf("val->ptr: %s\n",(char*)j->val->ptr);
+ }
+ redisAssert(j->val->storage == REDIS_VM_SWAPPING);
+ vp = createVmPointer(j->val->type);
+ vp->page = j->page;
+ vp->usedpages = j->pages;
+ dictGetEntryVal(de) = vp;
+ /* Fix the storage otherwise decrRefCount will attempt to
+ * remove the associated I/O job */
+ j->val->storage = REDIS_VM_MEMORY;
+ decrRefCount(j->val);
+ redisLog(REDIS_DEBUG,
+ "VM: object %s swapped out at %lld (%lld pages) (threaded)",
+ (unsigned char*) j->key->ptr,
+ (unsigned long long) j->page, (unsigned long long) j->pages);
+ server.vm_stats_swapped_objects++;
+ server.vm_stats_swapouts++;
+ freeIOJob(j);
+ /* Put a few more swap requests in queue if we are still
+ * out of memory */
+ if (trytoswap && vmCanSwapOut() &&
+ zmalloc_used_memory() > server.vm_max_memory)
+ {
+ int more = 1;
+ while(more) {
+ lockThreadedIO();
+ more = listLength(server.io_newjobs) <
+ (unsigned) server.vm_max_threads;
+ unlockThreadedIO();
+ /* Don't waste CPU time if swappable objects are rare. */
+ if (vmSwapOneObjectThreaded() == REDIS_ERR) {
+ trytoswap = 0;
+ break;
+ }
+ }
+ }
+ }
+ processed++;
+ if (processed == toprocess) return;
+ }
+ if (retval < 0 && errno != EAGAIN) {
+ redisLog(REDIS_WARNING,
+ "WARNING: read(2) error in vmThreadedIOCompletedJob() %s",
+ strerror(errno));
+ }
+}
+
+void lockThreadedIO(void) {
+ pthread_mutex_lock(&server.io_mutex);
+}
+
+void unlockThreadedIO(void) {
+ pthread_mutex_unlock(&server.io_mutex);
+}
+
+/* Remove the specified object from the threaded I/O queue if still not
+ * processed, otherwise make sure to flag it as canceled. */
+void vmCancelThreadedIOJob(robj *o) {
+ list *lists[3] = {
+ server.io_newjobs, /* 0 */
+ server.io_processing, /* 1 */
+ server.io_processed /* 2 */
+ };
+ int i;
+
+ redisAssert(o->storage == REDIS_VM_LOADING || o->storage == REDIS_VM_SWAPPING);
+again:
+ lockThreadedIO();
+ /* Search for a matching object in one of the queues */
+ for (i = 0; i < 3; i++) {
+ listNode *ln;
+ listIter li;
+
+ listRewind(lists[i],&li);
+ while ((ln = listNext(&li)) != NULL) {
+ iojob *job = ln->value;
+
+ if (job->canceled) continue; /* Skip this, already canceled. */
+ if (job->id == o) {
+ redisLog(REDIS_DEBUG,"*** CANCELED %p (key %s) (type %d) (LIST ID %d)\n",
+ (void*)job, (char*)job->key->ptr, job->type, i);
+ /* Mark the pages as free since the swap didn't happened
+ * or happened but is now discarded. */
+ if (i != 1 && job->type == REDIS_IOJOB_DO_SWAP)
+ vmMarkPagesFree(job->page,job->pages);
+ /* Cancel the job. It depends on the list the job is
+ * living in. */
+ switch(i) {
+ case 0: /* io_newjobs */
+ /* If the job was yet not processed the best thing to do
+ * is to remove it from the queue at all */
+ freeIOJob(job);
+ listDelNode(lists[i],ln);
+ break;
+ case 1: /* io_processing */
+ /* Oh Shi- the thread is messing with the Job:
+ *
+ * Probably it's accessing the object if this is a
+ * PREPARE_SWAP or DO_SWAP job.
+ * If it's a LOAD job it may be reading from disk and
+ * if we don't wait for the job to terminate before to
+ * cancel it, maybe in a few microseconds data can be
+ * corrupted in this pages. So the short story is:
+ *
+ * Better to wait for the job to move into the
+ * next queue (processed)... */
+
+ /* We try again and again until the job is completed. */
+ unlockThreadedIO();
+ /* But let's wait some time for the I/O thread
+ * to finish with this job. After all this condition
+ * should be very rare. */
+ usleep(1);
+ goto again;
+ case 2: /* io_processed */
+ /* The job was already processed, that's easy...
+ * just mark it as canceled so that we'll ignore it
+ * when processing completed jobs. */
+ job->canceled = 1;
+ break;
+ }
+ /* Finally we have to adjust the storage type of the object
+ * in order to "UNDO" the operaiton. */
+ if (o->storage == REDIS_VM_LOADING)
+ o->storage = REDIS_VM_SWAPPED;
+ else if (o->storage == REDIS_VM_SWAPPING)
+ o->storage = REDIS_VM_MEMORY;
+ unlockThreadedIO();
+ redisLog(REDIS_DEBUG,"*** DONE");
+ return;
+ }
+ }
+ }
+ unlockThreadedIO();
+ printf("Not found: %p\n", (void*)o);
+ redisAssert(1 != 1); /* We should never reach this */
+}
+
+void *IOThreadEntryPoint(void *arg) {
+ iojob *j;
+ listNode *ln;
+ REDIS_NOTUSED(arg);
+
+ pthread_detach(pthread_self());
+ while(1) {
+ /* Get a new job to process */
+ lockThreadedIO();
+ if (listLength(server.io_newjobs) == 0) {
+ /* No new jobs in queue, exit. */
+ redisLog(REDIS_DEBUG,"Thread %ld exiting, nothing to do",
+ (long) pthread_self());
+ server.io_active_threads--;
+ unlockThreadedIO();
+ return NULL;
+ }
+ ln = listFirst(server.io_newjobs);
+ j = ln->value;
+ listDelNode(server.io_newjobs,ln);
+ /* Add the job in the processing queue */
+ j->thread = pthread_self();
+ listAddNodeTail(server.io_processing,j);
+ ln = listLast(server.io_processing); /* We use ln later to remove it */
+ unlockThreadedIO();
+ redisLog(REDIS_DEBUG,"Thread %ld got a new job (type %d): %p about key '%s'",
+ (long) pthread_self(), j->type, (void*)j, (char*)j->key->ptr);
+
+ /* Process the Job */
+ if (j->type == REDIS_IOJOB_LOAD) {
+ vmpointer *vp = (vmpointer*)j->id;
+ j->val = vmReadObjectFromSwap(j->page,vp->vtype);
+ } else if (j->type == REDIS_IOJOB_PREPARE_SWAP) {
+ FILE *fp = fopen("/dev/null","w+");
+ j->pages = rdbSavedObjectPages(j->val,fp);
+ fclose(fp);
+ } else if (j->type == REDIS_IOJOB_DO_SWAP) {
+ if (vmWriteObjectOnSwap(j->val,j->page) == REDIS_ERR)
+ j->canceled = 1;
+ }
+
+ /* Done: insert the job into the processed queue */
+ redisLog(REDIS_DEBUG,"Thread %ld completed the job: %p (key %s)",
+ (long) pthread_self(), (void*)j, (char*)j->key->ptr);
+ lockThreadedIO();
+ listDelNode(server.io_processing,ln);
+ listAddNodeTail(server.io_processed,j);
+ unlockThreadedIO();
+
+ /* Signal the main thread there is new stuff to process */
+ redisAssert(write(server.io_ready_pipe_write,"x",1) == 1);
+ }
+ return NULL; /* never reached */
+}
+
+void spawnIOThread(void) {
+ pthread_t thread;
+ sigset_t mask, omask;
+ int err;
+
+ sigemptyset(&mask);
+ sigaddset(&mask,SIGCHLD);
+ sigaddset(&mask,SIGHUP);
+ sigaddset(&mask,SIGPIPE);
+ pthread_sigmask(SIG_SETMASK, &mask, &omask);
+ while ((err = pthread_create(&thread,&server.io_threads_attr,IOThreadEntryPoint,NULL)) != 0) {
+ redisLog(REDIS_WARNING,"Unable to spawn an I/O thread: %s",
+ strerror(err));
+ usleep(1000000);
+ }
+ pthread_sigmask(SIG_SETMASK, &omask, NULL);
+ server.io_active_threads++;
+}
+
+/* We need to wait for the last thread to exit before we are able to
+ * fork() in order to BGSAVE or BGREWRITEAOF. */
+void waitEmptyIOJobsQueue(void) {
+ while(1) {
+ int io_processed_len;
+
+ lockThreadedIO();
+ if (listLength(server.io_newjobs) == 0 &&
+ listLength(server.io_processing) == 0 &&
+ server.io_active_threads == 0)
+ {
+ unlockThreadedIO();
+ return;
+ }
+ /* While waiting for empty jobs queue condition we post-process some
+ * finshed job, as I/O threads may be hanging trying to write against
+ * the io_ready_pipe_write FD but there are so much pending jobs that
+ * it's blocking. */
+ io_processed_len = listLength(server.io_processed);
+ unlockThreadedIO();
+ if (io_processed_len) {
+ vmThreadedIOCompletedJob(NULL,server.io_ready_pipe_read,NULL,0);
+ usleep(1000); /* 1 millisecond */
+ } else {
+ usleep(10000); /* 10 milliseconds */
+ }
+ }
+}
+
+void vmReopenSwapFile(void) {
+ /* Note: we don't close the old one as we are in the child process
+ * and don't want to mess at all with the original file object. */
+ server.vm_fp = fopen(server.vm_swap_file,"r+b");
+ if (server.vm_fp == NULL) {
+ redisLog(REDIS_WARNING,"Can't re-open the VM swap file: %s. Exiting.",
+ server.vm_swap_file);
+ _exit(1);
+ }
+ server.vm_fd = fileno(server.vm_fp);
+}
+
+/* This function must be called while with threaded IO locked */
+void queueIOJob(iojob *j) {
+ redisLog(REDIS_DEBUG,"Queued IO Job %p type %d about key '%s'\n",
+ (void*)j, j->type, (char*)j->key->ptr);
+ listAddNodeTail(server.io_newjobs,j);
+ if (server.io_active_threads < server.vm_max_threads)
+ spawnIOThread();
+}
+
+int vmSwapObjectThreaded(robj *key, robj *val, redisDb *db) {
+ iojob *j;
+
+ j = zmalloc(sizeof(*j));
+ j->type = REDIS_IOJOB_PREPARE_SWAP;
+ j->db = db;
+ j->key = key;
+ incrRefCount(key);
+ j->id = j->val = val;
+ incrRefCount(val);
+ j->canceled = 0;
+ j->thread = (pthread_t) -1;
+ val->storage = REDIS_VM_SWAPPING;
+
+ lockThreadedIO();
+ queueIOJob(j);
+ unlockThreadedIO();
+ return REDIS_OK;
+}
+
+/* ============ Virtual Memory - Blocking clients on missing keys =========== */
+
+/* This function makes the clinet 'c' waiting for the key 'key' to be loaded.
+ * If there is not already a job loading the key, it is craeted.
+ * The key is added to the io_keys list in the client structure, and also
+ * in the hash table mapping swapped keys to waiting clients, that is,
+ * server.io_waited_keys. */
+int waitForSwappedKey(redisClient *c, robj *key) {
+ struct dictEntry *de;
+ robj *o;
+ list *l;
+
+ /* If the key does not exist or is already in RAM we don't need to
+ * block the client at all. */
+ de = dictFind(c->db->dict,key->ptr);
+ if (de == NULL) return 0;
+ o = dictGetEntryVal(de);
+ if (o->storage == REDIS_VM_MEMORY) {
+ return 0;
+ } else if (o->storage == REDIS_VM_SWAPPING) {
+ /* We were swapping the key, undo it! */
+ vmCancelThreadedIOJob(o);
+ return 0;
+ }
+
+ /* OK: the key is either swapped, or being loaded just now. */
+
+ /* Add the key to the list of keys this client is waiting for.
+ * This maps clients to keys they are waiting for. */
+ listAddNodeTail(c->io_keys,key);
+ incrRefCount(key);
+
+ /* Add the client to the swapped keys => clients waiting map. */
+ de = dictFind(c->db->io_keys,key);
+ if (de == NULL) {
+ int retval;
+
+ /* For every key we take a list of clients blocked for it */
+ l = listCreate();
+ retval = dictAdd(c->db->io_keys,key,l);
+ incrRefCount(key);
+ redisAssert(retval == DICT_OK);
+ } else {
+ l = dictGetEntryVal(de);
+ }
+ listAddNodeTail(l,c);
+
+ /* Are we already loading the key from disk? If not create a job */
+ if (o->storage == REDIS_VM_SWAPPED) {
+ iojob *j;
+ vmpointer *vp = (vmpointer*)o;
+
+ o->storage = REDIS_VM_LOADING;
+ j = zmalloc(sizeof(*j));
+ j->type = REDIS_IOJOB_LOAD;
+ j->db = c->db;
+ j->id = (robj*)vp;
+ j->key = key;
+ incrRefCount(key);
+ j->page = vp->page;
+ j->val = NULL;
+ j->canceled = 0;
+ j->thread = (pthread_t) -1;
+ lockThreadedIO();
+ queueIOJob(j);
+ unlockThreadedIO();
+ }
+ return 1;
+}
+
+/* Preload keys for any command with first, last and step values for
+ * the command keys prototype, as defined in the command table. */
+void waitForMultipleSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
+ int j, last;
+ if (cmd->vm_firstkey == 0) return;
+ last = cmd->vm_lastkey;
+ if (last < 0) last = argc+last;
+ for (j = cmd->vm_firstkey; j <= last; j += cmd->vm_keystep) {
+ redisAssert(j < argc);
+ waitForSwappedKey(c,argv[j]);
+ }
+}
+
+/* Preload keys needed for the ZUNIONSTORE and ZINTERSTORE commands.
+ * Note that the number of keys to preload is user-defined, so we need to
+ * apply a sanity check against argc. */
+void zunionInterBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
+ int i, num;
+ REDIS_NOTUSED(cmd);
+
+ num = atoi(argv[2]->ptr);
+ if (num > (argc-3)) return;
+ for (i = 0; i < num; i++) {
+ waitForSwappedKey(c,argv[3+i]);
+ }
+}
+
+/* Preload keys needed to execute the entire MULTI/EXEC block.
+ *
+ * This function is called by blockClientOnSwappedKeys when EXEC is issued,
+ * and will block the client when any command requires a swapped out value. */
+void execBlockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd, int argc, robj **argv) {
+ int i, margc;
+ struct redisCommand *mcmd;
+ robj **margv;
+ REDIS_NOTUSED(cmd);
+ REDIS_NOTUSED(argc);
+ REDIS_NOTUSED(argv);
+
+ if (!(c->flags & REDIS_MULTI)) return;
+ for (i = 0; i < c->mstate.count; i++) {
+ mcmd = c->mstate.commands[i].cmd;
+ margc = c->mstate.commands[i].argc;
+ margv = c->mstate.commands[i].argv;
+
+ if (mcmd->vm_preload_proc != NULL) {
+ mcmd->vm_preload_proc(c,mcmd,margc,margv);
+ } else {
+ waitForMultipleSwappedKeys(c,mcmd,margc,margv);
+ }
+ }
+}
+
+/* Is this client attempting to run a command against swapped keys?
+ * If so, block it ASAP, load the keys in background, then resume it.
+ *
+ * The important idea about this function is that it can fail! If keys will
+ * still be swapped when the client is resumed, this key lookups will
+ * just block loading keys from disk. In practical terms this should only
+ * happen with SORT BY command or if there is a bug in this function.
+ *
+ * Return 1 if the client is marked as blocked, 0 if the client can
+ * continue as the keys it is going to access appear to be in memory. */
+int blockClientOnSwappedKeys(redisClient *c, struct redisCommand *cmd) {
+ if (cmd->vm_preload_proc != NULL) {
+ cmd->vm_preload_proc(c,cmd,c->argc,c->argv);
+ } else {
+ waitForMultipleSwappedKeys(c,cmd,c->argc,c->argv);
+ }
+
+ /* If the client was blocked for at least one key, mark it as blocked. */
+ if (listLength(c->io_keys)) {
+ c->flags |= REDIS_IO_WAIT;
+ aeDeleteFileEvent(server.el,c->fd,AE_READABLE);
+ server.vm_blocked_clients++;
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/* Remove the 'key' from the list of blocked keys for a given client.
+ *
+ * The function returns 1 when there are no longer blocking keys after
+ * the current one was removed (and the client can be unblocked). */
+int dontWaitForSwappedKey(redisClient *c, robj *key) {
+ list *l;
+ listNode *ln;
+ listIter li;
+ struct dictEntry *de;
+
+ /* Remove the key from the list of keys this client is waiting for. */
+ listRewind(c->io_keys,&li);
+ while ((ln = listNext(&li)) != NULL) {
+ if (equalStringObjects(ln->value,key)) {
+ listDelNode(c->io_keys,ln);
+ break;
+ }
+ }
+ redisAssert(ln != NULL);
+
+ /* Remove the client form the key => waiting clients map. */
+ de = dictFind(c->db->io_keys,key);
+ redisAssert(de != NULL);
+ l = dictGetEntryVal(de);
+ ln = listSearchKey(l,c);
+ redisAssert(ln != NULL);
+ listDelNode(l,ln);
+ if (listLength(l) == 0)
+ dictDelete(c->db->io_keys,key);
+
+ return listLength(c->io_keys) == 0;
+}
+
+/* Every time we now a key was loaded back in memory, we handle clients
+ * waiting for this key if any. */
+void handleClientsBlockedOnSwappedKey(redisDb *db, robj *key) {
+ struct dictEntry *de;
+ list *l;
+ listNode *ln;
+ int len;
+
+ de = dictFind(db->io_keys,key);
+ if (!de) return;
+
+ l = dictGetEntryVal(de);
+ len = listLength(l);
+ /* Note: we can't use something like while(listLength(l)) as the list
+ * can be freed by the calling function when we remove the last element. */
+ while (len--) {
+ ln = listFirst(l);
+ redisClient *c = ln->value;
+
+ if (dontWaitForSwappedKey(c,key)) {
+ /* Put the client in the list of clients ready to go as we
+ * loaded all the keys about it. */
+ listAddNodeTail(server.io_ready_clients,c);
+ }
+ }
+}
diff --git a/src/ziplist.c b/src/ziplist.c
new file mode 100644
index 000000000..4b9d0fadc
--- /dev/null
+++ b/src/ziplist.c
@@ -0,0 +1,959 @@
+/* Memory layout of a ziplist, containing "foo", "bar", "quux":
+ * <zlbytes><zllen><len>"foo"<len>"bar"<len>"quux"
+ *
+ * <zlbytes> is an unsigned integer to hold the number of bytes that
+ * the ziplist occupies. This is stored to not have to traverse the ziplist
+ * to know the new length when pushing.
+ *
+ * <zllen> is the number of items in the ziplist. When this value is
+ * greater than 254, we need to traverse the entire list to know
+ * how many items it holds.
+ *
+ * <len> is the number of bytes occupied by a single entry. When this
+ * number is greater than 253, the length will occupy 5 bytes, where
+ * the extra bytes contain an unsigned integer to hold the length.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <assert.h>
+#include <limits.h>
+#include "zmalloc.h"
+#include "ziplist.h"
+
+/* Important note: the ZIP_END value is used to depict the end of the
+ * ziplist structure. When a pointer contains an entry, the first couple
+ * of bytes contain the encoded length of the previous entry. This length
+ * is encoded as ZIP_ENC_RAW length, so the first two bits will contain 00
+ * and the byte will therefore never have a value of 255. */
+#define ZIP_END 255
+#define ZIP_BIGLEN 254
+
+/* Entry encoding */
+#define ZIP_ENC_RAW 0
+#define ZIP_ENC_INT16 1
+#define ZIP_ENC_INT32 2
+#define ZIP_ENC_INT64 3
+#define ZIP_ENCODING(p) ((p)[0] >> 6)
+
+/* Length encoding for raw entries */
+#define ZIP_LEN_INLINE 0
+#define ZIP_LEN_UINT16 1
+#define ZIP_LEN_UINT32 2
+
+/* Utility macros */
+#define ZIPLIST_BYTES(zl) (*((uint32_t*)(zl)))
+#define ZIPLIST_TAIL_OFFSET(zl) (*((uint32_t*)((zl)+sizeof(uint32_t))))
+#define ZIPLIST_LENGTH(zl) (*((uint16_t*)((zl)+sizeof(uint32_t)*2)))
+#define ZIPLIST_HEADER_SIZE (sizeof(uint32_t)*2+sizeof(uint16_t))
+#define ZIPLIST_ENTRY_HEAD(zl) ((zl)+ZIPLIST_HEADER_SIZE)
+#define ZIPLIST_ENTRY_TAIL(zl) ((zl)+ZIPLIST_TAIL_OFFSET(zl))
+#define ZIPLIST_ENTRY_END(zl) ((zl)+ZIPLIST_BYTES(zl)-1)
+
+/* We know a positive increment can only be 1 because entries can only be
+ * pushed one at a time. */
+#define ZIPLIST_INCR_LENGTH(zl,incr) { \
+ if (ZIPLIST_LENGTH(zl) < UINT16_MAX) ZIPLIST_LENGTH(zl)+=incr; }
+
+typedef struct zlentry {
+ unsigned int prevrawlensize, prevrawlen;
+ unsigned int lensize, len;
+ unsigned int headersize;
+ unsigned char encoding;
+ unsigned char *p;
+} zlentry;
+
+/* Return bytes needed to store integer encoded by 'encoding' */
+static unsigned int zipEncodingSize(unsigned char encoding) {
+ if (encoding == ZIP_ENC_INT16) {
+ return sizeof(int16_t);
+ } else if (encoding == ZIP_ENC_INT32) {
+ return sizeof(int32_t);
+ } else if (encoding == ZIP_ENC_INT64) {
+ return sizeof(int64_t);
+ }
+ assert(NULL);
+}
+
+/* Decode the encoded length pointed by 'p'. If a pointer to 'lensize' is
+ * provided, it is set to the number of bytes required to encode the length. */
+static unsigned int zipDecodeLength(unsigned char *p, unsigned int *lensize) {
+ unsigned char encoding = ZIP_ENCODING(p), lenenc;
+ unsigned int len;
+
+ if (encoding == ZIP_ENC_RAW) {
+ lenenc = (p[0] >> 4) & 0x3;
+ if (lenenc == ZIP_LEN_INLINE) {
+ len = p[0] & 0xf;
+ if (lensize) *lensize = 1;
+ } else if (lenenc == ZIP_LEN_UINT16) {
+ len = p[1] | (p[2] << 8);
+ if (lensize) *lensize = 3;
+ } else {
+ len = p[1] | (p[2] << 8) | (p[3] << 16) | (p[4] << 24);
+ if (lensize) *lensize = 5;
+ }
+ } else {
+ len = zipEncodingSize(encoding);
+ if (lensize) *lensize = 1;
+ }
+ return len;
+}
+
+/* Encode the length 'l' writing it in 'p'. If p is NULL it just returns
+ * the amount of bytes required to encode such a length. */
+static unsigned int zipEncodeLength(unsigned char *p, char encoding, unsigned int rawlen) {
+ unsigned char len = 1, lenenc, buf[5];
+ if (encoding == ZIP_ENC_RAW) {
+ if (rawlen <= 0xf) {
+ if (!p) return len;
+ lenenc = ZIP_LEN_INLINE;
+ buf[0] = rawlen;
+ } else if (rawlen <= 0xffff) {
+ len += 2;
+ if (!p) return len;
+ lenenc = ZIP_LEN_UINT16;
+ buf[1] = (rawlen ) & 0xff;
+ buf[2] = (rawlen >> 8) & 0xff;
+ } else {
+ len += 4;
+ if (!p) return len;
+ lenenc = ZIP_LEN_UINT32;
+ buf[1] = (rawlen ) & 0xff;
+ buf[2] = (rawlen >> 8) & 0xff;
+ buf[3] = (rawlen >> 16) & 0xff;
+ buf[4] = (rawlen >> 24) & 0xff;
+ }
+ buf[0] = (lenenc << 4) | (buf[0] & 0xf);
+ }
+ if (!p) return len;
+
+ /* Apparently we need to store the length in 'p' */
+ buf[0] = (encoding << 6) | (buf[0] & 0x3f);
+ memcpy(p,buf,len);
+ return len;
+}
+
+/* Decode the length of the previous element stored at "p". */
+static unsigned int zipPrevDecodeLength(unsigned char *p, unsigned int *lensize) {
+ unsigned int len = *p;
+ if (len < ZIP_BIGLEN) {
+ if (lensize) *lensize = 1;
+ } else {
+ if (lensize) *lensize = 1+sizeof(len);
+ memcpy(&len,p+1,sizeof(len));
+ }
+ return len;
+}
+
+/* Encode the length of the previous entry and write it to "p". Return the
+ * number of bytes needed to encode this length if "p" is NULL. */
+static unsigned int zipPrevEncodeLength(unsigned char *p, unsigned int len) {
+ if (p == NULL) {
+ return (len < ZIP_BIGLEN) ? 1 : sizeof(len)+1;
+ } else {
+ if (len < ZIP_BIGLEN) {
+ p[0] = len;
+ return 1;
+ } else {
+ p[0] = ZIP_BIGLEN;
+ memcpy(p+1,&len,sizeof(len));
+ return 1+sizeof(len);
+ }
+ }
+}
+
+/* Return the difference in number of bytes needed to store the new length
+ * "len" on the entry pointed to by "p". */
+static int zipPrevLenByteDiff(unsigned char *p, unsigned int len) {
+ unsigned int prevlensize;
+ zipPrevDecodeLength(p,&prevlensize);
+ return zipPrevEncodeLength(NULL,len)-prevlensize;
+}
+
+/* Check if string pointed to by 'entry' can be encoded as an integer.
+ * Stores the integer value in 'v' and its encoding in 'encoding'.
+ * Warning: this function requires a NULL-terminated string! */
+static int zipTryEncoding(unsigned char *entry, long long *v, unsigned char *encoding) {
+ long long value;
+ char *eptr;
+
+ if (entry[0] == '-' || (entry[0] >= '0' && entry[0] <= '9')) {
+ value = strtoll((char*)entry,&eptr,10);
+ if (eptr[0] != '\0') return 0;
+ if (value >= INT16_MIN && value <= INT16_MAX) {
+ *encoding = ZIP_ENC_INT16;
+ } else if (value >= INT32_MIN && value <= INT32_MAX) {
+ *encoding = ZIP_ENC_INT32;
+ } else {
+ *encoding = ZIP_ENC_INT64;
+ }
+ *v = value;
+ return 1;
+ }
+ return 0;
+}
+
+/* Store integer 'value' at 'p', encoded as 'encoding' */
+static void zipSaveInteger(unsigned char *p, int64_t value, unsigned char encoding) {
+ int16_t i16;
+ int32_t i32;
+ int64_t i64;
+ if (encoding == ZIP_ENC_INT16) {
+ i16 = value;
+ memcpy(p,&i16,sizeof(i16));
+ } else if (encoding == ZIP_ENC_INT32) {
+ i32 = value;
+ memcpy(p,&i32,sizeof(i32));
+ } else if (encoding == ZIP_ENC_INT64) {
+ i64 = value;
+ memcpy(p,&i64,sizeof(i64));
+ } else {
+ assert(NULL);
+ }
+}
+
+/* Read integer encoded as 'encoding' from 'p' */
+static int64_t zipLoadInteger(unsigned char *p, unsigned char encoding) {
+ int16_t i16;
+ int32_t i32;
+ int64_t i64, ret;
+ if (encoding == ZIP_ENC_INT16) {
+ memcpy(&i16,p,sizeof(i16));
+ ret = i16;
+ } else if (encoding == ZIP_ENC_INT32) {
+ memcpy(&i32,p,sizeof(i32));
+ ret = i32;
+ } else if (encoding == ZIP_ENC_INT64) {
+ memcpy(&i64,p,sizeof(i64));
+ ret = i64;
+ } else {
+ assert(NULL);
+ }
+ return ret;
+}
+
+/* Return a struct with all information about an entry. */
+static zlentry zipEntry(unsigned char *p) {
+ zlentry e;
+ e.prevrawlen = zipPrevDecodeLength(p,&e.prevrawlensize);
+ e.len = zipDecodeLength(p+e.prevrawlensize,&e.lensize);
+ e.headersize = e.prevrawlensize+e.lensize;
+ e.encoding = ZIP_ENCODING(p+e.prevrawlensize);
+ e.p = p;
+ return e;
+}
+
+/* Return the total number of bytes used by the entry at "p". */
+static unsigned int zipRawEntryLength(unsigned char *p) {
+ zlentry e = zipEntry(p);
+ return e.headersize + e.len;
+}
+
+/* Create a new empty ziplist. */
+unsigned char *ziplistNew(void) {
+ unsigned int bytes = ZIPLIST_HEADER_SIZE+1;
+ unsigned char *zl = zmalloc(bytes);
+ ZIPLIST_BYTES(zl) = bytes;
+ ZIPLIST_TAIL_OFFSET(zl) = ZIPLIST_HEADER_SIZE;
+ ZIPLIST_LENGTH(zl) = 0;
+ zl[bytes-1] = ZIP_END;
+ return zl;
+}
+
+/* Resize the ziplist. */
+static unsigned char *ziplistResize(unsigned char *zl, unsigned int len) {
+ zl = zrealloc(zl,len);
+ ZIPLIST_BYTES(zl) = len;
+ zl[len-1] = ZIP_END;
+ return zl;
+}
+
+/* Delete "num" entries, starting at "p". Returns pointer to the ziplist. */
+static unsigned char *__ziplistDelete(unsigned char *zl, unsigned char *p, unsigned int num) {
+ unsigned int i, totlen, deleted = 0;
+ int nextdiff = 0;
+ zlentry first = zipEntry(p);
+ for (i = 0; p[0] != ZIP_END && i < num; i++) {
+ p += zipRawEntryLength(p);
+ deleted++;
+ }
+
+ totlen = p-first.p;
+ if (totlen > 0) {
+ if (p[0] != ZIP_END) {
+ /* Tricky: storing the prevlen in this entry might reduce or
+ * increase the number of bytes needed, compared to the current
+ * prevlen. Note that we can always store this length because
+ * it was previously stored by an entry that is being deleted. */
+ nextdiff = zipPrevLenByteDiff(p,first.prevrawlen);
+ zipPrevEncodeLength(p-nextdiff,first.prevrawlen);
+
+ /* Update offset for tail */
+ ZIPLIST_TAIL_OFFSET(zl) -= totlen+nextdiff;
+
+ /* Move tail to the front of the ziplist */
+ memmove(first.p,p-nextdiff,ZIPLIST_BYTES(zl)-(p-zl)-1+nextdiff);
+ } else {
+ /* The entire tail was deleted. No need to move memory. */
+ ZIPLIST_TAIL_OFFSET(zl) = (first.p-zl)-first.prevrawlen;
+ }
+
+ /* Resize and update length */
+ zl = ziplistResize(zl, ZIPLIST_BYTES(zl)-totlen+nextdiff);
+ ZIPLIST_INCR_LENGTH(zl,-deleted);
+ }
+ return zl;
+}
+
+/* Insert item at "p". */
+static unsigned char *__ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
+ unsigned int curlen = ZIPLIST_BYTES(zl), reqlen, prevlen = 0;
+ unsigned int offset, nextdiff = 0;
+ unsigned char *tail;
+ unsigned char encoding = ZIP_ENC_RAW;
+ long long value;
+ zlentry entry;
+
+ /* Find out prevlen for the entry that is inserted. */
+ if (p[0] != ZIP_END) {
+ entry = zipEntry(p);
+ prevlen = entry.prevrawlen;
+ } else {
+ tail = ZIPLIST_ENTRY_TAIL(zl);
+ if (tail[0] != ZIP_END) {
+ prevlen = zipRawEntryLength(tail);
+ }
+ }
+
+ /* See if the entry can be encoded */
+ if (zipTryEncoding(s,&value,&encoding)) {
+ reqlen = zipEncodingSize(encoding);
+ } else {
+ reqlen = slen;
+ }
+
+ /* We need space for both the length of the previous entry and
+ * the length of the payload. */
+ reqlen += zipPrevEncodeLength(NULL,prevlen);
+ reqlen += zipEncodeLength(NULL,encoding,slen);
+
+ /* When the insert position is not equal to the tail, we need to
+ * make sure that the next entry can hold this entry's length in
+ * its prevlen field. */
+ nextdiff = (p[0] != ZIP_END) ? zipPrevLenByteDiff(p,reqlen) : 0;
+
+ /* Store offset because a realloc may change the address of zl. */
+ offset = p-zl;
+ zl = ziplistResize(zl,curlen+reqlen+nextdiff);
+ p = zl+offset;
+
+ /* Apply memory move when necessary and update tail offset. */
+ if (p[0] != ZIP_END) {
+ /* Subtract one because of the ZIP_END bytes */
+ memmove(p+reqlen,p-nextdiff,curlen-offset-1+nextdiff);
+ /* Encode this entry's raw length in the next entry. */
+ zipPrevEncodeLength(p+reqlen,reqlen);
+ /* Update offset for tail */
+ ZIPLIST_TAIL_OFFSET(zl) += reqlen+nextdiff;
+ } else {
+ /* This element will be the new tail. */
+ ZIPLIST_TAIL_OFFSET(zl) = p-zl;
+ }
+
+ /* Write the entry */
+ p += zipPrevEncodeLength(p,prevlen);
+ p += zipEncodeLength(p,encoding,slen);
+ if (encoding != ZIP_ENC_RAW) {
+ zipSaveInteger(p,value,encoding);
+ } else {
+ memcpy(p,s,slen);
+ }
+ ZIPLIST_INCR_LENGTH(zl,1);
+ return zl;
+}
+
+unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where) {
+ unsigned char *p;
+ p = (where == ZIPLIST_HEAD) ? ZIPLIST_ENTRY_HEAD(zl) : ZIPLIST_ENTRY_END(zl);
+ return __ziplistInsert(zl,p,s,slen);
+}
+
+/* Returns an offset to use for iterating with ziplistNext. When the given
+ * index is negative, the list is traversed back to front. When the list
+ * doesn't contain an element at the provided index, NULL is returned. */
+unsigned char *ziplistIndex(unsigned char *zl, int index) {
+ unsigned char *p;
+ zlentry entry;
+ if (index < 0) {
+ index = (-index)-1;
+ p = ZIPLIST_ENTRY_TAIL(zl);
+ if (p[0] != ZIP_END) {
+ entry = zipEntry(p);
+ while (entry.prevrawlen > 0 && index--) {
+ p -= entry.prevrawlen;
+ entry = zipEntry(p);
+ }
+ }
+ } else {
+ p = ZIPLIST_ENTRY_HEAD(zl);
+ while (p[0] != ZIP_END && index--) {
+ p += zipRawEntryLength(p);
+ }
+ }
+ return (p[0] == ZIP_END || index > 0) ? NULL : p;
+}
+
+/* Return pointer to next entry in ziplist. */
+unsigned char *ziplistNext(unsigned char *zl, unsigned char *p) {
+ ((void) zl);
+
+ /* "p" could be equal to ZIP_END, caused by ziplistDelete,
+ * and we should return NULL. Otherwise, we should return NULL
+ * when the *next* element is ZIP_END (there is no next entry). */
+ if (p[0] == ZIP_END) {
+ return NULL;
+ } else {
+ p = p+zipRawEntryLength(p);
+ return (p[0] == ZIP_END) ? NULL : p;
+ }
+}
+
+/* Return pointer to previous entry in ziplist. */
+unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p) {
+ zlentry entry;
+
+ /* Iterating backwards from ZIP_END should return the tail. When "p" is
+ * equal to the first element of the list, we're already at the head,
+ * and should return NULL. */
+ if (p[0] == ZIP_END) {
+ p = ZIPLIST_ENTRY_TAIL(zl);
+ return (p[0] == ZIP_END) ? NULL : p;
+ } else if (p == ZIPLIST_ENTRY_HEAD(zl)) {
+ return NULL;
+ } else {
+ entry = zipEntry(p);
+ return p-entry.prevrawlen;
+ }
+}
+
+/* Get entry pointer to by 'p' and store in either 'e' or 'v' depending
+ * on the encoding of the entry. 'e' is always set to NULL to be able
+ * to find out whether the string pointer or the integer value was set.
+ * Return 0 if 'p' points to the end of the zipmap, 1 otherwise. */
+unsigned int ziplistGet(unsigned char *p, unsigned char **sstr, unsigned int *slen, long long *sval) {
+ zlentry entry;
+ if (p == NULL || p[0] == ZIP_END) return 0;
+ if (sstr) *sstr = NULL;
+
+ entry = zipEntry(p);
+ if (entry.encoding == ZIP_ENC_RAW) {
+ if (sstr) {
+ *slen = entry.len;
+ *sstr = p+entry.headersize;
+ }
+ } else {
+ if (sval) {
+ *sval = zipLoadInteger(p+entry.headersize,entry.encoding);
+ }
+ }
+ return 1;
+}
+
+/* Insert an entry at "p". */
+unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen) {
+ return __ziplistInsert(zl,p,s,slen);
+}
+
+/* Delete a single entry from the ziplist, pointed to by *p.
+ * Also update *p in place, to be able to iterate over the
+ * ziplist, while deleting entries. */
+unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p) {
+ unsigned int offset = *p-zl;
+ zl = __ziplistDelete(zl,*p,1);
+
+ /* Store pointer to current element in p, because ziplistDelete will
+ * do a realloc which might result in a different "zl"-pointer.
+ * When the delete direction is back to front, we might delete the last
+ * entry and end up with "p" pointing to ZIP_END, so check this. */
+ *p = zl+offset;
+ return zl;
+}
+
+/* Delete a range of entries from the ziplist. */
+unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num) {
+ unsigned char *p = ziplistIndex(zl,index);
+ return (p == NULL) ? zl : __ziplistDelete(zl,p,num);
+}
+
+/* Compare entry pointer to by 'p' with 'entry'. Return 1 if equal. */
+unsigned int ziplistCompare(unsigned char *p, unsigned char *sstr, unsigned int slen) {
+ zlentry entry;
+ unsigned char sencoding;
+ long long zval, sval;
+ if (p[0] == ZIP_END) return 0;
+
+ entry = zipEntry(p);
+ if (entry.encoding == ZIP_ENC_RAW) {
+ /* Raw compare */
+ if (entry.len == slen) {
+ return memcmp(p+entry.headersize,sstr,slen) == 0;
+ } else {
+ return 0;
+ }
+ } else {
+ /* Try to compare encoded values */
+ if (zipTryEncoding(sstr,&sval,&sencoding)) {
+ if (entry.encoding == sencoding) {
+ zval = zipLoadInteger(p+entry.headersize,entry.encoding);
+ return zval == sval;
+ }
+ }
+ }
+ return 0;
+}
+
+/* Return length of ziplist. */
+unsigned int ziplistLen(unsigned char *zl) {
+ unsigned int len = 0;
+ if (ZIPLIST_LENGTH(zl) < UINT16_MAX) {
+ len = ZIPLIST_LENGTH(zl);
+ } else {
+ unsigned char *p = zl+ZIPLIST_HEADER_SIZE;
+ while (*p != ZIP_END) {
+ p += zipRawEntryLength(p);
+ len++;
+ }
+
+ /* Re-store length if small enough */
+ if (len < UINT16_MAX) ZIPLIST_LENGTH(zl) = len;
+ }
+ return len;
+}
+
+/* Return size in bytes of ziplist. */
+unsigned int ziplistSize(unsigned char *zl) {
+ return ZIPLIST_BYTES(zl);
+}
+
+void ziplistRepr(unsigned char *zl) {
+ unsigned char *p;
+ zlentry entry;
+
+ printf("{total bytes %d} {length %u}\n",ZIPLIST_BYTES(zl), ZIPLIST_LENGTH(zl));
+ p = ZIPLIST_ENTRY_HEAD(zl);
+ while(*p != ZIP_END) {
+ entry = zipEntry(p);
+ printf("{offset %ld, header %u, payload %u} ",p-zl,entry.headersize,entry.len);
+ p += entry.headersize;
+ if (entry.encoding == ZIP_ENC_RAW) {
+ fwrite(p,entry.len,1,stdout);
+ } else {
+ printf("%lld", zipLoadInteger(p,entry.encoding));
+ }
+ printf("\n");
+ p += entry.len;
+ }
+ printf("{end}\n\n");
+}
+
+#ifdef ZIPLIST_TEST_MAIN
+#include <sys/time.h>
+
+unsigned char *createList() {
+ unsigned char *zl = ziplistNew();
+ zl = ziplistPush(zl, (unsigned char*)"foo", 3, ZIPLIST_TAIL);
+ zl = ziplistPush(zl, (unsigned char*)"quux", 4, ZIPLIST_TAIL);
+ zl = ziplistPush(zl, (unsigned char*)"hello", 5, ZIPLIST_HEAD);
+ zl = ziplistPush(zl, (unsigned char*)"1024", 4, ZIPLIST_TAIL);
+ return zl;
+}
+
+unsigned char *createIntList() {
+ unsigned char *zl = ziplistNew();
+ char buf[32];
+
+ sprintf(buf, "100");
+ zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL);
+ sprintf(buf, "128000");
+ zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL);
+ sprintf(buf, "-100");
+ zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD);
+ sprintf(buf, "4294967296");
+ zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_HEAD);
+ sprintf(buf, "non integer");
+ zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL);
+ sprintf(buf, "much much longer non integer");
+ zl = ziplistPush(zl, (unsigned char*)buf, strlen(buf), ZIPLIST_TAIL);
+ return zl;
+}
+
+long long usec(void) {
+ struct timeval tv;
+ gettimeofday(&tv,NULL);
+ return (((long long)tv.tv_sec)*1000000)+tv.tv_usec;
+}
+
+void stress(int pos, int num, int maxsize, int dnum) {
+ int i,j,k;
+ unsigned char *zl;
+ char posstr[2][5] = { "HEAD", "TAIL" };
+ long long start;
+ for (i = 0; i < maxsize; i+=dnum) {
+ zl = ziplistNew();
+ for (j = 0; j < i; j++) {
+ zl = ziplistPush(zl,(unsigned char*)"quux",4,ZIPLIST_TAIL);
+ }
+
+ /* Do num times a push+pop from pos */
+ start = usec();
+ for (k = 0; k < num; k++) {
+ zl = ziplistPush(zl,(unsigned char*)"quux",4,pos);
+ zl = ziplistDeleteRange(zl,0,1);
+ }
+ printf("List size: %8d, bytes: %8d, %dx push+pop (%s): %6lld usec\n",
+ i,ZIPLIST_BYTES(zl),num,posstr[pos],usec()-start);
+ zfree(zl);
+ }
+}
+
+void pop(unsigned char *zl, int where) {
+ unsigned char *p, *vstr;
+ unsigned int vlen;
+ long long vlong;
+
+ p = ziplistIndex(zl,where == ZIPLIST_HEAD ? 0 : -1);
+ if (ziplistGet(p,&vstr,&vlen,&vlong)) {
+ if (where == ZIPLIST_HEAD)
+ printf("Pop head: ");
+ else
+ printf("Pop tail: ");
+
+ if (vstr)
+ fwrite(vstr,vlen,1,stdout);
+ else
+ printf("%lld", vlong);
+
+ printf("\n");
+ ziplistDeleteRange(zl,-1,1);
+ } else {
+ printf("ERROR: Could not pop\n");
+ exit(1);
+ }
+}
+
+int main(int argc, char **argv) {
+ unsigned char *zl, *p;
+ unsigned char *entry;
+ unsigned int elen;
+ long long value;
+
+ zl = createIntList();
+ ziplistRepr(zl);
+
+ zl = createList();
+ ziplistRepr(zl);
+
+ pop(zl,ZIPLIST_TAIL);
+ ziplistRepr(zl);
+
+ pop(zl,ZIPLIST_HEAD);
+ ziplistRepr(zl);
+
+ pop(zl,ZIPLIST_TAIL);
+ ziplistRepr(zl);
+
+ pop(zl,ZIPLIST_TAIL);
+ ziplistRepr(zl);
+
+ printf("Get element at index 3:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, 3);
+ if (!ziplistGet(p, &entry, &elen, &value)) {
+ printf("ERROR: Could not access index 3\n");
+ return 1;
+ }
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ printf("\n");
+ } else {
+ printf("%lld\n", value);
+ }
+ printf("\n");
+ }
+
+ printf("Get element at index 4 (out of range):\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, 4);
+ if (p == NULL) {
+ printf("No entry\n");
+ } else {
+ printf("ERROR: Out of range index should return NULL, returned offset: %ld\n", p-zl);
+ return 1;
+ }
+ printf("\n");
+ }
+
+ printf("Get element at index -1 (last element):\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, -1);
+ if (!ziplistGet(p, &entry, &elen, &value)) {
+ printf("ERROR: Could not access index -1\n");
+ return 1;
+ }
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ printf("\n");
+ } else {
+ printf("%lld\n", value);
+ }
+ printf("\n");
+ }
+
+ printf("Get element at index -4 (first element):\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, -4);
+ if (!ziplistGet(p, &entry, &elen, &value)) {
+ printf("ERROR: Could not access index -4\n");
+ return 1;
+ }
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ printf("\n");
+ } else {
+ printf("%lld\n", value);
+ }
+ printf("\n");
+ }
+
+ printf("Get element at index -5 (reverse out of range):\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, -5);
+ if (p == NULL) {
+ printf("No entry\n");
+ } else {
+ printf("ERROR: Out of range index should return NULL, returned offset: %ld\n", p-zl);
+ return 1;
+ }
+ printf("\n");
+ }
+
+ printf("Iterate list from 0 to end:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, 0);
+ while (ziplistGet(p, &entry, &elen, &value)) {
+ printf("Entry: ");
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ } else {
+ printf("%lld", value);
+ }
+ p = ziplistNext(zl,p);
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+ printf("Iterate list from 1 to end:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, 1);
+ while (ziplistGet(p, &entry, &elen, &value)) {
+ printf("Entry: ");
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ } else {
+ printf("%lld", value);
+ }
+ p = ziplistNext(zl,p);
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+ printf("Iterate list from 2 to end:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, 2);
+ while (ziplistGet(p, &entry, &elen, &value)) {
+ printf("Entry: ");
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ } else {
+ printf("%lld", value);
+ }
+ p = ziplistNext(zl,p);
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+ printf("Iterate starting out of range:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, 4);
+ if (!ziplistGet(p, &entry, &elen, &value)) {
+ printf("No entry\n");
+ } else {
+ printf("ERROR\n");
+ }
+ printf("\n");
+ }
+
+ printf("Iterate from back to front:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, -1);
+ while (ziplistGet(p, &entry, &elen, &value)) {
+ printf("Entry: ");
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ } else {
+ printf("%lld", value);
+ }
+ p = ziplistPrev(zl,p);
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+ printf("Iterate from back to front, deleting all items:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl, -1);
+ while (ziplistGet(p, &entry, &elen, &value)) {
+ printf("Entry: ");
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ } else {
+ printf("%lld", value);
+ }
+ zl = ziplistDelete(zl,&p);
+ p = ziplistPrev(zl,p);
+ printf("\n");
+ }
+ printf("\n");
+ }
+
+ printf("Delete inclusive range 0,0:\n");
+ {
+ zl = createList();
+ zl = ziplistDeleteRange(zl, 0, 1);
+ ziplistRepr(zl);
+ }
+
+ printf("Delete inclusive range 0,1:\n");
+ {
+ zl = createList();
+ zl = ziplistDeleteRange(zl, 0, 2);
+ ziplistRepr(zl);
+ }
+
+ printf("Delete inclusive range 1,2:\n");
+ {
+ zl = createList();
+ zl = ziplistDeleteRange(zl, 1, 2);
+ ziplistRepr(zl);
+ }
+
+ printf("Delete with start index out of range:\n");
+ {
+ zl = createList();
+ zl = ziplistDeleteRange(zl, 5, 1);
+ ziplistRepr(zl);
+ }
+
+ printf("Delete with num overflow:\n");
+ {
+ zl = createList();
+ zl = ziplistDeleteRange(zl, 1, 5);
+ ziplistRepr(zl);
+ }
+
+ printf("Delete foo while iterating:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl,0);
+ while (ziplistGet(p,&entry,&elen,&value)) {
+ if (entry && strncmp("foo",(char*)entry,elen) == 0) {
+ printf("Delete foo\n");
+ zl = ziplistDelete(zl,&p);
+ } else {
+ printf("Entry: ");
+ if (entry) {
+ fwrite(entry,elen,1,stdout);
+ } else {
+ printf("%lld",value);
+ }
+ p = ziplistNext(zl,p);
+ printf("\n");
+ }
+ }
+ printf("\n");
+ ziplistRepr(zl);
+ }
+
+ printf("Create long list and check indices:\n");
+ {
+ zl = ziplistNew();
+ char buf[32];
+ int i,len;
+ for (i = 0; i < 1000; i++) {
+ len = sprintf(buf,"%d",i);
+ zl = ziplistPush(zl,(unsigned char*)buf,len,ZIPLIST_TAIL);
+ }
+ for (i = 0; i < 1000; i++) {
+ p = ziplistIndex(zl,i);
+ assert(ziplistGet(p,NULL,NULL,&value));
+ assert(i == value);
+
+ p = ziplistIndex(zl,-i-1);
+ assert(ziplistGet(p,NULL,NULL,&value));
+ assert(999-i == value);
+ }
+ printf("SUCCESS\n\n");
+ }
+
+ printf("Compare strings with ziplist entries:\n");
+ {
+ zl = createList();
+ p = ziplistIndex(zl,0);
+ if (!ziplistCompare(p,(unsigned char*)"hello",5)) {
+ printf("ERROR: not \"hello\"\n");
+ return 1;
+ }
+ if (ziplistCompare(p,(unsigned char*)"hella",5)) {
+ printf("ERROR: \"hella\"\n");
+ return 1;
+ }
+
+ p = ziplistIndex(zl,3);
+ if (!ziplistCompare(p,(unsigned char*)"1024",4)) {
+ printf("ERROR: not \"1024\"\n");
+ return 1;
+ }
+ if (ziplistCompare(p,(unsigned char*)"1025",4)) {
+ printf("ERROR: \"1025\"\n");
+ return 1;
+ }
+ printf("SUCCESS\n");
+ }
+
+ printf("Stress with variable ziplist size:\n");
+ {
+ stress(ZIPLIST_HEAD,100000,16384,256);
+ stress(ZIPLIST_TAIL,100000,16384,256);
+ }
+
+ return 0;
+}
+
+#endif
diff --git a/src/ziplist.h b/src/ziplist.h
new file mode 100644
index 000000000..311257256
--- /dev/null
+++ b/src/ziplist.h
@@ -0,0 +1,15 @@
+#define ZIPLIST_HEAD 0
+#define ZIPLIST_TAIL 1
+
+unsigned char *ziplistNew(void);
+unsigned char *ziplistPush(unsigned char *zl, unsigned char *s, unsigned int slen, int where);
+unsigned char *ziplistIndex(unsigned char *zl, int index);
+unsigned char *ziplistNext(unsigned char *zl, unsigned char *p);
+unsigned char *ziplistPrev(unsigned char *zl, unsigned char *p);
+unsigned int ziplistGet(unsigned char *p, unsigned char **sval, unsigned int *slen, long long *lval);
+unsigned char *ziplistInsert(unsigned char *zl, unsigned char *p, unsigned char *s, unsigned int slen);
+unsigned char *ziplistDelete(unsigned char *zl, unsigned char **p);
+unsigned char *ziplistDeleteRange(unsigned char *zl, unsigned int index, unsigned int num);
+unsigned int ziplistCompare(unsigned char *p, unsigned char *s, unsigned int slen);
+unsigned int ziplistLen(unsigned char *zl);
+unsigned int ziplistSize(unsigned char *zl);
diff --git a/src/zipmap.c b/src/zipmap.c
new file mode 100644
index 000000000..35faeabef
--- /dev/null
+++ b/src/zipmap.c
@@ -0,0 +1,455 @@
+/* String -> String Map data structure optimized for size.
+ * This file implements a data structure mapping strings to other strings
+ * implementing an O(n) lookup data structure designed to be very memory
+ * efficient.
+ *
+ * The Redis Hash type uses this data structure for hashes composed of a small
+ * number of elements, to switch to an hash table once a given number of
+ * elements is reached.
+ *
+ * Given that many times Redis Hashes are used to represent objects composed
+ * of few fields, this is a very big win in terms of used memory.
+ *
+ * --------------------------------------------------------------------------
+ *
+ * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Memory layout of a zipmap, for the map "foo" => "bar", "hello" => "world":
+ *
+ * <zmlen><len>"foo"<len><free>"bar"<len>"hello"<len><free>"world"
+ *
+ * <zmlen> is 1 byte length that holds the current size of the zipmap.
+ * When the zipmap length is greater than or equal to 254, this value
+ * is not used and the zipmap needs to be traversed to find out the length.
+ *
+ * <len> is the length of the following string (key or value).
+ * <len> lengths are encoded in a single value or in a 5 bytes value.
+ * If the first byte value (as an unsigned 8 bit value) is between 0 and
+ * 252, it's a single-byte length. If it is 253 then a four bytes unsigned
+ * integer follows (in the host byte ordering). A value fo 255 is used to
+ * signal the end of the hash. The special value 254 is used to mark
+ * empty space that can be used to add new key/value pairs.
+ *
+ * <free> is the number of free unused bytes
+ * after the string, resulting from modification of values associated to a
+ * key (for instance if "foo" is set to "bar', and later "foo" will be se to
+ * "hi", I'll have a free byte to use if the value will enlarge again later,
+ * or even in order to add a key/value pair if it fits.
+ *
+ * <free> is always an unsigned 8 bit number, because if after an
+ * update operation there are more than a few free bytes, the zipmap will be
+ * reallocated to make sure it is as small as possible.
+ *
+ * The most compact representation of the above two elements hash is actually:
+ *
+ * "\x02\x03foo\x03\x00bar\x05hello\x05\x00world\xff"
+ *
+ * Note that because keys and values are prefixed length "objects",
+ * the lookup will take O(N) where N is the number of elements
+ * in the zipmap and *not* the number of bytes needed to represent the zipmap.
+ * This lowers the constant times considerably.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "zmalloc.h"
+
+#define ZIPMAP_BIGLEN 254
+#define ZIPMAP_END 255
+
+/* The following defines the max value for the <free> field described in the
+ * comments above, that is, the max number of trailing bytes in a value. */
+#define ZIPMAP_VALUE_MAX_FREE 4
+
+/* The following macro returns the number of bytes needed to encode the length
+ * for the integer value _l, that is, 1 byte for lengths < ZIPMAP_BIGLEN and
+ * 5 bytes for all the other lengths. */
+#define ZIPMAP_LEN_BYTES(_l) (((_l) < ZIPMAP_BIGLEN) ? 1 : sizeof(unsigned int)+1)
+
+/* Create a new empty zipmap. */
+unsigned char *zipmapNew(void) {
+ unsigned char *zm = zmalloc(2);
+
+ zm[0] = 0; /* Length */
+ zm[1] = ZIPMAP_END;
+ return zm;
+}
+
+/* Decode the encoded length pointed by 'p' */
+static unsigned int zipmapDecodeLength(unsigned char *p) {
+ unsigned int len = *p;
+
+ if (len < ZIPMAP_BIGLEN) return len;
+ memcpy(&len,p+1,sizeof(unsigned int));
+ return len;
+}
+
+/* Encode the length 'l' writing it in 'p'. If p is NULL it just returns
+ * the amount of bytes required to encode such a length. */
+static unsigned int zipmapEncodeLength(unsigned char *p, unsigned int len) {
+ if (p == NULL) {
+ return ZIPMAP_LEN_BYTES(len);
+ } else {
+ if (len < ZIPMAP_BIGLEN) {
+ p[0] = len;
+ return 1;
+ } else {
+ p[0] = ZIPMAP_BIGLEN;
+ memcpy(p+1,&len,sizeof(len));
+ return 1+sizeof(len);
+ }
+ }
+}
+
+/* Search for a matching key, returning a pointer to the entry inside the
+ * zipmap. Returns NULL if the key is not found.
+ *
+ * If NULL is returned, and totlen is not NULL, it is set to the entire
+ * size of the zimap, so that the calling function will be able to
+ * reallocate the original zipmap to make room for more entries. */
+static unsigned char *zipmapLookupRaw(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned int *totlen) {
+ unsigned char *p = zm+1, *k = NULL;
+ unsigned int l,llen;
+
+ while(*p != ZIPMAP_END) {
+ unsigned char free;
+
+ /* Match or skip the key */
+ l = zipmapDecodeLength(p);
+ llen = zipmapEncodeLength(NULL,l);
+ if (k == NULL && l == klen && !memcmp(p+llen,key,l)) {
+ /* Only return when the user doesn't care
+ * for the total length of the zipmap. */
+ if (totlen != NULL) {
+ k = p;
+ } else {
+ return p;
+ }
+ }
+ p += llen+l;
+ /* Skip the value as well */
+ l = zipmapDecodeLength(p);
+ p += zipmapEncodeLength(NULL,l);
+ free = p[0];
+ p += l+1+free; /* +1 to skip the free byte */
+ }
+ if (totlen != NULL) *totlen = (unsigned int)(p-zm)+1;
+ return k;
+}
+
+static unsigned long zipmapRequiredLength(unsigned int klen, unsigned int vlen) {
+ unsigned int l;
+
+ l = klen+vlen+3;
+ if (klen >= ZIPMAP_BIGLEN) l += 4;
+ if (vlen >= ZIPMAP_BIGLEN) l += 4;
+ return l;
+}
+
+/* Return the total amount used by a key (encoded length + payload) */
+static unsigned int zipmapRawKeyLength(unsigned char *p) {
+ unsigned int l = zipmapDecodeLength(p);
+ return zipmapEncodeLength(NULL,l) + l;
+}
+
+/* Return the total amount used by a value
+ * (encoded length + single byte free count + payload) */
+static unsigned int zipmapRawValueLength(unsigned char *p) {
+ unsigned int l = zipmapDecodeLength(p);
+ unsigned int used;
+
+ used = zipmapEncodeLength(NULL,l);
+ used += p[used] + 1 + l;
+ return used;
+}
+
+/* If 'p' points to a key, this function returns the total amount of
+ * bytes used to store this entry (entry = key + associated value + trailing
+ * free space if any). */
+static unsigned int zipmapRawEntryLength(unsigned char *p) {
+ unsigned int l = zipmapRawKeyLength(p);
+ return l + zipmapRawValueLength(p+l);
+}
+
+static inline unsigned char *zipmapResize(unsigned char *zm, unsigned int len) {
+ zm = zrealloc(zm, len);
+ zm[len-1] = ZIPMAP_END;
+ return zm;
+}
+
+/* Set key to value, creating the key if it does not already exist.
+ * If 'update' is not NULL, *update is set to 1 if the key was
+ * already preset, otherwise to 0. */
+unsigned char *zipmapSet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char *val, unsigned int vlen, int *update) {
+ unsigned int zmlen, offset;
+ unsigned int freelen, reqlen = zipmapRequiredLength(klen,vlen);
+ unsigned int empty, vempty;
+ unsigned char *p;
+
+ freelen = reqlen;
+ if (update) *update = 0;
+ p = zipmapLookupRaw(zm,key,klen,&zmlen);
+ if (p == NULL) {
+ /* Key not found: enlarge */
+ zm = zipmapResize(zm, zmlen+reqlen);
+ p = zm+zmlen-1;
+ zmlen = zmlen+reqlen;
+
+ /* Increase zipmap length (this is an insert) */
+ if (zm[0] < ZIPMAP_BIGLEN) zm[0]++;
+ } else {
+ /* Key found. Is there enough space for the new value? */
+ /* Compute the total length: */
+ if (update) *update = 1;
+ freelen = zipmapRawEntryLength(p);
+ if (freelen < reqlen) {
+ /* Store the offset of this key within the current zipmap, so
+ * it can be resized. Then, move the tail backwards so this
+ * pair fits at the current position. */
+ offset = p-zm;
+ zm = zipmapResize(zm, zmlen-freelen+reqlen);
+ p = zm+offset;
+
+ /* The +1 in the number of bytes to be moved is caused by the
+ * end-of-zipmap byte. Note: the *original* zmlen is used. */
+ memmove(p+reqlen, p+freelen, zmlen-(offset+freelen+1));
+ zmlen = zmlen-freelen+reqlen;
+ freelen = reqlen;
+ }
+ }
+
+ /* We now have a suitable block where the key/value entry can
+ * be written. If there is too much free space, move the tail
+ * of the zipmap a few bytes to the front and shrink the zipmap,
+ * as we want zipmaps to be very space efficient. */
+ empty = freelen-reqlen;
+ if (empty >= ZIPMAP_VALUE_MAX_FREE) {
+ /* First, move the tail <empty> bytes to the front, then resize
+ * the zipmap to be <empty> bytes smaller. */
+ offset = p-zm;
+ memmove(p+reqlen, p+freelen, zmlen-(offset+freelen+1));
+ zmlen -= empty;
+ zm = zipmapResize(zm, zmlen);
+ p = zm+offset;
+ vempty = 0;
+ } else {
+ vempty = empty;
+ }
+
+ /* Just write the key + value and we are done. */
+ /* Key: */
+ p += zipmapEncodeLength(p,klen);
+ memcpy(p,key,klen);
+ p += klen;
+ /* Value: */
+ p += zipmapEncodeLength(p,vlen);
+ *p++ = vempty;
+ memcpy(p,val,vlen);
+ return zm;
+}
+
+/* Remove the specified key. If 'deleted' is not NULL the pointed integer is
+ * set to 0 if the key was not found, to 1 if it was found and deleted. */
+unsigned char *zipmapDel(unsigned char *zm, unsigned char *key, unsigned int klen, int *deleted) {
+ unsigned int zmlen, freelen;
+ unsigned char *p = zipmapLookupRaw(zm,key,klen,&zmlen);
+ if (p) {
+ freelen = zipmapRawEntryLength(p);
+ memmove(p, p+freelen, zmlen-((p-zm)+freelen+1));
+ zm = zipmapResize(zm, zmlen-freelen);
+
+ /* Decrease zipmap length */
+ if (zm[0] < ZIPMAP_BIGLEN) zm[0]--;
+
+ if (deleted) *deleted = 1;
+ } else {
+ if (deleted) *deleted = 0;
+ }
+ return zm;
+}
+
+/* Call it before to iterate trought elements via zipmapNext() */
+unsigned char *zipmapRewind(unsigned char *zm) {
+ return zm+1;
+}
+
+/* This function is used to iterate through all the zipmap elements.
+ * In the first call the first argument is the pointer to the zipmap + 1.
+ * In the next calls what zipmapNext returns is used as first argument.
+ * Example:
+ *
+ * unsigned char *i = zipmapRewind(my_zipmap);
+ * while((i = zipmapNext(i,&key,&klen,&value,&vlen)) != NULL) {
+ * printf("%d bytes key at $p\n", klen, key);
+ * printf("%d bytes value at $p\n", vlen, value);
+ * }
+ */
+unsigned char *zipmapNext(unsigned char *zm, unsigned char **key, unsigned int *klen, unsigned char **value, unsigned int *vlen) {
+ if (zm[0] == ZIPMAP_END) return NULL;
+ if (key) {
+ *key = zm;
+ *klen = zipmapDecodeLength(zm);
+ *key += ZIPMAP_LEN_BYTES(*klen);
+ }
+ zm += zipmapRawKeyLength(zm);
+ if (value) {
+ *value = zm+1;
+ *vlen = zipmapDecodeLength(zm);
+ *value += ZIPMAP_LEN_BYTES(*vlen);
+ }
+ zm += zipmapRawValueLength(zm);
+ return zm;
+}
+
+/* Search a key and retrieve the pointer and len of the associated value.
+ * If the key is found the function returns 1, otherwise 0. */
+int zipmapGet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char **value, unsigned int *vlen) {
+ unsigned char *p;
+
+ if ((p = zipmapLookupRaw(zm,key,klen,NULL)) == NULL) return 0;
+ p += zipmapRawKeyLength(p);
+ *vlen = zipmapDecodeLength(p);
+ *value = p + ZIPMAP_LEN_BYTES(*vlen) + 1;
+ return 1;
+}
+
+/* Return 1 if the key exists, otherwise 0 is returned. */
+int zipmapExists(unsigned char *zm, unsigned char *key, unsigned int klen) {
+ return zipmapLookupRaw(zm,key,klen,NULL) != NULL;
+}
+
+/* Return the number of entries inside a zipmap */
+unsigned int zipmapLen(unsigned char *zm) {
+ unsigned int len = 0;
+ if (zm[0] < ZIPMAP_BIGLEN) {
+ len = zm[0];
+ } else {
+ unsigned char *p = zipmapRewind(zm);
+ while((p = zipmapNext(p,NULL,NULL,NULL,NULL)) != NULL) len++;
+
+ /* Re-store length if small enough */
+ if (len < ZIPMAP_BIGLEN) zm[0] = len;
+ }
+ return len;
+}
+
+void zipmapRepr(unsigned char *p) {
+ unsigned int l;
+
+ printf("{status %u}",*p++);
+ while(1) {
+ if (p[0] == ZIPMAP_END) {
+ printf("{end}");
+ break;
+ } else {
+ unsigned char e;
+
+ l = zipmapDecodeLength(p);
+ printf("{key %u}",l);
+ p += zipmapEncodeLength(NULL,l);
+ fwrite(p,l,1,stdout);
+ p += l;
+
+ l = zipmapDecodeLength(p);
+ printf("{value %u}",l);
+ p += zipmapEncodeLength(NULL,l);
+ e = *p++;
+ fwrite(p,l,1,stdout);
+ p += l+e;
+ if (e) {
+ printf("[");
+ while(e--) printf(".");
+ printf("]");
+ }
+ }
+ }
+ printf("\n");
+}
+
+#ifdef ZIPMAP_TEST_MAIN
+int main(void) {
+ unsigned char *zm;
+
+ zm = zipmapNew();
+
+ zm = zipmapSet(zm,(unsigned char*) "name",4, (unsigned char*) "foo",3,NULL);
+ zm = zipmapSet(zm,(unsigned char*) "surname",7, (unsigned char*) "foo",3,NULL);
+ zm = zipmapSet(zm,(unsigned char*) "age",3, (unsigned char*) "foo",3,NULL);
+ zipmapRepr(zm);
+
+ zm = zipmapSet(zm,(unsigned char*) "hello",5, (unsigned char*) "world!",6,NULL);
+ zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "bar",3,NULL);
+ zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "!",1,NULL);
+ zipmapRepr(zm);
+ zm = zipmapSet(zm,(unsigned char*) "foo",3, (unsigned char*) "12345",5,NULL);
+ zipmapRepr(zm);
+ zm = zipmapSet(zm,(unsigned char*) "new",3, (unsigned char*) "xx",2,NULL);
+ zm = zipmapSet(zm,(unsigned char*) "noval",5, (unsigned char*) "",0,NULL);
+ zipmapRepr(zm);
+ zm = zipmapDel(zm,(unsigned char*) "new",3,NULL);
+ zipmapRepr(zm);
+
+ printf("\nLook up large key:\n");
+ {
+ unsigned char buf[512];
+ unsigned char *value;
+ unsigned int vlen, i;
+ for (i = 0; i < 512; i++) buf[i] = 'a';
+
+ zm = zipmapSet(zm,buf,512,(unsigned char*) "long",4,NULL);
+ if (zipmapGet(zm,buf,512,&value,&vlen)) {
+ printf(" <long key> is associated to the %d bytes value: %.*s\n",
+ vlen, vlen, value);
+ }
+ }
+
+ printf("\nPerform a direct lookup:\n");
+ {
+ unsigned char *value;
+ unsigned int vlen;
+
+ if (zipmapGet(zm,(unsigned char*) "foo",3,&value,&vlen)) {
+ printf(" foo is associated to the %d bytes value: %.*s\n",
+ vlen, vlen, value);
+ }
+ }
+ printf("\nIterate trought elements:\n");
+ {
+ unsigned char *i = zipmapRewind(zm);
+ unsigned char *key, *value;
+ unsigned int klen, vlen;
+
+ while((i = zipmapNext(i,&key,&klen,&value,&vlen)) != NULL) {
+ printf(" %d:%.*s => %d:%.*s\n", klen, klen, key, vlen, vlen, value);
+ }
+ }
+ return 0;
+}
+#endif
diff --git a/src/zipmap.h b/src/zipmap.h
new file mode 100644
index 000000000..e5f6c9f28
--- /dev/null
+++ b/src/zipmap.h
@@ -0,0 +1,48 @@
+/* String -> String Map data structure optimized for size.
+ *
+ * See zipmap.c for more info.
+ *
+ * --------------------------------------------------------------------------
+ *
+ * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ZIMMAP_H
+#define _ZIPMAP_H
+
+unsigned char *zipmapNew(void);
+unsigned char *zipmapSet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char *val, unsigned int vlen, int *update);
+unsigned char *zipmapDel(unsigned char *zm, unsigned char *key, unsigned int klen, int *deleted);
+unsigned char *zipmapRewind(unsigned char *zm);
+unsigned char *zipmapNext(unsigned char *zm, unsigned char **key, unsigned int *klen, unsigned char **value, unsigned int *vlen);
+int zipmapGet(unsigned char *zm, unsigned char *key, unsigned int klen, unsigned char **value, unsigned int *vlen);
+int zipmapExists(unsigned char *zm, unsigned char *key, unsigned int klen);
+unsigned int zipmapLen(unsigned char *zm);
+void zipmapRepr(unsigned char *p);
+
+#endif
diff --git a/src/zmalloc.c b/src/zmalloc.c
new file mode 100644
index 000000000..8658376a3
--- /dev/null
+++ b/src/zmalloc.c
@@ -0,0 +1,158 @@
+/* zmalloc - total amount of allocated memory aware version of malloc()
+ *
+ * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include "config.h"
+
+#if defined(__sun)
+#define PREFIX_SIZE sizeof(long long)
+#else
+#define PREFIX_SIZE sizeof(size_t)
+#endif
+
+#define increment_used_memory(__n) do { \
+ size_t _n = (__n); \
+ if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \
+ if (zmalloc_thread_safe) { \
+ pthread_mutex_lock(&used_memory_mutex); \
+ used_memory += _n; \
+ pthread_mutex_unlock(&used_memory_mutex); \
+ } else { \
+ used_memory += _n; \
+ } \
+} while(0)
+
+#define decrement_used_memory(__n) do { \
+ size_t _n = (__n); \
+ if (_n&(sizeof(long)-1)) _n += sizeof(long)-(_n&(sizeof(long)-1)); \
+ if (zmalloc_thread_safe) { \
+ pthread_mutex_lock(&used_memory_mutex); \
+ used_memory -= _n; \
+ pthread_mutex_unlock(&used_memory_mutex); \
+ } else { \
+ used_memory -= _n; \
+ } \
+} while(0)
+
+static size_t used_memory = 0;
+static int zmalloc_thread_safe = 0;
+pthread_mutex_t used_memory_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static void zmalloc_oom(size_t size) {
+ fprintf(stderr, "zmalloc: Out of memory trying to allocate %zu bytes\n",
+ size);
+ fflush(stderr);
+ abort();
+}
+
+void *zmalloc(size_t size) {
+ void *ptr = malloc(size+PREFIX_SIZE);
+
+ if (!ptr) zmalloc_oom(size);
+#ifdef HAVE_MALLOC_SIZE
+ increment_used_memory(redis_malloc_size(ptr));
+ return ptr;
+#else
+ *((size_t*)ptr) = size;
+ increment_used_memory(size+PREFIX_SIZE);
+ return (char*)ptr+PREFIX_SIZE;
+#endif
+}
+
+void *zrealloc(void *ptr, size_t size) {
+#ifndef HAVE_MALLOC_SIZE
+ void *realptr;
+#endif
+ size_t oldsize;
+ void *newptr;
+
+ if (ptr == NULL) return zmalloc(size);
+#ifdef HAVE_MALLOC_SIZE
+ oldsize = redis_malloc_size(ptr);
+ newptr = realloc(ptr,size);
+ if (!newptr) zmalloc_oom(size);
+
+ decrement_used_memory(oldsize);
+ increment_used_memory(redis_malloc_size(newptr));
+ return newptr;
+#else
+ realptr = (char*)ptr-PREFIX_SIZE;
+ oldsize = *((size_t*)realptr);
+ newptr = realloc(realptr,size+PREFIX_SIZE);
+ if (!newptr) zmalloc_oom(size);
+
+ *((size_t*)newptr) = size;
+ decrement_used_memory(oldsize);
+ increment_used_memory(size);
+ return (char*)newptr+PREFIX_SIZE;
+#endif
+}
+
+void zfree(void *ptr) {
+#ifndef HAVE_MALLOC_SIZE
+ void *realptr;
+ size_t oldsize;
+#endif
+
+ if (ptr == NULL) return;
+#ifdef HAVE_MALLOC_SIZE
+ decrement_used_memory(redis_malloc_size(ptr));
+ free(ptr);
+#else
+ realptr = (char*)ptr-PREFIX_SIZE;
+ oldsize = *((size_t*)realptr);
+ decrement_used_memory(oldsize+PREFIX_SIZE);
+ free(realptr);
+#endif
+}
+
+char *zstrdup(const char *s) {
+ size_t l = strlen(s)+1;
+ char *p = zmalloc(l);
+
+ memcpy(p,s,l);
+ return p;
+}
+
+size_t zmalloc_used_memory(void) {
+ size_t um;
+
+ if (zmalloc_thread_safe) pthread_mutex_lock(&used_memory_mutex);
+ um = used_memory;
+ if (zmalloc_thread_safe) pthread_mutex_unlock(&used_memory_mutex);
+ return um;
+}
+
+void zmalloc_enable_thread_safeness(void) {
+ zmalloc_thread_safe = 1;
+}
diff --git a/src/zmalloc.h b/src/zmalloc.h
new file mode 100644
index 000000000..193e7eda5
--- /dev/null
+++ b/src/zmalloc.h
@@ -0,0 +1,41 @@
+/* zmalloc - total amount of allocated memory aware version of malloc()
+ *
+ * Copyright (c) 2009-2010, Salvatore Sanfilippo <antirez at gmail dot com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Redis nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _ZMALLOC_H
+#define _ZMALLOC_H
+
+void *zmalloc(size_t size);
+void *zrealloc(void *ptr, size_t size);
+void zfree(void *ptr);
+char *zstrdup(const char *s);
+size_t zmalloc_used_memory(void);
+void zmalloc_enable_thread_safeness(void);
+
+#endif /* _ZMALLOC_H */